raid5.c revision 1f403624bde3c678a166984b1e6a727a0ce06f2b
1/* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21/* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->bm_write is the number of the last batch successfully written. 31 * conf->bm_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is bm_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46#include <linux/blkdev.h> 47#include <linux/kthread.h> 48#include <linux/async_tx.h> 49#include <linux/seq_file.h> 50#include "md.h" 51#include "raid5.h" 52#include "raid6.h" 53#include "bitmap.h" 54 55/* 56 * Stripe cache 57 */ 58 59#define NR_STRIPES 256 60#define STRIPE_SIZE PAGE_SIZE 61#define STRIPE_SHIFT (PAGE_SHIFT - 9) 62#define STRIPE_SECTORS (STRIPE_SIZE>>9) 63#define IO_THRESHOLD 1 64#define BYPASS_THRESHOLD 1 65#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 66#define HASH_MASK (NR_HASH - 1) 67 68#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 69 70/* bio's attached to a stripe+device for I/O are linked together in bi_sector 71 * order without overlap. There may be several bio's per stripe+device, and 72 * a bio could span several devices. 73 * When walking this list for a particular stripe+device, we must never proceed 74 * beyond a bio that extends past this device, as the next bio might no longer 75 * be valid. 76 * This macro is used to determine the 'next' bio in the list, given the sector 77 * of the current stripe+device 78 */ 79#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 80/* 81 * The following can be used to debug the driver 82 */ 83#define RAID5_PARANOIA 1 84#if RAID5_PARANOIA && defined(CONFIG_SMP) 85# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 86#else 87# define CHECK_DEVLOCK() 88#endif 89 90#ifdef DEBUG 91#define inline 92#define __inline__ 93#endif 94 95#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 96 97#if !RAID6_USE_EMPTY_ZERO_PAGE 98/* In .bss so it's zeroed */ 99const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); 100#endif 101 102/* 103 * We maintain a biased count of active stripes in the bottom 16 bits of 104 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 105 */ 106static inline int raid5_bi_phys_segments(struct bio *bio) 107{ 108 return bio->bi_phys_segments & 0xffff; 109} 110 111static inline int raid5_bi_hw_segments(struct bio *bio) 112{ 113 return (bio->bi_phys_segments >> 16) & 0xffff; 114} 115 116static inline int raid5_dec_bi_phys_segments(struct bio *bio) 117{ 118 --bio->bi_phys_segments; 119 return raid5_bi_phys_segments(bio); 120} 121 122static inline int raid5_dec_bi_hw_segments(struct bio *bio) 123{ 124 unsigned short val = raid5_bi_hw_segments(bio); 125 126 --val; 127 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 128 return val; 129} 130 131static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132{ 133 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 134} 135 136/* Find first data disk in a raid6 stripe */ 137static inline int raid6_d0(struct stripe_head *sh) 138{ 139 if (sh->ddf_layout) 140 /* ddf always start from first device */ 141 return 0; 142 /* md starts just after Q block */ 143 if (sh->qd_idx == sh->disks - 1) 144 return 0; 145 else 146 return sh->qd_idx + 1; 147} 148static inline int raid6_next_disk(int disk, int raid_disks) 149{ 150 disk++; 151 return (disk < raid_disks) ? disk : 0; 152} 153 154/* When walking through the disks in a raid5, starting at raid6_d0, 155 * We need to map each disk to a 'slot', where the data disks are slot 156 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 157 * is raid_disks-1. This help does that mapping. 158 */ 159static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 160 int *count, int syndrome_disks) 161{ 162 int slot; 163 164 if (idx == sh->pd_idx) 165 return syndrome_disks; 166 if (idx == sh->qd_idx) 167 return syndrome_disks + 1; 168 slot = (*count)++; 169 return slot; 170} 171 172static void return_io(struct bio *return_bi) 173{ 174 struct bio *bi = return_bi; 175 while (bi) { 176 177 return_bi = bi->bi_next; 178 bi->bi_next = NULL; 179 bi->bi_size = 0; 180 bio_endio(bi, 0); 181 bi = return_bi; 182 } 183} 184 185static void print_raid5_conf (raid5_conf_t *conf); 186 187static int stripe_operations_active(struct stripe_head *sh) 188{ 189 return sh->check_state || sh->reconstruct_state || 190 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 191 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 192} 193 194static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 195{ 196 if (atomic_dec_and_test(&sh->count)) { 197 BUG_ON(!list_empty(&sh->lru)); 198 BUG_ON(atomic_read(&conf->active_stripes)==0); 199 if (test_bit(STRIPE_HANDLE, &sh->state)) { 200 if (test_bit(STRIPE_DELAYED, &sh->state)) { 201 list_add_tail(&sh->lru, &conf->delayed_list); 202 blk_plug_device(conf->mddev->queue); 203 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 204 sh->bm_seq - conf->seq_write > 0) { 205 list_add_tail(&sh->lru, &conf->bitmap_list); 206 blk_plug_device(conf->mddev->queue); 207 } else { 208 clear_bit(STRIPE_BIT_DELAY, &sh->state); 209 list_add_tail(&sh->lru, &conf->handle_list); 210 } 211 md_wakeup_thread(conf->mddev->thread); 212 } else { 213 BUG_ON(stripe_operations_active(sh)); 214 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 215 atomic_dec(&conf->preread_active_stripes); 216 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 217 md_wakeup_thread(conf->mddev->thread); 218 } 219 atomic_dec(&conf->active_stripes); 220 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 221 list_add_tail(&sh->lru, &conf->inactive_list); 222 wake_up(&conf->wait_for_stripe); 223 if (conf->retry_read_aligned) 224 md_wakeup_thread(conf->mddev->thread); 225 } 226 } 227 } 228} 229 230static void release_stripe(struct stripe_head *sh) 231{ 232 raid5_conf_t *conf = sh->raid_conf; 233 unsigned long flags; 234 235 spin_lock_irqsave(&conf->device_lock, flags); 236 __release_stripe(conf, sh); 237 spin_unlock_irqrestore(&conf->device_lock, flags); 238} 239 240static inline void remove_hash(struct stripe_head *sh) 241{ 242 pr_debug("remove_hash(), stripe %llu\n", 243 (unsigned long long)sh->sector); 244 245 hlist_del_init(&sh->hash); 246} 247 248static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 249{ 250 struct hlist_head *hp = stripe_hash(conf, sh->sector); 251 252 pr_debug("insert_hash(), stripe %llu\n", 253 (unsigned long long)sh->sector); 254 255 CHECK_DEVLOCK(); 256 hlist_add_head(&sh->hash, hp); 257} 258 259 260/* find an idle stripe, make sure it is unhashed, and return it. */ 261static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 262{ 263 struct stripe_head *sh = NULL; 264 struct list_head *first; 265 266 CHECK_DEVLOCK(); 267 if (list_empty(&conf->inactive_list)) 268 goto out; 269 first = conf->inactive_list.next; 270 sh = list_entry(first, struct stripe_head, lru); 271 list_del_init(first); 272 remove_hash(sh); 273 atomic_inc(&conf->active_stripes); 274out: 275 return sh; 276} 277 278static void shrink_buffers(struct stripe_head *sh, int num) 279{ 280 struct page *p; 281 int i; 282 283 for (i=0; i<num ; i++) { 284 p = sh->dev[i].page; 285 if (!p) 286 continue; 287 sh->dev[i].page = NULL; 288 put_page(p); 289 } 290} 291 292static int grow_buffers(struct stripe_head *sh, int num) 293{ 294 int i; 295 296 for (i=0; i<num; i++) { 297 struct page *page; 298 299 if (!(page = alloc_page(GFP_KERNEL))) { 300 return 1; 301 } 302 sh->dev[i].page = page; 303 } 304 return 0; 305} 306 307static void raid5_build_block(struct stripe_head *sh, int i); 308static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 309 struct stripe_head *sh); 310 311static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 312{ 313 raid5_conf_t *conf = sh->raid_conf; 314 int i; 315 316 BUG_ON(atomic_read(&sh->count) != 0); 317 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 318 BUG_ON(stripe_operations_active(sh)); 319 320 CHECK_DEVLOCK(); 321 pr_debug("init_stripe called, stripe %llu\n", 322 (unsigned long long)sh->sector); 323 324 remove_hash(sh); 325 326 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 327 sh->sector = sector; 328 stripe_set_idx(sector, conf, previous, sh); 329 sh->state = 0; 330 331 332 for (i = sh->disks; i--; ) { 333 struct r5dev *dev = &sh->dev[i]; 334 335 if (dev->toread || dev->read || dev->towrite || dev->written || 336 test_bit(R5_LOCKED, &dev->flags)) { 337 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 338 (unsigned long long)sh->sector, i, dev->toread, 339 dev->read, dev->towrite, dev->written, 340 test_bit(R5_LOCKED, &dev->flags)); 341 BUG(); 342 } 343 dev->flags = 0; 344 raid5_build_block(sh, i); 345 } 346 insert_hash(conf, sh); 347} 348 349static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) 350{ 351 struct stripe_head *sh; 352 struct hlist_node *hn; 353 354 CHECK_DEVLOCK(); 355 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 356 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 357 if (sh->sector == sector && sh->disks == disks) 358 return sh; 359 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 360 return NULL; 361} 362 363static void unplug_slaves(mddev_t *mddev); 364static void raid5_unplug_device(struct request_queue *q); 365 366static struct stripe_head * 367get_active_stripe(raid5_conf_t *conf, sector_t sector, 368 int previous, int noblock) 369{ 370 struct stripe_head *sh; 371 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 372 373 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 374 375 spin_lock_irq(&conf->device_lock); 376 377 do { 378 wait_event_lock_irq(conf->wait_for_stripe, 379 conf->quiesce == 0, 380 conf->device_lock, /* nothing */); 381 sh = __find_stripe(conf, sector, disks); 382 if (!sh) { 383 if (!conf->inactive_blocked) 384 sh = get_free_stripe(conf); 385 if (noblock && sh == NULL) 386 break; 387 if (!sh) { 388 conf->inactive_blocked = 1; 389 wait_event_lock_irq(conf->wait_for_stripe, 390 !list_empty(&conf->inactive_list) && 391 (atomic_read(&conf->active_stripes) 392 < (conf->max_nr_stripes *3/4) 393 || !conf->inactive_blocked), 394 conf->device_lock, 395 raid5_unplug_device(conf->mddev->queue) 396 ); 397 conf->inactive_blocked = 0; 398 } else 399 init_stripe(sh, sector, previous); 400 } else { 401 if (atomic_read(&sh->count)) { 402 BUG_ON(!list_empty(&sh->lru)); 403 } else { 404 if (!test_bit(STRIPE_HANDLE, &sh->state)) 405 atomic_inc(&conf->active_stripes); 406 if (list_empty(&sh->lru) && 407 !test_bit(STRIPE_EXPANDING, &sh->state)) 408 BUG(); 409 list_del_init(&sh->lru); 410 } 411 } 412 } while (sh == NULL); 413 414 if (sh) 415 atomic_inc(&sh->count); 416 417 spin_unlock_irq(&conf->device_lock); 418 return sh; 419} 420 421static void 422raid5_end_read_request(struct bio *bi, int error); 423static void 424raid5_end_write_request(struct bio *bi, int error); 425 426static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 427{ 428 raid5_conf_t *conf = sh->raid_conf; 429 int i, disks = sh->disks; 430 431 might_sleep(); 432 433 for (i = disks; i--; ) { 434 int rw; 435 struct bio *bi; 436 mdk_rdev_t *rdev; 437 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 438 rw = WRITE; 439 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 440 rw = READ; 441 else 442 continue; 443 444 bi = &sh->dev[i].req; 445 446 bi->bi_rw = rw; 447 if (rw == WRITE) 448 bi->bi_end_io = raid5_end_write_request; 449 else 450 bi->bi_end_io = raid5_end_read_request; 451 452 rcu_read_lock(); 453 rdev = rcu_dereference(conf->disks[i].rdev); 454 if (rdev && test_bit(Faulty, &rdev->flags)) 455 rdev = NULL; 456 if (rdev) 457 atomic_inc(&rdev->nr_pending); 458 rcu_read_unlock(); 459 460 if (rdev) { 461 if (s->syncing || s->expanding || s->expanded) 462 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 463 464 set_bit(STRIPE_IO_STARTED, &sh->state); 465 466 bi->bi_bdev = rdev->bdev; 467 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 468 __func__, (unsigned long long)sh->sector, 469 bi->bi_rw, i); 470 atomic_inc(&sh->count); 471 bi->bi_sector = sh->sector + rdev->data_offset; 472 bi->bi_flags = 1 << BIO_UPTODATE; 473 bi->bi_vcnt = 1; 474 bi->bi_max_vecs = 1; 475 bi->bi_idx = 0; 476 bi->bi_io_vec = &sh->dev[i].vec; 477 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 478 bi->bi_io_vec[0].bv_offset = 0; 479 bi->bi_size = STRIPE_SIZE; 480 bi->bi_next = NULL; 481 if (rw == WRITE && 482 test_bit(R5_ReWrite, &sh->dev[i].flags)) 483 atomic_add(STRIPE_SECTORS, 484 &rdev->corrected_errors); 485 generic_make_request(bi); 486 } else { 487 if (rw == WRITE) 488 set_bit(STRIPE_DEGRADED, &sh->state); 489 pr_debug("skip op %ld on disc %d for sector %llu\n", 490 bi->bi_rw, i, (unsigned long long)sh->sector); 491 clear_bit(R5_LOCKED, &sh->dev[i].flags); 492 set_bit(STRIPE_HANDLE, &sh->state); 493 } 494 } 495} 496 497static struct dma_async_tx_descriptor * 498async_copy_data(int frombio, struct bio *bio, struct page *page, 499 sector_t sector, struct dma_async_tx_descriptor *tx) 500{ 501 struct bio_vec *bvl; 502 struct page *bio_page; 503 int i; 504 int page_offset; 505 506 if (bio->bi_sector >= sector) 507 page_offset = (signed)(bio->bi_sector - sector) * 512; 508 else 509 page_offset = (signed)(sector - bio->bi_sector) * -512; 510 bio_for_each_segment(bvl, bio, i) { 511 int len = bio_iovec_idx(bio, i)->bv_len; 512 int clen; 513 int b_offset = 0; 514 515 if (page_offset < 0) { 516 b_offset = -page_offset; 517 page_offset += b_offset; 518 len -= b_offset; 519 } 520 521 if (len > 0 && page_offset + len > STRIPE_SIZE) 522 clen = STRIPE_SIZE - page_offset; 523 else 524 clen = len; 525 526 if (clen > 0) { 527 b_offset += bio_iovec_idx(bio, i)->bv_offset; 528 bio_page = bio_iovec_idx(bio, i)->bv_page; 529 if (frombio) 530 tx = async_memcpy(page, bio_page, page_offset, 531 b_offset, clen, 532 ASYNC_TX_DEP_ACK, 533 tx, NULL, NULL); 534 else 535 tx = async_memcpy(bio_page, page, b_offset, 536 page_offset, clen, 537 ASYNC_TX_DEP_ACK, 538 tx, NULL, NULL); 539 } 540 if (clen < len) /* hit end of page */ 541 break; 542 page_offset += len; 543 } 544 545 return tx; 546} 547 548static void ops_complete_biofill(void *stripe_head_ref) 549{ 550 struct stripe_head *sh = stripe_head_ref; 551 struct bio *return_bi = NULL; 552 raid5_conf_t *conf = sh->raid_conf; 553 int i; 554 555 pr_debug("%s: stripe %llu\n", __func__, 556 (unsigned long long)sh->sector); 557 558 /* clear completed biofills */ 559 spin_lock_irq(&conf->device_lock); 560 for (i = sh->disks; i--; ) { 561 struct r5dev *dev = &sh->dev[i]; 562 563 /* acknowledge completion of a biofill operation */ 564 /* and check if we need to reply to a read request, 565 * new R5_Wantfill requests are held off until 566 * !STRIPE_BIOFILL_RUN 567 */ 568 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 569 struct bio *rbi, *rbi2; 570 571 BUG_ON(!dev->read); 572 rbi = dev->read; 573 dev->read = NULL; 574 while (rbi && rbi->bi_sector < 575 dev->sector + STRIPE_SECTORS) { 576 rbi2 = r5_next_bio(rbi, dev->sector); 577 if (!raid5_dec_bi_phys_segments(rbi)) { 578 rbi->bi_next = return_bi; 579 return_bi = rbi; 580 } 581 rbi = rbi2; 582 } 583 } 584 } 585 spin_unlock_irq(&conf->device_lock); 586 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 587 588 return_io(return_bi); 589 590 set_bit(STRIPE_HANDLE, &sh->state); 591 release_stripe(sh); 592} 593 594static void ops_run_biofill(struct stripe_head *sh) 595{ 596 struct dma_async_tx_descriptor *tx = NULL; 597 raid5_conf_t *conf = sh->raid_conf; 598 int i; 599 600 pr_debug("%s: stripe %llu\n", __func__, 601 (unsigned long long)sh->sector); 602 603 for (i = sh->disks; i--; ) { 604 struct r5dev *dev = &sh->dev[i]; 605 if (test_bit(R5_Wantfill, &dev->flags)) { 606 struct bio *rbi; 607 spin_lock_irq(&conf->device_lock); 608 dev->read = rbi = dev->toread; 609 dev->toread = NULL; 610 spin_unlock_irq(&conf->device_lock); 611 while (rbi && rbi->bi_sector < 612 dev->sector + STRIPE_SECTORS) { 613 tx = async_copy_data(0, rbi, dev->page, 614 dev->sector, tx); 615 rbi = r5_next_bio(rbi, dev->sector); 616 } 617 } 618 } 619 620 atomic_inc(&sh->count); 621 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 622 ops_complete_biofill, sh); 623} 624 625static void ops_complete_compute5(void *stripe_head_ref) 626{ 627 struct stripe_head *sh = stripe_head_ref; 628 int target = sh->ops.target; 629 struct r5dev *tgt = &sh->dev[target]; 630 631 pr_debug("%s: stripe %llu\n", __func__, 632 (unsigned long long)sh->sector); 633 634 set_bit(R5_UPTODATE, &tgt->flags); 635 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 636 clear_bit(R5_Wantcompute, &tgt->flags); 637 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 638 if (sh->check_state == check_state_compute_run) 639 sh->check_state = check_state_compute_result; 640 set_bit(STRIPE_HANDLE, &sh->state); 641 release_stripe(sh); 642} 643 644static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 645{ 646 /* kernel stack size limits the total number of disks */ 647 int disks = sh->disks; 648 struct page *xor_srcs[disks]; 649 int target = sh->ops.target; 650 struct r5dev *tgt = &sh->dev[target]; 651 struct page *xor_dest = tgt->page; 652 int count = 0; 653 struct dma_async_tx_descriptor *tx; 654 int i; 655 656 pr_debug("%s: stripe %llu block: %d\n", 657 __func__, (unsigned long long)sh->sector, target); 658 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 659 660 for (i = disks; i--; ) 661 if (i != target) 662 xor_srcs[count++] = sh->dev[i].page; 663 664 atomic_inc(&sh->count); 665 666 if (unlikely(count == 1)) 667 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 668 0, NULL, ops_complete_compute5, sh); 669 else 670 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 671 ASYNC_TX_XOR_ZERO_DST, NULL, 672 ops_complete_compute5, sh); 673 674 return tx; 675} 676 677static void ops_complete_prexor(void *stripe_head_ref) 678{ 679 struct stripe_head *sh = stripe_head_ref; 680 681 pr_debug("%s: stripe %llu\n", __func__, 682 (unsigned long long)sh->sector); 683} 684 685static struct dma_async_tx_descriptor * 686ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 687{ 688 /* kernel stack size limits the total number of disks */ 689 int disks = sh->disks; 690 struct page *xor_srcs[disks]; 691 int count = 0, pd_idx = sh->pd_idx, i; 692 693 /* existing parity data subtracted */ 694 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 695 696 pr_debug("%s: stripe %llu\n", __func__, 697 (unsigned long long)sh->sector); 698 699 for (i = disks; i--; ) { 700 struct r5dev *dev = &sh->dev[i]; 701 /* Only process blocks that are known to be uptodate */ 702 if (test_bit(R5_Wantdrain, &dev->flags)) 703 xor_srcs[count++] = dev->page; 704 } 705 706 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 707 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 708 ops_complete_prexor, sh); 709 710 return tx; 711} 712 713static struct dma_async_tx_descriptor * 714ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 715{ 716 int disks = sh->disks; 717 int i; 718 719 pr_debug("%s: stripe %llu\n", __func__, 720 (unsigned long long)sh->sector); 721 722 for (i = disks; i--; ) { 723 struct r5dev *dev = &sh->dev[i]; 724 struct bio *chosen; 725 726 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 727 struct bio *wbi; 728 729 spin_lock(&sh->lock); 730 chosen = dev->towrite; 731 dev->towrite = NULL; 732 BUG_ON(dev->written); 733 wbi = dev->written = chosen; 734 spin_unlock(&sh->lock); 735 736 while (wbi && wbi->bi_sector < 737 dev->sector + STRIPE_SECTORS) { 738 tx = async_copy_data(1, wbi, dev->page, 739 dev->sector, tx); 740 wbi = r5_next_bio(wbi, dev->sector); 741 } 742 } 743 } 744 745 return tx; 746} 747 748static void ops_complete_postxor(void *stripe_head_ref) 749{ 750 struct stripe_head *sh = stripe_head_ref; 751 int disks = sh->disks, i, pd_idx = sh->pd_idx; 752 753 pr_debug("%s: stripe %llu\n", __func__, 754 (unsigned long long)sh->sector); 755 756 for (i = disks; i--; ) { 757 struct r5dev *dev = &sh->dev[i]; 758 if (dev->written || i == pd_idx) 759 set_bit(R5_UPTODATE, &dev->flags); 760 } 761 762 if (sh->reconstruct_state == reconstruct_state_drain_run) 763 sh->reconstruct_state = reconstruct_state_drain_result; 764 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 765 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 766 else { 767 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 768 sh->reconstruct_state = reconstruct_state_result; 769 } 770 771 set_bit(STRIPE_HANDLE, &sh->state); 772 release_stripe(sh); 773} 774 775static void 776ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 777{ 778 /* kernel stack size limits the total number of disks */ 779 int disks = sh->disks; 780 struct page *xor_srcs[disks]; 781 782 int count = 0, pd_idx = sh->pd_idx, i; 783 struct page *xor_dest; 784 int prexor = 0; 785 unsigned long flags; 786 787 pr_debug("%s: stripe %llu\n", __func__, 788 (unsigned long long)sh->sector); 789 790 /* check if prexor is active which means only process blocks 791 * that are part of a read-modify-write (written) 792 */ 793 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 794 prexor = 1; 795 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 796 for (i = disks; i--; ) { 797 struct r5dev *dev = &sh->dev[i]; 798 if (dev->written) 799 xor_srcs[count++] = dev->page; 800 } 801 } else { 802 xor_dest = sh->dev[pd_idx].page; 803 for (i = disks; i--; ) { 804 struct r5dev *dev = &sh->dev[i]; 805 if (i != pd_idx) 806 xor_srcs[count++] = dev->page; 807 } 808 } 809 810 /* 1/ if we prexor'd then the dest is reused as a source 811 * 2/ if we did not prexor then we are redoing the parity 812 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 813 * for the synchronous xor case 814 */ 815 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 816 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 817 818 atomic_inc(&sh->count); 819 820 if (unlikely(count == 1)) { 821 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 822 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 823 flags, tx, ops_complete_postxor, sh); 824 } else 825 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 826 flags, tx, ops_complete_postxor, sh); 827} 828 829static void ops_complete_check(void *stripe_head_ref) 830{ 831 struct stripe_head *sh = stripe_head_ref; 832 833 pr_debug("%s: stripe %llu\n", __func__, 834 (unsigned long long)sh->sector); 835 836 sh->check_state = check_state_check_result; 837 set_bit(STRIPE_HANDLE, &sh->state); 838 release_stripe(sh); 839} 840 841static void ops_run_check(struct stripe_head *sh) 842{ 843 /* kernel stack size limits the total number of disks */ 844 int disks = sh->disks; 845 struct page *xor_srcs[disks]; 846 struct dma_async_tx_descriptor *tx; 847 848 int count = 0, pd_idx = sh->pd_idx, i; 849 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 850 851 pr_debug("%s: stripe %llu\n", __func__, 852 (unsigned long long)sh->sector); 853 854 for (i = disks; i--; ) { 855 struct r5dev *dev = &sh->dev[i]; 856 if (i != pd_idx) 857 xor_srcs[count++] = dev->page; 858 } 859 860 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 861 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 862 863 atomic_inc(&sh->count); 864 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 865 ops_complete_check, sh); 866} 867 868static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 869{ 870 int overlap_clear = 0, i, disks = sh->disks; 871 struct dma_async_tx_descriptor *tx = NULL; 872 873 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 874 ops_run_biofill(sh); 875 overlap_clear++; 876 } 877 878 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 879 tx = ops_run_compute5(sh); 880 /* terminate the chain if postxor is not set to be run */ 881 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 882 async_tx_ack(tx); 883 } 884 885 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 886 tx = ops_run_prexor(sh, tx); 887 888 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 889 tx = ops_run_biodrain(sh, tx); 890 overlap_clear++; 891 } 892 893 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 894 ops_run_postxor(sh, tx); 895 896 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 897 ops_run_check(sh); 898 899 if (overlap_clear) 900 for (i = disks; i--; ) { 901 struct r5dev *dev = &sh->dev[i]; 902 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 903 wake_up(&sh->raid_conf->wait_for_overlap); 904 } 905} 906 907static int grow_one_stripe(raid5_conf_t *conf) 908{ 909 struct stripe_head *sh; 910 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 911 if (!sh) 912 return 0; 913 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); 914 sh->raid_conf = conf; 915 spin_lock_init(&sh->lock); 916 917 if (grow_buffers(sh, conf->raid_disks)) { 918 shrink_buffers(sh, conf->raid_disks); 919 kmem_cache_free(conf->slab_cache, sh); 920 return 0; 921 } 922 sh->disks = conf->raid_disks; 923 /* we just created an active stripe so... */ 924 atomic_set(&sh->count, 1); 925 atomic_inc(&conf->active_stripes); 926 INIT_LIST_HEAD(&sh->lru); 927 release_stripe(sh); 928 return 1; 929} 930 931static int grow_stripes(raid5_conf_t *conf, int num) 932{ 933 struct kmem_cache *sc; 934 int devs = conf->raid_disks; 935 936 sprintf(conf->cache_name[0], 937 "raid%d-%s", conf->level, mdname(conf->mddev)); 938 sprintf(conf->cache_name[1], 939 "raid%d-%s-alt", conf->level, mdname(conf->mddev)); 940 conf->active_name = 0; 941 sc = kmem_cache_create(conf->cache_name[conf->active_name], 942 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 943 0, 0, NULL); 944 if (!sc) 945 return 1; 946 conf->slab_cache = sc; 947 conf->pool_size = devs; 948 while (num--) 949 if (!grow_one_stripe(conf)) 950 return 1; 951 return 0; 952} 953 954#ifdef CONFIG_MD_RAID5_RESHAPE 955static int resize_stripes(raid5_conf_t *conf, int newsize) 956{ 957 /* Make all the stripes able to hold 'newsize' devices. 958 * New slots in each stripe get 'page' set to a new page. 959 * 960 * This happens in stages: 961 * 1/ create a new kmem_cache and allocate the required number of 962 * stripe_heads. 963 * 2/ gather all the old stripe_heads and tranfer the pages across 964 * to the new stripe_heads. This will have the side effect of 965 * freezing the array as once all stripe_heads have been collected, 966 * no IO will be possible. Old stripe heads are freed once their 967 * pages have been transferred over, and the old kmem_cache is 968 * freed when all stripes are done. 969 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 970 * we simple return a failre status - no need to clean anything up. 971 * 4/ allocate new pages for the new slots in the new stripe_heads. 972 * If this fails, we don't bother trying the shrink the 973 * stripe_heads down again, we just leave them as they are. 974 * As each stripe_head is processed the new one is released into 975 * active service. 976 * 977 * Once step2 is started, we cannot afford to wait for a write, 978 * so we use GFP_NOIO allocations. 979 */ 980 struct stripe_head *osh, *nsh; 981 LIST_HEAD(newstripes); 982 struct disk_info *ndisks; 983 int err; 984 struct kmem_cache *sc; 985 int i; 986 987 if (newsize <= conf->pool_size) 988 return 0; /* never bother to shrink */ 989 990 err = md_allow_write(conf->mddev); 991 if (err) 992 return err; 993 994 /* Step 1 */ 995 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 996 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 997 0, 0, NULL); 998 if (!sc) 999 return -ENOMEM; 1000 1001 for (i = conf->max_nr_stripes; i; i--) { 1002 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1003 if (!nsh) 1004 break; 1005 1006 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1007 1008 nsh->raid_conf = conf; 1009 spin_lock_init(&nsh->lock); 1010 1011 list_add(&nsh->lru, &newstripes); 1012 } 1013 if (i) { 1014 /* didn't get enough, give up */ 1015 while (!list_empty(&newstripes)) { 1016 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1017 list_del(&nsh->lru); 1018 kmem_cache_free(sc, nsh); 1019 } 1020 kmem_cache_destroy(sc); 1021 return -ENOMEM; 1022 } 1023 /* Step 2 - Must use GFP_NOIO now. 1024 * OK, we have enough stripes, start collecting inactive 1025 * stripes and copying them over 1026 */ 1027 list_for_each_entry(nsh, &newstripes, lru) { 1028 spin_lock_irq(&conf->device_lock); 1029 wait_event_lock_irq(conf->wait_for_stripe, 1030 !list_empty(&conf->inactive_list), 1031 conf->device_lock, 1032 unplug_slaves(conf->mddev) 1033 ); 1034 osh = get_free_stripe(conf); 1035 spin_unlock_irq(&conf->device_lock); 1036 atomic_set(&nsh->count, 1); 1037 for(i=0; i<conf->pool_size; i++) 1038 nsh->dev[i].page = osh->dev[i].page; 1039 for( ; i<newsize; i++) 1040 nsh->dev[i].page = NULL; 1041 kmem_cache_free(conf->slab_cache, osh); 1042 } 1043 kmem_cache_destroy(conf->slab_cache); 1044 1045 /* Step 3. 1046 * At this point, we are holding all the stripes so the array 1047 * is completely stalled, so now is a good time to resize 1048 * conf->disks. 1049 */ 1050 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1051 if (ndisks) { 1052 for (i=0; i<conf->raid_disks; i++) 1053 ndisks[i] = conf->disks[i]; 1054 kfree(conf->disks); 1055 conf->disks = ndisks; 1056 } else 1057 err = -ENOMEM; 1058 1059 /* Step 4, return new stripes to service */ 1060 while(!list_empty(&newstripes)) { 1061 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1062 list_del_init(&nsh->lru); 1063 for (i=conf->raid_disks; i < newsize; i++) 1064 if (nsh->dev[i].page == NULL) { 1065 struct page *p = alloc_page(GFP_NOIO); 1066 nsh->dev[i].page = p; 1067 if (!p) 1068 err = -ENOMEM; 1069 } 1070 release_stripe(nsh); 1071 } 1072 /* critical section pass, GFP_NOIO no longer needed */ 1073 1074 conf->slab_cache = sc; 1075 conf->active_name = 1-conf->active_name; 1076 conf->pool_size = newsize; 1077 return err; 1078} 1079#endif 1080 1081static int drop_one_stripe(raid5_conf_t *conf) 1082{ 1083 struct stripe_head *sh; 1084 1085 spin_lock_irq(&conf->device_lock); 1086 sh = get_free_stripe(conf); 1087 spin_unlock_irq(&conf->device_lock); 1088 if (!sh) 1089 return 0; 1090 BUG_ON(atomic_read(&sh->count)); 1091 shrink_buffers(sh, conf->pool_size); 1092 kmem_cache_free(conf->slab_cache, sh); 1093 atomic_dec(&conf->active_stripes); 1094 return 1; 1095} 1096 1097static void shrink_stripes(raid5_conf_t *conf) 1098{ 1099 while (drop_one_stripe(conf)) 1100 ; 1101 1102 if (conf->slab_cache) 1103 kmem_cache_destroy(conf->slab_cache); 1104 conf->slab_cache = NULL; 1105} 1106 1107static void raid5_end_read_request(struct bio * bi, int error) 1108{ 1109 struct stripe_head *sh = bi->bi_private; 1110 raid5_conf_t *conf = sh->raid_conf; 1111 int disks = sh->disks, i; 1112 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1113 char b[BDEVNAME_SIZE]; 1114 mdk_rdev_t *rdev; 1115 1116 1117 for (i=0 ; i<disks; i++) 1118 if (bi == &sh->dev[i].req) 1119 break; 1120 1121 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1122 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1123 uptodate); 1124 if (i == disks) { 1125 BUG(); 1126 return; 1127 } 1128 1129 if (uptodate) { 1130 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1131 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1132 rdev = conf->disks[i].rdev; 1133 printk_rl(KERN_INFO "raid5:%s: read error corrected" 1134 " (%lu sectors at %llu on %s)\n", 1135 mdname(conf->mddev), STRIPE_SECTORS, 1136 (unsigned long long)(sh->sector 1137 + rdev->data_offset), 1138 bdevname(rdev->bdev, b)); 1139 clear_bit(R5_ReadError, &sh->dev[i].flags); 1140 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1141 } 1142 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1143 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1144 } else { 1145 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1146 int retry = 0; 1147 rdev = conf->disks[i].rdev; 1148 1149 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1150 atomic_inc(&rdev->read_errors); 1151 if (conf->mddev->degraded) 1152 printk_rl(KERN_WARNING 1153 "raid5:%s: read error not correctable " 1154 "(sector %llu on %s).\n", 1155 mdname(conf->mddev), 1156 (unsigned long long)(sh->sector 1157 + rdev->data_offset), 1158 bdn); 1159 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1160 /* Oh, no!!! */ 1161 printk_rl(KERN_WARNING 1162 "raid5:%s: read error NOT corrected!! " 1163 "(sector %llu on %s).\n", 1164 mdname(conf->mddev), 1165 (unsigned long long)(sh->sector 1166 + rdev->data_offset), 1167 bdn); 1168 else if (atomic_read(&rdev->read_errors) 1169 > conf->max_nr_stripes) 1170 printk(KERN_WARNING 1171 "raid5:%s: Too many read errors, failing device %s.\n", 1172 mdname(conf->mddev), bdn); 1173 else 1174 retry = 1; 1175 if (retry) 1176 set_bit(R5_ReadError, &sh->dev[i].flags); 1177 else { 1178 clear_bit(R5_ReadError, &sh->dev[i].flags); 1179 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1180 md_error(conf->mddev, rdev); 1181 } 1182 } 1183 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1184 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1185 set_bit(STRIPE_HANDLE, &sh->state); 1186 release_stripe(sh); 1187} 1188 1189static void raid5_end_write_request(struct bio *bi, int error) 1190{ 1191 struct stripe_head *sh = bi->bi_private; 1192 raid5_conf_t *conf = sh->raid_conf; 1193 int disks = sh->disks, i; 1194 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1195 1196 for (i=0 ; i<disks; i++) 1197 if (bi == &sh->dev[i].req) 1198 break; 1199 1200 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1201 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1202 uptodate); 1203 if (i == disks) { 1204 BUG(); 1205 return; 1206 } 1207 1208 if (!uptodate) 1209 md_error(conf->mddev, conf->disks[i].rdev); 1210 1211 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1212 1213 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1214 set_bit(STRIPE_HANDLE, &sh->state); 1215 release_stripe(sh); 1216} 1217 1218 1219static sector_t compute_blocknr(struct stripe_head *sh, int i); 1220 1221static void raid5_build_block(struct stripe_head *sh, int i) 1222{ 1223 struct r5dev *dev = &sh->dev[i]; 1224 1225 bio_init(&dev->req); 1226 dev->req.bi_io_vec = &dev->vec; 1227 dev->req.bi_vcnt++; 1228 dev->req.bi_max_vecs++; 1229 dev->vec.bv_page = dev->page; 1230 dev->vec.bv_len = STRIPE_SIZE; 1231 dev->vec.bv_offset = 0; 1232 1233 dev->req.bi_sector = sh->sector; 1234 dev->req.bi_private = sh; 1235 1236 dev->flags = 0; 1237 dev->sector = compute_blocknr(sh, i); 1238} 1239 1240static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1241{ 1242 char b[BDEVNAME_SIZE]; 1243 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1244 pr_debug("raid5: error called\n"); 1245 1246 if (!test_bit(Faulty, &rdev->flags)) { 1247 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1248 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1249 unsigned long flags; 1250 spin_lock_irqsave(&conf->device_lock, flags); 1251 mddev->degraded++; 1252 spin_unlock_irqrestore(&conf->device_lock, flags); 1253 /* 1254 * if recovery was running, make sure it aborts. 1255 */ 1256 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1257 } 1258 set_bit(Faulty, &rdev->flags); 1259 printk(KERN_ALERT 1260 "raid5: Disk failure on %s, disabling device.\n" 1261 "raid5: Operation continuing on %d devices.\n", 1262 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1263 } 1264} 1265 1266/* 1267 * Input: a 'big' sector number, 1268 * Output: index of the data and parity disk, and the sector # in them. 1269 */ 1270static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1271 int previous, int *dd_idx, 1272 struct stripe_head *sh) 1273{ 1274 long stripe; 1275 unsigned long chunk_number; 1276 unsigned int chunk_offset; 1277 int pd_idx, qd_idx; 1278 int ddf_layout = 0; 1279 sector_t new_sector; 1280 int sectors_per_chunk = conf->chunk_size >> 9; 1281 int raid_disks = previous ? conf->previous_raid_disks 1282 : conf->raid_disks; 1283 int data_disks = raid_disks - conf->max_degraded; 1284 1285 /* First compute the information on this sector */ 1286 1287 /* 1288 * Compute the chunk number and the sector offset inside the chunk 1289 */ 1290 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1291 chunk_number = r_sector; 1292 BUG_ON(r_sector != chunk_number); 1293 1294 /* 1295 * Compute the stripe number 1296 */ 1297 stripe = chunk_number / data_disks; 1298 1299 /* 1300 * Compute the data disk and parity disk indexes inside the stripe 1301 */ 1302 *dd_idx = chunk_number % data_disks; 1303 1304 /* 1305 * Select the parity disk based on the user selected algorithm. 1306 */ 1307 pd_idx = qd_idx = ~0; 1308 switch(conf->level) { 1309 case 4: 1310 pd_idx = data_disks; 1311 break; 1312 case 5: 1313 switch (conf->algorithm) { 1314 case ALGORITHM_LEFT_ASYMMETRIC: 1315 pd_idx = data_disks - stripe % raid_disks; 1316 if (*dd_idx >= pd_idx) 1317 (*dd_idx)++; 1318 break; 1319 case ALGORITHM_RIGHT_ASYMMETRIC: 1320 pd_idx = stripe % raid_disks; 1321 if (*dd_idx >= pd_idx) 1322 (*dd_idx)++; 1323 break; 1324 case ALGORITHM_LEFT_SYMMETRIC: 1325 pd_idx = data_disks - stripe % raid_disks; 1326 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1327 break; 1328 case ALGORITHM_RIGHT_SYMMETRIC: 1329 pd_idx = stripe % raid_disks; 1330 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1331 break; 1332 case ALGORITHM_PARITY_0: 1333 pd_idx = 0; 1334 (*dd_idx)++; 1335 break; 1336 case ALGORITHM_PARITY_N: 1337 pd_idx = data_disks; 1338 break; 1339 default: 1340 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1341 conf->algorithm); 1342 BUG(); 1343 } 1344 break; 1345 case 6: 1346 1347 switch (conf->algorithm) { 1348 case ALGORITHM_LEFT_ASYMMETRIC: 1349 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1350 qd_idx = pd_idx + 1; 1351 if (pd_idx == raid_disks-1) { 1352 (*dd_idx)++; /* Q D D D P */ 1353 qd_idx = 0; 1354 } else if (*dd_idx >= pd_idx) 1355 (*dd_idx) += 2; /* D D P Q D */ 1356 break; 1357 case ALGORITHM_RIGHT_ASYMMETRIC: 1358 pd_idx = stripe % raid_disks; 1359 qd_idx = pd_idx + 1; 1360 if (pd_idx == raid_disks-1) { 1361 (*dd_idx)++; /* Q D D D P */ 1362 qd_idx = 0; 1363 } else if (*dd_idx >= pd_idx) 1364 (*dd_idx) += 2; /* D D P Q D */ 1365 break; 1366 case ALGORITHM_LEFT_SYMMETRIC: 1367 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1368 qd_idx = (pd_idx + 1) % raid_disks; 1369 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1370 break; 1371 case ALGORITHM_RIGHT_SYMMETRIC: 1372 pd_idx = stripe % raid_disks; 1373 qd_idx = (pd_idx + 1) % raid_disks; 1374 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1375 break; 1376 1377 case ALGORITHM_PARITY_0: 1378 pd_idx = 0; 1379 qd_idx = 1; 1380 (*dd_idx) += 2; 1381 break; 1382 case ALGORITHM_PARITY_N: 1383 pd_idx = data_disks; 1384 qd_idx = data_disks + 1; 1385 break; 1386 1387 case ALGORITHM_ROTATING_ZERO_RESTART: 1388 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1389 * of blocks for computing Q is different. 1390 */ 1391 pd_idx = stripe % raid_disks; 1392 qd_idx = pd_idx + 1; 1393 if (pd_idx == raid_disks-1) { 1394 (*dd_idx)++; /* Q D D D P */ 1395 qd_idx = 0; 1396 } else if (*dd_idx >= pd_idx) 1397 (*dd_idx) += 2; /* D D P Q D */ 1398 ddf_layout = 1; 1399 break; 1400 1401 case ALGORITHM_ROTATING_N_RESTART: 1402 /* Same a left_asymmetric, by first stripe is 1403 * D D D P Q rather than 1404 * Q D D D P 1405 */ 1406 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); 1407 qd_idx = pd_idx + 1; 1408 if (pd_idx == raid_disks-1) { 1409 (*dd_idx)++; /* Q D D D P */ 1410 qd_idx = 0; 1411 } else if (*dd_idx >= pd_idx) 1412 (*dd_idx) += 2; /* D D P Q D */ 1413 ddf_layout = 1; 1414 break; 1415 1416 case ALGORITHM_ROTATING_N_CONTINUE: 1417 /* Same as left_symmetric but Q is before P */ 1418 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1419 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1420 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1421 ddf_layout = 1; 1422 break; 1423 1424 case ALGORITHM_LEFT_ASYMMETRIC_6: 1425 /* RAID5 left_asymmetric, with Q on last device */ 1426 pd_idx = data_disks - stripe % (raid_disks-1); 1427 if (*dd_idx >= pd_idx) 1428 (*dd_idx)++; 1429 qd_idx = raid_disks - 1; 1430 break; 1431 1432 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1433 pd_idx = stripe % (raid_disks-1); 1434 if (*dd_idx >= pd_idx) 1435 (*dd_idx)++; 1436 qd_idx = raid_disks - 1; 1437 break; 1438 1439 case ALGORITHM_LEFT_SYMMETRIC_6: 1440 pd_idx = data_disks - stripe % (raid_disks-1); 1441 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1442 qd_idx = raid_disks - 1; 1443 break; 1444 1445 case ALGORITHM_RIGHT_SYMMETRIC_6: 1446 pd_idx = stripe % (raid_disks-1); 1447 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1448 qd_idx = raid_disks - 1; 1449 break; 1450 1451 case ALGORITHM_PARITY_0_6: 1452 pd_idx = 0; 1453 (*dd_idx)++; 1454 qd_idx = raid_disks - 1; 1455 break; 1456 1457 1458 default: 1459 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1460 conf->algorithm); 1461 BUG(); 1462 } 1463 break; 1464 } 1465 1466 if (sh) { 1467 sh->pd_idx = pd_idx; 1468 sh->qd_idx = qd_idx; 1469 sh->ddf_layout = ddf_layout; 1470 } 1471 /* 1472 * Finally, compute the new sector number 1473 */ 1474 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1475 return new_sector; 1476} 1477 1478 1479static sector_t compute_blocknr(struct stripe_head *sh, int i) 1480{ 1481 raid5_conf_t *conf = sh->raid_conf; 1482 int raid_disks = sh->disks; 1483 int data_disks = raid_disks - conf->max_degraded; 1484 sector_t new_sector = sh->sector, check; 1485 int sectors_per_chunk = conf->chunk_size >> 9; 1486 sector_t stripe; 1487 int chunk_offset; 1488 int chunk_number, dummy1, dd_idx = i; 1489 sector_t r_sector; 1490 struct stripe_head sh2; 1491 1492 1493 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1494 stripe = new_sector; 1495 BUG_ON(new_sector != stripe); 1496 1497 if (i == sh->pd_idx) 1498 return 0; 1499 switch(conf->level) { 1500 case 4: break; 1501 case 5: 1502 switch (conf->algorithm) { 1503 case ALGORITHM_LEFT_ASYMMETRIC: 1504 case ALGORITHM_RIGHT_ASYMMETRIC: 1505 if (i > sh->pd_idx) 1506 i--; 1507 break; 1508 case ALGORITHM_LEFT_SYMMETRIC: 1509 case ALGORITHM_RIGHT_SYMMETRIC: 1510 if (i < sh->pd_idx) 1511 i += raid_disks; 1512 i -= (sh->pd_idx + 1); 1513 break; 1514 case ALGORITHM_PARITY_0: 1515 i -= 1; 1516 break; 1517 case ALGORITHM_PARITY_N: 1518 break; 1519 default: 1520 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1521 conf->algorithm); 1522 BUG(); 1523 } 1524 break; 1525 case 6: 1526 if (i == sh->qd_idx) 1527 return 0; /* It is the Q disk */ 1528 switch (conf->algorithm) { 1529 case ALGORITHM_LEFT_ASYMMETRIC: 1530 case ALGORITHM_RIGHT_ASYMMETRIC: 1531 case ALGORITHM_ROTATING_ZERO_RESTART: 1532 case ALGORITHM_ROTATING_N_RESTART: 1533 if (sh->pd_idx == raid_disks-1) 1534 i--; /* Q D D D P */ 1535 else if (i > sh->pd_idx) 1536 i -= 2; /* D D P Q D */ 1537 break; 1538 case ALGORITHM_LEFT_SYMMETRIC: 1539 case ALGORITHM_RIGHT_SYMMETRIC: 1540 if (sh->pd_idx == raid_disks-1) 1541 i--; /* Q D D D P */ 1542 else { 1543 /* D D P Q D */ 1544 if (i < sh->pd_idx) 1545 i += raid_disks; 1546 i -= (sh->pd_idx + 2); 1547 } 1548 break; 1549 case ALGORITHM_PARITY_0: 1550 i -= 2; 1551 break; 1552 case ALGORITHM_PARITY_N: 1553 break; 1554 case ALGORITHM_ROTATING_N_CONTINUE: 1555 if (sh->pd_idx == 0) 1556 i--; /* P D D D Q */ 1557 else if (i > sh->pd_idx) 1558 i -= 2; /* D D Q P D */ 1559 break; 1560 case ALGORITHM_LEFT_ASYMMETRIC_6: 1561 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1562 if (i > sh->pd_idx) 1563 i--; 1564 break; 1565 case ALGORITHM_LEFT_SYMMETRIC_6: 1566 case ALGORITHM_RIGHT_SYMMETRIC_6: 1567 if (i < sh->pd_idx) 1568 i += data_disks + 1; 1569 i -= (sh->pd_idx + 1); 1570 break; 1571 case ALGORITHM_PARITY_0_6: 1572 i -= 1; 1573 break; 1574 default: 1575 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1576 conf->algorithm); 1577 BUG(); 1578 } 1579 break; 1580 } 1581 1582 chunk_number = stripe * data_disks + i; 1583 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 1584 1585 check = raid5_compute_sector(conf, r_sector, 1586 (raid_disks != conf->raid_disks), 1587 &dummy1, &sh2); 1588 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1589 || sh2.qd_idx != sh->qd_idx) { 1590 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1591 return 0; 1592 } 1593 return r_sector; 1594} 1595 1596 1597 1598/* 1599 * Copy data between a page in the stripe cache, and one or more bion 1600 * The page could align with the middle of the bio, or there could be 1601 * several bion, each with several bio_vecs, which cover part of the page 1602 * Multiple bion are linked together on bi_next. There may be extras 1603 * at the end of this list. We ignore them. 1604 */ 1605static void copy_data(int frombio, struct bio *bio, 1606 struct page *page, 1607 sector_t sector) 1608{ 1609 char *pa = page_address(page); 1610 struct bio_vec *bvl; 1611 int i; 1612 int page_offset; 1613 1614 if (bio->bi_sector >= sector) 1615 page_offset = (signed)(bio->bi_sector - sector) * 512; 1616 else 1617 page_offset = (signed)(sector - bio->bi_sector) * -512; 1618 bio_for_each_segment(bvl, bio, i) { 1619 int len = bio_iovec_idx(bio,i)->bv_len; 1620 int clen; 1621 int b_offset = 0; 1622 1623 if (page_offset < 0) { 1624 b_offset = -page_offset; 1625 page_offset += b_offset; 1626 len -= b_offset; 1627 } 1628 1629 if (len > 0 && page_offset + len > STRIPE_SIZE) 1630 clen = STRIPE_SIZE - page_offset; 1631 else clen = len; 1632 1633 if (clen > 0) { 1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 1635 if (frombio) 1636 memcpy(pa+page_offset, ba+b_offset, clen); 1637 else 1638 memcpy(ba+b_offset, pa+page_offset, clen); 1639 __bio_kunmap_atomic(ba, KM_USER0); 1640 } 1641 if (clen < len) /* hit end of page */ 1642 break; 1643 page_offset += len; 1644 } 1645} 1646 1647#define check_xor() do { \ 1648 if (count == MAX_XOR_BLOCKS) { \ 1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\ 1650 count = 0; \ 1651 } \ 1652 } while(0) 1653 1654static void compute_parity6(struct stripe_head *sh, int method) 1655{ 1656 raid5_conf_t *conf = sh->raid_conf; 1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; 1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1659 struct bio *chosen; 1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1661 void *ptrs[syndrome_disks+2]; 1662 1663 pd_idx = sh->pd_idx; 1664 qd_idx = sh->qd_idx; 1665 d0_idx = raid6_d0(sh); 1666 1667 pr_debug("compute_parity, stripe %llu, method %d\n", 1668 (unsigned long long)sh->sector, method); 1669 1670 switch(method) { 1671 case READ_MODIFY_WRITE: 1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ 1673 case RECONSTRUCT_WRITE: 1674 for (i= disks; i-- ;) 1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { 1676 chosen = sh->dev[i].towrite; 1677 sh->dev[i].towrite = NULL; 1678 1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1680 wake_up(&conf->wait_for_overlap); 1681 1682 BUG_ON(sh->dev[i].written); 1683 sh->dev[i].written = chosen; 1684 } 1685 break; 1686 case CHECK_PARITY: 1687 BUG(); /* Not implemented yet */ 1688 } 1689 1690 for (i = disks; i--;) 1691 if (sh->dev[i].written) { 1692 sector_t sector = sh->dev[i].sector; 1693 struct bio *wbi = sh->dev[i].written; 1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { 1695 copy_data(1, wbi, sh->dev[i].page, sector); 1696 wbi = r5_next_bio(wbi, sector); 1697 } 1698 1699 set_bit(R5_LOCKED, &sh->dev[i].flags); 1700 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1701 } 1702 1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ 1704 1705 for (i = 0; i < disks; i++) 1706 ptrs[i] = (void *)raid6_empty_zero_page; 1707 1708 count = 0; 1709 i = d0_idx; 1710 do { 1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1712 1713 ptrs[slot] = page_address(sh->dev[i].page); 1714 if (slot < syndrome_disks && 1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { 1716 printk(KERN_ERR "block %d/%d not uptodate " 1717 "on parity calc\n", i, count); 1718 BUG(); 1719 } 1720 1721 i = raid6_next_disk(i, disks); 1722 } while (i != d0_idx); 1723 BUG_ON(count != syndrome_disks); 1724 1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); 1726 1727 switch(method) { 1728 case RECONSTRUCT_WRITE: 1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); 1733 break; 1734 case UPDATE_PARITY: 1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1737 break; 1738 } 1739} 1740 1741 1742/* Compute one missing block */ 1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1744{ 1745 int i, count, disks = sh->disks; 1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p; 1747 int qd_idx = sh->qd_idx; 1748 1749 pr_debug("compute_block_1, stripe %llu, idx %d\n", 1750 (unsigned long long)sh->sector, dd_idx); 1751 1752 if ( dd_idx == qd_idx ) { 1753 /* We're actually computing the Q drive */ 1754 compute_parity6(sh, UPDATE_PARITY); 1755 } else { 1756 dest = page_address(sh->dev[dd_idx].page); 1757 if (!nozero) memset(dest, 0, STRIPE_SIZE); 1758 count = 0; 1759 for (i = disks ; i--; ) { 1760 if (i == dd_idx || i == qd_idx) 1761 continue; 1762 p = page_address(sh->dev[i].page); 1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1764 ptr[count++] = p; 1765 else 1766 printk("compute_block() %d, stripe %llu, %d" 1767 " not present\n", dd_idx, 1768 (unsigned long long)sh->sector, i); 1769 1770 check_xor(); 1771 } 1772 if (count) 1773 xor_blocks(count, STRIPE_SIZE, dest, ptr); 1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1776 } 1777} 1778 1779/* Compute two missing blocks */ 1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) 1781{ 1782 int i, count, disks = sh->disks; 1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1784 int d0_idx = raid6_d0(sh); 1785 int faila = -1, failb = -1; 1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1787 void *ptrs[syndrome_disks+2]; 1788 1789 for (i = 0; i < disks ; i++) 1790 ptrs[i] = (void *)raid6_empty_zero_page; 1791 count = 0; 1792 i = d0_idx; 1793 do { 1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1795 1796 ptrs[slot] = page_address(sh->dev[i].page); 1797 1798 if (i == dd_idx1) 1799 faila = slot; 1800 if (i == dd_idx2) 1801 failb = slot; 1802 i = raid6_next_disk(i, disks); 1803 } while (i != d0_idx); 1804 BUG_ON(count != syndrome_disks); 1805 1806 BUG_ON(faila == failb); 1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1808 1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2, 1811 faila, failb); 1812 1813 if (failb == syndrome_disks+1) { 1814 /* Q disk is one of the missing disks */ 1815 if (faila == syndrome_disks) { 1816 /* Missing P+Q, just recompute */ 1817 compute_parity6(sh, UPDATE_PARITY); 1818 return; 1819 } else { 1820 /* We're missing D+Q; recompute D from P */ 1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? 1822 dd_idx2 : dd_idx1), 1823 0); 1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ 1825 return; 1826 } 1827 } 1828 1829 /* We're missing D+P or D+D; */ 1830 if (failb == syndrome_disks) { 1831 /* We're missing D+P. */ 1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); 1833 } else { 1834 /* We're missing D+D. */ 1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, 1836 ptrs); 1837 } 1838 1839 /* Both the above update both missing blocks */ 1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); 1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); 1842} 1843 1844static void 1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1846 int rcw, int expand) 1847{ 1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1849 1850 if (rcw) { 1851 /* if we are not expanding this is a proper write request, and 1852 * there will be bios with new data to be drained into the 1853 * stripe cache 1854 */ 1855 if (!expand) { 1856 sh->reconstruct_state = reconstruct_state_drain_run; 1857 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1858 } else 1859 sh->reconstruct_state = reconstruct_state_run; 1860 1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1862 1863 for (i = disks; i--; ) { 1864 struct r5dev *dev = &sh->dev[i]; 1865 1866 if (dev->towrite) { 1867 set_bit(R5_LOCKED, &dev->flags); 1868 set_bit(R5_Wantdrain, &dev->flags); 1869 if (!expand) 1870 clear_bit(R5_UPTODATE, &dev->flags); 1871 s->locked++; 1872 } 1873 } 1874 if (s->locked + 1 == disks) 1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1877 } else { 1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1880 1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1885 1886 for (i = disks; i--; ) { 1887 struct r5dev *dev = &sh->dev[i]; 1888 if (i == pd_idx) 1889 continue; 1890 1891 if (dev->towrite && 1892 (test_bit(R5_UPTODATE, &dev->flags) || 1893 test_bit(R5_Wantcompute, &dev->flags))) { 1894 set_bit(R5_Wantdrain, &dev->flags); 1895 set_bit(R5_LOCKED, &dev->flags); 1896 clear_bit(R5_UPTODATE, &dev->flags); 1897 s->locked++; 1898 } 1899 } 1900 } 1901 1902 /* keep the parity disk locked while asynchronous operations 1903 * are in flight 1904 */ 1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1907 s->locked++; 1908 1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 1910 __func__, (unsigned long long)sh->sector, 1911 s->locked, s->ops_request); 1912} 1913 1914/* 1915 * Each stripe/dev can have one or more bion attached. 1916 * toread/towrite point to the first in a chain. 1917 * The bi_next chain must be in order. 1918 */ 1919static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 1920{ 1921 struct bio **bip; 1922 raid5_conf_t *conf = sh->raid_conf; 1923 int firstwrite=0; 1924 1925 pr_debug("adding bh b#%llu to stripe s#%llu\n", 1926 (unsigned long long)bi->bi_sector, 1927 (unsigned long long)sh->sector); 1928 1929 1930 spin_lock(&sh->lock); 1931 spin_lock_irq(&conf->device_lock); 1932 if (forwrite) { 1933 bip = &sh->dev[dd_idx].towrite; 1934 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 1935 firstwrite = 1; 1936 } else 1937 bip = &sh->dev[dd_idx].toread; 1938 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 1939 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 1940 goto overlap; 1941 bip = & (*bip)->bi_next; 1942 } 1943 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 1944 goto overlap; 1945 1946 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 1947 if (*bip) 1948 bi->bi_next = *bip; 1949 *bip = bi; 1950 bi->bi_phys_segments++; 1951 spin_unlock_irq(&conf->device_lock); 1952 spin_unlock(&sh->lock); 1953 1954 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 1955 (unsigned long long)bi->bi_sector, 1956 (unsigned long long)sh->sector, dd_idx); 1957 1958 if (conf->mddev->bitmap && firstwrite) { 1959 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 1960 STRIPE_SECTORS, 0); 1961 sh->bm_seq = conf->seq_flush+1; 1962 set_bit(STRIPE_BIT_DELAY, &sh->state); 1963 } 1964 1965 if (forwrite) { 1966 /* check if page is covered */ 1967 sector_t sector = sh->dev[dd_idx].sector; 1968 for (bi=sh->dev[dd_idx].towrite; 1969 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 1970 bi && bi->bi_sector <= sector; 1971 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 1972 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 1973 sector = bi->bi_sector + (bi->bi_size>>9); 1974 } 1975 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 1976 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 1977 } 1978 return 1; 1979 1980 overlap: 1981 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 1982 spin_unlock_irq(&conf->device_lock); 1983 spin_unlock(&sh->lock); 1984 return 0; 1985} 1986 1987static void end_reshape(raid5_conf_t *conf); 1988 1989static int page_is_zero(struct page *p) 1990{ 1991 char *a = page_address(p); 1992 return ((*(u32*)a) == 0 && 1993 memcmp(a, a+4, STRIPE_SIZE-4)==0); 1994} 1995 1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 1997 struct stripe_head *sh) 1998{ 1999 int sectors_per_chunk = conf->chunk_size >> 9; 2000 int dd_idx; 2001 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2002 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2003 2004 raid5_compute_sector(conf, 2005 stripe * (disks - conf->max_degraded) 2006 *sectors_per_chunk + chunk_offset, 2007 previous, 2008 &dd_idx, sh); 2009} 2010 2011static void 2012handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 2013 struct stripe_head_state *s, int disks, 2014 struct bio **return_bi) 2015{ 2016 int i; 2017 for (i = disks; i--; ) { 2018 struct bio *bi; 2019 int bitmap_end = 0; 2020 2021 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2022 mdk_rdev_t *rdev; 2023 rcu_read_lock(); 2024 rdev = rcu_dereference(conf->disks[i].rdev); 2025 if (rdev && test_bit(In_sync, &rdev->flags)) 2026 /* multiple read failures in one stripe */ 2027 md_error(conf->mddev, rdev); 2028 rcu_read_unlock(); 2029 } 2030 spin_lock_irq(&conf->device_lock); 2031 /* fail all writes first */ 2032 bi = sh->dev[i].towrite; 2033 sh->dev[i].towrite = NULL; 2034 if (bi) { 2035 s->to_write--; 2036 bitmap_end = 1; 2037 } 2038 2039 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2040 wake_up(&conf->wait_for_overlap); 2041 2042 while (bi && bi->bi_sector < 2043 sh->dev[i].sector + STRIPE_SECTORS) { 2044 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2045 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2046 if (!raid5_dec_bi_phys_segments(bi)) { 2047 md_write_end(conf->mddev); 2048 bi->bi_next = *return_bi; 2049 *return_bi = bi; 2050 } 2051 bi = nextbi; 2052 } 2053 /* and fail all 'written' */ 2054 bi = sh->dev[i].written; 2055 sh->dev[i].written = NULL; 2056 if (bi) bitmap_end = 1; 2057 while (bi && bi->bi_sector < 2058 sh->dev[i].sector + STRIPE_SECTORS) { 2059 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2060 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2061 if (!raid5_dec_bi_phys_segments(bi)) { 2062 md_write_end(conf->mddev); 2063 bi->bi_next = *return_bi; 2064 *return_bi = bi; 2065 } 2066 bi = bi2; 2067 } 2068 2069 /* fail any reads if this device is non-operational and 2070 * the data has not reached the cache yet. 2071 */ 2072 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2073 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2074 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2075 bi = sh->dev[i].toread; 2076 sh->dev[i].toread = NULL; 2077 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2078 wake_up(&conf->wait_for_overlap); 2079 if (bi) s->to_read--; 2080 while (bi && bi->bi_sector < 2081 sh->dev[i].sector + STRIPE_SECTORS) { 2082 struct bio *nextbi = 2083 r5_next_bio(bi, sh->dev[i].sector); 2084 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2085 if (!raid5_dec_bi_phys_segments(bi)) { 2086 bi->bi_next = *return_bi; 2087 *return_bi = bi; 2088 } 2089 bi = nextbi; 2090 } 2091 } 2092 spin_unlock_irq(&conf->device_lock); 2093 if (bitmap_end) 2094 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2095 STRIPE_SECTORS, 0, 0); 2096 } 2097 2098 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2099 if (atomic_dec_and_test(&conf->pending_full_writes)) 2100 md_wakeup_thread(conf->mddev->thread); 2101} 2102 2103/* fetch_block5 - checks the given member device to see if its data needs 2104 * to be read or computed to satisfy a request. 2105 * 2106 * Returns 1 when no more member devices need to be checked, otherwise returns 2107 * 0 to tell the loop in handle_stripe_fill5 to continue 2108 */ 2109static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 2110 int disk_idx, int disks) 2111{ 2112 struct r5dev *dev = &sh->dev[disk_idx]; 2113 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 2114 2115 /* is the data in this block needed, and can we get it? */ 2116 if (!test_bit(R5_LOCKED, &dev->flags) && 2117 !test_bit(R5_UPTODATE, &dev->flags) && 2118 (dev->toread || 2119 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2120 s->syncing || s->expanding || 2121 (s->failed && 2122 (failed_dev->toread || 2123 (failed_dev->towrite && 2124 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 2125 /* We would like to get this block, possibly by computing it, 2126 * otherwise read it if the backing disk is insync 2127 */ 2128 if ((s->uptodate == disks - 1) && 2129 (s->failed && disk_idx == s->failed_num)) { 2130 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2131 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2132 set_bit(R5_Wantcompute, &dev->flags); 2133 sh->ops.target = disk_idx; 2134 s->req_compute = 1; 2135 /* Careful: from this point on 'uptodate' is in the eye 2136 * of raid5_run_ops which services 'compute' operations 2137 * before writes. R5_Wantcompute flags a block that will 2138 * be R5_UPTODATE by the time it is needed for a 2139 * subsequent operation. 2140 */ 2141 s->uptodate++; 2142 return 1; /* uptodate + compute == disks */ 2143 } else if (test_bit(R5_Insync, &dev->flags)) { 2144 set_bit(R5_LOCKED, &dev->flags); 2145 set_bit(R5_Wantread, &dev->flags); 2146 s->locked++; 2147 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 2148 s->syncing); 2149 } 2150 } 2151 2152 return 0; 2153} 2154 2155/** 2156 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 2157 */ 2158static void handle_stripe_fill5(struct stripe_head *sh, 2159 struct stripe_head_state *s, int disks) 2160{ 2161 int i; 2162 2163 /* look for blocks to read/compute, skip this if a compute 2164 * is already in flight, or if the stripe contents are in the 2165 * midst of changing due to a write 2166 */ 2167 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2168 !sh->reconstruct_state) 2169 for (i = disks; i--; ) 2170 if (fetch_block5(sh, s, i, disks)) 2171 break; 2172 set_bit(STRIPE_HANDLE, &sh->state); 2173} 2174 2175static void handle_stripe_fill6(struct stripe_head *sh, 2176 struct stripe_head_state *s, struct r6_state *r6s, 2177 int disks) 2178{ 2179 int i; 2180 for (i = disks; i--; ) { 2181 struct r5dev *dev = &sh->dev[i]; 2182 if (!test_bit(R5_LOCKED, &dev->flags) && 2183 !test_bit(R5_UPTODATE, &dev->flags) && 2184 (dev->toread || (dev->towrite && 2185 !test_bit(R5_OVERWRITE, &dev->flags)) || 2186 s->syncing || s->expanding || 2187 (s->failed >= 1 && 2188 (sh->dev[r6s->failed_num[0]].toread || 2189 s->to_write)) || 2190 (s->failed >= 2 && 2191 (sh->dev[r6s->failed_num[1]].toread || 2192 s->to_write)))) { 2193 /* we would like to get this block, possibly 2194 * by computing it, but we might not be able to 2195 */ 2196 if ((s->uptodate == disks - 1) && 2197 (s->failed && (i == r6s->failed_num[0] || 2198 i == r6s->failed_num[1]))) { 2199 pr_debug("Computing stripe %llu block %d\n", 2200 (unsigned long long)sh->sector, i); 2201 compute_block_1(sh, i, 0); 2202 s->uptodate++; 2203 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2204 /* Computing 2-failure is *very* expensive; only 2205 * do it if failed >= 2 2206 */ 2207 int other; 2208 for (other = disks; other--; ) { 2209 if (other == i) 2210 continue; 2211 if (!test_bit(R5_UPTODATE, 2212 &sh->dev[other].flags)) 2213 break; 2214 } 2215 BUG_ON(other < 0); 2216 pr_debug("Computing stripe %llu blocks %d,%d\n", 2217 (unsigned long long)sh->sector, 2218 i, other); 2219 compute_block_2(sh, i, other); 2220 s->uptodate += 2; 2221 } else if (test_bit(R5_Insync, &dev->flags)) { 2222 set_bit(R5_LOCKED, &dev->flags); 2223 set_bit(R5_Wantread, &dev->flags); 2224 s->locked++; 2225 pr_debug("Reading block %d (sync=%d)\n", 2226 i, s->syncing); 2227 } 2228 } 2229 } 2230 set_bit(STRIPE_HANDLE, &sh->state); 2231} 2232 2233 2234/* handle_stripe_clean_event 2235 * any written block on an uptodate or failed drive can be returned. 2236 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2237 * never LOCKED, so we don't need to test 'failed' directly. 2238 */ 2239static void handle_stripe_clean_event(raid5_conf_t *conf, 2240 struct stripe_head *sh, int disks, struct bio **return_bi) 2241{ 2242 int i; 2243 struct r5dev *dev; 2244 2245 for (i = disks; i--; ) 2246 if (sh->dev[i].written) { 2247 dev = &sh->dev[i]; 2248 if (!test_bit(R5_LOCKED, &dev->flags) && 2249 test_bit(R5_UPTODATE, &dev->flags)) { 2250 /* We can return any write requests */ 2251 struct bio *wbi, *wbi2; 2252 int bitmap_end = 0; 2253 pr_debug("Return write for disc %d\n", i); 2254 spin_lock_irq(&conf->device_lock); 2255 wbi = dev->written; 2256 dev->written = NULL; 2257 while (wbi && wbi->bi_sector < 2258 dev->sector + STRIPE_SECTORS) { 2259 wbi2 = r5_next_bio(wbi, dev->sector); 2260 if (!raid5_dec_bi_phys_segments(wbi)) { 2261 md_write_end(conf->mddev); 2262 wbi->bi_next = *return_bi; 2263 *return_bi = wbi; 2264 } 2265 wbi = wbi2; 2266 } 2267 if (dev->towrite == NULL) 2268 bitmap_end = 1; 2269 spin_unlock_irq(&conf->device_lock); 2270 if (bitmap_end) 2271 bitmap_endwrite(conf->mddev->bitmap, 2272 sh->sector, 2273 STRIPE_SECTORS, 2274 !test_bit(STRIPE_DEGRADED, &sh->state), 2275 0); 2276 } 2277 } 2278 2279 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2280 if (atomic_dec_and_test(&conf->pending_full_writes)) 2281 md_wakeup_thread(conf->mddev->thread); 2282} 2283 2284static void handle_stripe_dirtying5(raid5_conf_t *conf, 2285 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2286{ 2287 int rmw = 0, rcw = 0, i; 2288 for (i = disks; i--; ) { 2289 /* would I have to read this buffer for read_modify_write */ 2290 struct r5dev *dev = &sh->dev[i]; 2291 if ((dev->towrite || i == sh->pd_idx) && 2292 !test_bit(R5_LOCKED, &dev->flags) && 2293 !(test_bit(R5_UPTODATE, &dev->flags) || 2294 test_bit(R5_Wantcompute, &dev->flags))) { 2295 if (test_bit(R5_Insync, &dev->flags)) 2296 rmw++; 2297 else 2298 rmw += 2*disks; /* cannot read it */ 2299 } 2300 /* Would I have to read this buffer for reconstruct_write */ 2301 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2302 !test_bit(R5_LOCKED, &dev->flags) && 2303 !(test_bit(R5_UPTODATE, &dev->flags) || 2304 test_bit(R5_Wantcompute, &dev->flags))) { 2305 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2306 else 2307 rcw += 2*disks; 2308 } 2309 } 2310 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2311 (unsigned long long)sh->sector, rmw, rcw); 2312 set_bit(STRIPE_HANDLE, &sh->state); 2313 if (rmw < rcw && rmw > 0) 2314 /* prefer read-modify-write, but need to get some data */ 2315 for (i = disks; i--; ) { 2316 struct r5dev *dev = &sh->dev[i]; 2317 if ((dev->towrite || i == sh->pd_idx) && 2318 !test_bit(R5_LOCKED, &dev->flags) && 2319 !(test_bit(R5_UPTODATE, &dev->flags) || 2320 test_bit(R5_Wantcompute, &dev->flags)) && 2321 test_bit(R5_Insync, &dev->flags)) { 2322 if ( 2323 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2324 pr_debug("Read_old block " 2325 "%d for r-m-w\n", i); 2326 set_bit(R5_LOCKED, &dev->flags); 2327 set_bit(R5_Wantread, &dev->flags); 2328 s->locked++; 2329 } else { 2330 set_bit(STRIPE_DELAYED, &sh->state); 2331 set_bit(STRIPE_HANDLE, &sh->state); 2332 } 2333 } 2334 } 2335 if (rcw <= rmw && rcw > 0) 2336 /* want reconstruct write, but need to get some data */ 2337 for (i = disks; i--; ) { 2338 struct r5dev *dev = &sh->dev[i]; 2339 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2340 i != sh->pd_idx && 2341 !test_bit(R5_LOCKED, &dev->flags) && 2342 !(test_bit(R5_UPTODATE, &dev->flags) || 2343 test_bit(R5_Wantcompute, &dev->flags)) && 2344 test_bit(R5_Insync, &dev->flags)) { 2345 if ( 2346 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2347 pr_debug("Read_old block " 2348 "%d for Reconstruct\n", i); 2349 set_bit(R5_LOCKED, &dev->flags); 2350 set_bit(R5_Wantread, &dev->flags); 2351 s->locked++; 2352 } else { 2353 set_bit(STRIPE_DELAYED, &sh->state); 2354 set_bit(STRIPE_HANDLE, &sh->state); 2355 } 2356 } 2357 } 2358 /* now if nothing is locked, and if we have enough data, 2359 * we can start a write request 2360 */ 2361 /* since handle_stripe can be called at any time we need to handle the 2362 * case where a compute block operation has been submitted and then a 2363 * subsequent call wants to start a write request. raid5_run_ops only 2364 * handles the case where compute block and postxor are requested 2365 * simultaneously. If this is not the case then new writes need to be 2366 * held off until the compute completes. 2367 */ 2368 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2369 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2370 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2371 schedule_reconstruction5(sh, s, rcw == 0, 0); 2372} 2373 2374static void handle_stripe_dirtying6(raid5_conf_t *conf, 2375 struct stripe_head *sh, struct stripe_head_state *s, 2376 struct r6_state *r6s, int disks) 2377{ 2378 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2379 int qd_idx = r6s->qd_idx; 2380 for (i = disks; i--; ) { 2381 struct r5dev *dev = &sh->dev[i]; 2382 /* Would I have to read this buffer for reconstruct_write */ 2383 if (!test_bit(R5_OVERWRITE, &dev->flags) 2384 && i != pd_idx && i != qd_idx 2385 && (!test_bit(R5_LOCKED, &dev->flags) 2386 ) && 2387 !test_bit(R5_UPTODATE, &dev->flags)) { 2388 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2389 else { 2390 pr_debug("raid6: must_compute: " 2391 "disk %d flags=%#lx\n", i, dev->flags); 2392 must_compute++; 2393 } 2394 } 2395 } 2396 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", 2397 (unsigned long long)sh->sector, rcw, must_compute); 2398 set_bit(STRIPE_HANDLE, &sh->state); 2399 2400 if (rcw > 0) 2401 /* want reconstruct write, but need to get some data */ 2402 for (i = disks; i--; ) { 2403 struct r5dev *dev = &sh->dev[i]; 2404 if (!test_bit(R5_OVERWRITE, &dev->flags) 2405 && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) 2406 && !test_bit(R5_LOCKED, &dev->flags) && 2407 !test_bit(R5_UPTODATE, &dev->flags) && 2408 test_bit(R5_Insync, &dev->flags)) { 2409 if ( 2410 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2411 pr_debug("Read_old stripe %llu " 2412 "block %d for Reconstruct\n", 2413 (unsigned long long)sh->sector, i); 2414 set_bit(R5_LOCKED, &dev->flags); 2415 set_bit(R5_Wantread, &dev->flags); 2416 s->locked++; 2417 } else { 2418 pr_debug("Request delayed stripe %llu " 2419 "block %d for Reconstruct\n", 2420 (unsigned long long)sh->sector, i); 2421 set_bit(STRIPE_DELAYED, &sh->state); 2422 set_bit(STRIPE_HANDLE, &sh->state); 2423 } 2424 } 2425 } 2426 /* now if nothing is locked, and if we have enough data, we can start a 2427 * write request 2428 */ 2429 if (s->locked == 0 && rcw == 0 && 2430 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2431 if (must_compute > 0) { 2432 /* We have failed blocks and need to compute them */ 2433 switch (s->failed) { 2434 case 0: 2435 BUG(); 2436 case 1: 2437 compute_block_1(sh, r6s->failed_num[0], 0); 2438 break; 2439 case 2: 2440 compute_block_2(sh, r6s->failed_num[0], 2441 r6s->failed_num[1]); 2442 break; 2443 default: /* This request should have been failed? */ 2444 BUG(); 2445 } 2446 } 2447 2448 pr_debug("Computing parity for stripe %llu\n", 2449 (unsigned long long)sh->sector); 2450 compute_parity6(sh, RECONSTRUCT_WRITE); 2451 /* now every locked buffer is ready to be written */ 2452 for (i = disks; i--; ) 2453 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 2454 pr_debug("Writing stripe %llu block %d\n", 2455 (unsigned long long)sh->sector, i); 2456 s->locked++; 2457 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2458 } 2459 if (s->locked == disks) 2460 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2461 atomic_inc(&conf->pending_full_writes); 2462 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2463 set_bit(STRIPE_INSYNC, &sh->state); 2464 2465 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2466 atomic_dec(&conf->preread_active_stripes); 2467 if (atomic_read(&conf->preread_active_stripes) < 2468 IO_THRESHOLD) 2469 md_wakeup_thread(conf->mddev->thread); 2470 } 2471 } 2472} 2473 2474static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2475 struct stripe_head_state *s, int disks) 2476{ 2477 struct r5dev *dev = NULL; 2478 2479 set_bit(STRIPE_HANDLE, &sh->state); 2480 2481 switch (sh->check_state) { 2482 case check_state_idle: 2483 /* start a new check operation if there are no failures */ 2484 if (s->failed == 0) { 2485 BUG_ON(s->uptodate != disks); 2486 sh->check_state = check_state_run; 2487 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2488 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2489 s->uptodate--; 2490 break; 2491 } 2492 dev = &sh->dev[s->failed_num]; 2493 /* fall through */ 2494 case check_state_compute_result: 2495 sh->check_state = check_state_idle; 2496 if (!dev) 2497 dev = &sh->dev[sh->pd_idx]; 2498 2499 /* check that a write has not made the stripe insync */ 2500 if (test_bit(STRIPE_INSYNC, &sh->state)) 2501 break; 2502 2503 /* either failed parity check, or recovery is happening */ 2504 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2505 BUG_ON(s->uptodate != disks); 2506 2507 set_bit(R5_LOCKED, &dev->flags); 2508 s->locked++; 2509 set_bit(R5_Wantwrite, &dev->flags); 2510 2511 clear_bit(STRIPE_DEGRADED, &sh->state); 2512 set_bit(STRIPE_INSYNC, &sh->state); 2513 break; 2514 case check_state_run: 2515 break; /* we will be called again upon completion */ 2516 case check_state_check_result: 2517 sh->check_state = check_state_idle; 2518 2519 /* if a failure occurred during the check operation, leave 2520 * STRIPE_INSYNC not set and let the stripe be handled again 2521 */ 2522 if (s->failed) 2523 break; 2524 2525 /* handle a successful check operation, if parity is correct 2526 * we are done. Otherwise update the mismatch count and repair 2527 * parity if !MD_RECOVERY_CHECK 2528 */ 2529 if (sh->ops.zero_sum_result == 0) 2530 /* parity is correct (on disc, 2531 * not in buffer any more) 2532 */ 2533 set_bit(STRIPE_INSYNC, &sh->state); 2534 else { 2535 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2536 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2537 /* don't try to repair!! */ 2538 set_bit(STRIPE_INSYNC, &sh->state); 2539 else { 2540 sh->check_state = check_state_compute_run; 2541 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2542 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2543 set_bit(R5_Wantcompute, 2544 &sh->dev[sh->pd_idx].flags); 2545 sh->ops.target = sh->pd_idx; 2546 s->uptodate++; 2547 } 2548 } 2549 break; 2550 case check_state_compute_run: 2551 break; 2552 default: 2553 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2554 __func__, sh->check_state, 2555 (unsigned long long) sh->sector); 2556 BUG(); 2557 } 2558} 2559 2560 2561static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2562 struct stripe_head_state *s, 2563 struct r6_state *r6s, struct page *tmp_page, 2564 int disks) 2565{ 2566 int update_p = 0, update_q = 0; 2567 struct r5dev *dev; 2568 int pd_idx = sh->pd_idx; 2569 int qd_idx = r6s->qd_idx; 2570 2571 set_bit(STRIPE_HANDLE, &sh->state); 2572 2573 BUG_ON(s->failed > 2); 2574 BUG_ON(s->uptodate < disks); 2575 /* Want to check and possibly repair P and Q. 2576 * However there could be one 'failed' device, in which 2577 * case we can only check one of them, possibly using the 2578 * other to generate missing data 2579 */ 2580 2581 /* If !tmp_page, we cannot do the calculations, 2582 * but as we have set STRIPE_HANDLE, we will soon be called 2583 * by stripe_handle with a tmp_page - just wait until then. 2584 */ 2585 if (tmp_page) { 2586 if (s->failed == r6s->q_failed) { 2587 /* The only possible failed device holds 'Q', so it 2588 * makes sense to check P (If anything else were failed, 2589 * we would have used P to recreate it). 2590 */ 2591 compute_block_1(sh, pd_idx, 1); 2592 if (!page_is_zero(sh->dev[pd_idx].page)) { 2593 compute_block_1(sh, pd_idx, 0); 2594 update_p = 1; 2595 } 2596 } 2597 if (!r6s->q_failed && s->failed < 2) { 2598 /* q is not failed, and we didn't use it to generate 2599 * anything, so it makes sense to check it 2600 */ 2601 memcpy(page_address(tmp_page), 2602 page_address(sh->dev[qd_idx].page), 2603 STRIPE_SIZE); 2604 compute_parity6(sh, UPDATE_PARITY); 2605 if (memcmp(page_address(tmp_page), 2606 page_address(sh->dev[qd_idx].page), 2607 STRIPE_SIZE) != 0) { 2608 clear_bit(STRIPE_INSYNC, &sh->state); 2609 update_q = 1; 2610 } 2611 } 2612 if (update_p || update_q) { 2613 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2614 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2615 /* don't try to repair!! */ 2616 update_p = update_q = 0; 2617 } 2618 2619 /* now write out any block on a failed drive, 2620 * or P or Q if they need it 2621 */ 2622 2623 if (s->failed == 2) { 2624 dev = &sh->dev[r6s->failed_num[1]]; 2625 s->locked++; 2626 set_bit(R5_LOCKED, &dev->flags); 2627 set_bit(R5_Wantwrite, &dev->flags); 2628 } 2629 if (s->failed >= 1) { 2630 dev = &sh->dev[r6s->failed_num[0]]; 2631 s->locked++; 2632 set_bit(R5_LOCKED, &dev->flags); 2633 set_bit(R5_Wantwrite, &dev->flags); 2634 } 2635 2636 if (update_p) { 2637 dev = &sh->dev[pd_idx]; 2638 s->locked++; 2639 set_bit(R5_LOCKED, &dev->flags); 2640 set_bit(R5_Wantwrite, &dev->flags); 2641 } 2642 if (update_q) { 2643 dev = &sh->dev[qd_idx]; 2644 s->locked++; 2645 set_bit(R5_LOCKED, &dev->flags); 2646 set_bit(R5_Wantwrite, &dev->flags); 2647 } 2648 clear_bit(STRIPE_DEGRADED, &sh->state); 2649 2650 set_bit(STRIPE_INSYNC, &sh->state); 2651 } 2652} 2653 2654static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2655 struct r6_state *r6s) 2656{ 2657 int i; 2658 2659 /* We have read all the blocks in this stripe and now we need to 2660 * copy some of them into a target stripe for expand. 2661 */ 2662 struct dma_async_tx_descriptor *tx = NULL; 2663 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2664 for (i = 0; i < sh->disks; i++) 2665 if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { 2666 int dd_idx, j; 2667 struct stripe_head *sh2; 2668 2669 sector_t bn = compute_blocknr(sh, i); 2670 sector_t s = raid5_compute_sector(conf, bn, 0, 2671 &dd_idx, NULL); 2672 sh2 = get_active_stripe(conf, s, 0, 1); 2673 if (sh2 == NULL) 2674 /* so far only the early blocks of this stripe 2675 * have been requested. When later blocks 2676 * get requested, we will try again 2677 */ 2678 continue; 2679 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2680 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2681 /* must have already done this block */ 2682 release_stripe(sh2); 2683 continue; 2684 } 2685 2686 /* place all the copies on one channel */ 2687 tx = async_memcpy(sh2->dev[dd_idx].page, 2688 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2689 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2690 2691 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2692 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2693 for (j = 0; j < conf->raid_disks; j++) 2694 if (j != sh2->pd_idx && 2695 (!r6s || j != sh2->qd_idx) && 2696 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2697 break; 2698 if (j == conf->raid_disks) { 2699 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2700 set_bit(STRIPE_HANDLE, &sh2->state); 2701 } 2702 release_stripe(sh2); 2703 2704 } 2705 /* done submitting copies, wait for them to complete */ 2706 if (tx) { 2707 async_tx_ack(tx); 2708 dma_wait_for_async_tx(tx); 2709 } 2710} 2711 2712 2713/* 2714 * handle_stripe - do things to a stripe. 2715 * 2716 * We lock the stripe and then examine the state of various bits 2717 * to see what needs to be done. 2718 * Possible results: 2719 * return some read request which now have data 2720 * return some write requests which are safely on disc 2721 * schedule a read on some buffers 2722 * schedule a write of some buffers 2723 * return confirmation of parity correctness 2724 * 2725 * buffers are taken off read_list or write_list, and bh_cache buffers 2726 * get BH_Lock set before the stripe lock is released. 2727 * 2728 */ 2729 2730static bool handle_stripe5(struct stripe_head *sh) 2731{ 2732 raid5_conf_t *conf = sh->raid_conf; 2733 int disks = sh->disks, i; 2734 struct bio *return_bi = NULL; 2735 struct stripe_head_state s; 2736 struct r5dev *dev; 2737 mdk_rdev_t *blocked_rdev = NULL; 2738 int prexor; 2739 2740 memset(&s, 0, sizeof(s)); 2741 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 2742 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 2743 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 2744 sh->reconstruct_state); 2745 2746 spin_lock(&sh->lock); 2747 clear_bit(STRIPE_HANDLE, &sh->state); 2748 clear_bit(STRIPE_DELAYED, &sh->state); 2749 2750 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2751 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2752 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2753 2754 /* Now to look around and see what can be done */ 2755 rcu_read_lock(); 2756 for (i=disks; i--; ) { 2757 mdk_rdev_t *rdev; 2758 struct r5dev *dev = &sh->dev[i]; 2759 clear_bit(R5_Insync, &dev->flags); 2760 2761 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 2762 "written %p\n", i, dev->flags, dev->toread, dev->read, 2763 dev->towrite, dev->written); 2764 2765 /* maybe we can request a biofill operation 2766 * 2767 * new wantfill requests are only permitted while 2768 * ops_complete_biofill is guaranteed to be inactive 2769 */ 2770 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2771 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 2772 set_bit(R5_Wantfill, &dev->flags); 2773 2774 /* now count some things */ 2775 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 2776 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 2777 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 2778 2779 if (test_bit(R5_Wantfill, &dev->flags)) 2780 s.to_fill++; 2781 else if (dev->toread) 2782 s.to_read++; 2783 if (dev->towrite) { 2784 s.to_write++; 2785 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2786 s.non_overwrite++; 2787 } 2788 if (dev->written) 2789 s.written++; 2790 rdev = rcu_dereference(conf->disks[i].rdev); 2791 if (blocked_rdev == NULL && 2792 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2793 blocked_rdev = rdev; 2794 atomic_inc(&rdev->nr_pending); 2795 } 2796 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2797 /* The ReadError flag will just be confusing now */ 2798 clear_bit(R5_ReadError, &dev->flags); 2799 clear_bit(R5_ReWrite, &dev->flags); 2800 } 2801 if (!rdev || !test_bit(In_sync, &rdev->flags) 2802 || test_bit(R5_ReadError, &dev->flags)) { 2803 s.failed++; 2804 s.failed_num = i; 2805 } else 2806 set_bit(R5_Insync, &dev->flags); 2807 } 2808 rcu_read_unlock(); 2809 2810 if (unlikely(blocked_rdev)) { 2811 if (s.syncing || s.expanding || s.expanded || 2812 s.to_write || s.written) { 2813 set_bit(STRIPE_HANDLE, &sh->state); 2814 goto unlock; 2815 } 2816 /* There is nothing for the blocked_rdev to block */ 2817 rdev_dec_pending(blocked_rdev, conf->mddev); 2818 blocked_rdev = NULL; 2819 } 2820 2821 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 2822 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 2823 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 2824 } 2825 2826 pr_debug("locked=%d uptodate=%d to_read=%d" 2827 " to_write=%d failed=%d failed_num=%d\n", 2828 s.locked, s.uptodate, s.to_read, s.to_write, 2829 s.failed, s.failed_num); 2830 /* check if the array has lost two devices and, if so, some requests might 2831 * need to be failed 2832 */ 2833 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2834 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 2835 if (s.failed > 1 && s.syncing) { 2836 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2837 clear_bit(STRIPE_SYNCING, &sh->state); 2838 s.syncing = 0; 2839 } 2840 2841 /* might be able to return some write requests if the parity block 2842 * is safe, or on a failed drive 2843 */ 2844 dev = &sh->dev[sh->pd_idx]; 2845 if ( s.written && 2846 ((test_bit(R5_Insync, &dev->flags) && 2847 !test_bit(R5_LOCKED, &dev->flags) && 2848 test_bit(R5_UPTODATE, &dev->flags)) || 2849 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2850 handle_stripe_clean_event(conf, sh, disks, &return_bi); 2851 2852 /* Now we might consider reading some blocks, either to check/generate 2853 * parity, or to satisfy requests 2854 * or to load a block that is being partially written. 2855 */ 2856 if (s.to_read || s.non_overwrite || 2857 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 2858 handle_stripe_fill5(sh, &s, disks); 2859 2860 /* Now we check to see if any write operations have recently 2861 * completed 2862 */ 2863 prexor = 0; 2864 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 2865 prexor = 1; 2866 if (sh->reconstruct_state == reconstruct_state_drain_result || 2867 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 2868 sh->reconstruct_state = reconstruct_state_idle; 2869 2870 /* All the 'written' buffers and the parity block are ready to 2871 * be written back to disk 2872 */ 2873 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 2874 for (i = disks; i--; ) { 2875 dev = &sh->dev[i]; 2876 if (test_bit(R5_LOCKED, &dev->flags) && 2877 (i == sh->pd_idx || dev->written)) { 2878 pr_debug("Writing block %d\n", i); 2879 set_bit(R5_Wantwrite, &dev->flags); 2880 if (prexor) 2881 continue; 2882 if (!test_bit(R5_Insync, &dev->flags) || 2883 (i == sh->pd_idx && s.failed == 0)) 2884 set_bit(STRIPE_INSYNC, &sh->state); 2885 } 2886 } 2887 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2888 atomic_dec(&conf->preread_active_stripes); 2889 if (atomic_read(&conf->preread_active_stripes) < 2890 IO_THRESHOLD) 2891 md_wakeup_thread(conf->mddev->thread); 2892 } 2893 } 2894 2895 /* Now to consider new write requests and what else, if anything 2896 * should be read. We do not handle new writes when: 2897 * 1/ A 'write' operation (copy+xor) is already in flight. 2898 * 2/ A 'check' operation is in flight, as it may clobber the parity 2899 * block. 2900 */ 2901 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 2902 handle_stripe_dirtying5(conf, sh, &s, disks); 2903 2904 /* maybe we need to check and possibly fix the parity for this stripe 2905 * Any reads will already have been scheduled, so we just see if enough 2906 * data is available. The parity check is held off while parity 2907 * dependent operations are in flight. 2908 */ 2909 if (sh->check_state || 2910 (s.syncing && s.locked == 0 && 2911 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 2912 !test_bit(STRIPE_INSYNC, &sh->state))) 2913 handle_parity_checks5(conf, sh, &s, disks); 2914 2915 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2916 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2917 clear_bit(STRIPE_SYNCING, &sh->state); 2918 } 2919 2920 /* If the failed drive is just a ReadError, then we might need to progress 2921 * the repair/check process 2922 */ 2923 if (s.failed == 1 && !conf->mddev->ro && 2924 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 2925 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 2926 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 2927 ) { 2928 dev = &sh->dev[s.failed_num]; 2929 if (!test_bit(R5_ReWrite, &dev->flags)) { 2930 set_bit(R5_Wantwrite, &dev->flags); 2931 set_bit(R5_ReWrite, &dev->flags); 2932 set_bit(R5_LOCKED, &dev->flags); 2933 s.locked++; 2934 } else { 2935 /* let's read it back */ 2936 set_bit(R5_Wantread, &dev->flags); 2937 set_bit(R5_LOCKED, &dev->flags); 2938 s.locked++; 2939 } 2940 } 2941 2942 /* Finish reconstruct operations initiated by the expansion process */ 2943 if (sh->reconstruct_state == reconstruct_state_result) { 2944 sh->reconstruct_state = reconstruct_state_idle; 2945 clear_bit(STRIPE_EXPANDING, &sh->state); 2946 for (i = conf->raid_disks; i--; ) { 2947 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2948 set_bit(R5_LOCKED, &sh->dev[i].flags); 2949 s.locked++; 2950 } 2951 } 2952 2953 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2954 !sh->reconstruct_state) { 2955 /* Need to write out all blocks after computing parity */ 2956 sh->disks = conf->raid_disks; 2957 stripe_set_idx(sh->sector, conf, 0, sh); 2958 schedule_reconstruction5(sh, &s, 1, 1); 2959 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 2960 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2961 atomic_dec(&conf->reshape_stripes); 2962 wake_up(&conf->wait_for_overlap); 2963 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2964 } 2965 2966 if (s.expanding && s.locked == 0 && 2967 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 2968 handle_stripe_expansion(conf, sh, NULL); 2969 2970 unlock: 2971 spin_unlock(&sh->lock); 2972 2973 /* wait for this device to become unblocked */ 2974 if (unlikely(blocked_rdev)) 2975 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2976 2977 if (s.ops_request) 2978 raid5_run_ops(sh, s.ops_request); 2979 2980 ops_run_io(sh, &s); 2981 2982 return_io(return_bi); 2983 2984 return blocked_rdev == NULL; 2985} 2986 2987static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2988{ 2989 raid5_conf_t *conf = sh->raid_conf; 2990 int disks = sh->disks; 2991 struct bio *return_bi = NULL; 2992 int i, pd_idx = sh->pd_idx; 2993 struct stripe_head_state s; 2994 struct r6_state r6s; 2995 struct r5dev *dev, *pdev, *qdev; 2996 mdk_rdev_t *blocked_rdev = NULL; 2997 2998 r6s.qd_idx = sh->qd_idx; 2999 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3000 "pd_idx=%d, qd_idx=%d\n", 3001 (unsigned long long)sh->sector, sh->state, 3002 atomic_read(&sh->count), pd_idx, r6s.qd_idx); 3003 memset(&s, 0, sizeof(s)); 3004 3005 spin_lock(&sh->lock); 3006 clear_bit(STRIPE_HANDLE, &sh->state); 3007 clear_bit(STRIPE_DELAYED, &sh->state); 3008 3009 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3010 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3011 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3012 /* Now to look around and see what can be done */ 3013 3014 rcu_read_lock(); 3015 for (i=disks; i--; ) { 3016 mdk_rdev_t *rdev; 3017 dev = &sh->dev[i]; 3018 clear_bit(R5_Insync, &dev->flags); 3019 3020 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3021 i, dev->flags, dev->toread, dev->towrite, dev->written); 3022 /* maybe we can reply to a read */ 3023 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3024 struct bio *rbi, *rbi2; 3025 pr_debug("Return read for disc %d\n", i); 3026 spin_lock_irq(&conf->device_lock); 3027 rbi = dev->toread; 3028 dev->toread = NULL; 3029 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3030 wake_up(&conf->wait_for_overlap); 3031 spin_unlock_irq(&conf->device_lock); 3032 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { 3033 copy_data(0, rbi, dev->page, dev->sector); 3034 rbi2 = r5_next_bio(rbi, dev->sector); 3035 spin_lock_irq(&conf->device_lock); 3036 if (!raid5_dec_bi_phys_segments(rbi)) { 3037 rbi->bi_next = return_bi; 3038 return_bi = rbi; 3039 } 3040 spin_unlock_irq(&conf->device_lock); 3041 rbi = rbi2; 3042 } 3043 } 3044 3045 /* now count some things */ 3046 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3047 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3048 3049 3050 if (dev->toread) 3051 s.to_read++; 3052 if (dev->towrite) { 3053 s.to_write++; 3054 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3055 s.non_overwrite++; 3056 } 3057 if (dev->written) 3058 s.written++; 3059 rdev = rcu_dereference(conf->disks[i].rdev); 3060 if (blocked_rdev == NULL && 3061 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3062 blocked_rdev = rdev; 3063 atomic_inc(&rdev->nr_pending); 3064 } 3065 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3066 /* The ReadError flag will just be confusing now */ 3067 clear_bit(R5_ReadError, &dev->flags); 3068 clear_bit(R5_ReWrite, &dev->flags); 3069 } 3070 if (!rdev || !test_bit(In_sync, &rdev->flags) 3071 || test_bit(R5_ReadError, &dev->flags)) { 3072 if (s.failed < 2) 3073 r6s.failed_num[s.failed] = i; 3074 s.failed++; 3075 } else 3076 set_bit(R5_Insync, &dev->flags); 3077 } 3078 rcu_read_unlock(); 3079 3080 if (unlikely(blocked_rdev)) { 3081 if (s.syncing || s.expanding || s.expanded || 3082 s.to_write || s.written) { 3083 set_bit(STRIPE_HANDLE, &sh->state); 3084 goto unlock; 3085 } 3086 /* There is nothing for the blocked_rdev to block */ 3087 rdev_dec_pending(blocked_rdev, conf->mddev); 3088 blocked_rdev = NULL; 3089 } 3090 3091 pr_debug("locked=%d uptodate=%d to_read=%d" 3092 " to_write=%d failed=%d failed_num=%d,%d\n", 3093 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3094 r6s.failed_num[0], r6s.failed_num[1]); 3095 /* check if the array has lost >2 devices and, if so, some requests 3096 * might need to be failed 3097 */ 3098 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3099 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3100 if (s.failed > 2 && s.syncing) { 3101 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3102 clear_bit(STRIPE_SYNCING, &sh->state); 3103 s.syncing = 0; 3104 } 3105 3106 /* 3107 * might be able to return some write requests if the parity blocks 3108 * are safe, or on a failed drive 3109 */ 3110 pdev = &sh->dev[pd_idx]; 3111 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3112 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3113 qdev = &sh->dev[r6s.qd_idx]; 3114 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) 3115 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); 3116 3117 if ( s.written && 3118 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3119 && !test_bit(R5_LOCKED, &pdev->flags) 3120 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3121 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3122 && !test_bit(R5_LOCKED, &qdev->flags) 3123 && test_bit(R5_UPTODATE, &qdev->flags))))) 3124 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3125 3126 /* Now we might consider reading some blocks, either to check/generate 3127 * parity, or to satisfy requests 3128 * or to load a block that is being partially written. 3129 */ 3130 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3131 (s.syncing && (s.uptodate < disks)) || s.expanding) 3132 handle_stripe_fill6(sh, &s, &r6s, disks); 3133 3134 /* now to consider writing and what else, if anything should be read */ 3135 if (s.to_write) 3136 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3137 3138 /* maybe we need to check and possibly fix the parity for this stripe 3139 * Any reads will already have been scheduled, so we just see if enough 3140 * data is available 3141 */ 3142 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3143 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3144 3145 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3146 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3147 clear_bit(STRIPE_SYNCING, &sh->state); 3148 } 3149 3150 /* If the failed drives are just a ReadError, then we might need 3151 * to progress the repair/check process 3152 */ 3153 if (s.failed <= 2 && !conf->mddev->ro) 3154 for (i = 0; i < s.failed; i++) { 3155 dev = &sh->dev[r6s.failed_num[i]]; 3156 if (test_bit(R5_ReadError, &dev->flags) 3157 && !test_bit(R5_LOCKED, &dev->flags) 3158 && test_bit(R5_UPTODATE, &dev->flags) 3159 ) { 3160 if (!test_bit(R5_ReWrite, &dev->flags)) { 3161 set_bit(R5_Wantwrite, &dev->flags); 3162 set_bit(R5_ReWrite, &dev->flags); 3163 set_bit(R5_LOCKED, &dev->flags); 3164 } else { 3165 /* let's read it back */ 3166 set_bit(R5_Wantread, &dev->flags); 3167 set_bit(R5_LOCKED, &dev->flags); 3168 } 3169 } 3170 } 3171 3172 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3173 /* Need to write out all blocks after computing P&Q */ 3174 sh->disks = conf->raid_disks; 3175 stripe_set_idx(sh->sector, conf, 0, sh); 3176 compute_parity6(sh, RECONSTRUCT_WRITE); 3177 for (i = conf->raid_disks ; i-- ; ) { 3178 set_bit(R5_LOCKED, &sh->dev[i].flags); 3179 s.locked++; 3180 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3181 } 3182 clear_bit(STRIPE_EXPANDING, &sh->state); 3183 } else if (s.expanded) { 3184 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3185 atomic_dec(&conf->reshape_stripes); 3186 wake_up(&conf->wait_for_overlap); 3187 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3188 } 3189 3190 if (s.expanding && s.locked == 0 && 3191 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3192 handle_stripe_expansion(conf, sh, &r6s); 3193 3194 unlock: 3195 spin_unlock(&sh->lock); 3196 3197 /* wait for this device to become unblocked */ 3198 if (unlikely(blocked_rdev)) 3199 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3200 3201 ops_run_io(sh, &s); 3202 3203 return_io(return_bi); 3204 3205 return blocked_rdev == NULL; 3206} 3207 3208/* returns true if the stripe was handled */ 3209static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3210{ 3211 if (sh->raid_conf->level == 6) 3212 return handle_stripe6(sh, tmp_page); 3213 else 3214 return handle_stripe5(sh); 3215} 3216 3217 3218 3219static void raid5_activate_delayed(raid5_conf_t *conf) 3220{ 3221 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3222 while (!list_empty(&conf->delayed_list)) { 3223 struct list_head *l = conf->delayed_list.next; 3224 struct stripe_head *sh; 3225 sh = list_entry(l, struct stripe_head, lru); 3226 list_del_init(l); 3227 clear_bit(STRIPE_DELAYED, &sh->state); 3228 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3229 atomic_inc(&conf->preread_active_stripes); 3230 list_add_tail(&sh->lru, &conf->hold_list); 3231 } 3232 } else 3233 blk_plug_device(conf->mddev->queue); 3234} 3235 3236static void activate_bit_delay(raid5_conf_t *conf) 3237{ 3238 /* device_lock is held */ 3239 struct list_head head; 3240 list_add(&head, &conf->bitmap_list); 3241 list_del_init(&conf->bitmap_list); 3242 while (!list_empty(&head)) { 3243 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3244 list_del_init(&sh->lru); 3245 atomic_inc(&sh->count); 3246 __release_stripe(conf, sh); 3247 } 3248} 3249 3250static void unplug_slaves(mddev_t *mddev) 3251{ 3252 raid5_conf_t *conf = mddev_to_conf(mddev); 3253 int i; 3254 3255 rcu_read_lock(); 3256 for (i=0; i<mddev->raid_disks; i++) { 3257 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3258 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3259 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3260 3261 atomic_inc(&rdev->nr_pending); 3262 rcu_read_unlock(); 3263 3264 blk_unplug(r_queue); 3265 3266 rdev_dec_pending(rdev, mddev); 3267 rcu_read_lock(); 3268 } 3269 } 3270 rcu_read_unlock(); 3271} 3272 3273static void raid5_unplug_device(struct request_queue *q) 3274{ 3275 mddev_t *mddev = q->queuedata; 3276 raid5_conf_t *conf = mddev_to_conf(mddev); 3277 unsigned long flags; 3278 3279 spin_lock_irqsave(&conf->device_lock, flags); 3280 3281 if (blk_remove_plug(q)) { 3282 conf->seq_flush++; 3283 raid5_activate_delayed(conf); 3284 } 3285 md_wakeup_thread(mddev->thread); 3286 3287 spin_unlock_irqrestore(&conf->device_lock, flags); 3288 3289 unplug_slaves(mddev); 3290} 3291 3292static int raid5_congested(void *data, int bits) 3293{ 3294 mddev_t *mddev = data; 3295 raid5_conf_t *conf = mddev_to_conf(mddev); 3296 3297 /* No difference between reads and writes. Just check 3298 * how busy the stripe_cache is 3299 */ 3300 if (conf->inactive_blocked) 3301 return 1; 3302 if (conf->quiesce) 3303 return 1; 3304 if (list_empty_careful(&conf->inactive_list)) 3305 return 1; 3306 3307 return 0; 3308} 3309 3310/* We want read requests to align with chunks where possible, 3311 * but write requests don't need to. 3312 */ 3313static int raid5_mergeable_bvec(struct request_queue *q, 3314 struct bvec_merge_data *bvm, 3315 struct bio_vec *biovec) 3316{ 3317 mddev_t *mddev = q->queuedata; 3318 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3319 int max; 3320 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3321 unsigned int bio_sectors = bvm->bi_size >> 9; 3322 3323 if ((bvm->bi_rw & 1) == WRITE) 3324 return biovec->bv_len; /* always allow writes to be mergeable */ 3325 3326 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3327 if (max < 0) max = 0; 3328 if (max <= biovec->bv_len && bio_sectors == 0) 3329 return biovec->bv_len; 3330 else 3331 return max; 3332} 3333 3334 3335static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3336{ 3337 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3338 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3339 unsigned int bio_sectors = bio->bi_size >> 9; 3340 3341 return chunk_sectors >= 3342 ((sector & (chunk_sectors - 1)) + bio_sectors); 3343} 3344 3345/* 3346 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3347 * later sampled by raid5d. 3348 */ 3349static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3350{ 3351 unsigned long flags; 3352 3353 spin_lock_irqsave(&conf->device_lock, flags); 3354 3355 bi->bi_next = conf->retry_read_aligned_list; 3356 conf->retry_read_aligned_list = bi; 3357 3358 spin_unlock_irqrestore(&conf->device_lock, flags); 3359 md_wakeup_thread(conf->mddev->thread); 3360} 3361 3362 3363static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3364{ 3365 struct bio *bi; 3366 3367 bi = conf->retry_read_aligned; 3368 if (bi) { 3369 conf->retry_read_aligned = NULL; 3370 return bi; 3371 } 3372 bi = conf->retry_read_aligned_list; 3373 if(bi) { 3374 conf->retry_read_aligned_list = bi->bi_next; 3375 bi->bi_next = NULL; 3376 /* 3377 * this sets the active strip count to 1 and the processed 3378 * strip count to zero (upper 8 bits) 3379 */ 3380 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3381 } 3382 3383 return bi; 3384} 3385 3386 3387/* 3388 * The "raid5_align_endio" should check if the read succeeded and if it 3389 * did, call bio_endio on the original bio (having bio_put the new bio 3390 * first). 3391 * If the read failed.. 3392 */ 3393static void raid5_align_endio(struct bio *bi, int error) 3394{ 3395 struct bio* raid_bi = bi->bi_private; 3396 mddev_t *mddev; 3397 raid5_conf_t *conf; 3398 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3399 mdk_rdev_t *rdev; 3400 3401 bio_put(bi); 3402 3403 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3404 conf = mddev_to_conf(mddev); 3405 rdev = (void*)raid_bi->bi_next; 3406 raid_bi->bi_next = NULL; 3407 3408 rdev_dec_pending(rdev, conf->mddev); 3409 3410 if (!error && uptodate) { 3411 bio_endio(raid_bi, 0); 3412 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3413 wake_up(&conf->wait_for_stripe); 3414 return; 3415 } 3416 3417 3418 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3419 3420 add_bio_to_retry(raid_bi, conf); 3421} 3422 3423static int bio_fits_rdev(struct bio *bi) 3424{ 3425 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3426 3427 if ((bi->bi_size>>9) > q->max_sectors) 3428 return 0; 3429 blk_recount_segments(q, bi); 3430 if (bi->bi_phys_segments > q->max_phys_segments) 3431 return 0; 3432 3433 if (q->merge_bvec_fn) 3434 /* it's too hard to apply the merge_bvec_fn at this stage, 3435 * just just give up 3436 */ 3437 return 0; 3438 3439 return 1; 3440} 3441 3442 3443static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3444{ 3445 mddev_t *mddev = q->queuedata; 3446 raid5_conf_t *conf = mddev_to_conf(mddev); 3447 unsigned int dd_idx; 3448 struct bio* align_bi; 3449 mdk_rdev_t *rdev; 3450 3451 if (!in_chunk_boundary(mddev, raid_bio)) { 3452 pr_debug("chunk_aligned_read : non aligned\n"); 3453 return 0; 3454 } 3455 /* 3456 * use bio_clone to make a copy of the bio 3457 */ 3458 align_bi = bio_clone(raid_bio, GFP_NOIO); 3459 if (!align_bi) 3460 return 0; 3461 /* 3462 * set bi_end_io to a new function, and set bi_private to the 3463 * original bio. 3464 */ 3465 align_bi->bi_end_io = raid5_align_endio; 3466 align_bi->bi_private = raid_bio; 3467 /* 3468 * compute position 3469 */ 3470 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3471 0, 3472 &dd_idx, NULL); 3473 3474 rcu_read_lock(); 3475 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3476 if (rdev && test_bit(In_sync, &rdev->flags)) { 3477 atomic_inc(&rdev->nr_pending); 3478 rcu_read_unlock(); 3479 raid_bio->bi_next = (void*)rdev; 3480 align_bi->bi_bdev = rdev->bdev; 3481 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3482 align_bi->bi_sector += rdev->data_offset; 3483 3484 if (!bio_fits_rdev(align_bi)) { 3485 /* too big in some way */ 3486 bio_put(align_bi); 3487 rdev_dec_pending(rdev, mddev); 3488 return 0; 3489 } 3490 3491 spin_lock_irq(&conf->device_lock); 3492 wait_event_lock_irq(conf->wait_for_stripe, 3493 conf->quiesce == 0, 3494 conf->device_lock, /* nothing */); 3495 atomic_inc(&conf->active_aligned_reads); 3496 spin_unlock_irq(&conf->device_lock); 3497 3498 generic_make_request(align_bi); 3499 return 1; 3500 } else { 3501 rcu_read_unlock(); 3502 bio_put(align_bi); 3503 return 0; 3504 } 3505} 3506 3507/* __get_priority_stripe - get the next stripe to process 3508 * 3509 * Full stripe writes are allowed to pass preread active stripes up until 3510 * the bypass_threshold is exceeded. In general the bypass_count 3511 * increments when the handle_list is handled before the hold_list; however, it 3512 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3513 * stripe with in flight i/o. The bypass_count will be reset when the 3514 * head of the hold_list has changed, i.e. the head was promoted to the 3515 * handle_list. 3516 */ 3517static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3518{ 3519 struct stripe_head *sh; 3520 3521 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3522 __func__, 3523 list_empty(&conf->handle_list) ? "empty" : "busy", 3524 list_empty(&conf->hold_list) ? "empty" : "busy", 3525 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3526 3527 if (!list_empty(&conf->handle_list)) { 3528 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3529 3530 if (list_empty(&conf->hold_list)) 3531 conf->bypass_count = 0; 3532 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3533 if (conf->hold_list.next == conf->last_hold) 3534 conf->bypass_count++; 3535 else { 3536 conf->last_hold = conf->hold_list.next; 3537 conf->bypass_count -= conf->bypass_threshold; 3538 if (conf->bypass_count < 0) 3539 conf->bypass_count = 0; 3540 } 3541 } 3542 } else if (!list_empty(&conf->hold_list) && 3543 ((conf->bypass_threshold && 3544 conf->bypass_count > conf->bypass_threshold) || 3545 atomic_read(&conf->pending_full_writes) == 0)) { 3546 sh = list_entry(conf->hold_list.next, 3547 typeof(*sh), lru); 3548 conf->bypass_count -= conf->bypass_threshold; 3549 if (conf->bypass_count < 0) 3550 conf->bypass_count = 0; 3551 } else 3552 return NULL; 3553 3554 list_del_init(&sh->lru); 3555 atomic_inc(&sh->count); 3556 BUG_ON(atomic_read(&sh->count) != 1); 3557 return sh; 3558} 3559 3560static int make_request(struct request_queue *q, struct bio * bi) 3561{ 3562 mddev_t *mddev = q->queuedata; 3563 raid5_conf_t *conf = mddev_to_conf(mddev); 3564 int dd_idx; 3565 sector_t new_sector; 3566 sector_t logical_sector, last_sector; 3567 struct stripe_head *sh; 3568 const int rw = bio_data_dir(bi); 3569 int cpu, remaining; 3570 3571 if (unlikely(bio_barrier(bi))) { 3572 bio_endio(bi, -EOPNOTSUPP); 3573 return 0; 3574 } 3575 3576 md_write_start(mddev, bi); 3577 3578 cpu = part_stat_lock(); 3579 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 3580 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 3581 bio_sectors(bi)); 3582 part_stat_unlock(); 3583 3584 if (rw == READ && 3585 mddev->reshape_position == MaxSector && 3586 chunk_aligned_read(q,bi)) 3587 return 0; 3588 3589 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3590 last_sector = bi->bi_sector + (bi->bi_size>>9); 3591 bi->bi_next = NULL; 3592 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3593 3594 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3595 DEFINE_WAIT(w); 3596 int disks, data_disks; 3597 int previous; 3598 3599 retry: 3600 previous = 0; 3601 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3602 if (likely(conf->expand_progress == MaxSector)) 3603 disks = conf->raid_disks; 3604 else { 3605 /* spinlock is needed as expand_progress may be 3606 * 64bit on a 32bit platform, and so it might be 3607 * possible to see a half-updated value 3608 * Ofcourse expand_progress could change after 3609 * the lock is dropped, so once we get a reference 3610 * to the stripe that we think it is, we will have 3611 * to check again. 3612 */ 3613 spin_lock_irq(&conf->device_lock); 3614 disks = conf->raid_disks; 3615 if (logical_sector >= conf->expand_progress) { 3616 disks = conf->previous_raid_disks; 3617 previous = 1; 3618 } else { 3619 if (logical_sector >= conf->expand_lo) { 3620 spin_unlock_irq(&conf->device_lock); 3621 schedule(); 3622 goto retry; 3623 } 3624 } 3625 spin_unlock_irq(&conf->device_lock); 3626 } 3627 data_disks = disks - conf->max_degraded; 3628 3629 new_sector = raid5_compute_sector(conf, logical_sector, 3630 previous, 3631 &dd_idx, NULL); 3632 pr_debug("raid5: make_request, sector %llu logical %llu\n", 3633 (unsigned long long)new_sector, 3634 (unsigned long long)logical_sector); 3635 3636 sh = get_active_stripe(conf, new_sector, previous, 3637 (bi->bi_rw&RWA_MASK)); 3638 if (sh) { 3639 if (unlikely(conf->expand_progress != MaxSector)) { 3640 /* expansion might have moved on while waiting for a 3641 * stripe, so we must do the range check again. 3642 * Expansion could still move past after this 3643 * test, but as we are holding a reference to 3644 * 'sh', we know that if that happens, 3645 * STRIPE_EXPANDING will get set and the expansion 3646 * won't proceed until we finish with the stripe. 3647 */ 3648 int must_retry = 0; 3649 spin_lock_irq(&conf->device_lock); 3650 if (logical_sector < conf->expand_progress && 3651 disks == conf->previous_raid_disks) 3652 /* mismatch, need to try again */ 3653 must_retry = 1; 3654 spin_unlock_irq(&conf->device_lock); 3655 if (must_retry) { 3656 release_stripe(sh); 3657 goto retry; 3658 } 3659 } 3660 /* FIXME what if we get a false positive because these 3661 * are being updated. 3662 */ 3663 if (logical_sector >= mddev->suspend_lo && 3664 logical_sector < mddev->suspend_hi) { 3665 release_stripe(sh); 3666 schedule(); 3667 goto retry; 3668 } 3669 3670 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3671 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3672 /* Stripe is busy expanding or 3673 * add failed due to overlap. Flush everything 3674 * and wait a while 3675 */ 3676 raid5_unplug_device(mddev->queue); 3677 release_stripe(sh); 3678 schedule(); 3679 goto retry; 3680 } 3681 finish_wait(&conf->wait_for_overlap, &w); 3682 set_bit(STRIPE_HANDLE, &sh->state); 3683 clear_bit(STRIPE_DELAYED, &sh->state); 3684 release_stripe(sh); 3685 } else { 3686 /* cannot get stripe for read-ahead, just give-up */ 3687 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3688 finish_wait(&conf->wait_for_overlap, &w); 3689 break; 3690 } 3691 3692 } 3693 spin_lock_irq(&conf->device_lock); 3694 remaining = raid5_dec_bi_phys_segments(bi); 3695 spin_unlock_irq(&conf->device_lock); 3696 if (remaining == 0) { 3697 3698 if ( rw == WRITE ) 3699 md_write_end(mddev); 3700 3701 bio_endio(bi, 0); 3702 } 3703 return 0; 3704} 3705 3706static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 3707{ 3708 /* reshaping is quite different to recovery/resync so it is 3709 * handled quite separately ... here. 3710 * 3711 * On each call to sync_request, we gather one chunk worth of 3712 * destination stripes and flag them as expanding. 3713 * Then we find all the source stripes and request reads. 3714 * As the reads complete, handle_stripe will copy the data 3715 * into the destination stripe and release that stripe. 3716 */ 3717 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3718 struct stripe_head *sh; 3719 sector_t first_sector, last_sector; 3720 int raid_disks = conf->previous_raid_disks; 3721 int data_disks = raid_disks - conf->max_degraded; 3722 int new_data_disks = conf->raid_disks - conf->max_degraded; 3723 int i; 3724 int dd_idx; 3725 sector_t writepos, safepos, gap; 3726 3727 if (sector_nr == 0 && 3728 conf->expand_progress != 0) { 3729 /* restarting in the middle, skip the initial sectors */ 3730 sector_nr = conf->expand_progress; 3731 sector_div(sector_nr, new_data_disks); 3732 *skipped = 1; 3733 return sector_nr; 3734 } 3735 3736 /* we update the metadata when there is more than 3Meg 3737 * in the block range (that is rather arbitrary, should 3738 * probably be time based) or when the data about to be 3739 * copied would over-write the source of the data at 3740 * the front of the range. 3741 * i.e. one new_stripe forward from expand_progress new_maps 3742 * to after where expand_lo old_maps to 3743 */ 3744 writepos = conf->expand_progress + 3745 conf->chunk_size/512*(new_data_disks); 3746 sector_div(writepos, new_data_disks); 3747 safepos = conf->expand_lo; 3748 sector_div(safepos, data_disks); 3749 gap = conf->expand_progress - conf->expand_lo; 3750 3751 if (writepos >= safepos || 3752 gap > (new_data_disks)*3000*2 /*3Meg*/) { 3753 /* Cannot proceed until we've updated the superblock... */ 3754 wait_event(conf->wait_for_overlap, 3755 atomic_read(&conf->reshape_stripes)==0); 3756 mddev->reshape_position = conf->expand_progress; 3757 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3758 md_wakeup_thread(mddev->thread); 3759 wait_event(mddev->sb_wait, mddev->flags == 0 || 3760 kthread_should_stop()); 3761 spin_lock_irq(&conf->device_lock); 3762 conf->expand_lo = mddev->reshape_position; 3763 spin_unlock_irq(&conf->device_lock); 3764 wake_up(&conf->wait_for_overlap); 3765 } 3766 3767 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 3768 int j; 3769 int skipped = 0; 3770 sh = get_active_stripe(conf, sector_nr+i, 0, 0); 3771 set_bit(STRIPE_EXPANDING, &sh->state); 3772 atomic_inc(&conf->reshape_stripes); 3773 /* If any of this stripe is beyond the end of the old 3774 * array, then we need to zero those blocks 3775 */ 3776 for (j=sh->disks; j--;) { 3777 sector_t s; 3778 if (j == sh->pd_idx) 3779 continue; 3780 if (conf->level == 6 && 3781 j == sh->qd_idx) 3782 continue; 3783 s = compute_blocknr(sh, j); 3784 if (s < mddev->array_sectors) { 3785 skipped = 1; 3786 continue; 3787 } 3788 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 3789 set_bit(R5_Expanded, &sh->dev[j].flags); 3790 set_bit(R5_UPTODATE, &sh->dev[j].flags); 3791 } 3792 if (!skipped) { 3793 set_bit(STRIPE_EXPAND_READY, &sh->state); 3794 set_bit(STRIPE_HANDLE, &sh->state); 3795 } 3796 release_stripe(sh); 3797 } 3798 spin_lock_irq(&conf->device_lock); 3799 conf->expand_progress = (sector_nr + i) * new_data_disks; 3800 spin_unlock_irq(&conf->device_lock); 3801 /* Ok, those stripe are ready. We can start scheduling 3802 * reads on the source stripes. 3803 * The source stripes are determined by mapping the first and last 3804 * block on the destination stripes. 3805 */ 3806 first_sector = 3807 raid5_compute_sector(conf, sector_nr*(new_data_disks), 3808 1, &dd_idx, NULL); 3809 last_sector = 3810 raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) 3811 *(new_data_disks) - 1), 3812 1, &dd_idx, NULL); 3813 if (last_sector >= mddev->dev_sectors) 3814 last_sector = mddev->dev_sectors - 1; 3815 while (first_sector <= last_sector) { 3816 sh = get_active_stripe(conf, first_sector, 1, 0); 3817 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3818 set_bit(STRIPE_HANDLE, &sh->state); 3819 release_stripe(sh); 3820 first_sector += STRIPE_SECTORS; 3821 } 3822 /* If this takes us to the resync_max point where we have to pause, 3823 * then we need to write out the superblock. 3824 */ 3825 sector_nr += conf->chunk_size>>9; 3826 if (sector_nr >= mddev->resync_max) { 3827 /* Cannot proceed until we've updated the superblock... */ 3828 wait_event(conf->wait_for_overlap, 3829 atomic_read(&conf->reshape_stripes) == 0); 3830 mddev->reshape_position = conf->expand_progress; 3831 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3832 md_wakeup_thread(mddev->thread); 3833 wait_event(mddev->sb_wait, 3834 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 3835 || kthread_should_stop()); 3836 spin_lock_irq(&conf->device_lock); 3837 conf->expand_lo = mddev->reshape_position; 3838 spin_unlock_irq(&conf->device_lock); 3839 wake_up(&conf->wait_for_overlap); 3840 } 3841 return conf->chunk_size>>9; 3842} 3843 3844/* FIXME go_faster isn't used */ 3845static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 3846{ 3847 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3848 struct stripe_head *sh; 3849 sector_t max_sector = mddev->dev_sectors; 3850 int sync_blocks; 3851 int still_degraded = 0; 3852 int i; 3853 3854 if (sector_nr >= max_sector) { 3855 /* just being told to finish up .. nothing much to do */ 3856 unplug_slaves(mddev); 3857 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3858 end_reshape(conf); 3859 return 0; 3860 } 3861 3862 if (mddev->curr_resync < max_sector) /* aborted */ 3863 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3864 &sync_blocks, 1); 3865 else /* completed sync */ 3866 conf->fullsync = 0; 3867 bitmap_close_sync(mddev->bitmap); 3868 3869 return 0; 3870 } 3871 3872 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3873 return reshape_request(mddev, sector_nr, skipped); 3874 3875 /* No need to check resync_max as we never do more than one 3876 * stripe, and as resync_max will always be on a chunk boundary, 3877 * if the check in md_do_sync didn't fire, there is no chance 3878 * of overstepping resync_max here 3879 */ 3880 3881 /* if there is too many failed drives and we are trying 3882 * to resync, then assert that we are finished, because there is 3883 * nothing we can do. 3884 */ 3885 if (mddev->degraded >= conf->max_degraded && 3886 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3887 sector_t rv = mddev->dev_sectors - sector_nr; 3888 *skipped = 1; 3889 return rv; 3890 } 3891 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 3892 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3893 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 3894 /* we can skip this block, and probably more */ 3895 sync_blocks /= STRIPE_SECTORS; 3896 *skipped = 1; 3897 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 3898 } 3899 3900 3901 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 3902 3903 sh = get_active_stripe(conf, sector_nr, 0, 1); 3904 if (sh == NULL) { 3905 sh = get_active_stripe(conf, sector_nr, 0, 0); 3906 /* make sure we don't swamp the stripe cache if someone else 3907 * is trying to get access 3908 */ 3909 schedule_timeout_uninterruptible(1); 3910 } 3911 /* Need to check if array will still be degraded after recovery/resync 3912 * We don't need to check the 'failed' flag as when that gets set, 3913 * recovery aborts. 3914 */ 3915 for (i=0; i<mddev->raid_disks; i++) 3916 if (conf->disks[i].rdev == NULL) 3917 still_degraded = 1; 3918 3919 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 3920 3921 spin_lock(&sh->lock); 3922 set_bit(STRIPE_SYNCING, &sh->state); 3923 clear_bit(STRIPE_INSYNC, &sh->state); 3924 spin_unlock(&sh->lock); 3925 3926 /* wait for any blocked device to be handled */ 3927 while(unlikely(!handle_stripe(sh, NULL))) 3928 ; 3929 release_stripe(sh); 3930 3931 return STRIPE_SECTORS; 3932} 3933 3934static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 3935{ 3936 /* We may not be able to submit a whole bio at once as there 3937 * may not be enough stripe_heads available. 3938 * We cannot pre-allocate enough stripe_heads as we may need 3939 * more than exist in the cache (if we allow ever large chunks). 3940 * So we do one stripe head at a time and record in 3941 * ->bi_hw_segments how many have been done. 3942 * 3943 * We *know* that this entire raid_bio is in one chunk, so 3944 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 3945 */ 3946 struct stripe_head *sh; 3947 int dd_idx; 3948 sector_t sector, logical_sector, last_sector; 3949 int scnt = 0; 3950 int remaining; 3951 int handled = 0; 3952 3953 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3954 sector = raid5_compute_sector(conf, logical_sector, 3955 0, &dd_idx, NULL); 3956 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 3957 3958 for (; logical_sector < last_sector; 3959 logical_sector += STRIPE_SECTORS, 3960 sector += STRIPE_SECTORS, 3961 scnt++) { 3962 3963 if (scnt < raid5_bi_hw_segments(raid_bio)) 3964 /* already done this stripe */ 3965 continue; 3966 3967 sh = get_active_stripe(conf, sector, 0, 1); 3968 3969 if (!sh) { 3970 /* failed to get a stripe - must wait */ 3971 raid5_set_bi_hw_segments(raid_bio, scnt); 3972 conf->retry_read_aligned = raid_bio; 3973 return handled; 3974 } 3975 3976 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 3977 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 3978 release_stripe(sh); 3979 raid5_set_bi_hw_segments(raid_bio, scnt); 3980 conf->retry_read_aligned = raid_bio; 3981 return handled; 3982 } 3983 3984 handle_stripe(sh, NULL); 3985 release_stripe(sh); 3986 handled++; 3987 } 3988 spin_lock_irq(&conf->device_lock); 3989 remaining = raid5_dec_bi_phys_segments(raid_bio); 3990 spin_unlock_irq(&conf->device_lock); 3991 if (remaining == 0) 3992 bio_endio(raid_bio, 0); 3993 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3994 wake_up(&conf->wait_for_stripe); 3995 return handled; 3996} 3997 3998 3999 4000/* 4001 * This is our raid5 kernel thread. 4002 * 4003 * We scan the hash table for stripes which can be handled now. 4004 * During the scan, completed stripes are saved for us by the interrupt 4005 * handler, so that they will not have to wait for our next wakeup. 4006 */ 4007static void raid5d(mddev_t *mddev) 4008{ 4009 struct stripe_head *sh; 4010 raid5_conf_t *conf = mddev_to_conf(mddev); 4011 int handled; 4012 4013 pr_debug("+++ raid5d active\n"); 4014 4015 md_check_recovery(mddev); 4016 4017 handled = 0; 4018 spin_lock_irq(&conf->device_lock); 4019 while (1) { 4020 struct bio *bio; 4021 4022 if (conf->seq_flush != conf->seq_write) { 4023 int seq = conf->seq_flush; 4024 spin_unlock_irq(&conf->device_lock); 4025 bitmap_unplug(mddev->bitmap); 4026 spin_lock_irq(&conf->device_lock); 4027 conf->seq_write = seq; 4028 activate_bit_delay(conf); 4029 } 4030 4031 while ((bio = remove_bio_from_retry(conf))) { 4032 int ok; 4033 spin_unlock_irq(&conf->device_lock); 4034 ok = retry_aligned_read(conf, bio); 4035 spin_lock_irq(&conf->device_lock); 4036 if (!ok) 4037 break; 4038 handled++; 4039 } 4040 4041 sh = __get_priority_stripe(conf); 4042 4043 if (!sh) 4044 break; 4045 spin_unlock_irq(&conf->device_lock); 4046 4047 handled++; 4048 handle_stripe(sh, conf->spare_page); 4049 release_stripe(sh); 4050 4051 spin_lock_irq(&conf->device_lock); 4052 } 4053 pr_debug("%d stripes handled\n", handled); 4054 4055 spin_unlock_irq(&conf->device_lock); 4056 4057 async_tx_issue_pending_all(); 4058 unplug_slaves(mddev); 4059 4060 pr_debug("--- raid5d inactive\n"); 4061} 4062 4063static ssize_t 4064raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4065{ 4066 raid5_conf_t *conf = mddev_to_conf(mddev); 4067 if (conf) 4068 return sprintf(page, "%d\n", conf->max_nr_stripes); 4069 else 4070 return 0; 4071} 4072 4073static ssize_t 4074raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4075{ 4076 raid5_conf_t *conf = mddev_to_conf(mddev); 4077 unsigned long new; 4078 int err; 4079 4080 if (len >= PAGE_SIZE) 4081 return -EINVAL; 4082 if (!conf) 4083 return -ENODEV; 4084 4085 if (strict_strtoul(page, 10, &new)) 4086 return -EINVAL; 4087 if (new <= 16 || new > 32768) 4088 return -EINVAL; 4089 while (new < conf->max_nr_stripes) { 4090 if (drop_one_stripe(conf)) 4091 conf->max_nr_stripes--; 4092 else 4093 break; 4094 } 4095 err = md_allow_write(mddev); 4096 if (err) 4097 return err; 4098 while (new > conf->max_nr_stripes) { 4099 if (grow_one_stripe(conf)) 4100 conf->max_nr_stripes++; 4101 else break; 4102 } 4103 return len; 4104} 4105 4106static struct md_sysfs_entry 4107raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4108 raid5_show_stripe_cache_size, 4109 raid5_store_stripe_cache_size); 4110 4111static ssize_t 4112raid5_show_preread_threshold(mddev_t *mddev, char *page) 4113{ 4114 raid5_conf_t *conf = mddev_to_conf(mddev); 4115 if (conf) 4116 return sprintf(page, "%d\n", conf->bypass_threshold); 4117 else 4118 return 0; 4119} 4120 4121static ssize_t 4122raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4123{ 4124 raid5_conf_t *conf = mddev_to_conf(mddev); 4125 unsigned long new; 4126 if (len >= PAGE_SIZE) 4127 return -EINVAL; 4128 if (!conf) 4129 return -ENODEV; 4130 4131 if (strict_strtoul(page, 10, &new)) 4132 return -EINVAL; 4133 if (new > conf->max_nr_stripes) 4134 return -EINVAL; 4135 conf->bypass_threshold = new; 4136 return len; 4137} 4138 4139static struct md_sysfs_entry 4140raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4141 S_IRUGO | S_IWUSR, 4142 raid5_show_preread_threshold, 4143 raid5_store_preread_threshold); 4144 4145static ssize_t 4146stripe_cache_active_show(mddev_t *mddev, char *page) 4147{ 4148 raid5_conf_t *conf = mddev_to_conf(mddev); 4149 if (conf) 4150 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4151 else 4152 return 0; 4153} 4154 4155static struct md_sysfs_entry 4156raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4157 4158static struct attribute *raid5_attrs[] = { 4159 &raid5_stripecache_size.attr, 4160 &raid5_stripecache_active.attr, 4161 &raid5_preread_bypass_threshold.attr, 4162 NULL, 4163}; 4164static struct attribute_group raid5_attrs_group = { 4165 .name = NULL, 4166 .attrs = raid5_attrs, 4167}; 4168 4169static sector_t 4170raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4171{ 4172 raid5_conf_t *conf = mddev_to_conf(mddev); 4173 4174 if (!sectors) 4175 sectors = mddev->dev_sectors; 4176 if (!raid_disks) 4177 raid_disks = conf->previous_raid_disks; 4178 4179 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4180 return sectors * (raid_disks - conf->max_degraded); 4181} 4182 4183static raid5_conf_t *setup_conf(mddev_t *mddev) 4184{ 4185 raid5_conf_t *conf; 4186 int raid_disk, memory; 4187 mdk_rdev_t *rdev; 4188 struct disk_info *disk; 4189 4190 if (mddev->new_level != 5 4191 && mddev->new_level != 4 4192 && mddev->new_level != 6) { 4193 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4194 mdname(mddev), mddev->new_level); 4195 return ERR_PTR(-EIO); 4196 } 4197 if ((mddev->new_level == 5 4198 && !algorithm_valid_raid5(mddev->new_layout)) || 4199 (mddev->new_level == 6 4200 && !algorithm_valid_raid6(mddev->new_layout))) { 4201 printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4202 mdname(mddev), mddev->new_layout); 4203 return ERR_PTR(-EIO); 4204 } 4205 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4206 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4207 mdname(mddev), mddev->raid_disks); 4208 return ERR_PTR(-EINVAL); 4209 } 4210 4211 if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { 4212 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4213 mddev->new_chunk, mdname(mddev)); 4214 return ERR_PTR(-EINVAL); 4215 } 4216 4217 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4218 if (conf == NULL) 4219 goto abort; 4220 4221 conf->raid_disks = mddev->raid_disks; 4222 if (mddev->reshape_position == MaxSector) 4223 conf->previous_raid_disks = mddev->raid_disks; 4224 else 4225 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4226 4227 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4228 GFP_KERNEL); 4229 if (!conf->disks) 4230 goto abort; 4231 4232 conf->mddev = mddev; 4233 4234 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4235 goto abort; 4236 4237 if (mddev->new_level == 6) { 4238 conf->spare_page = alloc_page(GFP_KERNEL); 4239 if (!conf->spare_page) 4240 goto abort; 4241 } 4242 spin_lock_init(&conf->device_lock); 4243 init_waitqueue_head(&conf->wait_for_stripe); 4244 init_waitqueue_head(&conf->wait_for_overlap); 4245 INIT_LIST_HEAD(&conf->handle_list); 4246 INIT_LIST_HEAD(&conf->hold_list); 4247 INIT_LIST_HEAD(&conf->delayed_list); 4248 INIT_LIST_HEAD(&conf->bitmap_list); 4249 INIT_LIST_HEAD(&conf->inactive_list); 4250 atomic_set(&conf->active_stripes, 0); 4251 atomic_set(&conf->preread_active_stripes, 0); 4252 atomic_set(&conf->active_aligned_reads, 0); 4253 conf->bypass_threshold = BYPASS_THRESHOLD; 4254 4255 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4256 4257 list_for_each_entry(rdev, &mddev->disks, same_set) { 4258 raid_disk = rdev->raid_disk; 4259 if (raid_disk >= conf->raid_disks 4260 || raid_disk < 0) 4261 continue; 4262 disk = conf->disks + raid_disk; 4263 4264 disk->rdev = rdev; 4265 4266 if (test_bit(In_sync, &rdev->flags)) { 4267 char b[BDEVNAME_SIZE]; 4268 printk(KERN_INFO "raid5: device %s operational as raid" 4269 " disk %d\n", bdevname(rdev->bdev,b), 4270 raid_disk); 4271 } else 4272 /* Cannot rely on bitmap to complete recovery */ 4273 conf->fullsync = 1; 4274 } 4275 4276 conf->chunk_size = mddev->new_chunk; 4277 conf->level = mddev->new_level; 4278 if (conf->level == 6) 4279 conf->max_degraded = 2; 4280 else 4281 conf->max_degraded = 1; 4282 conf->algorithm = mddev->new_layout; 4283 conf->max_nr_stripes = NR_STRIPES; 4284 conf->expand_progress = mddev->reshape_position; 4285 4286 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4287 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4288 if (grow_stripes(conf, conf->max_nr_stripes)) { 4289 printk(KERN_ERR 4290 "raid5: couldn't allocate %dkB for buffers\n", memory); 4291 goto abort; 4292 } else 4293 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4294 memory, mdname(mddev)); 4295 4296 conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); 4297 if (!conf->thread) { 4298 printk(KERN_ERR 4299 "raid5: couldn't allocate thread for %s\n", 4300 mdname(mddev)); 4301 goto abort; 4302 } 4303 4304 return conf; 4305 4306 abort: 4307 if (conf) { 4308 shrink_stripes(conf); 4309 safe_put_page(conf->spare_page); 4310 kfree(conf->disks); 4311 kfree(conf->stripe_hashtbl); 4312 kfree(conf); 4313 return ERR_PTR(-EIO); 4314 } else 4315 return ERR_PTR(-ENOMEM); 4316} 4317 4318static int run(mddev_t *mddev) 4319{ 4320 raid5_conf_t *conf; 4321 int working_disks = 0; 4322 mdk_rdev_t *rdev; 4323 4324 if (mddev->reshape_position != MaxSector) { 4325 /* Check that we can continue the reshape. 4326 * Currently only disks can change, it must 4327 * increase, and we must be past the point where 4328 * a stripe over-writes itself 4329 */ 4330 sector_t here_new, here_old; 4331 int old_disks; 4332 int max_degraded = (mddev->level == 5 ? 1 : 2); 4333 4334 if (mddev->new_level != mddev->level || 4335 mddev->new_layout != mddev->layout || 4336 mddev->new_chunk != mddev->chunk_size) { 4337 printk(KERN_ERR "raid5: %s: unsupported reshape " 4338 "required - aborting.\n", 4339 mdname(mddev)); 4340 return -EINVAL; 4341 } 4342 if (mddev->delta_disks <= 0) { 4343 printk(KERN_ERR "raid5: %s: unsupported reshape " 4344 "(reduce disks) required - aborting.\n", 4345 mdname(mddev)); 4346 return -EINVAL; 4347 } 4348 old_disks = mddev->raid_disks - mddev->delta_disks; 4349 /* reshape_position must be on a new-stripe boundary, and one 4350 * further up in new geometry must map after here in old 4351 * geometry. 4352 */ 4353 here_new = mddev->reshape_position; 4354 if (sector_div(here_new, (mddev->chunk_size>>9)* 4355 (mddev->raid_disks - max_degraded))) { 4356 printk(KERN_ERR "raid5: reshape_position not " 4357 "on a stripe boundary\n"); 4358 return -EINVAL; 4359 } 4360 /* here_new is the stripe we will write to */ 4361 here_old = mddev->reshape_position; 4362 sector_div(here_old, (mddev->chunk_size>>9)* 4363 (old_disks-max_degraded)); 4364 /* here_old is the first stripe that we might need to read 4365 * from */ 4366 if (here_new >= here_old) { 4367 /* Reading from the same stripe as writing to - bad */ 4368 printk(KERN_ERR "raid5: reshape_position too early for " 4369 "auto-recovery - aborting.\n"); 4370 return -EINVAL; 4371 } 4372 printk(KERN_INFO "raid5: reshape will continue\n"); 4373 /* OK, we should be able to continue; */ 4374 } else { 4375 BUG_ON(mddev->level != mddev->new_level); 4376 BUG_ON(mddev->layout != mddev->new_layout); 4377 BUG_ON(mddev->chunk_size != mddev->new_chunk); 4378 BUG_ON(mddev->delta_disks != 0); 4379 } 4380 4381 if (mddev->private == NULL) 4382 conf = setup_conf(mddev); 4383 else 4384 conf = mddev->private; 4385 4386 if (IS_ERR(conf)) 4387 return PTR_ERR(conf); 4388 4389 mddev->thread = conf->thread; 4390 conf->thread = NULL; 4391 mddev->private = conf; 4392 4393 /* 4394 * 0 for a fully functional array, 1 or 2 for a degraded array. 4395 */ 4396 list_for_each_entry(rdev, &mddev->disks, same_set) 4397 if (rdev->raid_disk >= 0 && 4398 test_bit(In_sync, &rdev->flags)) 4399 working_disks++; 4400 4401 mddev->degraded = conf->raid_disks - working_disks; 4402 4403 if (mddev->degraded > conf->max_degraded) { 4404 printk(KERN_ERR "raid5: not enough operational devices for %s" 4405 " (%d/%d failed)\n", 4406 mdname(mddev), mddev->degraded, conf->raid_disks); 4407 goto abort; 4408 } 4409 4410 /* device size must be a multiple of chunk size */ 4411 mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); 4412 mddev->resync_max_sectors = mddev->dev_sectors; 4413 4414 if (mddev->degraded > 0 && 4415 mddev->recovery_cp != MaxSector) { 4416 if (mddev->ok_start_degraded) 4417 printk(KERN_WARNING 4418 "raid5: starting dirty degraded array: %s" 4419 "- data corruption possible.\n", 4420 mdname(mddev)); 4421 else { 4422 printk(KERN_ERR 4423 "raid5: cannot start dirty degraded array for %s\n", 4424 mdname(mddev)); 4425 goto abort; 4426 } 4427 } 4428 4429 if (mddev->degraded == 0) 4430 printk("raid5: raid level %d set %s active with %d out of %d" 4431 " devices, algorithm %d\n", conf->level, mdname(mddev), 4432 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 4433 conf->algorithm); 4434 else 4435 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 4436 " out of %d devices, algorithm %d\n", conf->level, 4437 mdname(mddev), mddev->raid_disks - mddev->degraded, 4438 mddev->raid_disks, conf->algorithm); 4439 4440 print_raid5_conf(conf); 4441 4442 if (conf->expand_progress != MaxSector) { 4443 printk("...ok start reshape thread\n"); 4444 conf->expand_lo = conf->expand_progress; 4445 atomic_set(&conf->reshape_stripes, 0); 4446 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4447 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4448 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4449 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4450 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4451 "%s_reshape"); 4452 } 4453 4454 /* read-ahead size must cover two whole stripes, which is 4455 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4456 */ 4457 { 4458 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4459 int stripe = data_disks * 4460 (mddev->chunk_size / PAGE_SIZE); 4461 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4462 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4463 } 4464 4465 /* Ok, everything is just fine now */ 4466 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 4467 printk(KERN_WARNING 4468 "raid5: failed to create sysfs attributes for %s\n", 4469 mdname(mddev)); 4470 4471 mddev->queue->queue_lock = &conf->device_lock; 4472 4473 mddev->queue->unplug_fn = raid5_unplug_device; 4474 mddev->queue->backing_dev_info.congested_data = mddev; 4475 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4476 4477 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 4478 4479 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4480 4481 return 0; 4482abort: 4483 md_unregister_thread(mddev->thread); 4484 mddev->thread = NULL; 4485 if (conf) { 4486 shrink_stripes(conf); 4487 print_raid5_conf(conf); 4488 safe_put_page(conf->spare_page); 4489 kfree(conf->disks); 4490 kfree(conf->stripe_hashtbl); 4491 kfree(conf); 4492 } 4493 mddev->private = NULL; 4494 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4495 return -EIO; 4496} 4497 4498 4499 4500static int stop(mddev_t *mddev) 4501{ 4502 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4503 4504 md_unregister_thread(mddev->thread); 4505 mddev->thread = NULL; 4506 shrink_stripes(conf); 4507 kfree(conf->stripe_hashtbl); 4508 mddev->queue->backing_dev_info.congested_fn = NULL; 4509 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4510 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4511 kfree(conf->disks); 4512 kfree(conf); 4513 mddev->private = NULL; 4514 return 0; 4515} 4516 4517#ifdef DEBUG 4518static void print_sh(struct seq_file *seq, struct stripe_head *sh) 4519{ 4520 int i; 4521 4522 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 4523 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 4524 seq_printf(seq, "sh %llu, count %d.\n", 4525 (unsigned long long)sh->sector, atomic_read(&sh->count)); 4526 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 4527 for (i = 0; i < sh->disks; i++) { 4528 seq_printf(seq, "(cache%d: %p %ld) ", 4529 i, sh->dev[i].page, sh->dev[i].flags); 4530 } 4531 seq_printf(seq, "\n"); 4532} 4533 4534static void printall(struct seq_file *seq, raid5_conf_t *conf) 4535{ 4536 struct stripe_head *sh; 4537 struct hlist_node *hn; 4538 int i; 4539 4540 spin_lock_irq(&conf->device_lock); 4541 for (i = 0; i < NR_HASH; i++) { 4542 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 4543 if (sh->raid_conf != conf) 4544 continue; 4545 print_sh(seq, sh); 4546 } 4547 } 4548 spin_unlock_irq(&conf->device_lock); 4549} 4550#endif 4551 4552static void status(struct seq_file *seq, mddev_t *mddev) 4553{ 4554 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4555 int i; 4556 4557 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); 4558 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 4559 for (i = 0; i < conf->raid_disks; i++) 4560 seq_printf (seq, "%s", 4561 conf->disks[i].rdev && 4562 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 4563 seq_printf (seq, "]"); 4564#ifdef DEBUG 4565 seq_printf (seq, "\n"); 4566 printall(seq, conf); 4567#endif 4568} 4569 4570static void print_raid5_conf (raid5_conf_t *conf) 4571{ 4572 int i; 4573 struct disk_info *tmp; 4574 4575 printk("RAID5 conf printout:\n"); 4576 if (!conf) { 4577 printk("(conf==NULL)\n"); 4578 return; 4579 } 4580 printk(" --- rd:%d wd:%d\n", conf->raid_disks, 4581 conf->raid_disks - conf->mddev->degraded); 4582 4583 for (i = 0; i < conf->raid_disks; i++) { 4584 char b[BDEVNAME_SIZE]; 4585 tmp = conf->disks + i; 4586 if (tmp->rdev) 4587 printk(" disk %d, o:%d, dev:%s\n", 4588 i, !test_bit(Faulty, &tmp->rdev->flags), 4589 bdevname(tmp->rdev->bdev,b)); 4590 } 4591} 4592 4593static int raid5_spare_active(mddev_t *mddev) 4594{ 4595 int i; 4596 raid5_conf_t *conf = mddev->private; 4597 struct disk_info *tmp; 4598 4599 for (i = 0; i < conf->raid_disks; i++) { 4600 tmp = conf->disks + i; 4601 if (tmp->rdev 4602 && !test_bit(Faulty, &tmp->rdev->flags) 4603 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 4604 unsigned long flags; 4605 spin_lock_irqsave(&conf->device_lock, flags); 4606 mddev->degraded--; 4607 spin_unlock_irqrestore(&conf->device_lock, flags); 4608 } 4609 } 4610 print_raid5_conf(conf); 4611 return 0; 4612} 4613 4614static int raid5_remove_disk(mddev_t *mddev, int number) 4615{ 4616 raid5_conf_t *conf = mddev->private; 4617 int err = 0; 4618 mdk_rdev_t *rdev; 4619 struct disk_info *p = conf->disks + number; 4620 4621 print_raid5_conf(conf); 4622 rdev = p->rdev; 4623 if (rdev) { 4624 if (test_bit(In_sync, &rdev->flags) || 4625 atomic_read(&rdev->nr_pending)) { 4626 err = -EBUSY; 4627 goto abort; 4628 } 4629 /* Only remove non-faulty devices if recovery 4630 * isn't possible. 4631 */ 4632 if (!test_bit(Faulty, &rdev->flags) && 4633 mddev->degraded <= conf->max_degraded) { 4634 err = -EBUSY; 4635 goto abort; 4636 } 4637 p->rdev = NULL; 4638 synchronize_rcu(); 4639 if (atomic_read(&rdev->nr_pending)) { 4640 /* lost the race, try later */ 4641 err = -EBUSY; 4642 p->rdev = rdev; 4643 } 4644 } 4645abort: 4646 4647 print_raid5_conf(conf); 4648 return err; 4649} 4650 4651static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4652{ 4653 raid5_conf_t *conf = mddev->private; 4654 int err = -EEXIST; 4655 int disk; 4656 struct disk_info *p; 4657 int first = 0; 4658 int last = conf->raid_disks - 1; 4659 4660 if (mddev->degraded > conf->max_degraded) 4661 /* no point adding a device */ 4662 return -EINVAL; 4663 4664 if (rdev->raid_disk >= 0) 4665 first = last = rdev->raid_disk; 4666 4667 /* 4668 * find the disk ... but prefer rdev->saved_raid_disk 4669 * if possible. 4670 */ 4671 if (rdev->saved_raid_disk >= 0 && 4672 rdev->saved_raid_disk >= first && 4673 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4674 disk = rdev->saved_raid_disk; 4675 else 4676 disk = first; 4677 for ( ; disk <= last ; disk++) 4678 if ((p=conf->disks + disk)->rdev == NULL) { 4679 clear_bit(In_sync, &rdev->flags); 4680 rdev->raid_disk = disk; 4681 err = 0; 4682 if (rdev->saved_raid_disk != disk) 4683 conf->fullsync = 1; 4684 rcu_assign_pointer(p->rdev, rdev); 4685 break; 4686 } 4687 print_raid5_conf(conf); 4688 return err; 4689} 4690 4691static int raid5_resize(mddev_t *mddev, sector_t sectors) 4692{ 4693 /* no resync is happening, and there is enough space 4694 * on all devices, so we can resize. 4695 * We need to make sure resync covers any new space. 4696 * If the array is shrinking we should possibly wait until 4697 * any io in the removed space completes, but it hardly seems 4698 * worth it. 4699 */ 4700 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4701 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 4702 mddev->raid_disks)); 4703 set_capacity(mddev->gendisk, mddev->array_sectors); 4704 mddev->changed = 1; 4705 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 4706 mddev->recovery_cp = mddev->dev_sectors; 4707 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4708 } 4709 mddev->dev_sectors = sectors; 4710 mddev->resync_max_sectors = sectors; 4711 return 0; 4712} 4713 4714#ifdef CONFIG_MD_RAID5_RESHAPE 4715static int raid5_check_reshape(mddev_t *mddev) 4716{ 4717 raid5_conf_t *conf = mddev_to_conf(mddev); 4718 int err; 4719 4720 if (mddev->delta_disks < 0 || 4721 mddev->new_level != mddev->level) 4722 return -EINVAL; /* Cannot shrink array or change level yet */ 4723 if (mddev->delta_disks == 0) 4724 return 0; /* nothing to do */ 4725 if (mddev->bitmap) 4726 /* Cannot grow a bitmap yet */ 4727 return -EBUSY; 4728 4729 /* Can only proceed if there are plenty of stripe_heads. 4730 * We need a minimum of one full stripe,, and for sensible progress 4731 * it is best to have about 4 times that. 4732 * If we require 4 times, then the default 256 4K stripe_heads will 4733 * allow for chunk sizes up to 256K, which is probably OK. 4734 * If the chunk size is greater, user-space should request more 4735 * stripe_heads first. 4736 */ 4737 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || 4738 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { 4739 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 4740 (mddev->chunk_size / STRIPE_SIZE)*4); 4741 return -ENOSPC; 4742 } 4743 4744 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 4745 if (err) 4746 return err; 4747 4748 if (mddev->degraded > conf->max_degraded) 4749 return -EINVAL; 4750 /* looks like we might be able to manage this */ 4751 return 0; 4752} 4753 4754static int raid5_start_reshape(mddev_t *mddev) 4755{ 4756 raid5_conf_t *conf = mddev_to_conf(mddev); 4757 mdk_rdev_t *rdev; 4758 int spares = 0; 4759 int added_devices = 0; 4760 unsigned long flags; 4761 4762 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4763 return -EBUSY; 4764 4765 list_for_each_entry(rdev, &mddev->disks, same_set) 4766 if (rdev->raid_disk < 0 && 4767 !test_bit(Faulty, &rdev->flags)) 4768 spares++; 4769 4770 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 4771 /* Not enough devices even to make a degraded array 4772 * of that size 4773 */ 4774 return -EINVAL; 4775 4776 atomic_set(&conf->reshape_stripes, 0); 4777 spin_lock_irq(&conf->device_lock); 4778 conf->previous_raid_disks = conf->raid_disks; 4779 conf->raid_disks += mddev->delta_disks; 4780 conf->expand_progress = 0; 4781 conf->expand_lo = 0; 4782 spin_unlock_irq(&conf->device_lock); 4783 4784 /* Add some new drives, as many as will fit. 4785 * We know there are enough to make the newly sized array work. 4786 */ 4787 list_for_each_entry(rdev, &mddev->disks, same_set) 4788 if (rdev->raid_disk < 0 && 4789 !test_bit(Faulty, &rdev->flags)) { 4790 if (raid5_add_disk(mddev, rdev) == 0) { 4791 char nm[20]; 4792 set_bit(In_sync, &rdev->flags); 4793 added_devices++; 4794 rdev->recovery_offset = 0; 4795 sprintf(nm, "rd%d", rdev->raid_disk); 4796 if (sysfs_create_link(&mddev->kobj, 4797 &rdev->kobj, nm)) 4798 printk(KERN_WARNING 4799 "raid5: failed to create " 4800 " link %s for %s\n", 4801 nm, mdname(mddev)); 4802 } else 4803 break; 4804 } 4805 4806 spin_lock_irqsave(&conf->device_lock, flags); 4807 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; 4808 spin_unlock_irqrestore(&conf->device_lock, flags); 4809 mddev->raid_disks = conf->raid_disks; 4810 mddev->reshape_position = 0; 4811 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4812 4813 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4814 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4815 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4816 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4817 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4818 "%s_reshape"); 4819 if (!mddev->sync_thread) { 4820 mddev->recovery = 0; 4821 spin_lock_irq(&conf->device_lock); 4822 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 4823 conf->expand_progress = MaxSector; 4824 spin_unlock_irq(&conf->device_lock); 4825 return -EAGAIN; 4826 } 4827 md_wakeup_thread(mddev->sync_thread); 4828 md_new_event(mddev); 4829 return 0; 4830} 4831#endif 4832 4833static void end_reshape(raid5_conf_t *conf) 4834{ 4835 struct block_device *bdev; 4836 4837 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4838 mddev_t *mddev = conf->mddev; 4839 4840 md_set_array_sectors(mddev, raid5_size(mddev, 0, 4841 conf->raid_disks)); 4842 set_capacity(mddev->gendisk, mddev->array_sectors); 4843 mddev->changed = 1; 4844 conf->previous_raid_disks = conf->raid_disks; 4845 4846 bdev = bdget_disk(conf->mddev->gendisk, 0); 4847 if (bdev) { 4848 mutex_lock(&bdev->bd_inode->i_mutex); 4849 i_size_write(bdev->bd_inode, 4850 (loff_t)conf->mddev->array_sectors << 9); 4851 mutex_unlock(&bdev->bd_inode->i_mutex); 4852 bdput(bdev); 4853 } 4854 spin_lock_irq(&conf->device_lock); 4855 conf->expand_progress = MaxSector; 4856 spin_unlock_irq(&conf->device_lock); 4857 conf->mddev->reshape_position = MaxSector; 4858 4859 /* read-ahead size must cover two whole stripes, which is 4860 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4861 */ 4862 { 4863 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4864 int stripe = data_disks * 4865 (conf->mddev->chunk_size / PAGE_SIZE); 4866 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4867 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4868 } 4869 } 4870} 4871 4872static void raid5_quiesce(mddev_t *mddev, int state) 4873{ 4874 raid5_conf_t *conf = mddev_to_conf(mddev); 4875 4876 switch(state) { 4877 case 2: /* resume for a suspend */ 4878 wake_up(&conf->wait_for_overlap); 4879 break; 4880 4881 case 1: /* stop all writes */ 4882 spin_lock_irq(&conf->device_lock); 4883 conf->quiesce = 1; 4884 wait_event_lock_irq(conf->wait_for_stripe, 4885 atomic_read(&conf->active_stripes) == 0 && 4886 atomic_read(&conf->active_aligned_reads) == 0, 4887 conf->device_lock, /* nothing */); 4888 spin_unlock_irq(&conf->device_lock); 4889 break; 4890 4891 case 0: /* re-enable writes */ 4892 spin_lock_irq(&conf->device_lock); 4893 conf->quiesce = 0; 4894 wake_up(&conf->wait_for_stripe); 4895 wake_up(&conf->wait_for_overlap); 4896 spin_unlock_irq(&conf->device_lock); 4897 break; 4898 } 4899} 4900 4901 4902static void *raid5_takeover_raid1(mddev_t *mddev) 4903{ 4904 int chunksect; 4905 4906 if (mddev->raid_disks != 2 || 4907 mddev->degraded > 1) 4908 return ERR_PTR(-EINVAL); 4909 4910 /* Should check if there are write-behind devices? */ 4911 4912 chunksect = 64*2; /* 64K by default */ 4913 4914 /* The array must be an exact multiple of chunksize */ 4915 while (chunksect && (mddev->array_sectors & (chunksect-1))) 4916 chunksect >>= 1; 4917 4918 if ((chunksect<<9) < STRIPE_SIZE) 4919 /* array size does not allow a suitable chunk size */ 4920 return ERR_PTR(-EINVAL); 4921 4922 mddev->new_level = 5; 4923 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 4924 mddev->new_chunk = chunksect << 9; 4925 4926 return setup_conf(mddev); 4927} 4928 4929static void *raid5_takeover_raid6(mddev_t *mddev) 4930{ 4931 int new_layout; 4932 4933 switch (mddev->layout) { 4934 case ALGORITHM_LEFT_ASYMMETRIC_6: 4935 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 4936 break; 4937 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4938 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 4939 break; 4940 case ALGORITHM_LEFT_SYMMETRIC_6: 4941 new_layout = ALGORITHM_LEFT_SYMMETRIC; 4942 break; 4943 case ALGORITHM_RIGHT_SYMMETRIC_6: 4944 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 4945 break; 4946 case ALGORITHM_PARITY_0_6: 4947 new_layout = ALGORITHM_PARITY_0; 4948 break; 4949 case ALGORITHM_PARITY_N: 4950 new_layout = ALGORITHM_PARITY_N; 4951 break; 4952 default: 4953 return ERR_PTR(-EINVAL); 4954 } 4955 mddev->new_level = 5; 4956 mddev->new_layout = new_layout; 4957 mddev->delta_disks = -1; 4958 mddev->raid_disks -= 1; 4959 return setup_conf(mddev); 4960} 4961 4962 4963static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) 4964{ 4965 /* Currently the layout and chunk size can only be changed 4966 * for a 2-drive raid array, as in that case no data shuffling 4967 * is required. 4968 * Later we might validate these and set new_* so a reshape 4969 * can complete the change. 4970 */ 4971 raid5_conf_t *conf = mddev_to_conf(mddev); 4972 4973 if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) 4974 return -EINVAL; 4975 if (new_chunk > 0) { 4976 if (new_chunk & (new_chunk-1)) 4977 /* not a power of 2 */ 4978 return -EINVAL; 4979 if (new_chunk < PAGE_SIZE) 4980 return -EINVAL; 4981 if (mddev->array_sectors & ((new_chunk>>9)-1)) 4982 /* not factor of array size */ 4983 return -EINVAL; 4984 } 4985 4986 /* They look valid */ 4987 4988 if (mddev->raid_disks != 2) 4989 return -EINVAL; 4990 4991 if (new_layout >= 0) { 4992 conf->algorithm = new_layout; 4993 mddev->layout = mddev->new_layout = new_layout; 4994 } 4995 if (new_chunk > 0) { 4996 conf->chunk_size = new_chunk; 4997 mddev->chunk_size = mddev->new_chunk = new_chunk; 4998 } 4999 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5000 md_wakeup_thread(mddev->thread); 5001 return 0; 5002} 5003 5004static void *raid5_takeover(mddev_t *mddev) 5005{ 5006 /* raid5 can take over: 5007 * raid0 - if all devices are the same - make it a raid4 layout 5008 * raid1 - if there are two drives. We need to know the chunk size 5009 * raid4 - trivial - just use a raid4 layout. 5010 * raid6 - Providing it is a *_6 layout 5011 * 5012 * For now, just do raid1 5013 */ 5014 5015 if (mddev->level == 1) 5016 return raid5_takeover_raid1(mddev); 5017 if (mddev->level == 4) { 5018 mddev->new_layout = ALGORITHM_PARITY_N; 5019 mddev->new_level = 5; 5020 return setup_conf(mddev); 5021 } 5022 if (mddev->level == 6) 5023 return raid5_takeover_raid6(mddev); 5024 5025 return ERR_PTR(-EINVAL); 5026} 5027 5028 5029static struct mdk_personality raid5_personality; 5030 5031static void *raid6_takeover(mddev_t *mddev) 5032{ 5033 /* Currently can only take over a raid5. We map the 5034 * personality to an equivalent raid6 personality 5035 * with the Q block at the end. 5036 */ 5037 int new_layout; 5038 5039 if (mddev->pers != &raid5_personality) 5040 return ERR_PTR(-EINVAL); 5041 if (mddev->degraded > 1) 5042 return ERR_PTR(-EINVAL); 5043 if (mddev->raid_disks > 253) 5044 return ERR_PTR(-EINVAL); 5045 if (mddev->raid_disks < 3) 5046 return ERR_PTR(-EINVAL); 5047 5048 switch (mddev->layout) { 5049 case ALGORITHM_LEFT_ASYMMETRIC: 5050 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5051 break; 5052 case ALGORITHM_RIGHT_ASYMMETRIC: 5053 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5054 break; 5055 case ALGORITHM_LEFT_SYMMETRIC: 5056 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5057 break; 5058 case ALGORITHM_RIGHT_SYMMETRIC: 5059 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5060 break; 5061 case ALGORITHM_PARITY_0: 5062 new_layout = ALGORITHM_PARITY_0_6; 5063 break; 5064 case ALGORITHM_PARITY_N: 5065 new_layout = ALGORITHM_PARITY_N; 5066 break; 5067 default: 5068 return ERR_PTR(-EINVAL); 5069 } 5070 mddev->new_level = 6; 5071 mddev->new_layout = new_layout; 5072 mddev->delta_disks = 1; 5073 mddev->raid_disks += 1; 5074 return setup_conf(mddev); 5075} 5076 5077 5078static struct mdk_personality raid6_personality = 5079{ 5080 .name = "raid6", 5081 .level = 6, 5082 .owner = THIS_MODULE, 5083 .make_request = make_request, 5084 .run = run, 5085 .stop = stop, 5086 .status = status, 5087 .error_handler = error, 5088 .hot_add_disk = raid5_add_disk, 5089 .hot_remove_disk= raid5_remove_disk, 5090 .spare_active = raid5_spare_active, 5091 .sync_request = sync_request, 5092 .resize = raid5_resize, 5093 .size = raid5_size, 5094#ifdef CONFIG_MD_RAID5_RESHAPE 5095 .check_reshape = raid5_check_reshape, 5096 .start_reshape = raid5_start_reshape, 5097#endif 5098 .quiesce = raid5_quiesce, 5099 .takeover = raid6_takeover, 5100}; 5101static struct mdk_personality raid5_personality = 5102{ 5103 .name = "raid5", 5104 .level = 5, 5105 .owner = THIS_MODULE, 5106 .make_request = make_request, 5107 .run = run, 5108 .stop = stop, 5109 .status = status, 5110 .error_handler = error, 5111 .hot_add_disk = raid5_add_disk, 5112 .hot_remove_disk= raid5_remove_disk, 5113 .spare_active = raid5_spare_active, 5114 .sync_request = sync_request, 5115 .resize = raid5_resize, 5116 .size = raid5_size, 5117#ifdef CONFIG_MD_RAID5_RESHAPE 5118 .check_reshape = raid5_check_reshape, 5119 .start_reshape = raid5_start_reshape, 5120#endif 5121 .quiesce = raid5_quiesce, 5122 .takeover = raid5_takeover, 5123 .reconfig = raid5_reconfig, 5124}; 5125 5126static struct mdk_personality raid4_personality = 5127{ 5128 .name = "raid4", 5129 .level = 4, 5130 .owner = THIS_MODULE, 5131 .make_request = make_request, 5132 .run = run, 5133 .stop = stop, 5134 .status = status, 5135 .error_handler = error, 5136 .hot_add_disk = raid5_add_disk, 5137 .hot_remove_disk= raid5_remove_disk, 5138 .spare_active = raid5_spare_active, 5139 .sync_request = sync_request, 5140 .resize = raid5_resize, 5141 .size = raid5_size, 5142#ifdef CONFIG_MD_RAID5_RESHAPE 5143 .check_reshape = raid5_check_reshape, 5144 .start_reshape = raid5_start_reshape, 5145#endif 5146 .quiesce = raid5_quiesce, 5147}; 5148 5149static int __init raid5_init(void) 5150{ 5151 int e; 5152 5153 e = raid6_select_algo(); 5154 if ( e ) 5155 return e; 5156 register_md_personality(&raid6_personality); 5157 register_md_personality(&raid5_personality); 5158 register_md_personality(&raid4_personality); 5159 return 0; 5160} 5161 5162static void raid5_exit(void) 5163{ 5164 unregister_md_personality(&raid6_personality); 5165 unregister_md_personality(&raid5_personality); 5166 unregister_md_personality(&raid4_personality); 5167} 5168 5169module_init(raid5_init); 5170module_exit(raid5_exit); 5171MODULE_LICENSE("GPL"); 5172MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5173MODULE_ALIAS("md-raid5"); 5174MODULE_ALIAS("md-raid4"); 5175MODULE_ALIAS("md-level-5"); 5176MODULE_ALIAS("md-level-4"); 5177MODULE_ALIAS("md-personality-8"); /* RAID6 */ 5178MODULE_ALIAS("md-raid6"); 5179MODULE_ALIAS("md-level-6"); 5180 5181/* This used to be two separate modules, they were: */ 5182MODULE_ALIAS("raid5"); 5183MODULE_ALIAS("raid6"); 5184