raid5.c revision cc371e66e340f35eed8dc4651c7c18e754c7fb26
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 *	   Copyright (C) 1999, 2000 Ingo Molnar
5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches.  Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to
32 *    new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 *   we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 *   batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/module.h>
47#include <linux/slab.h>
48#include <linux/highmem.h>
49#include <linux/bitops.h>
50#include <linux/kthread.h>
51#include <asm/atomic.h>
52#include "raid6.h"
53
54#include <linux/raid/bitmap.h>
55#include <linux/async_tx.h>
56
57/*
58 * Stripe cache
59 */
60
61#define NR_STRIPES		256
62#define STRIPE_SIZE		PAGE_SIZE
63#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
64#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
65#define	IO_THRESHOLD		1
66#define BYPASS_THRESHOLD	1
67#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
68#define HASH_MASK		(NR_HASH - 1)
69
70#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
71
72/* bio's attached to a stripe+device for I/O are linked together in bi_sector
73 * order without overlap.  There may be several bio's per stripe+device, and
74 * a bio could span several devices.
75 * When walking this list for a particular stripe+device, we must never proceed
76 * beyond a bio that extends past this device, as the next bio might no longer
77 * be valid.
78 * This macro is used to determine the 'next' bio in the list, given the sector
79 * of the current stripe+device
80 */
81#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
82/*
83 * The following can be used to debug the driver
84 */
85#define RAID5_PARANOIA	1
86#if RAID5_PARANOIA && defined(CONFIG_SMP)
87# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
88#else
89# define CHECK_DEVLOCK()
90#endif
91
92#ifdef DEBUG
93#define inline
94#define __inline__
95#endif
96
97#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
98
99#if !RAID6_USE_EMPTY_ZERO_PAGE
100/* In .bss so it's zeroed */
101const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
102#endif
103
104static inline int raid6_next_disk(int disk, int raid_disks)
105{
106	disk++;
107	return (disk < raid_disks) ? disk : 0;
108}
109
110static void return_io(struct bio *return_bi)
111{
112	struct bio *bi = return_bi;
113	while (bi) {
114
115		return_bi = bi->bi_next;
116		bi->bi_next = NULL;
117		bi->bi_size = 0;
118		bi->bi_end_io(bi,
119			      test_bit(BIO_UPTODATE, &bi->bi_flags)
120			        ? 0 : -EIO);
121		bi = return_bi;
122	}
123}
124
125static void print_raid5_conf (raid5_conf_t *conf);
126
127static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
128{
129	if (atomic_dec_and_test(&sh->count)) {
130		BUG_ON(!list_empty(&sh->lru));
131		BUG_ON(atomic_read(&conf->active_stripes)==0);
132		if (test_bit(STRIPE_HANDLE, &sh->state)) {
133			if (test_bit(STRIPE_DELAYED, &sh->state)) {
134				list_add_tail(&sh->lru, &conf->delayed_list);
135				blk_plug_device(conf->mddev->queue);
136			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
137				   sh->bm_seq - conf->seq_write > 0) {
138				list_add_tail(&sh->lru, &conf->bitmap_list);
139				blk_plug_device(conf->mddev->queue);
140			} else {
141				clear_bit(STRIPE_BIT_DELAY, &sh->state);
142				list_add_tail(&sh->lru, &conf->handle_list);
143			}
144			md_wakeup_thread(conf->mddev->thread);
145		} else {
146			BUG_ON(sh->ops.pending);
147			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
148				atomic_dec(&conf->preread_active_stripes);
149				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
150					md_wakeup_thread(conf->mddev->thread);
151			}
152			atomic_dec(&conf->active_stripes);
153			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
154				list_add_tail(&sh->lru, &conf->inactive_list);
155				wake_up(&conf->wait_for_stripe);
156				if (conf->retry_read_aligned)
157					md_wakeup_thread(conf->mddev->thread);
158			}
159		}
160	}
161}
162static void release_stripe(struct stripe_head *sh)
163{
164	raid5_conf_t *conf = sh->raid_conf;
165	unsigned long flags;
166
167	spin_lock_irqsave(&conf->device_lock, flags);
168	__release_stripe(conf, sh);
169	spin_unlock_irqrestore(&conf->device_lock, flags);
170}
171
172static inline void remove_hash(struct stripe_head *sh)
173{
174	pr_debug("remove_hash(), stripe %llu\n",
175		(unsigned long long)sh->sector);
176
177	hlist_del_init(&sh->hash);
178}
179
180static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
181{
182	struct hlist_head *hp = stripe_hash(conf, sh->sector);
183
184	pr_debug("insert_hash(), stripe %llu\n",
185		(unsigned long long)sh->sector);
186
187	CHECK_DEVLOCK();
188	hlist_add_head(&sh->hash, hp);
189}
190
191
192/* find an idle stripe, make sure it is unhashed, and return it. */
193static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
194{
195	struct stripe_head *sh = NULL;
196	struct list_head *first;
197
198	CHECK_DEVLOCK();
199	if (list_empty(&conf->inactive_list))
200		goto out;
201	first = conf->inactive_list.next;
202	sh = list_entry(first, struct stripe_head, lru);
203	list_del_init(first);
204	remove_hash(sh);
205	atomic_inc(&conf->active_stripes);
206out:
207	return sh;
208}
209
210static void shrink_buffers(struct stripe_head *sh, int num)
211{
212	struct page *p;
213	int i;
214
215	for (i=0; i<num ; i++) {
216		p = sh->dev[i].page;
217		if (!p)
218			continue;
219		sh->dev[i].page = NULL;
220		put_page(p);
221	}
222}
223
224static int grow_buffers(struct stripe_head *sh, int num)
225{
226	int i;
227
228	for (i=0; i<num; i++) {
229		struct page *page;
230
231		if (!(page = alloc_page(GFP_KERNEL))) {
232			return 1;
233		}
234		sh->dev[i].page = page;
235	}
236	return 0;
237}
238
239static void raid5_build_block (struct stripe_head *sh, int i);
240
241static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
242{
243	raid5_conf_t *conf = sh->raid_conf;
244	int i;
245
246	BUG_ON(atomic_read(&sh->count) != 0);
247	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
248	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
249
250	CHECK_DEVLOCK();
251	pr_debug("init_stripe called, stripe %llu\n",
252		(unsigned long long)sh->sector);
253
254	remove_hash(sh);
255
256	sh->sector = sector;
257	sh->pd_idx = pd_idx;
258	sh->state = 0;
259
260	sh->disks = disks;
261
262	for (i = sh->disks; i--; ) {
263		struct r5dev *dev = &sh->dev[i];
264
265		if (dev->toread || dev->read || dev->towrite || dev->written ||
266		    test_bit(R5_LOCKED, &dev->flags)) {
267			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
268			       (unsigned long long)sh->sector, i, dev->toread,
269			       dev->read, dev->towrite, dev->written,
270			       test_bit(R5_LOCKED, &dev->flags));
271			BUG();
272		}
273		dev->flags = 0;
274		raid5_build_block(sh, i);
275	}
276	insert_hash(conf, sh);
277}
278
279static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
280{
281	struct stripe_head *sh;
282	struct hlist_node *hn;
283
284	CHECK_DEVLOCK();
285	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
286	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
287		if (sh->sector == sector && sh->disks == disks)
288			return sh;
289	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
290	return NULL;
291}
292
293static void unplug_slaves(mddev_t *mddev);
294static void raid5_unplug_device(struct request_queue *q);
295
296static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
297					     int pd_idx, int noblock)
298{
299	struct stripe_head *sh;
300
301	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
302
303	spin_lock_irq(&conf->device_lock);
304
305	do {
306		wait_event_lock_irq(conf->wait_for_stripe,
307				    conf->quiesce == 0,
308				    conf->device_lock, /* nothing */);
309		sh = __find_stripe(conf, sector, disks);
310		if (!sh) {
311			if (!conf->inactive_blocked)
312				sh = get_free_stripe(conf);
313			if (noblock && sh == NULL)
314				break;
315			if (!sh) {
316				conf->inactive_blocked = 1;
317				wait_event_lock_irq(conf->wait_for_stripe,
318						    !list_empty(&conf->inactive_list) &&
319						    (atomic_read(&conf->active_stripes)
320						     < (conf->max_nr_stripes *3/4)
321						     || !conf->inactive_blocked),
322						    conf->device_lock,
323						    raid5_unplug_device(conf->mddev->queue)
324					);
325				conf->inactive_blocked = 0;
326			} else
327				init_stripe(sh, sector, pd_idx, disks);
328		} else {
329			if (atomic_read(&sh->count)) {
330			  BUG_ON(!list_empty(&sh->lru));
331			} else {
332				if (!test_bit(STRIPE_HANDLE, &sh->state))
333					atomic_inc(&conf->active_stripes);
334				if (list_empty(&sh->lru) &&
335				    !test_bit(STRIPE_EXPANDING, &sh->state))
336					BUG();
337				list_del_init(&sh->lru);
338			}
339		}
340	} while (sh == NULL);
341
342	if (sh)
343		atomic_inc(&sh->count);
344
345	spin_unlock_irq(&conf->device_lock);
346	return sh;
347}
348
349/* test_and_ack_op() ensures that we only dequeue an operation once */
350#define test_and_ack_op(op, pend) \
351do {							\
352	if (test_bit(op, &sh->ops.pending) &&		\
353		!test_bit(op, &sh->ops.complete)) {	\
354		if (test_and_set_bit(op, &sh->ops.ack)) \
355			clear_bit(op, &pend);		\
356		else					\
357			ack++;				\
358	} else						\
359		clear_bit(op, &pend);			\
360} while (0)
361
362/* find new work to run, do not resubmit work that is already
363 * in flight
364 */
365static unsigned long get_stripe_work(struct stripe_head *sh)
366{
367	unsigned long pending;
368	int ack = 0;
369
370	pending = sh->ops.pending;
371
372	test_and_ack_op(STRIPE_OP_BIOFILL, pending);
373	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
374	test_and_ack_op(STRIPE_OP_PREXOR, pending);
375	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
376	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
377	test_and_ack_op(STRIPE_OP_CHECK, pending);
378	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
379		ack++;
380
381	sh->ops.count -= ack;
382	if (unlikely(sh->ops.count < 0)) {
383		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
384			"ops.complete: %#lx\n", pending, sh->ops.pending,
385			sh->ops.ack, sh->ops.complete);
386		BUG();
387	}
388
389	return pending;
390}
391
392static void
393raid5_end_read_request(struct bio *bi, int error);
394static void
395raid5_end_write_request(struct bio *bi, int error);
396
397static void ops_run_io(struct stripe_head *sh)
398{
399	raid5_conf_t *conf = sh->raid_conf;
400	int i, disks = sh->disks;
401
402	might_sleep();
403
404	set_bit(STRIPE_IO_STARTED, &sh->state);
405	for (i = disks; i--; ) {
406		int rw;
407		struct bio *bi;
408		mdk_rdev_t *rdev;
409		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
410			rw = WRITE;
411		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
412			rw = READ;
413		else
414			continue;
415
416		bi = &sh->dev[i].req;
417
418		bi->bi_rw = rw;
419		if (rw == WRITE)
420			bi->bi_end_io = raid5_end_write_request;
421		else
422			bi->bi_end_io = raid5_end_read_request;
423
424		rcu_read_lock();
425		rdev = rcu_dereference(conf->disks[i].rdev);
426		if (rdev && test_bit(Faulty, &rdev->flags))
427			rdev = NULL;
428		if (rdev)
429			atomic_inc(&rdev->nr_pending);
430		rcu_read_unlock();
431
432		if (rdev) {
433			if (test_bit(STRIPE_SYNCING, &sh->state) ||
434				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
435				test_bit(STRIPE_EXPAND_READY, &sh->state))
436				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
437
438			bi->bi_bdev = rdev->bdev;
439			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
440				__func__, (unsigned long long)sh->sector,
441				bi->bi_rw, i);
442			atomic_inc(&sh->count);
443			bi->bi_sector = sh->sector + rdev->data_offset;
444			bi->bi_flags = 1 << BIO_UPTODATE;
445			bi->bi_vcnt = 1;
446			bi->bi_max_vecs = 1;
447			bi->bi_idx = 0;
448			bi->bi_io_vec = &sh->dev[i].vec;
449			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
450			bi->bi_io_vec[0].bv_offset = 0;
451			bi->bi_size = STRIPE_SIZE;
452			bi->bi_next = NULL;
453			if (rw == WRITE &&
454			    test_bit(R5_ReWrite, &sh->dev[i].flags))
455				atomic_add(STRIPE_SECTORS,
456					&rdev->corrected_errors);
457			generic_make_request(bi);
458		} else {
459			if (rw == WRITE)
460				set_bit(STRIPE_DEGRADED, &sh->state);
461			pr_debug("skip op %ld on disc %d for sector %llu\n",
462				bi->bi_rw, i, (unsigned long long)sh->sector);
463			clear_bit(R5_LOCKED, &sh->dev[i].flags);
464			set_bit(STRIPE_HANDLE, &sh->state);
465		}
466	}
467}
468
469static struct dma_async_tx_descriptor *
470async_copy_data(int frombio, struct bio *bio, struct page *page,
471	sector_t sector, struct dma_async_tx_descriptor *tx)
472{
473	struct bio_vec *bvl;
474	struct page *bio_page;
475	int i;
476	int page_offset;
477
478	if (bio->bi_sector >= sector)
479		page_offset = (signed)(bio->bi_sector - sector) * 512;
480	else
481		page_offset = (signed)(sector - bio->bi_sector) * -512;
482	bio_for_each_segment(bvl, bio, i) {
483		int len = bio_iovec_idx(bio, i)->bv_len;
484		int clen;
485		int b_offset = 0;
486
487		if (page_offset < 0) {
488			b_offset = -page_offset;
489			page_offset += b_offset;
490			len -= b_offset;
491		}
492
493		if (len > 0 && page_offset + len > STRIPE_SIZE)
494			clen = STRIPE_SIZE - page_offset;
495		else
496			clen = len;
497
498		if (clen > 0) {
499			b_offset += bio_iovec_idx(bio, i)->bv_offset;
500			bio_page = bio_iovec_idx(bio, i)->bv_page;
501			if (frombio)
502				tx = async_memcpy(page, bio_page, page_offset,
503					b_offset, clen,
504					ASYNC_TX_DEP_ACK,
505					tx, NULL, NULL);
506			else
507				tx = async_memcpy(bio_page, page, b_offset,
508					page_offset, clen,
509					ASYNC_TX_DEP_ACK,
510					tx, NULL, NULL);
511		}
512		if (clen < len) /* hit end of page */
513			break;
514		page_offset +=  len;
515	}
516
517	return tx;
518}
519
520static void ops_complete_biofill(void *stripe_head_ref)
521{
522	struct stripe_head *sh = stripe_head_ref;
523	struct bio *return_bi = NULL;
524	raid5_conf_t *conf = sh->raid_conf;
525	int i;
526
527	pr_debug("%s: stripe %llu\n", __func__,
528		(unsigned long long)sh->sector);
529
530	/* clear completed biofills */
531	for (i = sh->disks; i--; ) {
532		struct r5dev *dev = &sh->dev[i];
533
534		/* acknowledge completion of a biofill operation */
535		/* and check if we need to reply to a read request,
536		 * new R5_Wantfill requests are held off until
537		 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
538		 */
539		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
540			struct bio *rbi, *rbi2;
541
542			/* The access to dev->read is outside of the
543			 * spin_lock_irq(&conf->device_lock), but is protected
544			 * by the STRIPE_OP_BIOFILL pending bit
545			 */
546			BUG_ON(!dev->read);
547			rbi = dev->read;
548			dev->read = NULL;
549			while (rbi && rbi->bi_sector <
550				dev->sector + STRIPE_SECTORS) {
551				rbi2 = r5_next_bio(rbi, dev->sector);
552				spin_lock_irq(&conf->device_lock);
553				if (--rbi->bi_phys_segments == 0) {
554					rbi->bi_next = return_bi;
555					return_bi = rbi;
556				}
557				spin_unlock_irq(&conf->device_lock);
558				rbi = rbi2;
559			}
560		}
561	}
562	set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
563
564	return_io(return_bi);
565
566	set_bit(STRIPE_HANDLE, &sh->state);
567	release_stripe(sh);
568}
569
570static void ops_run_biofill(struct stripe_head *sh)
571{
572	struct dma_async_tx_descriptor *tx = NULL;
573	raid5_conf_t *conf = sh->raid_conf;
574	int i;
575
576	pr_debug("%s: stripe %llu\n", __func__,
577		(unsigned long long)sh->sector);
578
579	for (i = sh->disks; i--; ) {
580		struct r5dev *dev = &sh->dev[i];
581		if (test_bit(R5_Wantfill, &dev->flags)) {
582			struct bio *rbi;
583			spin_lock_irq(&conf->device_lock);
584			dev->read = rbi = dev->toread;
585			dev->toread = NULL;
586			spin_unlock_irq(&conf->device_lock);
587			while (rbi && rbi->bi_sector <
588				dev->sector + STRIPE_SECTORS) {
589				tx = async_copy_data(0, rbi, dev->page,
590					dev->sector, tx);
591				rbi = r5_next_bio(rbi, dev->sector);
592			}
593		}
594	}
595
596	atomic_inc(&sh->count);
597	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
598		ops_complete_biofill, sh);
599}
600
601static void ops_complete_compute5(void *stripe_head_ref)
602{
603	struct stripe_head *sh = stripe_head_ref;
604	int target = sh->ops.target;
605	struct r5dev *tgt = &sh->dev[target];
606
607	pr_debug("%s: stripe %llu\n", __func__,
608		(unsigned long long)sh->sector);
609
610	set_bit(R5_UPTODATE, &tgt->flags);
611	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
612	clear_bit(R5_Wantcompute, &tgt->flags);
613	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
614	set_bit(STRIPE_HANDLE, &sh->state);
615	release_stripe(sh);
616}
617
618static struct dma_async_tx_descriptor *
619ops_run_compute5(struct stripe_head *sh, unsigned long pending)
620{
621	/* kernel stack size limits the total number of disks */
622	int disks = sh->disks;
623	struct page *xor_srcs[disks];
624	int target = sh->ops.target;
625	struct r5dev *tgt = &sh->dev[target];
626	struct page *xor_dest = tgt->page;
627	int count = 0;
628	struct dma_async_tx_descriptor *tx;
629	int i;
630
631	pr_debug("%s: stripe %llu block: %d\n",
632		__func__, (unsigned long long)sh->sector, target);
633	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
634
635	for (i = disks; i--; )
636		if (i != target)
637			xor_srcs[count++] = sh->dev[i].page;
638
639	atomic_inc(&sh->count);
640
641	if (unlikely(count == 1))
642		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
643			0, NULL, ops_complete_compute5, sh);
644	else
645		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
646			ASYNC_TX_XOR_ZERO_DST, NULL,
647			ops_complete_compute5, sh);
648
649	/* ack now if postxor is not set to be run */
650	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
651		async_tx_ack(tx);
652
653	return tx;
654}
655
656static void ops_complete_prexor(void *stripe_head_ref)
657{
658	struct stripe_head *sh = stripe_head_ref;
659
660	pr_debug("%s: stripe %llu\n", __func__,
661		(unsigned long long)sh->sector);
662
663	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
664}
665
666static struct dma_async_tx_descriptor *
667ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
668{
669	/* kernel stack size limits the total number of disks */
670	int disks = sh->disks;
671	struct page *xor_srcs[disks];
672	int count = 0, pd_idx = sh->pd_idx, i;
673
674	/* existing parity data subtracted */
675	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
676
677	pr_debug("%s: stripe %llu\n", __func__,
678		(unsigned long long)sh->sector);
679
680	for (i = disks; i--; ) {
681		struct r5dev *dev = &sh->dev[i];
682		/* Only process blocks that are known to be uptodate */
683		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
684			xor_srcs[count++] = dev->page;
685	}
686
687	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
688		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
689		ops_complete_prexor, sh);
690
691	return tx;
692}
693
694static struct dma_async_tx_descriptor *
695ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
696		 unsigned long pending)
697{
698	int disks = sh->disks;
699	int pd_idx = sh->pd_idx, i;
700
701	/* check if prexor is active which means only process blocks
702	 * that are part of a read-modify-write (Wantprexor)
703	 */
704	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
705
706	pr_debug("%s: stripe %llu\n", __func__,
707		(unsigned long long)sh->sector);
708
709	for (i = disks; i--; ) {
710		struct r5dev *dev = &sh->dev[i];
711		struct bio *chosen;
712		int towrite;
713
714		towrite = 0;
715		if (prexor) { /* rmw */
716			if (dev->towrite &&
717			    test_bit(R5_Wantprexor, &dev->flags))
718				towrite = 1;
719		} else { /* rcw */
720			if (i != pd_idx && dev->towrite &&
721				test_bit(R5_LOCKED, &dev->flags))
722				towrite = 1;
723		}
724
725		if (towrite) {
726			struct bio *wbi;
727
728			spin_lock(&sh->lock);
729			chosen = dev->towrite;
730			dev->towrite = NULL;
731			BUG_ON(dev->written);
732			wbi = dev->written = chosen;
733			spin_unlock(&sh->lock);
734
735			while (wbi && wbi->bi_sector <
736				dev->sector + STRIPE_SECTORS) {
737				tx = async_copy_data(1, wbi, dev->page,
738					dev->sector, tx);
739				wbi = r5_next_bio(wbi, dev->sector);
740			}
741		}
742	}
743
744	return tx;
745}
746
747static void ops_complete_postxor(void *stripe_head_ref)
748{
749	struct stripe_head *sh = stripe_head_ref;
750
751	pr_debug("%s: stripe %llu\n", __func__,
752		(unsigned long long)sh->sector);
753
754	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
755	set_bit(STRIPE_HANDLE, &sh->state);
756	release_stripe(sh);
757}
758
759static void ops_complete_write(void *stripe_head_ref)
760{
761	struct stripe_head *sh = stripe_head_ref;
762	int disks = sh->disks, i, pd_idx = sh->pd_idx;
763
764	pr_debug("%s: stripe %llu\n", __func__,
765		(unsigned long long)sh->sector);
766
767	for (i = disks; i--; ) {
768		struct r5dev *dev = &sh->dev[i];
769		if (dev->written || i == pd_idx)
770			set_bit(R5_UPTODATE, &dev->flags);
771	}
772
773	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
774	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
775
776	set_bit(STRIPE_HANDLE, &sh->state);
777	release_stripe(sh);
778}
779
780static void
781ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
782		unsigned long pending)
783{
784	/* kernel stack size limits the total number of disks */
785	int disks = sh->disks;
786	struct page *xor_srcs[disks];
787
788	int count = 0, pd_idx = sh->pd_idx, i;
789	struct page *xor_dest;
790	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
791	unsigned long flags;
792	dma_async_tx_callback callback;
793
794	pr_debug("%s: stripe %llu\n", __func__,
795		(unsigned long long)sh->sector);
796
797	/* check if prexor is active which means only process blocks
798	 * that are part of a read-modify-write (written)
799	 */
800	if (prexor) {
801		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
802		for (i = disks; i--; ) {
803			struct r5dev *dev = &sh->dev[i];
804			if (dev->written)
805				xor_srcs[count++] = dev->page;
806		}
807	} else {
808		xor_dest = sh->dev[pd_idx].page;
809		for (i = disks; i--; ) {
810			struct r5dev *dev = &sh->dev[i];
811			if (i != pd_idx)
812				xor_srcs[count++] = dev->page;
813		}
814	}
815
816	/* check whether this postxor is part of a write */
817	callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
818		ops_complete_write : ops_complete_postxor;
819
820	/* 1/ if we prexor'd then the dest is reused as a source
821	 * 2/ if we did not prexor then we are redoing the parity
822	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
823	 * for the synchronous xor case
824	 */
825	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
826		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
827
828	atomic_inc(&sh->count);
829
830	if (unlikely(count == 1)) {
831		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
832		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
833			flags, tx, callback, sh);
834	} else
835		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
836			flags, tx, callback, sh);
837}
838
839static void ops_complete_check(void *stripe_head_ref)
840{
841	struct stripe_head *sh = stripe_head_ref;
842	int pd_idx = sh->pd_idx;
843
844	pr_debug("%s: stripe %llu\n", __func__,
845		(unsigned long long)sh->sector);
846
847	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
848		sh->ops.zero_sum_result == 0)
849		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
850
851	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
852	set_bit(STRIPE_HANDLE, &sh->state);
853	release_stripe(sh);
854}
855
856static void ops_run_check(struct stripe_head *sh)
857{
858	/* kernel stack size limits the total number of disks */
859	int disks = sh->disks;
860	struct page *xor_srcs[disks];
861	struct dma_async_tx_descriptor *tx;
862
863	int count = 0, pd_idx = sh->pd_idx, i;
864	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
865
866	pr_debug("%s: stripe %llu\n", __func__,
867		(unsigned long long)sh->sector);
868
869	for (i = disks; i--; ) {
870		struct r5dev *dev = &sh->dev[i];
871		if (i != pd_idx)
872			xor_srcs[count++] = dev->page;
873	}
874
875	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
876		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
877
878	if (tx)
879		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
880	else
881		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
882
883	atomic_inc(&sh->count);
884	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
885		ops_complete_check, sh);
886}
887
888static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
889{
890	int overlap_clear = 0, i, disks = sh->disks;
891	struct dma_async_tx_descriptor *tx = NULL;
892
893	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
894		ops_run_biofill(sh);
895		overlap_clear++;
896	}
897
898	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
899		tx = ops_run_compute5(sh, pending);
900
901	if (test_bit(STRIPE_OP_PREXOR, &pending))
902		tx = ops_run_prexor(sh, tx);
903
904	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
905		tx = ops_run_biodrain(sh, tx, pending);
906		overlap_clear++;
907	}
908
909	if (test_bit(STRIPE_OP_POSTXOR, &pending))
910		ops_run_postxor(sh, tx, pending);
911
912	if (test_bit(STRIPE_OP_CHECK, &pending))
913		ops_run_check(sh);
914
915	if (test_bit(STRIPE_OP_IO, &pending))
916		ops_run_io(sh);
917
918	if (overlap_clear)
919		for (i = disks; i--; ) {
920			struct r5dev *dev = &sh->dev[i];
921			if (test_and_clear_bit(R5_Overlap, &dev->flags))
922				wake_up(&sh->raid_conf->wait_for_overlap);
923		}
924}
925
926static int grow_one_stripe(raid5_conf_t *conf)
927{
928	struct stripe_head *sh;
929	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
930	if (!sh)
931		return 0;
932	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
933	sh->raid_conf = conf;
934	spin_lock_init(&sh->lock);
935
936	if (grow_buffers(sh, conf->raid_disks)) {
937		shrink_buffers(sh, conf->raid_disks);
938		kmem_cache_free(conf->slab_cache, sh);
939		return 0;
940	}
941	sh->disks = conf->raid_disks;
942	/* we just created an active stripe so... */
943	atomic_set(&sh->count, 1);
944	atomic_inc(&conf->active_stripes);
945	INIT_LIST_HEAD(&sh->lru);
946	release_stripe(sh);
947	return 1;
948}
949
950static int grow_stripes(raid5_conf_t *conf, int num)
951{
952	struct kmem_cache *sc;
953	int devs = conf->raid_disks;
954
955	sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
956	sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
957	conf->active_name = 0;
958	sc = kmem_cache_create(conf->cache_name[conf->active_name],
959			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
960			       0, 0, NULL);
961	if (!sc)
962		return 1;
963	conf->slab_cache = sc;
964	conf->pool_size = devs;
965	while (num--)
966		if (!grow_one_stripe(conf))
967			return 1;
968	return 0;
969}
970
971#ifdef CONFIG_MD_RAID5_RESHAPE
972static int resize_stripes(raid5_conf_t *conf, int newsize)
973{
974	/* Make all the stripes able to hold 'newsize' devices.
975	 * New slots in each stripe get 'page' set to a new page.
976	 *
977	 * This happens in stages:
978	 * 1/ create a new kmem_cache and allocate the required number of
979	 *    stripe_heads.
980	 * 2/ gather all the old stripe_heads and tranfer the pages across
981	 *    to the new stripe_heads.  This will have the side effect of
982	 *    freezing the array as once all stripe_heads have been collected,
983	 *    no IO will be possible.  Old stripe heads are freed once their
984	 *    pages have been transferred over, and the old kmem_cache is
985	 *    freed when all stripes are done.
986	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
987	 *    we simple return a failre status - no need to clean anything up.
988	 * 4/ allocate new pages for the new slots in the new stripe_heads.
989	 *    If this fails, we don't bother trying the shrink the
990	 *    stripe_heads down again, we just leave them as they are.
991	 *    As each stripe_head is processed the new one is released into
992	 *    active service.
993	 *
994	 * Once step2 is started, we cannot afford to wait for a write,
995	 * so we use GFP_NOIO allocations.
996	 */
997	struct stripe_head *osh, *nsh;
998	LIST_HEAD(newstripes);
999	struct disk_info *ndisks;
1000	int err = 0;
1001	struct kmem_cache *sc;
1002	int i;
1003
1004	if (newsize <= conf->pool_size)
1005		return 0; /* never bother to shrink */
1006
1007	md_allow_write(conf->mddev);
1008
1009	/* Step 1 */
1010	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1011			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1012			       0, 0, NULL);
1013	if (!sc)
1014		return -ENOMEM;
1015
1016	for (i = conf->max_nr_stripes; i; i--) {
1017		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1018		if (!nsh)
1019			break;
1020
1021		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1022
1023		nsh->raid_conf = conf;
1024		spin_lock_init(&nsh->lock);
1025
1026		list_add(&nsh->lru, &newstripes);
1027	}
1028	if (i) {
1029		/* didn't get enough, give up */
1030		while (!list_empty(&newstripes)) {
1031			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1032			list_del(&nsh->lru);
1033			kmem_cache_free(sc, nsh);
1034		}
1035		kmem_cache_destroy(sc);
1036		return -ENOMEM;
1037	}
1038	/* Step 2 - Must use GFP_NOIO now.
1039	 * OK, we have enough stripes, start collecting inactive
1040	 * stripes and copying them over
1041	 */
1042	list_for_each_entry(nsh, &newstripes, lru) {
1043		spin_lock_irq(&conf->device_lock);
1044		wait_event_lock_irq(conf->wait_for_stripe,
1045				    !list_empty(&conf->inactive_list),
1046				    conf->device_lock,
1047				    unplug_slaves(conf->mddev)
1048			);
1049		osh = get_free_stripe(conf);
1050		spin_unlock_irq(&conf->device_lock);
1051		atomic_set(&nsh->count, 1);
1052		for(i=0; i<conf->pool_size; i++)
1053			nsh->dev[i].page = osh->dev[i].page;
1054		for( ; i<newsize; i++)
1055			nsh->dev[i].page = NULL;
1056		kmem_cache_free(conf->slab_cache, osh);
1057	}
1058	kmem_cache_destroy(conf->slab_cache);
1059
1060	/* Step 3.
1061	 * At this point, we are holding all the stripes so the array
1062	 * is completely stalled, so now is a good time to resize
1063	 * conf->disks.
1064	 */
1065	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1066	if (ndisks) {
1067		for (i=0; i<conf->raid_disks; i++)
1068			ndisks[i] = conf->disks[i];
1069		kfree(conf->disks);
1070		conf->disks = ndisks;
1071	} else
1072		err = -ENOMEM;
1073
1074	/* Step 4, return new stripes to service */
1075	while(!list_empty(&newstripes)) {
1076		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1077		list_del_init(&nsh->lru);
1078		for (i=conf->raid_disks; i < newsize; i++)
1079			if (nsh->dev[i].page == NULL) {
1080				struct page *p = alloc_page(GFP_NOIO);
1081				nsh->dev[i].page = p;
1082				if (!p)
1083					err = -ENOMEM;
1084			}
1085		release_stripe(nsh);
1086	}
1087	/* critical section pass, GFP_NOIO no longer needed */
1088
1089	conf->slab_cache = sc;
1090	conf->active_name = 1-conf->active_name;
1091	conf->pool_size = newsize;
1092	return err;
1093}
1094#endif
1095
1096static int drop_one_stripe(raid5_conf_t *conf)
1097{
1098	struct stripe_head *sh;
1099
1100	spin_lock_irq(&conf->device_lock);
1101	sh = get_free_stripe(conf);
1102	spin_unlock_irq(&conf->device_lock);
1103	if (!sh)
1104		return 0;
1105	BUG_ON(atomic_read(&sh->count));
1106	shrink_buffers(sh, conf->pool_size);
1107	kmem_cache_free(conf->slab_cache, sh);
1108	atomic_dec(&conf->active_stripes);
1109	return 1;
1110}
1111
1112static void shrink_stripes(raid5_conf_t *conf)
1113{
1114	while (drop_one_stripe(conf))
1115		;
1116
1117	if (conf->slab_cache)
1118		kmem_cache_destroy(conf->slab_cache);
1119	conf->slab_cache = NULL;
1120}
1121
1122static void raid5_end_read_request(struct bio * bi, int error)
1123{
1124 	struct stripe_head *sh = bi->bi_private;
1125	raid5_conf_t *conf = sh->raid_conf;
1126	int disks = sh->disks, i;
1127	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1128	char b[BDEVNAME_SIZE];
1129	mdk_rdev_t *rdev;
1130
1131
1132	for (i=0 ; i<disks; i++)
1133		if (bi == &sh->dev[i].req)
1134			break;
1135
1136	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1137		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1138		uptodate);
1139	if (i == disks) {
1140		BUG();
1141		return;
1142	}
1143
1144	if (uptodate) {
1145		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1146		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1147			rdev = conf->disks[i].rdev;
1148			printk_rl(KERN_INFO "raid5:%s: read error corrected"
1149				  " (%lu sectors at %llu on %s)\n",
1150				  mdname(conf->mddev), STRIPE_SECTORS,
1151				  (unsigned long long)(sh->sector
1152						       + rdev->data_offset),
1153				  bdevname(rdev->bdev, b));
1154			clear_bit(R5_ReadError, &sh->dev[i].flags);
1155			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1156		}
1157		if (atomic_read(&conf->disks[i].rdev->read_errors))
1158			atomic_set(&conf->disks[i].rdev->read_errors, 0);
1159	} else {
1160		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1161		int retry = 0;
1162		rdev = conf->disks[i].rdev;
1163
1164		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1165		atomic_inc(&rdev->read_errors);
1166		if (conf->mddev->degraded)
1167			printk_rl(KERN_WARNING
1168				  "raid5:%s: read error not correctable "
1169				  "(sector %llu on %s).\n",
1170				  mdname(conf->mddev),
1171				  (unsigned long long)(sh->sector
1172						       + rdev->data_offset),
1173				  bdn);
1174		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1175			/* Oh, no!!! */
1176			printk_rl(KERN_WARNING
1177				  "raid5:%s: read error NOT corrected!! "
1178				  "(sector %llu on %s).\n",
1179				  mdname(conf->mddev),
1180				  (unsigned long long)(sh->sector
1181						       + rdev->data_offset),
1182				  bdn);
1183		else if (atomic_read(&rdev->read_errors)
1184			 > conf->max_nr_stripes)
1185			printk(KERN_WARNING
1186			       "raid5:%s: Too many read errors, failing device %s.\n",
1187			       mdname(conf->mddev), bdn);
1188		else
1189			retry = 1;
1190		if (retry)
1191			set_bit(R5_ReadError, &sh->dev[i].flags);
1192		else {
1193			clear_bit(R5_ReadError, &sh->dev[i].flags);
1194			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1195			md_error(conf->mddev, rdev);
1196		}
1197	}
1198	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1199	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1200	set_bit(STRIPE_HANDLE, &sh->state);
1201	release_stripe(sh);
1202}
1203
1204static void raid5_end_write_request (struct bio *bi, int error)
1205{
1206 	struct stripe_head *sh = bi->bi_private;
1207	raid5_conf_t *conf = sh->raid_conf;
1208	int disks = sh->disks, i;
1209	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1210
1211	for (i=0 ; i<disks; i++)
1212		if (bi == &sh->dev[i].req)
1213			break;
1214
1215	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1216		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1217		uptodate);
1218	if (i == disks) {
1219		BUG();
1220		return;
1221	}
1222
1223	if (!uptodate)
1224		md_error(conf->mddev, conf->disks[i].rdev);
1225
1226	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1227
1228	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1229	set_bit(STRIPE_HANDLE, &sh->state);
1230	release_stripe(sh);
1231}
1232
1233
1234static sector_t compute_blocknr(struct stripe_head *sh, int i);
1235
1236static void raid5_build_block (struct stripe_head *sh, int i)
1237{
1238	struct r5dev *dev = &sh->dev[i];
1239
1240	bio_init(&dev->req);
1241	dev->req.bi_io_vec = &dev->vec;
1242	dev->req.bi_vcnt++;
1243	dev->req.bi_max_vecs++;
1244	dev->vec.bv_page = dev->page;
1245	dev->vec.bv_len = STRIPE_SIZE;
1246	dev->vec.bv_offset = 0;
1247
1248	dev->req.bi_sector = sh->sector;
1249	dev->req.bi_private = sh;
1250
1251	dev->flags = 0;
1252	dev->sector = compute_blocknr(sh, i);
1253}
1254
1255static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1256{
1257	char b[BDEVNAME_SIZE];
1258	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1259	pr_debug("raid5: error called\n");
1260
1261	if (!test_bit(Faulty, &rdev->flags)) {
1262		set_bit(MD_CHANGE_DEVS, &mddev->flags);
1263		if (test_and_clear_bit(In_sync, &rdev->flags)) {
1264			unsigned long flags;
1265			spin_lock_irqsave(&conf->device_lock, flags);
1266			mddev->degraded++;
1267			spin_unlock_irqrestore(&conf->device_lock, flags);
1268			/*
1269			 * if recovery was running, make sure it aborts.
1270			 */
1271			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1272		}
1273		set_bit(Faulty, &rdev->flags);
1274		printk (KERN_ALERT
1275			"raid5: Disk failure on %s, disabling device.\n"
1276			"raid5: Operation continuing on %d devices.\n",
1277			bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1278	}
1279}
1280
1281/*
1282 * Input: a 'big' sector number,
1283 * Output: index of the data and parity disk, and the sector # in them.
1284 */
1285static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
1286			unsigned int data_disks, unsigned int * dd_idx,
1287			unsigned int * pd_idx, raid5_conf_t *conf)
1288{
1289	long stripe;
1290	unsigned long chunk_number;
1291	unsigned int chunk_offset;
1292	sector_t new_sector;
1293	int sectors_per_chunk = conf->chunk_size >> 9;
1294
1295	/* First compute the information on this sector */
1296
1297	/*
1298	 * Compute the chunk number and the sector offset inside the chunk
1299	 */
1300	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1301	chunk_number = r_sector;
1302	BUG_ON(r_sector != chunk_number);
1303
1304	/*
1305	 * Compute the stripe number
1306	 */
1307	stripe = chunk_number / data_disks;
1308
1309	/*
1310	 * Compute the data disk and parity disk indexes inside the stripe
1311	 */
1312	*dd_idx = chunk_number % data_disks;
1313
1314	/*
1315	 * Select the parity disk based on the user selected algorithm.
1316	 */
1317	switch(conf->level) {
1318	case 4:
1319		*pd_idx = data_disks;
1320		break;
1321	case 5:
1322		switch (conf->algorithm) {
1323		case ALGORITHM_LEFT_ASYMMETRIC:
1324			*pd_idx = data_disks - stripe % raid_disks;
1325			if (*dd_idx >= *pd_idx)
1326				(*dd_idx)++;
1327			break;
1328		case ALGORITHM_RIGHT_ASYMMETRIC:
1329			*pd_idx = stripe % raid_disks;
1330			if (*dd_idx >= *pd_idx)
1331				(*dd_idx)++;
1332			break;
1333		case ALGORITHM_LEFT_SYMMETRIC:
1334			*pd_idx = data_disks - stripe % raid_disks;
1335			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1336			break;
1337		case ALGORITHM_RIGHT_SYMMETRIC:
1338			*pd_idx = stripe % raid_disks;
1339			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1340			break;
1341		default:
1342			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1343				conf->algorithm);
1344		}
1345		break;
1346	case 6:
1347
1348		/**** FIX THIS ****/
1349		switch (conf->algorithm) {
1350		case ALGORITHM_LEFT_ASYMMETRIC:
1351			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
1352			if (*pd_idx == raid_disks-1)
1353				(*dd_idx)++; 	/* Q D D D P */
1354			else if (*dd_idx >= *pd_idx)
1355				(*dd_idx) += 2; /* D D P Q D */
1356			break;
1357		case ALGORITHM_RIGHT_ASYMMETRIC:
1358			*pd_idx = stripe % raid_disks;
1359			if (*pd_idx == raid_disks-1)
1360				(*dd_idx)++; 	/* Q D D D P */
1361			else if (*dd_idx >= *pd_idx)
1362				(*dd_idx) += 2; /* D D P Q D */
1363			break;
1364		case ALGORITHM_LEFT_SYMMETRIC:
1365			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
1366			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1367			break;
1368		case ALGORITHM_RIGHT_SYMMETRIC:
1369			*pd_idx = stripe % raid_disks;
1370			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1371			break;
1372		default:
1373			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
1374				conf->algorithm);
1375		}
1376		break;
1377	}
1378
1379	/*
1380	 * Finally, compute the new sector number
1381	 */
1382	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1383	return new_sector;
1384}
1385
1386
1387static sector_t compute_blocknr(struct stripe_head *sh, int i)
1388{
1389	raid5_conf_t *conf = sh->raid_conf;
1390	int raid_disks = sh->disks;
1391	int data_disks = raid_disks - conf->max_degraded;
1392	sector_t new_sector = sh->sector, check;
1393	int sectors_per_chunk = conf->chunk_size >> 9;
1394	sector_t stripe;
1395	int chunk_offset;
1396	int chunk_number, dummy1, dummy2, dd_idx = i;
1397	sector_t r_sector;
1398
1399
1400	chunk_offset = sector_div(new_sector, sectors_per_chunk);
1401	stripe = new_sector;
1402	BUG_ON(new_sector != stripe);
1403
1404	if (i == sh->pd_idx)
1405		return 0;
1406	switch(conf->level) {
1407	case 4: break;
1408	case 5:
1409		switch (conf->algorithm) {
1410		case ALGORITHM_LEFT_ASYMMETRIC:
1411		case ALGORITHM_RIGHT_ASYMMETRIC:
1412			if (i > sh->pd_idx)
1413				i--;
1414			break;
1415		case ALGORITHM_LEFT_SYMMETRIC:
1416		case ALGORITHM_RIGHT_SYMMETRIC:
1417			if (i < sh->pd_idx)
1418				i += raid_disks;
1419			i -= (sh->pd_idx + 1);
1420			break;
1421		default:
1422			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1423			       conf->algorithm);
1424		}
1425		break;
1426	case 6:
1427		if (i == raid6_next_disk(sh->pd_idx, raid_disks))
1428			return 0; /* It is the Q disk */
1429		switch (conf->algorithm) {
1430		case ALGORITHM_LEFT_ASYMMETRIC:
1431		case ALGORITHM_RIGHT_ASYMMETRIC:
1432		  	if (sh->pd_idx == raid_disks-1)
1433				i--; 	/* Q D D D P */
1434			else if (i > sh->pd_idx)
1435				i -= 2; /* D D P Q D */
1436			break;
1437		case ALGORITHM_LEFT_SYMMETRIC:
1438		case ALGORITHM_RIGHT_SYMMETRIC:
1439			if (sh->pd_idx == raid_disks-1)
1440				i--; /* Q D D D P */
1441			else {
1442				/* D D P Q D */
1443				if (i < sh->pd_idx)
1444					i += raid_disks;
1445				i -= (sh->pd_idx + 2);
1446			}
1447			break;
1448		default:
1449			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
1450				conf->algorithm);
1451		}
1452		break;
1453	}
1454
1455	chunk_number = stripe * data_disks + i;
1456	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1457
1458	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
1459	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
1460		printk(KERN_ERR "compute_blocknr: map not correct\n");
1461		return 0;
1462	}
1463	return r_sector;
1464}
1465
1466
1467
1468/*
1469 * Copy data between a page in the stripe cache, and one or more bion
1470 * The page could align with the middle of the bio, or there could be
1471 * several bion, each with several bio_vecs, which cover part of the page
1472 * Multiple bion are linked together on bi_next.  There may be extras
1473 * at the end of this list.  We ignore them.
1474 */
1475static void copy_data(int frombio, struct bio *bio,
1476		     struct page *page,
1477		     sector_t sector)
1478{
1479	char *pa = page_address(page);
1480	struct bio_vec *bvl;
1481	int i;
1482	int page_offset;
1483
1484	if (bio->bi_sector >= sector)
1485		page_offset = (signed)(bio->bi_sector - sector) * 512;
1486	else
1487		page_offset = (signed)(sector - bio->bi_sector) * -512;
1488	bio_for_each_segment(bvl, bio, i) {
1489		int len = bio_iovec_idx(bio,i)->bv_len;
1490		int clen;
1491		int b_offset = 0;
1492
1493		if (page_offset < 0) {
1494			b_offset = -page_offset;
1495			page_offset += b_offset;
1496			len -= b_offset;
1497		}
1498
1499		if (len > 0 && page_offset + len > STRIPE_SIZE)
1500			clen = STRIPE_SIZE - page_offset;
1501		else clen = len;
1502
1503		if (clen > 0) {
1504			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1505			if (frombio)
1506				memcpy(pa+page_offset, ba+b_offset, clen);
1507			else
1508				memcpy(ba+b_offset, pa+page_offset, clen);
1509			__bio_kunmap_atomic(ba, KM_USER0);
1510		}
1511		if (clen < len) /* hit end of page */
1512			break;
1513		page_offset +=  len;
1514	}
1515}
1516
1517#define check_xor()	do {						  \
1518				if (count == MAX_XOR_BLOCKS) {		  \
1519				xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1520				count = 0;				  \
1521			   }						  \
1522			} while(0)
1523
1524static void compute_parity6(struct stripe_head *sh, int method)
1525{
1526	raid6_conf_t *conf = sh->raid_conf;
1527	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1528	struct bio *chosen;
1529	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
1530	void *ptrs[disks];
1531
1532	qd_idx = raid6_next_disk(pd_idx, disks);
1533	d0_idx = raid6_next_disk(qd_idx, disks);
1534
1535	pr_debug("compute_parity, stripe %llu, method %d\n",
1536		(unsigned long long)sh->sector, method);
1537
1538	switch(method) {
1539	case READ_MODIFY_WRITE:
1540		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */
1541	case RECONSTRUCT_WRITE:
1542		for (i= disks; i-- ;)
1543			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1544				chosen = sh->dev[i].towrite;
1545				sh->dev[i].towrite = NULL;
1546
1547				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1548					wake_up(&conf->wait_for_overlap);
1549
1550				BUG_ON(sh->dev[i].written);
1551				sh->dev[i].written = chosen;
1552			}
1553		break;
1554	case CHECK_PARITY:
1555		BUG();		/* Not implemented yet */
1556	}
1557
1558	for (i = disks; i--;)
1559		if (sh->dev[i].written) {
1560			sector_t sector = sh->dev[i].sector;
1561			struct bio *wbi = sh->dev[i].written;
1562			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1563				copy_data(1, wbi, sh->dev[i].page, sector);
1564				wbi = r5_next_bio(wbi, sector);
1565			}
1566
1567			set_bit(R5_LOCKED, &sh->dev[i].flags);
1568			set_bit(R5_UPTODATE, &sh->dev[i].flags);
1569		}
1570
1571//	switch(method) {
1572//	case RECONSTRUCT_WRITE:
1573//	case CHECK_PARITY:
1574//	case UPDATE_PARITY:
1575		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1576		/* FIX: Is this ordering of drives even remotely optimal? */
1577		count = 0;
1578		i = d0_idx;
1579		do {
1580			ptrs[count++] = page_address(sh->dev[i].page);
1581			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1582				printk("block %d/%d not uptodate on parity calc\n", i,count);
1583			i = raid6_next_disk(i, disks);
1584		} while ( i != d0_idx );
1585//		break;
1586//	}
1587
1588	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1589
1590	switch(method) {
1591	case RECONSTRUCT_WRITE:
1592		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1593		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1594		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
1595		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
1596		break;
1597	case UPDATE_PARITY:
1598		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1599		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1600		break;
1601	}
1602}
1603
1604
1605/* Compute one missing block */
1606static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1607{
1608	int i, count, disks = sh->disks;
1609	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1610	int pd_idx = sh->pd_idx;
1611	int qd_idx = raid6_next_disk(pd_idx, disks);
1612
1613	pr_debug("compute_block_1, stripe %llu, idx %d\n",
1614		(unsigned long long)sh->sector, dd_idx);
1615
1616	if ( dd_idx == qd_idx ) {
1617		/* We're actually computing the Q drive */
1618		compute_parity6(sh, UPDATE_PARITY);
1619	} else {
1620		dest = page_address(sh->dev[dd_idx].page);
1621		if (!nozero) memset(dest, 0, STRIPE_SIZE);
1622		count = 0;
1623		for (i = disks ; i--; ) {
1624			if (i == dd_idx || i == qd_idx)
1625				continue;
1626			p = page_address(sh->dev[i].page);
1627			if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1628				ptr[count++] = p;
1629			else
1630				printk("compute_block() %d, stripe %llu, %d"
1631				       " not present\n", dd_idx,
1632				       (unsigned long long)sh->sector, i);
1633
1634			check_xor();
1635		}
1636		if (count)
1637			xor_blocks(count, STRIPE_SIZE, dest, ptr);
1638		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1639		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1640	}
1641}
1642
1643/* Compute two missing blocks */
1644static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1645{
1646	int i, count, disks = sh->disks;
1647	int pd_idx = sh->pd_idx;
1648	int qd_idx = raid6_next_disk(pd_idx, disks);
1649	int d0_idx = raid6_next_disk(qd_idx, disks);
1650	int faila, failb;
1651
1652	/* faila and failb are disk numbers relative to d0_idx */
1653	/* pd_idx become disks-2 and qd_idx become disks-1 */
1654	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1655	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1656
1657	BUG_ON(faila == failb);
1658	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1659
1660	pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1661	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1662
1663	if ( failb == disks-1 ) {
1664		/* Q disk is one of the missing disks */
1665		if ( faila == disks-2 ) {
1666			/* Missing P+Q, just recompute */
1667			compute_parity6(sh, UPDATE_PARITY);
1668			return;
1669		} else {
1670			/* We're missing D+Q; recompute D from P */
1671			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1672			compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1673			return;
1674		}
1675	}
1676
1677	/* We're missing D+P or D+D; build pointer table */
1678	{
1679		/**** FIX THIS: This could be very bad if disks is close to 256 ****/
1680		void *ptrs[disks];
1681
1682		count = 0;
1683		i = d0_idx;
1684		do {
1685			ptrs[count++] = page_address(sh->dev[i].page);
1686			i = raid6_next_disk(i, disks);
1687			if (i != dd_idx1 && i != dd_idx2 &&
1688			    !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1689				printk("compute_2 with missing block %d/%d\n", count, i);
1690		} while ( i != d0_idx );
1691
1692		if ( failb == disks-2 ) {
1693			/* We're missing D+P. */
1694			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1695		} else {
1696			/* We're missing D+D. */
1697			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1698		}
1699
1700		/* Both the above update both missing blocks */
1701		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1702		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1703	}
1704}
1705
1706static int
1707handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1708{
1709	int i, pd_idx = sh->pd_idx, disks = sh->disks;
1710	int locked = 0;
1711
1712	if (rcw) {
1713		/* if we are not expanding this is a proper write request, and
1714		 * there will be bios with new data to be drained into the
1715		 * stripe cache
1716		 */
1717		if (!expand) {
1718			set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1719			sh->ops.count++;
1720		}
1721
1722		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1723		sh->ops.count++;
1724
1725		for (i = disks; i--; ) {
1726			struct r5dev *dev = &sh->dev[i];
1727
1728			if (dev->towrite) {
1729				set_bit(R5_LOCKED, &dev->flags);
1730				if (!expand)
1731					clear_bit(R5_UPTODATE, &dev->flags);
1732				locked++;
1733			}
1734		}
1735		if (locked + 1 == disks)
1736			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1737				atomic_inc(&sh->raid_conf->pending_full_writes);
1738	} else {
1739		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1740			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1741
1742		set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1743		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1744		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1745
1746		sh->ops.count += 3;
1747
1748		for (i = disks; i--; ) {
1749			struct r5dev *dev = &sh->dev[i];
1750			if (i == pd_idx)
1751				continue;
1752
1753			/* For a read-modify write there may be blocks that are
1754			 * locked for reading while others are ready to be
1755			 * written so we distinguish these blocks by the
1756			 * R5_Wantprexor bit
1757			 */
1758			if (dev->towrite &&
1759			    (test_bit(R5_UPTODATE, &dev->flags) ||
1760			    test_bit(R5_Wantcompute, &dev->flags))) {
1761				set_bit(R5_Wantprexor, &dev->flags);
1762				set_bit(R5_LOCKED, &dev->flags);
1763				clear_bit(R5_UPTODATE, &dev->flags);
1764				locked++;
1765			}
1766		}
1767	}
1768
1769	/* keep the parity disk locked while asynchronous operations
1770	 * are in flight
1771	 */
1772	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1773	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1774	locked++;
1775
1776	pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1777		__func__, (unsigned long long)sh->sector,
1778		locked, sh->ops.pending);
1779
1780	return locked;
1781}
1782
1783/*
1784 * Each stripe/dev can have one or more bion attached.
1785 * toread/towrite point to the first in a chain.
1786 * The bi_next chain must be in order.
1787 */
1788static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
1789{
1790	struct bio **bip;
1791	raid5_conf_t *conf = sh->raid_conf;
1792	int firstwrite=0;
1793
1794	pr_debug("adding bh b#%llu to stripe s#%llu\n",
1795		(unsigned long long)bi->bi_sector,
1796		(unsigned long long)sh->sector);
1797
1798
1799	spin_lock(&sh->lock);
1800	spin_lock_irq(&conf->device_lock);
1801	if (forwrite) {
1802		bip = &sh->dev[dd_idx].towrite;
1803		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
1804			firstwrite = 1;
1805	} else
1806		bip = &sh->dev[dd_idx].toread;
1807	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
1808		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
1809			goto overlap;
1810		bip = & (*bip)->bi_next;
1811	}
1812	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
1813		goto overlap;
1814
1815	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
1816	if (*bip)
1817		bi->bi_next = *bip;
1818	*bip = bi;
1819	bi->bi_phys_segments ++;
1820	spin_unlock_irq(&conf->device_lock);
1821	spin_unlock(&sh->lock);
1822
1823	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
1824		(unsigned long long)bi->bi_sector,
1825		(unsigned long long)sh->sector, dd_idx);
1826
1827	if (conf->mddev->bitmap && firstwrite) {
1828		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
1829				  STRIPE_SECTORS, 0);
1830		sh->bm_seq = conf->seq_flush+1;
1831		set_bit(STRIPE_BIT_DELAY, &sh->state);
1832	}
1833
1834	if (forwrite) {
1835		/* check if page is covered */
1836		sector_t sector = sh->dev[dd_idx].sector;
1837		for (bi=sh->dev[dd_idx].towrite;
1838		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
1839			     bi && bi->bi_sector <= sector;
1840		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
1841			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
1842				sector = bi->bi_sector + (bi->bi_size>>9);
1843		}
1844		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
1845			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
1846	}
1847	return 1;
1848
1849 overlap:
1850	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1851	spin_unlock_irq(&conf->device_lock);
1852	spin_unlock(&sh->lock);
1853	return 0;
1854}
1855
1856static void end_reshape(raid5_conf_t *conf);
1857
1858static int page_is_zero(struct page *p)
1859{
1860	char *a = page_address(p);
1861	return ((*(u32*)a) == 0 &&
1862		memcmp(a, a+4, STRIPE_SIZE-4)==0);
1863}
1864
1865static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1866{
1867	int sectors_per_chunk = conf->chunk_size >> 9;
1868	int pd_idx, dd_idx;
1869	int chunk_offset = sector_div(stripe, sectors_per_chunk);
1870
1871	raid5_compute_sector(stripe * (disks - conf->max_degraded)
1872			     *sectors_per_chunk + chunk_offset,
1873			     disks, disks - conf->max_degraded,
1874			     &dd_idx, &pd_idx, conf);
1875	return pd_idx;
1876}
1877
1878static void
1879handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1880				struct stripe_head_state *s, int disks,
1881				struct bio **return_bi)
1882{
1883	int i;
1884	for (i = disks; i--; ) {
1885		struct bio *bi;
1886		int bitmap_end = 0;
1887
1888		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1889			mdk_rdev_t *rdev;
1890			rcu_read_lock();
1891			rdev = rcu_dereference(conf->disks[i].rdev);
1892			if (rdev && test_bit(In_sync, &rdev->flags))
1893				/* multiple read failures in one stripe */
1894				md_error(conf->mddev, rdev);
1895			rcu_read_unlock();
1896		}
1897		spin_lock_irq(&conf->device_lock);
1898		/* fail all writes first */
1899		bi = sh->dev[i].towrite;
1900		sh->dev[i].towrite = NULL;
1901		if (bi) {
1902			s->to_write--;
1903			bitmap_end = 1;
1904		}
1905
1906		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1907			wake_up(&conf->wait_for_overlap);
1908
1909		while (bi && bi->bi_sector <
1910			sh->dev[i].sector + STRIPE_SECTORS) {
1911			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1912			clear_bit(BIO_UPTODATE, &bi->bi_flags);
1913			if (--bi->bi_phys_segments == 0) {
1914				md_write_end(conf->mddev);
1915				bi->bi_next = *return_bi;
1916				*return_bi = bi;
1917			}
1918			bi = nextbi;
1919		}
1920		/* and fail all 'written' */
1921		bi = sh->dev[i].written;
1922		sh->dev[i].written = NULL;
1923		if (bi) bitmap_end = 1;
1924		while (bi && bi->bi_sector <
1925		       sh->dev[i].sector + STRIPE_SECTORS) {
1926			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1927			clear_bit(BIO_UPTODATE, &bi->bi_flags);
1928			if (--bi->bi_phys_segments == 0) {
1929				md_write_end(conf->mddev);
1930				bi->bi_next = *return_bi;
1931				*return_bi = bi;
1932			}
1933			bi = bi2;
1934		}
1935
1936		/* fail any reads if this device is non-operational and
1937		 * the data has not reached the cache yet.
1938		 */
1939		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
1940		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1941		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
1942			bi = sh->dev[i].toread;
1943			sh->dev[i].toread = NULL;
1944			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1945				wake_up(&conf->wait_for_overlap);
1946			if (bi) s->to_read--;
1947			while (bi && bi->bi_sector <
1948			       sh->dev[i].sector + STRIPE_SECTORS) {
1949				struct bio *nextbi =
1950					r5_next_bio(bi, sh->dev[i].sector);
1951				clear_bit(BIO_UPTODATE, &bi->bi_flags);
1952				if (--bi->bi_phys_segments == 0) {
1953					bi->bi_next = *return_bi;
1954					*return_bi = bi;
1955				}
1956				bi = nextbi;
1957			}
1958		}
1959		spin_unlock_irq(&conf->device_lock);
1960		if (bitmap_end)
1961			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1962					STRIPE_SECTORS, 0, 0);
1963	}
1964
1965	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1966		if (atomic_dec_and_test(&conf->pending_full_writes))
1967			md_wakeup_thread(conf->mddev->thread);
1968}
1969
1970/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
1971 * to process
1972 */
1973static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1974			struct stripe_head_state *s, int disk_idx, int disks)
1975{
1976	struct r5dev *dev = &sh->dev[disk_idx];
1977	struct r5dev *failed_dev = &sh->dev[s->failed_num];
1978
1979	/* don't schedule compute operations or reads on the parity block while
1980	 * a check is in flight
1981	 */
1982	if ((disk_idx == sh->pd_idx) &&
1983	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1984		return ~0;
1985
1986	/* is the data in this block needed, and can we get it? */
1987	if (!test_bit(R5_LOCKED, &dev->flags) &&
1988	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
1989	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1990	     s->syncing || s->expanding || (s->failed &&
1991	     (failed_dev->toread || (failed_dev->towrite &&
1992	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
1993	     ))))) {
1994		/* 1/ We would like to get this block, possibly by computing it,
1995		 * but we might not be able to.
1996		 *
1997		 * 2/ Since parity check operations potentially make the parity
1998		 * block !uptodate it will need to be refreshed before any
1999		 * compute operations on data disks are scheduled.
2000		 *
2001		 * 3/ We hold off parity block re-reads until check operations
2002		 * have quiesced.
2003		 */
2004		if ((s->uptodate == disks - 1) &&
2005		    (s->failed && disk_idx == s->failed_num) &&
2006		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2007			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2008			set_bit(R5_Wantcompute, &dev->flags);
2009			sh->ops.target = disk_idx;
2010			s->req_compute = 1;
2011			sh->ops.count++;
2012			/* Careful: from this point on 'uptodate' is in the eye
2013			 * of raid5_run_ops which services 'compute' operations
2014			 * before writes. R5_Wantcompute flags a block that will
2015			 * be R5_UPTODATE by the time it is needed for a
2016			 * subsequent operation.
2017			 */
2018			s->uptodate++;
2019			return 0; /* uptodate + compute == disks */
2020		} else if ((s->uptodate < disks - 1) &&
2021			test_bit(R5_Insync, &dev->flags)) {
2022			/* Note: we hold off compute operations while checks are
2023			 * in flight, but we still prefer 'compute' over 'read'
2024			 * hence we only read if (uptodate < * disks-1)
2025			 */
2026			set_bit(R5_LOCKED, &dev->flags);
2027			set_bit(R5_Wantread, &dev->flags);
2028			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2029				sh->ops.count++;
2030			s->locked++;
2031			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2032				s->syncing);
2033		}
2034	}
2035
2036	return ~0;
2037}
2038
2039static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2040			struct stripe_head_state *s, int disks)
2041{
2042	int i;
2043
2044	/* Clear completed compute operations.  Parity recovery
2045	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2046	 * later on in this routine
2047	 */
2048	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2049		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2050		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2051		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2052		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2053	}
2054
2055	/* look for blocks to read/compute, skip this if a compute
2056	 * is already in flight, or if the stripe contents are in the
2057	 * midst of changing due to a write
2058	 */
2059	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2060		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
2061		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2062		for (i = disks; i--; )
2063			if (__handle_issuing_new_read_requests5(
2064				sh, s, i, disks) == 0)
2065				break;
2066	}
2067	set_bit(STRIPE_HANDLE, &sh->state);
2068}
2069
2070static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2071			struct stripe_head_state *s, struct r6_state *r6s,
2072			int disks)
2073{
2074	int i;
2075	for (i = disks; i--; ) {
2076		struct r5dev *dev = &sh->dev[i];
2077		if (!test_bit(R5_LOCKED, &dev->flags) &&
2078		    !test_bit(R5_UPTODATE, &dev->flags) &&
2079		    (dev->toread || (dev->towrite &&
2080		     !test_bit(R5_OVERWRITE, &dev->flags)) ||
2081		     s->syncing || s->expanding ||
2082		     (s->failed >= 1 &&
2083		      (sh->dev[r6s->failed_num[0]].toread ||
2084		       s->to_write)) ||
2085		     (s->failed >= 2 &&
2086		      (sh->dev[r6s->failed_num[1]].toread ||
2087		       s->to_write)))) {
2088			/* we would like to get this block, possibly
2089			 * by computing it, but we might not be able to
2090			 */
2091			if ((s->uptodate == disks - 1) &&
2092			    (s->failed && (i == r6s->failed_num[0] ||
2093					   i == r6s->failed_num[1]))) {
2094				pr_debug("Computing stripe %llu block %d\n",
2095				       (unsigned long long)sh->sector, i);
2096				compute_block_1(sh, i, 0);
2097				s->uptodate++;
2098			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2099				/* Computing 2-failure is *very* expensive; only
2100				 * do it if failed >= 2
2101				 */
2102				int other;
2103				for (other = disks; other--; ) {
2104					if (other == i)
2105						continue;
2106					if (!test_bit(R5_UPTODATE,
2107					      &sh->dev[other].flags))
2108						break;
2109				}
2110				BUG_ON(other < 0);
2111				pr_debug("Computing stripe %llu blocks %d,%d\n",
2112				       (unsigned long long)sh->sector,
2113				       i, other);
2114				compute_block_2(sh, i, other);
2115				s->uptodate += 2;
2116			} else if (test_bit(R5_Insync, &dev->flags)) {
2117				set_bit(R5_LOCKED, &dev->flags);
2118				set_bit(R5_Wantread, &dev->flags);
2119				s->locked++;
2120				pr_debug("Reading block %d (sync=%d)\n",
2121					i, s->syncing);
2122			}
2123		}
2124	}
2125	set_bit(STRIPE_HANDLE, &sh->state);
2126}
2127
2128
2129/* handle_completed_write_requests
2130 * any written block on an uptodate or failed drive can be returned.
2131 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2132 * never LOCKED, so we don't need to test 'failed' directly.
2133 */
2134static void handle_completed_write_requests(raid5_conf_t *conf,
2135	struct stripe_head *sh, int disks, struct bio **return_bi)
2136{
2137	int i;
2138	struct r5dev *dev;
2139
2140	for (i = disks; i--; )
2141		if (sh->dev[i].written) {
2142			dev = &sh->dev[i];
2143			if (!test_bit(R5_LOCKED, &dev->flags) &&
2144				test_bit(R5_UPTODATE, &dev->flags)) {
2145				/* We can return any write requests */
2146				struct bio *wbi, *wbi2;
2147				int bitmap_end = 0;
2148				pr_debug("Return write for disc %d\n", i);
2149				spin_lock_irq(&conf->device_lock);
2150				wbi = dev->written;
2151				dev->written = NULL;
2152				while (wbi && wbi->bi_sector <
2153					dev->sector + STRIPE_SECTORS) {
2154					wbi2 = r5_next_bio(wbi, dev->sector);
2155					if (--wbi->bi_phys_segments == 0) {
2156						md_write_end(conf->mddev);
2157						wbi->bi_next = *return_bi;
2158						*return_bi = wbi;
2159					}
2160					wbi = wbi2;
2161				}
2162				if (dev->towrite == NULL)
2163					bitmap_end = 1;
2164				spin_unlock_irq(&conf->device_lock);
2165				if (bitmap_end)
2166					bitmap_endwrite(conf->mddev->bitmap,
2167							sh->sector,
2168							STRIPE_SECTORS,
2169					 !test_bit(STRIPE_DEGRADED, &sh->state),
2170							0);
2171			}
2172		}
2173
2174	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2175		if (atomic_dec_and_test(&conf->pending_full_writes))
2176			md_wakeup_thread(conf->mddev->thread);
2177}
2178
2179static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2180		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
2181{
2182	int rmw = 0, rcw = 0, i;
2183	for (i = disks; i--; ) {
2184		/* would I have to read this buffer for read_modify_write */
2185		struct r5dev *dev = &sh->dev[i];
2186		if ((dev->towrite || i == sh->pd_idx) &&
2187		    !test_bit(R5_LOCKED, &dev->flags) &&
2188		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2189		      test_bit(R5_Wantcompute, &dev->flags))) {
2190			if (test_bit(R5_Insync, &dev->flags))
2191				rmw++;
2192			else
2193				rmw += 2*disks;  /* cannot read it */
2194		}
2195		/* Would I have to read this buffer for reconstruct_write */
2196		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2197		    !test_bit(R5_LOCKED, &dev->flags) &&
2198		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2199		    test_bit(R5_Wantcompute, &dev->flags))) {
2200			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2201			else
2202				rcw += 2*disks;
2203		}
2204	}
2205	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2206		(unsigned long long)sh->sector, rmw, rcw);
2207	set_bit(STRIPE_HANDLE, &sh->state);
2208	if (rmw < rcw && rmw > 0)
2209		/* prefer read-modify-write, but need to get some data */
2210		for (i = disks; i--; ) {
2211			struct r5dev *dev = &sh->dev[i];
2212			if ((dev->towrite || i == sh->pd_idx) &&
2213			    !test_bit(R5_LOCKED, &dev->flags) &&
2214			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2215			    test_bit(R5_Wantcompute, &dev->flags)) &&
2216			    test_bit(R5_Insync, &dev->flags)) {
2217				if (
2218				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2219					pr_debug("Read_old block "
2220						"%d for r-m-w\n", i);
2221					set_bit(R5_LOCKED, &dev->flags);
2222					set_bit(R5_Wantread, &dev->flags);
2223					if (!test_and_set_bit(
2224						STRIPE_OP_IO, &sh->ops.pending))
2225						sh->ops.count++;
2226					s->locked++;
2227				} else {
2228					set_bit(STRIPE_DELAYED, &sh->state);
2229					set_bit(STRIPE_HANDLE, &sh->state);
2230				}
2231			}
2232		}
2233	if (rcw <= rmw && rcw > 0)
2234		/* want reconstruct write, but need to get some data */
2235		for (i = disks; i--; ) {
2236			struct r5dev *dev = &sh->dev[i];
2237			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2238			    i != sh->pd_idx &&
2239			    !test_bit(R5_LOCKED, &dev->flags) &&
2240			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2241			    test_bit(R5_Wantcompute, &dev->flags)) &&
2242			    test_bit(R5_Insync, &dev->flags)) {
2243				if (
2244				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2245					pr_debug("Read_old block "
2246						"%d for Reconstruct\n", i);
2247					set_bit(R5_LOCKED, &dev->flags);
2248					set_bit(R5_Wantread, &dev->flags);
2249					if (!test_and_set_bit(
2250						STRIPE_OP_IO, &sh->ops.pending))
2251						sh->ops.count++;
2252					s->locked++;
2253				} else {
2254					set_bit(STRIPE_DELAYED, &sh->state);
2255					set_bit(STRIPE_HANDLE, &sh->state);
2256				}
2257			}
2258		}
2259	/* now if nothing is locked, and if we have enough data,
2260	 * we can start a write request
2261	 */
2262	/* since handle_stripe can be called at any time we need to handle the
2263	 * case where a compute block operation has been submitted and then a
2264	 * subsequent call wants to start a write request.  raid5_run_ops only
2265	 * handles the case where compute block and postxor are requested
2266	 * simultaneously.  If this is not the case then new writes need to be
2267	 * held off until the compute completes.
2268	 */
2269	if ((s->req_compute ||
2270	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
2271		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
2272		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
2273		s->locked += handle_write_operations5(sh, rcw == 0, 0);
2274}
2275
2276static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2277		struct stripe_head *sh,	struct stripe_head_state *s,
2278		struct r6_state *r6s, int disks)
2279{
2280	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2281	int qd_idx = r6s->qd_idx;
2282	for (i = disks; i--; ) {
2283		struct r5dev *dev = &sh->dev[i];
2284		/* Would I have to read this buffer for reconstruct_write */
2285		if (!test_bit(R5_OVERWRITE, &dev->flags)
2286		    && i != pd_idx && i != qd_idx
2287		    && (!test_bit(R5_LOCKED, &dev->flags)
2288			    ) &&
2289		    !test_bit(R5_UPTODATE, &dev->flags)) {
2290			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2291			else {
2292				pr_debug("raid6: must_compute: "
2293					"disk %d flags=%#lx\n", i, dev->flags);
2294				must_compute++;
2295			}
2296		}
2297	}
2298	pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2299	       (unsigned long long)sh->sector, rcw, must_compute);
2300	set_bit(STRIPE_HANDLE, &sh->state);
2301
2302	if (rcw > 0)
2303		/* want reconstruct write, but need to get some data */
2304		for (i = disks; i--; ) {
2305			struct r5dev *dev = &sh->dev[i];
2306			if (!test_bit(R5_OVERWRITE, &dev->flags)
2307			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2308			    && !test_bit(R5_LOCKED, &dev->flags) &&
2309			    !test_bit(R5_UPTODATE, &dev->flags) &&
2310			    test_bit(R5_Insync, &dev->flags)) {
2311				if (
2312				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2313					pr_debug("Read_old stripe %llu "
2314						"block %d for Reconstruct\n",
2315					     (unsigned long long)sh->sector, i);
2316					set_bit(R5_LOCKED, &dev->flags);
2317					set_bit(R5_Wantread, &dev->flags);
2318					s->locked++;
2319				} else {
2320					pr_debug("Request delayed stripe %llu "
2321						"block %d for Reconstruct\n",
2322					     (unsigned long long)sh->sector, i);
2323					set_bit(STRIPE_DELAYED, &sh->state);
2324					set_bit(STRIPE_HANDLE, &sh->state);
2325				}
2326			}
2327		}
2328	/* now if nothing is locked, and if we have enough data, we can start a
2329	 * write request
2330	 */
2331	if (s->locked == 0 && rcw == 0 &&
2332	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2333		if (must_compute > 0) {
2334			/* We have failed blocks and need to compute them */
2335			switch (s->failed) {
2336			case 0:
2337				BUG();
2338			case 1:
2339				compute_block_1(sh, r6s->failed_num[0], 0);
2340				break;
2341			case 2:
2342				compute_block_2(sh, r6s->failed_num[0],
2343						r6s->failed_num[1]);
2344				break;
2345			default: /* This request should have been failed? */
2346				BUG();
2347			}
2348		}
2349
2350		pr_debug("Computing parity for stripe %llu\n",
2351			(unsigned long long)sh->sector);
2352		compute_parity6(sh, RECONSTRUCT_WRITE);
2353		/* now every locked buffer is ready to be written */
2354		for (i = disks; i--; )
2355			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2356				pr_debug("Writing stripe %llu block %d\n",
2357				       (unsigned long long)sh->sector, i);
2358				s->locked++;
2359				set_bit(R5_Wantwrite, &sh->dev[i].flags);
2360			}
2361		if (s->locked == disks)
2362			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2363				atomic_inc(&conf->pending_full_writes);
2364		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2365		set_bit(STRIPE_INSYNC, &sh->state);
2366
2367		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2368			atomic_dec(&conf->preread_active_stripes);
2369			if (atomic_read(&conf->preread_active_stripes) <
2370			    IO_THRESHOLD)
2371				md_wakeup_thread(conf->mddev->thread);
2372		}
2373	}
2374}
2375
2376static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2377				struct stripe_head_state *s, int disks)
2378{
2379	int canceled_check = 0;
2380
2381	set_bit(STRIPE_HANDLE, &sh->state);
2382
2383	/* complete a check operation */
2384	if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
2385		clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
2386		clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2387		if (s->failed == 0) {
2388			if (sh->ops.zero_sum_result == 0)
2389				/* parity is correct (on disc,
2390				 * not in buffer any more)
2391				 */
2392				set_bit(STRIPE_INSYNC, &sh->state);
2393			else {
2394				conf->mddev->resync_mismatches +=
2395					STRIPE_SECTORS;
2396				if (test_bit(
2397				     MD_RECOVERY_CHECK, &conf->mddev->recovery))
2398					/* don't try to repair!! */
2399					set_bit(STRIPE_INSYNC, &sh->state);
2400				else {
2401					set_bit(STRIPE_OP_COMPUTE_BLK,
2402						&sh->ops.pending);
2403					set_bit(STRIPE_OP_MOD_REPAIR_PD,
2404						&sh->ops.pending);
2405					set_bit(R5_Wantcompute,
2406						&sh->dev[sh->pd_idx].flags);
2407					sh->ops.target = sh->pd_idx;
2408					sh->ops.count++;
2409					s->uptodate++;
2410				}
2411			}
2412		} else
2413			canceled_check = 1; /* STRIPE_INSYNC is not set */
2414	}
2415
2416	/* start a new check operation if there are no failures, the stripe is
2417	 * not insync, and a repair is not in flight
2418	 */
2419	if (s->failed == 0 &&
2420	    !test_bit(STRIPE_INSYNC, &sh->state) &&
2421	    !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2422		if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2423			BUG_ON(s->uptodate != disks);
2424			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2425			sh->ops.count++;
2426			s->uptodate--;
2427		}
2428	}
2429
2430	/* check if we can clear a parity disk reconstruct */
2431	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2432	    test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2433
2434		clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2435		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2436		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2437		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2438	}
2439
2440
2441	/* Wait for check parity and compute block operations to complete
2442	 * before write-back.  If a failure occurred while the check operation
2443	 * was in flight we need to cycle this stripe through handle_stripe
2444	 * since the parity block may not be uptodate
2445	 */
2446	if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2447	    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2448	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2449		struct r5dev *dev;
2450		/* either failed parity check, or recovery is happening */
2451		if (s->failed == 0)
2452			s->failed_num = sh->pd_idx;
2453		dev = &sh->dev[s->failed_num];
2454		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2455		BUG_ON(s->uptodate != disks);
2456
2457		set_bit(R5_LOCKED, &dev->flags);
2458		set_bit(R5_Wantwrite, &dev->flags);
2459		if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2460			sh->ops.count++;
2461
2462		clear_bit(STRIPE_DEGRADED, &sh->state);
2463		s->locked++;
2464		set_bit(STRIPE_INSYNC, &sh->state);
2465	}
2466}
2467
2468
2469static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2470				struct stripe_head_state *s,
2471				struct r6_state *r6s, struct page *tmp_page,
2472				int disks)
2473{
2474	int update_p = 0, update_q = 0;
2475	struct r5dev *dev;
2476	int pd_idx = sh->pd_idx;
2477	int qd_idx = r6s->qd_idx;
2478
2479	set_bit(STRIPE_HANDLE, &sh->state);
2480
2481	BUG_ON(s->failed > 2);
2482	BUG_ON(s->uptodate < disks);
2483	/* Want to check and possibly repair P and Q.
2484	 * However there could be one 'failed' device, in which
2485	 * case we can only check one of them, possibly using the
2486	 * other to generate missing data
2487	 */
2488
2489	/* If !tmp_page, we cannot do the calculations,
2490	 * but as we have set STRIPE_HANDLE, we will soon be called
2491	 * by stripe_handle with a tmp_page - just wait until then.
2492	 */
2493	if (tmp_page) {
2494		if (s->failed == r6s->q_failed) {
2495			/* The only possible failed device holds 'Q', so it
2496			 * makes sense to check P (If anything else were failed,
2497			 * we would have used P to recreate it).
2498			 */
2499			compute_block_1(sh, pd_idx, 1);
2500			if (!page_is_zero(sh->dev[pd_idx].page)) {
2501				compute_block_1(sh, pd_idx, 0);
2502				update_p = 1;
2503			}
2504		}
2505		if (!r6s->q_failed && s->failed < 2) {
2506			/* q is not failed, and we didn't use it to generate
2507			 * anything, so it makes sense to check it
2508			 */
2509			memcpy(page_address(tmp_page),
2510			       page_address(sh->dev[qd_idx].page),
2511			       STRIPE_SIZE);
2512			compute_parity6(sh, UPDATE_PARITY);
2513			if (memcmp(page_address(tmp_page),
2514				   page_address(sh->dev[qd_idx].page),
2515				   STRIPE_SIZE) != 0) {
2516				clear_bit(STRIPE_INSYNC, &sh->state);
2517				update_q = 1;
2518			}
2519		}
2520		if (update_p || update_q) {
2521			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2522			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2523				/* don't try to repair!! */
2524				update_p = update_q = 0;
2525		}
2526
2527		/* now write out any block on a failed drive,
2528		 * or P or Q if they need it
2529		 */
2530
2531		if (s->failed == 2) {
2532			dev = &sh->dev[r6s->failed_num[1]];
2533			s->locked++;
2534			set_bit(R5_LOCKED, &dev->flags);
2535			set_bit(R5_Wantwrite, &dev->flags);
2536		}
2537		if (s->failed >= 1) {
2538			dev = &sh->dev[r6s->failed_num[0]];
2539			s->locked++;
2540			set_bit(R5_LOCKED, &dev->flags);
2541			set_bit(R5_Wantwrite, &dev->flags);
2542		}
2543
2544		if (update_p) {
2545			dev = &sh->dev[pd_idx];
2546			s->locked++;
2547			set_bit(R5_LOCKED, &dev->flags);
2548			set_bit(R5_Wantwrite, &dev->flags);
2549		}
2550		if (update_q) {
2551			dev = &sh->dev[qd_idx];
2552			s->locked++;
2553			set_bit(R5_LOCKED, &dev->flags);
2554			set_bit(R5_Wantwrite, &dev->flags);
2555		}
2556		clear_bit(STRIPE_DEGRADED, &sh->state);
2557
2558		set_bit(STRIPE_INSYNC, &sh->state);
2559	}
2560}
2561
2562static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2563				struct r6_state *r6s)
2564{
2565	int i;
2566
2567	/* We have read all the blocks in this stripe and now we need to
2568	 * copy some of them into a target stripe for expand.
2569	 */
2570	struct dma_async_tx_descriptor *tx = NULL;
2571	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2572	for (i = 0; i < sh->disks; i++)
2573		if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
2574			int dd_idx, pd_idx, j;
2575			struct stripe_head *sh2;
2576
2577			sector_t bn = compute_blocknr(sh, i);
2578			sector_t s = raid5_compute_sector(bn, conf->raid_disks,
2579						conf->raid_disks -
2580						conf->max_degraded, &dd_idx,
2581						&pd_idx, conf);
2582			sh2 = get_active_stripe(conf, s, conf->raid_disks,
2583						pd_idx, 1);
2584			if (sh2 == NULL)
2585				/* so far only the early blocks of this stripe
2586				 * have been requested.  When later blocks
2587				 * get requested, we will try again
2588				 */
2589				continue;
2590			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2591			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2592				/* must have already done this block */
2593				release_stripe(sh2);
2594				continue;
2595			}
2596
2597			/* place all the copies on one channel */
2598			tx = async_memcpy(sh2->dev[dd_idx].page,
2599				sh->dev[i].page, 0, 0, STRIPE_SIZE,
2600				ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2601
2602			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2603			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2604			for (j = 0; j < conf->raid_disks; j++)
2605				if (j != sh2->pd_idx &&
2606				    (!r6s || j != raid6_next_disk(sh2->pd_idx,
2607								 sh2->disks)) &&
2608				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
2609					break;
2610			if (j == conf->raid_disks) {
2611				set_bit(STRIPE_EXPAND_READY, &sh2->state);
2612				set_bit(STRIPE_HANDLE, &sh2->state);
2613			}
2614			release_stripe(sh2);
2615
2616		}
2617	/* done submitting copies, wait for them to complete */
2618	if (tx) {
2619		async_tx_ack(tx);
2620		dma_wait_for_async_tx(tx);
2621	}
2622}
2623
2624
2625/*
2626 * handle_stripe - do things to a stripe.
2627 *
2628 * We lock the stripe and then examine the state of various bits
2629 * to see what needs to be done.
2630 * Possible results:
2631 *    return some read request which now have data
2632 *    return some write requests which are safely on disc
2633 *    schedule a read on some buffers
2634 *    schedule a write of some buffers
2635 *    return confirmation of parity correctness
2636 *
2637 * buffers are taken off read_list or write_list, and bh_cache buffers
2638 * get BH_Lock set before the stripe lock is released.
2639 *
2640 */
2641
2642static void handle_stripe5(struct stripe_head *sh)
2643{
2644	raid5_conf_t *conf = sh->raid_conf;
2645	int disks = sh->disks, i;
2646	struct bio *return_bi = NULL;
2647	struct stripe_head_state s;
2648	struct r5dev *dev;
2649	unsigned long pending = 0;
2650	mdk_rdev_t *blocked_rdev = NULL;
2651	int prexor;
2652
2653	memset(&s, 0, sizeof(s));
2654	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
2655		"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
2656		atomic_read(&sh->count), sh->pd_idx,
2657		sh->ops.pending, sh->ops.ack, sh->ops.complete);
2658
2659	spin_lock(&sh->lock);
2660	clear_bit(STRIPE_HANDLE, &sh->state);
2661	clear_bit(STRIPE_DELAYED, &sh->state);
2662
2663	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2664	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2665	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2666	/* Now to look around and see what can be done */
2667
2668	/* clean-up completed biofill operations */
2669	if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2670		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2671		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2672		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2673	}
2674
2675	rcu_read_lock();
2676	for (i=disks; i--; ) {
2677		mdk_rdev_t *rdev;
2678		struct r5dev *dev = &sh->dev[i];
2679		clear_bit(R5_Insync, &dev->flags);
2680
2681		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2682			"written %p\n",	i, dev->flags, dev->toread, dev->read,
2683			dev->towrite, dev->written);
2684
2685		/* maybe we can request a biofill operation
2686		 *
2687		 * new wantfill requests are only permitted while
2688		 * STRIPE_OP_BIOFILL is clear
2689		 */
2690		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2691			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2692			set_bit(R5_Wantfill, &dev->flags);
2693
2694		/* now count some things */
2695		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2696		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2697		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2698
2699		if (test_bit(R5_Wantfill, &dev->flags))
2700			s.to_fill++;
2701		else if (dev->toread)
2702			s.to_read++;
2703		if (dev->towrite) {
2704			s.to_write++;
2705			if (!test_bit(R5_OVERWRITE, &dev->flags))
2706				s.non_overwrite++;
2707		}
2708		if (dev->written)
2709			s.written++;
2710		rdev = rcu_dereference(conf->disks[i].rdev);
2711		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2712			blocked_rdev = rdev;
2713			atomic_inc(&rdev->nr_pending);
2714			break;
2715		}
2716		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2717			/* The ReadError flag will just be confusing now */
2718			clear_bit(R5_ReadError, &dev->flags);
2719			clear_bit(R5_ReWrite, &dev->flags);
2720		}
2721		if (!rdev || !test_bit(In_sync, &rdev->flags)
2722		    || test_bit(R5_ReadError, &dev->flags)) {
2723			s.failed++;
2724			s.failed_num = i;
2725		} else
2726			set_bit(R5_Insync, &dev->flags);
2727	}
2728	rcu_read_unlock();
2729
2730	if (unlikely(blocked_rdev)) {
2731		set_bit(STRIPE_HANDLE, &sh->state);
2732		goto unlock;
2733	}
2734
2735	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2736		sh->ops.count++;
2737
2738	pr_debug("locked=%d uptodate=%d to_read=%d"
2739		" to_write=%d failed=%d failed_num=%d\n",
2740		s.locked, s.uptodate, s.to_read, s.to_write,
2741		s.failed, s.failed_num);
2742	/* check if the array has lost two devices and, if so, some requests might
2743	 * need to be failed
2744	 */
2745	if (s.failed > 1 && s.to_read+s.to_write+s.written)
2746		handle_requests_to_failed_array(conf, sh, &s, disks,
2747						&return_bi);
2748	if (s.failed > 1 && s.syncing) {
2749		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2750		clear_bit(STRIPE_SYNCING, &sh->state);
2751		s.syncing = 0;
2752	}
2753
2754	/* might be able to return some write requests if the parity block
2755	 * is safe, or on a failed drive
2756	 */
2757	dev = &sh->dev[sh->pd_idx];
2758	if ( s.written &&
2759	     ((test_bit(R5_Insync, &dev->flags) &&
2760	       !test_bit(R5_LOCKED, &dev->flags) &&
2761	       test_bit(R5_UPTODATE, &dev->flags)) ||
2762	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
2763		handle_completed_write_requests(conf, sh, disks, &return_bi);
2764
2765	/* Now we might consider reading some blocks, either to check/generate
2766	 * parity, or to satisfy requests
2767	 * or to load a block that is being partially written.
2768	 */
2769	if (s.to_read || s.non_overwrite ||
2770	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
2771	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
2772		handle_issuing_new_read_requests5(sh, &s, disks);
2773
2774	/* Now we check to see if any write operations have recently
2775	 * completed
2776	 */
2777
2778	/* leave prexor set until postxor is done, allows us to distinguish
2779	 * a rmw from a rcw during biodrain
2780	 */
2781	prexor = 0;
2782	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
2783		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2784
2785		prexor = 1;
2786		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
2787		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
2788		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
2789
2790		for (i = disks; i--; )
2791			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2792	}
2793
2794	/* if only POSTXOR is set then this is an 'expand' postxor */
2795	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2796		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2797
2798		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2799		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2800		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2801
2802		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2803		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2804		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2805
2806		/* All the 'written' buffers and the parity block are ready to
2807		 * be written back to disk
2808		 */
2809		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
2810		for (i = disks; i--; ) {
2811			dev = &sh->dev[i];
2812			if (test_bit(R5_LOCKED, &dev->flags) &&
2813				(i == sh->pd_idx || dev->written)) {
2814				pr_debug("Writing block %d\n", i);
2815				set_bit(R5_Wantwrite, &dev->flags);
2816				if (!test_and_set_bit(
2817				    STRIPE_OP_IO, &sh->ops.pending))
2818					sh->ops.count++;
2819				if (prexor)
2820					continue;
2821				if (!test_bit(R5_Insync, &dev->flags) ||
2822				    (i == sh->pd_idx && s.failed == 0))
2823					set_bit(STRIPE_INSYNC, &sh->state);
2824			}
2825		}
2826		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2827			atomic_dec(&conf->preread_active_stripes);
2828			if (atomic_read(&conf->preread_active_stripes) <
2829				IO_THRESHOLD)
2830				md_wakeup_thread(conf->mddev->thread);
2831		}
2832	}
2833
2834	/* Now to consider new write requests and what else, if anything
2835	 * should be read.  We do not handle new writes when:
2836	 * 1/ A 'write' operation (copy+xor) is already in flight.
2837	 * 2/ A 'check' operation is in flight, as it may clobber the parity
2838	 *    block.
2839	 */
2840	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
2841			  !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
2842		handle_issuing_new_write_requests5(conf, sh, &s, disks);
2843
2844	/* maybe we need to check and possibly fix the parity for this stripe
2845	 * Any reads will already have been scheduled, so we just see if enough
2846	 * data is available.  The parity check is held off while parity
2847	 * dependent operations are in flight.
2848	 */
2849	if ((s.syncing && s.locked == 0 &&
2850	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2851	     !test_bit(STRIPE_INSYNC, &sh->state)) ||
2852	      test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
2853	      test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2854		handle_parity_checks5(conf, sh, &s, disks);
2855
2856	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2857		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2858		clear_bit(STRIPE_SYNCING, &sh->state);
2859	}
2860
2861	/* If the failed drive is just a ReadError, then we might need to progress
2862	 * the repair/check process
2863	 */
2864	if (s.failed == 1 && !conf->mddev->ro &&
2865	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
2866	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
2867	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
2868		) {
2869		dev = &sh->dev[s.failed_num];
2870		if (!test_bit(R5_ReWrite, &dev->flags)) {
2871			set_bit(R5_Wantwrite, &dev->flags);
2872			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2873				sh->ops.count++;
2874			set_bit(R5_ReWrite, &dev->flags);
2875			set_bit(R5_LOCKED, &dev->flags);
2876			s.locked++;
2877		} else {
2878			/* let's read it back */
2879			set_bit(R5_Wantread, &dev->flags);
2880			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2881				sh->ops.count++;
2882			set_bit(R5_LOCKED, &dev->flags);
2883			s.locked++;
2884		}
2885	}
2886
2887	/* Finish postxor operations initiated by the expansion
2888	 * process
2889	 */
2890	if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2891		!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2892
2893		clear_bit(STRIPE_EXPANDING, &sh->state);
2894
2895		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2896		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2897		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2898
2899		for (i = conf->raid_disks; i--; ) {
2900			set_bit(R5_Wantwrite, &sh->dev[i].flags);
2901			set_bit(R5_LOCKED, &dev->flags);
2902			s.locked++;
2903			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2904				sh->ops.count++;
2905		}
2906	}
2907
2908	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2909		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2910		/* Need to write out all blocks after computing parity */
2911		sh->disks = conf->raid_disks;
2912		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2913			conf->raid_disks);
2914		s.locked += handle_write_operations5(sh, 1, 1);
2915	} else if (s.expanded &&
2916		   s.locked == 0 &&
2917		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2918		clear_bit(STRIPE_EXPAND_READY, &sh->state);
2919		atomic_dec(&conf->reshape_stripes);
2920		wake_up(&conf->wait_for_overlap);
2921		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2922	}
2923
2924	if (s.expanding && s.locked == 0 &&
2925	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
2926		handle_stripe_expansion(conf, sh, NULL);
2927
2928	if (sh->ops.count)
2929		pending = get_stripe_work(sh);
2930
2931 unlock:
2932	spin_unlock(&sh->lock);
2933
2934	/* wait for this device to become unblocked */
2935	if (unlikely(blocked_rdev))
2936		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2937
2938	if (pending)
2939		raid5_run_ops(sh, pending);
2940
2941	return_io(return_bi);
2942
2943}
2944
2945static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2946{
2947	raid6_conf_t *conf = sh->raid_conf;
2948	int disks = sh->disks;
2949	struct bio *return_bi = NULL;
2950	int i, pd_idx = sh->pd_idx;
2951	struct stripe_head_state s;
2952	struct r6_state r6s;
2953	struct r5dev *dev, *pdev, *qdev;
2954	mdk_rdev_t *blocked_rdev = NULL;
2955
2956	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2957	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
2958		"pd_idx=%d, qd_idx=%d\n",
2959	       (unsigned long long)sh->sector, sh->state,
2960	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2961	memset(&s, 0, sizeof(s));
2962
2963	spin_lock(&sh->lock);
2964	clear_bit(STRIPE_HANDLE, &sh->state);
2965	clear_bit(STRIPE_DELAYED, &sh->state);
2966
2967	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2968	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2969	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2970	/* Now to look around and see what can be done */
2971
2972	rcu_read_lock();
2973	for (i=disks; i--; ) {
2974		mdk_rdev_t *rdev;
2975		dev = &sh->dev[i];
2976		clear_bit(R5_Insync, &dev->flags);
2977
2978		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
2979			i, dev->flags, dev->toread, dev->towrite, dev->written);
2980		/* maybe we can reply to a read */
2981		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
2982			struct bio *rbi, *rbi2;
2983			pr_debug("Return read for disc %d\n", i);
2984			spin_lock_irq(&conf->device_lock);
2985			rbi = dev->toread;
2986			dev->toread = NULL;
2987			if (test_and_clear_bit(R5_Overlap, &dev->flags))
2988				wake_up(&conf->wait_for_overlap);
2989			spin_unlock_irq(&conf->device_lock);
2990			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2991				copy_data(0, rbi, dev->page, dev->sector);
2992				rbi2 = r5_next_bio(rbi, dev->sector);
2993				spin_lock_irq(&conf->device_lock);
2994				if (--rbi->bi_phys_segments == 0) {
2995					rbi->bi_next = return_bi;
2996					return_bi = rbi;
2997				}
2998				spin_unlock_irq(&conf->device_lock);
2999				rbi = rbi2;
3000			}
3001		}
3002
3003		/* now count some things */
3004		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3005		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3006
3007
3008		if (dev->toread)
3009			s.to_read++;
3010		if (dev->towrite) {
3011			s.to_write++;
3012			if (!test_bit(R5_OVERWRITE, &dev->flags))
3013				s.non_overwrite++;
3014		}
3015		if (dev->written)
3016			s.written++;
3017		rdev = rcu_dereference(conf->disks[i].rdev);
3018		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3019			blocked_rdev = rdev;
3020			atomic_inc(&rdev->nr_pending);
3021			break;
3022		}
3023		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
3024			/* The ReadError flag will just be confusing now */
3025			clear_bit(R5_ReadError, &dev->flags);
3026			clear_bit(R5_ReWrite, &dev->flags);
3027		}
3028		if (!rdev || !test_bit(In_sync, &rdev->flags)
3029		    || test_bit(R5_ReadError, &dev->flags)) {
3030			if (s.failed < 2)
3031				r6s.failed_num[s.failed] = i;
3032			s.failed++;
3033		} else
3034			set_bit(R5_Insync, &dev->flags);
3035	}
3036	rcu_read_unlock();
3037
3038	if (unlikely(blocked_rdev)) {
3039		set_bit(STRIPE_HANDLE, &sh->state);
3040		goto unlock;
3041	}
3042	pr_debug("locked=%d uptodate=%d to_read=%d"
3043	       " to_write=%d failed=%d failed_num=%d,%d\n",
3044	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3045	       r6s.failed_num[0], r6s.failed_num[1]);
3046	/* check if the array has lost >2 devices and, if so, some requests
3047	 * might need to be failed
3048	 */
3049	if (s.failed > 2 && s.to_read+s.to_write+s.written)
3050		handle_requests_to_failed_array(conf, sh, &s, disks,
3051						&return_bi);
3052	if (s.failed > 2 && s.syncing) {
3053		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3054		clear_bit(STRIPE_SYNCING, &sh->state);
3055		s.syncing = 0;
3056	}
3057
3058	/*
3059	 * might be able to return some write requests if the parity blocks
3060	 * are safe, or on a failed drive
3061	 */
3062	pdev = &sh->dev[pd_idx];
3063	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
3064		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
3065	qdev = &sh->dev[r6s.qd_idx];
3066	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
3067		|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
3068
3069	if ( s.written &&
3070	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3071			     && !test_bit(R5_LOCKED, &pdev->flags)
3072			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3073	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3074			     && !test_bit(R5_LOCKED, &qdev->flags)
3075			     && test_bit(R5_UPTODATE, &qdev->flags)))))
3076		handle_completed_write_requests(conf, sh, disks, &return_bi);
3077
3078	/* Now we might consider reading some blocks, either to check/generate
3079	 * parity, or to satisfy requests
3080	 * or to load a block that is being partially written.
3081	 */
3082	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3083	    (s.syncing && (s.uptodate < disks)) || s.expanding)
3084		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
3085
3086	/* now to consider writing and what else, if anything should be read */
3087	if (s.to_write)
3088		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
3089
3090	/* maybe we need to check and possibly fix the parity for this stripe
3091	 * Any reads will already have been scheduled, so we just see if enough
3092	 * data is available
3093	 */
3094	if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
3095		handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
3096
3097	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3098		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3099		clear_bit(STRIPE_SYNCING, &sh->state);
3100	}
3101
3102	/* If the failed drives are just a ReadError, then we might need
3103	 * to progress the repair/check process
3104	 */
3105	if (s.failed <= 2 && !conf->mddev->ro)
3106		for (i = 0; i < s.failed; i++) {
3107			dev = &sh->dev[r6s.failed_num[i]];
3108			if (test_bit(R5_ReadError, &dev->flags)
3109			    && !test_bit(R5_LOCKED, &dev->flags)
3110			    && test_bit(R5_UPTODATE, &dev->flags)
3111				) {
3112				if (!test_bit(R5_ReWrite, &dev->flags)) {
3113					set_bit(R5_Wantwrite, &dev->flags);
3114					set_bit(R5_ReWrite, &dev->flags);
3115					set_bit(R5_LOCKED, &dev->flags);
3116				} else {
3117					/* let's read it back */
3118					set_bit(R5_Wantread, &dev->flags);
3119					set_bit(R5_LOCKED, &dev->flags);
3120				}
3121			}
3122		}
3123
3124	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3125		/* Need to write out all blocks after computing P&Q */
3126		sh->disks = conf->raid_disks;
3127		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
3128					     conf->raid_disks);
3129		compute_parity6(sh, RECONSTRUCT_WRITE);
3130		for (i = conf->raid_disks ; i-- ;  ) {
3131			set_bit(R5_LOCKED, &sh->dev[i].flags);
3132			s.locked++;
3133			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3134		}
3135		clear_bit(STRIPE_EXPANDING, &sh->state);
3136	} else if (s.expanded) {
3137		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3138		atomic_dec(&conf->reshape_stripes);
3139		wake_up(&conf->wait_for_overlap);
3140		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3141	}
3142
3143	if (s.expanding && s.locked == 0 &&
3144	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
3145		handle_stripe_expansion(conf, sh, &r6s);
3146
3147 unlock:
3148	spin_unlock(&sh->lock);
3149
3150	/* wait for this device to become unblocked */
3151	if (unlikely(blocked_rdev))
3152		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3153
3154	return_io(return_bi);
3155
3156	for (i=disks; i-- ;) {
3157		int rw;
3158		struct bio *bi;
3159		mdk_rdev_t *rdev;
3160		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3161			rw = WRITE;
3162		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3163			rw = READ;
3164		else
3165			continue;
3166
3167		set_bit(STRIPE_IO_STARTED, &sh->state);
3168
3169		bi = &sh->dev[i].req;
3170
3171		bi->bi_rw = rw;
3172		if (rw == WRITE)
3173			bi->bi_end_io = raid5_end_write_request;
3174		else
3175			bi->bi_end_io = raid5_end_read_request;
3176
3177		rcu_read_lock();
3178		rdev = rcu_dereference(conf->disks[i].rdev);
3179		if (rdev && test_bit(Faulty, &rdev->flags))
3180			rdev = NULL;
3181		if (rdev)
3182			atomic_inc(&rdev->nr_pending);
3183		rcu_read_unlock();
3184
3185		if (rdev) {
3186			if (s.syncing || s.expanding || s.expanded)
3187				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3188
3189			bi->bi_bdev = rdev->bdev;
3190			pr_debug("for %llu schedule op %ld on disc %d\n",
3191				(unsigned long long)sh->sector, bi->bi_rw, i);
3192			atomic_inc(&sh->count);
3193			bi->bi_sector = sh->sector + rdev->data_offset;
3194			bi->bi_flags = 1 << BIO_UPTODATE;
3195			bi->bi_vcnt = 1;
3196			bi->bi_max_vecs = 1;
3197			bi->bi_idx = 0;
3198			bi->bi_io_vec = &sh->dev[i].vec;
3199			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3200			bi->bi_io_vec[0].bv_offset = 0;
3201			bi->bi_size = STRIPE_SIZE;
3202			bi->bi_next = NULL;
3203			if (rw == WRITE &&
3204			    test_bit(R5_ReWrite, &sh->dev[i].flags))
3205				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3206			generic_make_request(bi);
3207		} else {
3208			if (rw == WRITE)
3209				set_bit(STRIPE_DEGRADED, &sh->state);
3210			pr_debug("skip op %ld on disc %d for sector %llu\n",
3211				bi->bi_rw, i, (unsigned long long)sh->sector);
3212			clear_bit(R5_LOCKED, &sh->dev[i].flags);
3213			set_bit(STRIPE_HANDLE, &sh->state);
3214		}
3215	}
3216}
3217
3218static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
3219{
3220	if (sh->raid_conf->level == 6)
3221		handle_stripe6(sh, tmp_page);
3222	else
3223		handle_stripe5(sh);
3224}
3225
3226
3227
3228static void raid5_activate_delayed(raid5_conf_t *conf)
3229{
3230	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3231		while (!list_empty(&conf->delayed_list)) {
3232			struct list_head *l = conf->delayed_list.next;
3233			struct stripe_head *sh;
3234			sh = list_entry(l, struct stripe_head, lru);
3235			list_del_init(l);
3236			clear_bit(STRIPE_DELAYED, &sh->state);
3237			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3238				atomic_inc(&conf->preread_active_stripes);
3239			list_add_tail(&sh->lru, &conf->hold_list);
3240		}
3241	} else
3242		blk_plug_device(conf->mddev->queue);
3243}
3244
3245static void activate_bit_delay(raid5_conf_t *conf)
3246{
3247	/* device_lock is held */
3248	struct list_head head;
3249	list_add(&head, &conf->bitmap_list);
3250	list_del_init(&conf->bitmap_list);
3251	while (!list_empty(&head)) {
3252		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3253		list_del_init(&sh->lru);
3254		atomic_inc(&sh->count);
3255		__release_stripe(conf, sh);
3256	}
3257}
3258
3259static void unplug_slaves(mddev_t *mddev)
3260{
3261	raid5_conf_t *conf = mddev_to_conf(mddev);
3262	int i;
3263
3264	rcu_read_lock();
3265	for (i=0; i<mddev->raid_disks; i++) {
3266		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3267		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3268			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3269
3270			atomic_inc(&rdev->nr_pending);
3271			rcu_read_unlock();
3272
3273			blk_unplug(r_queue);
3274
3275			rdev_dec_pending(rdev, mddev);
3276			rcu_read_lock();
3277		}
3278	}
3279	rcu_read_unlock();
3280}
3281
3282static void raid5_unplug_device(struct request_queue *q)
3283{
3284	mddev_t *mddev = q->queuedata;
3285	raid5_conf_t *conf = mddev_to_conf(mddev);
3286	unsigned long flags;
3287
3288	spin_lock_irqsave(&conf->device_lock, flags);
3289
3290	if (blk_remove_plug(q)) {
3291		conf->seq_flush++;
3292		raid5_activate_delayed(conf);
3293	}
3294	md_wakeup_thread(mddev->thread);
3295
3296	spin_unlock_irqrestore(&conf->device_lock, flags);
3297
3298	unplug_slaves(mddev);
3299}
3300
3301static int raid5_congested(void *data, int bits)
3302{
3303	mddev_t *mddev = data;
3304	raid5_conf_t *conf = mddev_to_conf(mddev);
3305
3306	/* No difference between reads and writes.  Just check
3307	 * how busy the stripe_cache is
3308	 */
3309	if (conf->inactive_blocked)
3310		return 1;
3311	if (conf->quiesce)
3312		return 1;
3313	if (list_empty_careful(&conf->inactive_list))
3314		return 1;
3315
3316	return 0;
3317}
3318
3319/* We want read requests to align with chunks where possible,
3320 * but write requests don't need to.
3321 */
3322static int raid5_mergeable_bvec(struct request_queue *q,
3323				struct bvec_merge_data *bvm,
3324				struct bio_vec *biovec)
3325{
3326	mddev_t *mddev = q->queuedata;
3327	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3328	int max;
3329	unsigned int chunk_sectors = mddev->chunk_size >> 9;
3330	unsigned int bio_sectors = bvm->bi_size >> 9;
3331
3332	if ((bvm->bi_rw & 1) == WRITE)
3333		return biovec->bv_len; /* always allow writes to be mergeable */
3334
3335	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3336	if (max < 0) max = 0;
3337	if (max <= biovec->bv_len && bio_sectors == 0)
3338		return biovec->bv_len;
3339	else
3340		return max;
3341}
3342
3343
3344static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3345{
3346	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3347	unsigned int chunk_sectors = mddev->chunk_size >> 9;
3348	unsigned int bio_sectors = bio->bi_size >> 9;
3349
3350	return  chunk_sectors >=
3351		((sector & (chunk_sectors - 1)) + bio_sectors);
3352}
3353
3354/*
3355 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3356 *  later sampled by raid5d.
3357 */
3358static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3359{
3360	unsigned long flags;
3361
3362	spin_lock_irqsave(&conf->device_lock, flags);
3363
3364	bi->bi_next = conf->retry_read_aligned_list;
3365	conf->retry_read_aligned_list = bi;
3366
3367	spin_unlock_irqrestore(&conf->device_lock, flags);
3368	md_wakeup_thread(conf->mddev->thread);
3369}
3370
3371
3372static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3373{
3374	struct bio *bi;
3375
3376	bi = conf->retry_read_aligned;
3377	if (bi) {
3378		conf->retry_read_aligned = NULL;
3379		return bi;
3380	}
3381	bi = conf->retry_read_aligned_list;
3382	if(bi) {
3383		conf->retry_read_aligned_list = bi->bi_next;
3384		bi->bi_next = NULL;
3385		bi->bi_phys_segments = 1; /* biased count of active stripes */
3386		bi->bi_hw_segments = 0; /* count of processed stripes */
3387	}
3388
3389	return bi;
3390}
3391
3392
3393/*
3394 *  The "raid5_align_endio" should check if the read succeeded and if it
3395 *  did, call bio_endio on the original bio (having bio_put the new bio
3396 *  first).
3397 *  If the read failed..
3398 */
3399static void raid5_align_endio(struct bio *bi, int error)
3400{
3401	struct bio* raid_bi  = bi->bi_private;
3402	mddev_t *mddev;
3403	raid5_conf_t *conf;
3404	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3405	mdk_rdev_t *rdev;
3406
3407	bio_put(bi);
3408
3409	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3410	conf = mddev_to_conf(mddev);
3411	rdev = (void*)raid_bi->bi_next;
3412	raid_bi->bi_next = NULL;
3413
3414	rdev_dec_pending(rdev, conf->mddev);
3415
3416	if (!error && uptodate) {
3417		bio_endio(raid_bi, 0);
3418		if (atomic_dec_and_test(&conf->active_aligned_reads))
3419			wake_up(&conf->wait_for_stripe);
3420		return;
3421	}
3422
3423
3424	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3425
3426	add_bio_to_retry(raid_bi, conf);
3427}
3428
3429static int bio_fits_rdev(struct bio *bi)
3430{
3431	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3432
3433	if ((bi->bi_size>>9) > q->max_sectors)
3434		return 0;
3435	blk_recount_segments(q, bi);
3436	if (bi->bi_phys_segments > q->max_phys_segments ||
3437	    bi->bi_hw_segments > q->max_hw_segments)
3438		return 0;
3439
3440	if (q->merge_bvec_fn)
3441		/* it's too hard to apply the merge_bvec_fn at this stage,
3442		 * just just give up
3443		 */
3444		return 0;
3445
3446	return 1;
3447}
3448
3449
3450static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3451{
3452	mddev_t *mddev = q->queuedata;
3453	raid5_conf_t *conf = mddev_to_conf(mddev);
3454	const unsigned int raid_disks = conf->raid_disks;
3455	const unsigned int data_disks = raid_disks - conf->max_degraded;
3456	unsigned int dd_idx, pd_idx;
3457	struct bio* align_bi;
3458	mdk_rdev_t *rdev;
3459
3460	if (!in_chunk_boundary(mddev, raid_bio)) {
3461		pr_debug("chunk_aligned_read : non aligned\n");
3462		return 0;
3463	}
3464	/*
3465 	 * use bio_clone to make a copy of the bio
3466	 */
3467	align_bi = bio_clone(raid_bio, GFP_NOIO);
3468	if (!align_bi)
3469		return 0;
3470	/*
3471	 *   set bi_end_io to a new function, and set bi_private to the
3472	 *     original bio.
3473	 */
3474	align_bi->bi_end_io  = raid5_align_endio;
3475	align_bi->bi_private = raid_bio;
3476	/*
3477	 *	compute position
3478	 */
3479	align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
3480					raid_disks,
3481					data_disks,
3482					&dd_idx,
3483					&pd_idx,
3484					conf);
3485
3486	rcu_read_lock();
3487	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3488	if (rdev && test_bit(In_sync, &rdev->flags)) {
3489		atomic_inc(&rdev->nr_pending);
3490		rcu_read_unlock();
3491		raid_bio->bi_next = (void*)rdev;
3492		align_bi->bi_bdev =  rdev->bdev;
3493		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3494		align_bi->bi_sector += rdev->data_offset;
3495
3496		if (!bio_fits_rdev(align_bi)) {
3497			/* too big in some way */
3498			bio_put(align_bi);
3499			rdev_dec_pending(rdev, mddev);
3500			return 0;
3501		}
3502
3503		spin_lock_irq(&conf->device_lock);
3504		wait_event_lock_irq(conf->wait_for_stripe,
3505				    conf->quiesce == 0,
3506				    conf->device_lock, /* nothing */);
3507		atomic_inc(&conf->active_aligned_reads);
3508		spin_unlock_irq(&conf->device_lock);
3509
3510		generic_make_request(align_bi);
3511		return 1;
3512	} else {
3513		rcu_read_unlock();
3514		bio_put(align_bi);
3515		return 0;
3516	}
3517}
3518
3519/* __get_priority_stripe - get the next stripe to process
3520 *
3521 * Full stripe writes are allowed to pass preread active stripes up until
3522 * the bypass_threshold is exceeded.  In general the bypass_count
3523 * increments when the handle_list is handled before the hold_list; however, it
3524 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3525 * stripe with in flight i/o.  The bypass_count will be reset when the
3526 * head of the hold_list has changed, i.e. the head was promoted to the
3527 * handle_list.
3528 */
3529static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3530{
3531	struct stripe_head *sh;
3532
3533	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3534		  __func__,
3535		  list_empty(&conf->handle_list) ? "empty" : "busy",
3536		  list_empty(&conf->hold_list) ? "empty" : "busy",
3537		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
3538
3539	if (!list_empty(&conf->handle_list)) {
3540		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3541
3542		if (list_empty(&conf->hold_list))
3543			conf->bypass_count = 0;
3544		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3545			if (conf->hold_list.next == conf->last_hold)
3546				conf->bypass_count++;
3547			else {
3548				conf->last_hold = conf->hold_list.next;
3549				conf->bypass_count -= conf->bypass_threshold;
3550				if (conf->bypass_count < 0)
3551					conf->bypass_count = 0;
3552			}
3553		}
3554	} else if (!list_empty(&conf->hold_list) &&
3555		   ((conf->bypass_threshold &&
3556		     conf->bypass_count > conf->bypass_threshold) ||
3557		    atomic_read(&conf->pending_full_writes) == 0)) {
3558		sh = list_entry(conf->hold_list.next,
3559				typeof(*sh), lru);
3560		conf->bypass_count -= conf->bypass_threshold;
3561		if (conf->bypass_count < 0)
3562			conf->bypass_count = 0;
3563	} else
3564		return NULL;
3565
3566	list_del_init(&sh->lru);
3567	atomic_inc(&sh->count);
3568	BUG_ON(atomic_read(&sh->count) != 1);
3569	return sh;
3570}
3571
3572static int make_request(struct request_queue *q, struct bio * bi)
3573{
3574	mddev_t *mddev = q->queuedata;
3575	raid5_conf_t *conf = mddev_to_conf(mddev);
3576	unsigned int dd_idx, pd_idx;
3577	sector_t new_sector;
3578	sector_t logical_sector, last_sector;
3579	struct stripe_head *sh;
3580	const int rw = bio_data_dir(bi);
3581	int remaining;
3582
3583	if (unlikely(bio_barrier(bi))) {
3584		bio_endio(bi, -EOPNOTSUPP);
3585		return 0;
3586	}
3587
3588	md_write_start(mddev, bi);
3589
3590	disk_stat_inc(mddev->gendisk, ios[rw]);
3591	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
3592
3593	if (rw == READ &&
3594	     mddev->reshape_position == MaxSector &&
3595	     chunk_aligned_read(q,bi))
3596            	return 0;
3597
3598	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3599	last_sector = bi->bi_sector + (bi->bi_size>>9);
3600	bi->bi_next = NULL;
3601	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
3602
3603	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3604		DEFINE_WAIT(w);
3605		int disks, data_disks;
3606
3607	retry:
3608		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3609		if (likely(conf->expand_progress == MaxSector))
3610			disks = conf->raid_disks;
3611		else {
3612			/* spinlock is needed as expand_progress may be
3613			 * 64bit on a 32bit platform, and so it might be
3614			 * possible to see a half-updated value
3615			 * Ofcourse expand_progress could change after
3616			 * the lock is dropped, so once we get a reference
3617			 * to the stripe that we think it is, we will have
3618			 * to check again.
3619			 */
3620			spin_lock_irq(&conf->device_lock);
3621			disks = conf->raid_disks;
3622			if (logical_sector >= conf->expand_progress)
3623				disks = conf->previous_raid_disks;
3624			else {
3625				if (logical_sector >= conf->expand_lo) {
3626					spin_unlock_irq(&conf->device_lock);
3627					schedule();
3628					goto retry;
3629				}
3630			}
3631			spin_unlock_irq(&conf->device_lock);
3632		}
3633		data_disks = disks - conf->max_degraded;
3634
3635 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
3636						  &dd_idx, &pd_idx, conf);
3637		pr_debug("raid5: make_request, sector %llu logical %llu\n",
3638			(unsigned long long)new_sector,
3639			(unsigned long long)logical_sector);
3640
3641		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
3642		if (sh) {
3643			if (unlikely(conf->expand_progress != MaxSector)) {
3644				/* expansion might have moved on while waiting for a
3645				 * stripe, so we must do the range check again.
3646				 * Expansion could still move past after this
3647				 * test, but as we are holding a reference to
3648				 * 'sh', we know that if that happens,
3649				 *  STRIPE_EXPANDING will get set and the expansion
3650				 * won't proceed until we finish with the stripe.
3651				 */
3652				int must_retry = 0;
3653				spin_lock_irq(&conf->device_lock);
3654				if (logical_sector <  conf->expand_progress &&
3655				    disks == conf->previous_raid_disks)
3656					/* mismatch, need to try again */
3657					must_retry = 1;
3658				spin_unlock_irq(&conf->device_lock);
3659				if (must_retry) {
3660					release_stripe(sh);
3661					goto retry;
3662				}
3663			}
3664			/* FIXME what if we get a false positive because these
3665			 * are being updated.
3666			 */
3667			if (logical_sector >= mddev->suspend_lo &&
3668			    logical_sector < mddev->suspend_hi) {
3669				release_stripe(sh);
3670				schedule();
3671				goto retry;
3672			}
3673
3674			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3675			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
3676				/* Stripe is busy expanding or
3677				 * add failed due to overlap.  Flush everything
3678				 * and wait a while
3679				 */
3680				raid5_unplug_device(mddev->queue);
3681				release_stripe(sh);
3682				schedule();
3683				goto retry;
3684			}
3685			finish_wait(&conf->wait_for_overlap, &w);
3686			set_bit(STRIPE_HANDLE, &sh->state);
3687			clear_bit(STRIPE_DELAYED, &sh->state);
3688			release_stripe(sh);
3689		} else {
3690			/* cannot get stripe for read-ahead, just give-up */
3691			clear_bit(BIO_UPTODATE, &bi->bi_flags);
3692			finish_wait(&conf->wait_for_overlap, &w);
3693			break;
3694		}
3695
3696	}
3697	spin_lock_irq(&conf->device_lock);
3698	remaining = --bi->bi_phys_segments;
3699	spin_unlock_irq(&conf->device_lock);
3700	if (remaining == 0) {
3701
3702		if ( rw == WRITE )
3703			md_write_end(mddev);
3704
3705		bi->bi_end_io(bi,
3706			      test_bit(BIO_UPTODATE, &bi->bi_flags)
3707			        ? 0 : -EIO);
3708	}
3709	return 0;
3710}
3711
3712static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
3713{
3714	/* reshaping is quite different to recovery/resync so it is
3715	 * handled quite separately ... here.
3716	 *
3717	 * On each call to sync_request, we gather one chunk worth of
3718	 * destination stripes and flag them as expanding.
3719	 * Then we find all the source stripes and request reads.
3720	 * As the reads complete, handle_stripe will copy the data
3721	 * into the destination stripe and release that stripe.
3722	 */
3723	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3724	struct stripe_head *sh;
3725	int pd_idx;
3726	sector_t first_sector, last_sector;
3727	int raid_disks = conf->previous_raid_disks;
3728	int data_disks = raid_disks - conf->max_degraded;
3729	int new_data_disks = conf->raid_disks - conf->max_degraded;
3730	int i;
3731	int dd_idx;
3732	sector_t writepos, safepos, gap;
3733
3734	if (sector_nr == 0 &&
3735	    conf->expand_progress != 0) {
3736		/* restarting in the middle, skip the initial sectors */
3737		sector_nr = conf->expand_progress;
3738		sector_div(sector_nr, new_data_disks);
3739		*skipped = 1;
3740		return sector_nr;
3741	}
3742
3743	/* we update the metadata when there is more than 3Meg
3744	 * in the block range (that is rather arbitrary, should
3745	 * probably be time based) or when the data about to be
3746	 * copied would over-write the source of the data at
3747	 * the front of the range.
3748	 * i.e. one new_stripe forward from expand_progress new_maps
3749	 * to after where expand_lo old_maps to
3750	 */
3751	writepos = conf->expand_progress +
3752		conf->chunk_size/512*(new_data_disks);
3753	sector_div(writepos, new_data_disks);
3754	safepos = conf->expand_lo;
3755	sector_div(safepos, data_disks);
3756	gap = conf->expand_progress - conf->expand_lo;
3757
3758	if (writepos >= safepos ||
3759	    gap > (new_data_disks)*3000*2 /*3Meg*/) {
3760		/* Cannot proceed until we've updated the superblock... */
3761		wait_event(conf->wait_for_overlap,
3762			   atomic_read(&conf->reshape_stripes)==0);
3763		mddev->reshape_position = conf->expand_progress;
3764		set_bit(MD_CHANGE_DEVS, &mddev->flags);
3765		md_wakeup_thread(mddev->thread);
3766		wait_event(mddev->sb_wait, mddev->flags == 0 ||
3767			   kthread_should_stop());
3768		spin_lock_irq(&conf->device_lock);
3769		conf->expand_lo = mddev->reshape_position;
3770		spin_unlock_irq(&conf->device_lock);
3771		wake_up(&conf->wait_for_overlap);
3772	}
3773
3774	for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
3775		int j;
3776		int skipped = 0;
3777		pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
3778		sh = get_active_stripe(conf, sector_nr+i,
3779				       conf->raid_disks, pd_idx, 0);
3780		set_bit(STRIPE_EXPANDING, &sh->state);
3781		atomic_inc(&conf->reshape_stripes);
3782		/* If any of this stripe is beyond the end of the old
3783		 * array, then we need to zero those blocks
3784		 */
3785		for (j=sh->disks; j--;) {
3786			sector_t s;
3787			if (j == sh->pd_idx)
3788				continue;
3789			if (conf->level == 6 &&
3790			    j == raid6_next_disk(sh->pd_idx, sh->disks))
3791				continue;
3792			s = compute_blocknr(sh, j);
3793			if (s < (mddev->array_size<<1)) {
3794				skipped = 1;
3795				continue;
3796			}
3797			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
3798			set_bit(R5_Expanded, &sh->dev[j].flags);
3799			set_bit(R5_UPTODATE, &sh->dev[j].flags);
3800		}
3801		if (!skipped) {
3802			set_bit(STRIPE_EXPAND_READY, &sh->state);
3803			set_bit(STRIPE_HANDLE, &sh->state);
3804		}
3805		release_stripe(sh);
3806	}
3807	spin_lock_irq(&conf->device_lock);
3808	conf->expand_progress = (sector_nr + i) * new_data_disks;
3809	spin_unlock_irq(&conf->device_lock);
3810	/* Ok, those stripe are ready. We can start scheduling
3811	 * reads on the source stripes.
3812	 * The source stripes are determined by mapping the first and last
3813	 * block on the destination stripes.
3814	 */
3815	first_sector =
3816		raid5_compute_sector(sector_nr*(new_data_disks),
3817				     raid_disks, data_disks,
3818				     &dd_idx, &pd_idx, conf);
3819	last_sector =
3820		raid5_compute_sector((sector_nr+conf->chunk_size/512)
3821				     *(new_data_disks) -1,
3822				     raid_disks, data_disks,
3823				     &dd_idx, &pd_idx, conf);
3824	if (last_sector >= (mddev->size<<1))
3825		last_sector = (mddev->size<<1)-1;
3826	while (first_sector <= last_sector) {
3827		pd_idx = stripe_to_pdidx(first_sector, conf,
3828					 conf->previous_raid_disks);
3829		sh = get_active_stripe(conf, first_sector,
3830				       conf->previous_raid_disks, pd_idx, 0);
3831		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3832		set_bit(STRIPE_HANDLE, &sh->state);
3833		release_stripe(sh);
3834		first_sector += STRIPE_SECTORS;
3835	}
3836	/* If this takes us to the resync_max point where we have to pause,
3837	 * then we need to write out the superblock.
3838	 */
3839	sector_nr += conf->chunk_size>>9;
3840	if (sector_nr >= mddev->resync_max) {
3841		/* Cannot proceed until we've updated the superblock... */
3842		wait_event(conf->wait_for_overlap,
3843			   atomic_read(&conf->reshape_stripes) == 0);
3844		mddev->reshape_position = conf->expand_progress;
3845		set_bit(MD_CHANGE_DEVS, &mddev->flags);
3846		md_wakeup_thread(mddev->thread);
3847		wait_event(mddev->sb_wait,
3848			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
3849			   || kthread_should_stop());
3850		spin_lock_irq(&conf->device_lock);
3851		conf->expand_lo = mddev->reshape_position;
3852		spin_unlock_irq(&conf->device_lock);
3853		wake_up(&conf->wait_for_overlap);
3854	}
3855	return conf->chunk_size>>9;
3856}
3857
3858/* FIXME go_faster isn't used */
3859static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
3860{
3861	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3862	struct stripe_head *sh;
3863	int pd_idx;
3864	int raid_disks = conf->raid_disks;
3865	sector_t max_sector = mddev->size << 1;
3866	int sync_blocks;
3867	int still_degraded = 0;
3868	int i;
3869
3870	if (sector_nr >= max_sector) {
3871		/* just being told to finish up .. nothing much to do */
3872		unplug_slaves(mddev);
3873		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3874			end_reshape(conf);
3875			return 0;
3876		}
3877
3878		if (mddev->curr_resync < max_sector) /* aborted */
3879			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3880					&sync_blocks, 1);
3881		else /* completed sync */
3882			conf->fullsync = 0;
3883		bitmap_close_sync(mddev->bitmap);
3884
3885		return 0;
3886	}
3887
3888	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3889		return reshape_request(mddev, sector_nr, skipped);
3890
3891	/* No need to check resync_max as we never do more than one
3892	 * stripe, and as resync_max will always be on a chunk boundary,
3893	 * if the check in md_do_sync didn't fire, there is no chance
3894	 * of overstepping resync_max here
3895	 */
3896
3897	/* if there is too many failed drives and we are trying
3898	 * to resync, then assert that we are finished, because there is
3899	 * nothing we can do.
3900	 */
3901	if (mddev->degraded >= conf->max_degraded &&
3902	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3903		sector_t rv = (mddev->size << 1) - sector_nr;
3904		*skipped = 1;
3905		return rv;
3906	}
3907	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
3908	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3909	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
3910		/* we can skip this block, and probably more */
3911		sync_blocks /= STRIPE_SECTORS;
3912		*skipped = 1;
3913		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
3914	}
3915
3916
3917	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3918
3919	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
3920	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
3921	if (sh == NULL) {
3922		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
3923		/* make sure we don't swamp the stripe cache if someone else
3924		 * is trying to get access
3925		 */
3926		schedule_timeout_uninterruptible(1);
3927	}
3928	/* Need to check if array will still be degraded after recovery/resync
3929	 * We don't need to check the 'failed' flag as when that gets set,
3930	 * recovery aborts.
3931	 */
3932	for (i=0; i<mddev->raid_disks; i++)
3933		if (conf->disks[i].rdev == NULL)
3934			still_degraded = 1;
3935
3936	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
3937
3938	spin_lock(&sh->lock);
3939	set_bit(STRIPE_SYNCING, &sh->state);
3940	clear_bit(STRIPE_INSYNC, &sh->state);
3941	spin_unlock(&sh->lock);
3942
3943	handle_stripe(sh, NULL);
3944	release_stripe(sh);
3945
3946	return STRIPE_SECTORS;
3947}
3948
3949static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3950{
3951	/* We may not be able to submit a whole bio at once as there
3952	 * may not be enough stripe_heads available.
3953	 * We cannot pre-allocate enough stripe_heads as we may need
3954	 * more than exist in the cache (if we allow ever large chunks).
3955	 * So we do one stripe head at a time and record in
3956	 * ->bi_hw_segments how many have been done.
3957	 *
3958	 * We *know* that this entire raid_bio is in one chunk, so
3959	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
3960	 */
3961	struct stripe_head *sh;
3962	int dd_idx, pd_idx;
3963	sector_t sector, logical_sector, last_sector;
3964	int scnt = 0;
3965	int remaining;
3966	int handled = 0;
3967
3968	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3969	sector = raid5_compute_sector(	logical_sector,
3970					conf->raid_disks,
3971					conf->raid_disks - conf->max_degraded,
3972					&dd_idx,
3973					&pd_idx,
3974					conf);
3975	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
3976
3977	for (; logical_sector < last_sector;
3978	     logical_sector += STRIPE_SECTORS,
3979		     sector += STRIPE_SECTORS,
3980		     scnt++) {
3981
3982		if (scnt < raid_bio->bi_hw_segments)
3983			/* already done this stripe */
3984			continue;
3985
3986		sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
3987
3988		if (!sh) {
3989			/* failed to get a stripe - must wait */
3990			raid_bio->bi_hw_segments = scnt;
3991			conf->retry_read_aligned = raid_bio;
3992			return handled;
3993		}
3994
3995		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
3996		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
3997			release_stripe(sh);
3998			raid_bio->bi_hw_segments = scnt;
3999			conf->retry_read_aligned = raid_bio;
4000			return handled;
4001		}
4002
4003		handle_stripe(sh, NULL);
4004		release_stripe(sh);
4005		handled++;
4006	}
4007	spin_lock_irq(&conf->device_lock);
4008	remaining = --raid_bio->bi_phys_segments;
4009	spin_unlock_irq(&conf->device_lock);
4010	if (remaining == 0) {
4011
4012		raid_bio->bi_end_io(raid_bio,
4013			      test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
4014			        ? 0 : -EIO);
4015	}
4016	if (atomic_dec_and_test(&conf->active_aligned_reads))
4017		wake_up(&conf->wait_for_stripe);
4018	return handled;
4019}
4020
4021
4022
4023/*
4024 * This is our raid5 kernel thread.
4025 *
4026 * We scan the hash table for stripes which can be handled now.
4027 * During the scan, completed stripes are saved for us by the interrupt
4028 * handler, so that they will not have to wait for our next wakeup.
4029 */
4030static void raid5d(mddev_t *mddev)
4031{
4032	struct stripe_head *sh;
4033	raid5_conf_t *conf = mddev_to_conf(mddev);
4034	int handled;
4035
4036	pr_debug("+++ raid5d active\n");
4037
4038	md_check_recovery(mddev);
4039
4040	handled = 0;
4041	spin_lock_irq(&conf->device_lock);
4042	while (1) {
4043		struct bio *bio;
4044
4045		if (conf->seq_flush != conf->seq_write) {
4046			int seq = conf->seq_flush;
4047			spin_unlock_irq(&conf->device_lock);
4048			bitmap_unplug(mddev->bitmap);
4049			spin_lock_irq(&conf->device_lock);
4050			conf->seq_write = seq;
4051			activate_bit_delay(conf);
4052		}
4053
4054		while ((bio = remove_bio_from_retry(conf))) {
4055			int ok;
4056			spin_unlock_irq(&conf->device_lock);
4057			ok = retry_aligned_read(conf, bio);
4058			spin_lock_irq(&conf->device_lock);
4059			if (!ok)
4060				break;
4061			handled++;
4062		}
4063
4064		sh = __get_priority_stripe(conf);
4065
4066		if (!sh) {
4067			async_tx_issue_pending_all();
4068			break;
4069		}
4070		spin_unlock_irq(&conf->device_lock);
4071
4072		handled++;
4073		handle_stripe(sh, conf->spare_page);
4074		release_stripe(sh);
4075
4076		spin_lock_irq(&conf->device_lock);
4077	}
4078	pr_debug("%d stripes handled\n", handled);
4079
4080	spin_unlock_irq(&conf->device_lock);
4081
4082	unplug_slaves(mddev);
4083
4084	pr_debug("--- raid5d inactive\n");
4085}
4086
4087static ssize_t
4088raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4089{
4090	raid5_conf_t *conf = mddev_to_conf(mddev);
4091	if (conf)
4092		return sprintf(page, "%d\n", conf->max_nr_stripes);
4093	else
4094		return 0;
4095}
4096
4097static ssize_t
4098raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4099{
4100	raid5_conf_t *conf = mddev_to_conf(mddev);
4101	unsigned long new;
4102	if (len >= PAGE_SIZE)
4103		return -EINVAL;
4104	if (!conf)
4105		return -ENODEV;
4106
4107	if (strict_strtoul(page, 10, &new))
4108		return -EINVAL;
4109	if (new <= 16 || new > 32768)
4110		return -EINVAL;
4111	while (new < conf->max_nr_stripes) {
4112		if (drop_one_stripe(conf))
4113			conf->max_nr_stripes--;
4114		else
4115			break;
4116	}
4117	md_allow_write(mddev);
4118	while (new > conf->max_nr_stripes) {
4119		if (grow_one_stripe(conf))
4120			conf->max_nr_stripes++;
4121		else break;
4122	}
4123	return len;
4124}
4125
4126static struct md_sysfs_entry
4127raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4128				raid5_show_stripe_cache_size,
4129				raid5_store_stripe_cache_size);
4130
4131static ssize_t
4132raid5_show_preread_threshold(mddev_t *mddev, char *page)
4133{
4134	raid5_conf_t *conf = mddev_to_conf(mddev);
4135	if (conf)
4136		return sprintf(page, "%d\n", conf->bypass_threshold);
4137	else
4138		return 0;
4139}
4140
4141static ssize_t
4142raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4143{
4144	raid5_conf_t *conf = mddev_to_conf(mddev);
4145	unsigned long new;
4146	if (len >= PAGE_SIZE)
4147		return -EINVAL;
4148	if (!conf)
4149		return -ENODEV;
4150
4151	if (strict_strtoul(page, 10, &new))
4152		return -EINVAL;
4153	if (new > conf->max_nr_stripes)
4154		return -EINVAL;
4155	conf->bypass_threshold = new;
4156	return len;
4157}
4158
4159static struct md_sysfs_entry
4160raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4161					S_IRUGO | S_IWUSR,
4162					raid5_show_preread_threshold,
4163					raid5_store_preread_threshold);
4164
4165static ssize_t
4166stripe_cache_active_show(mddev_t *mddev, char *page)
4167{
4168	raid5_conf_t *conf = mddev_to_conf(mddev);
4169	if (conf)
4170		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4171	else
4172		return 0;
4173}
4174
4175static struct md_sysfs_entry
4176raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4177
4178static struct attribute *raid5_attrs[] =  {
4179	&raid5_stripecache_size.attr,
4180	&raid5_stripecache_active.attr,
4181	&raid5_preread_bypass_threshold.attr,
4182	NULL,
4183};
4184static struct attribute_group raid5_attrs_group = {
4185	.name = NULL,
4186	.attrs = raid5_attrs,
4187};
4188
4189static int run(mddev_t *mddev)
4190{
4191	raid5_conf_t *conf;
4192	int raid_disk, memory;
4193	mdk_rdev_t *rdev;
4194	struct disk_info *disk;
4195	struct list_head *tmp;
4196	int working_disks = 0;
4197
4198	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
4199		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4200		       mdname(mddev), mddev->level);
4201		return -EIO;
4202	}
4203
4204	if (mddev->reshape_position != MaxSector) {
4205		/* Check that we can continue the reshape.
4206		 * Currently only disks can change, it must
4207		 * increase, and we must be past the point where
4208		 * a stripe over-writes itself
4209		 */
4210		sector_t here_new, here_old;
4211		int old_disks;
4212		int max_degraded = (mddev->level == 5 ? 1 : 2);
4213
4214		if (mddev->new_level != mddev->level ||
4215		    mddev->new_layout != mddev->layout ||
4216		    mddev->new_chunk != mddev->chunk_size) {
4217			printk(KERN_ERR "raid5: %s: unsupported reshape "
4218			       "required - aborting.\n",
4219			       mdname(mddev));
4220			return -EINVAL;
4221		}
4222		if (mddev->delta_disks <= 0) {
4223			printk(KERN_ERR "raid5: %s: unsupported reshape "
4224			       "(reduce disks) required - aborting.\n",
4225			       mdname(mddev));
4226			return -EINVAL;
4227		}
4228		old_disks = mddev->raid_disks - mddev->delta_disks;
4229		/* reshape_position must be on a new-stripe boundary, and one
4230		 * further up in new geometry must map after here in old
4231		 * geometry.
4232		 */
4233		here_new = mddev->reshape_position;
4234		if (sector_div(here_new, (mddev->chunk_size>>9)*
4235			       (mddev->raid_disks - max_degraded))) {
4236			printk(KERN_ERR "raid5: reshape_position not "
4237			       "on a stripe boundary\n");
4238			return -EINVAL;
4239		}
4240		/* here_new is the stripe we will write to */
4241		here_old = mddev->reshape_position;
4242		sector_div(here_old, (mddev->chunk_size>>9)*
4243			   (old_disks-max_degraded));
4244		/* here_old is the first stripe that we might need to read
4245		 * from */
4246		if (here_new >= here_old) {
4247			/* Reading from the same stripe as writing to - bad */
4248			printk(KERN_ERR "raid5: reshape_position too early for "
4249			       "auto-recovery - aborting.\n");
4250			return -EINVAL;
4251		}
4252		printk(KERN_INFO "raid5: reshape will continue\n");
4253		/* OK, we should be able to continue; */
4254	}
4255
4256
4257	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
4258	if ((conf = mddev->private) == NULL)
4259		goto abort;
4260	if (mddev->reshape_position == MaxSector) {
4261		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
4262	} else {
4263		conf->raid_disks = mddev->raid_disks;
4264		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4265	}
4266
4267	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4268			      GFP_KERNEL);
4269	if (!conf->disks)
4270		goto abort;
4271
4272	conf->mddev = mddev;
4273
4274	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4275		goto abort;
4276
4277	if (mddev->level == 6) {
4278		conf->spare_page = alloc_page(GFP_KERNEL);
4279		if (!conf->spare_page)
4280			goto abort;
4281	}
4282	spin_lock_init(&conf->device_lock);
4283	mddev->queue->queue_lock = &conf->device_lock;
4284	init_waitqueue_head(&conf->wait_for_stripe);
4285	init_waitqueue_head(&conf->wait_for_overlap);
4286	INIT_LIST_HEAD(&conf->handle_list);
4287	INIT_LIST_HEAD(&conf->hold_list);
4288	INIT_LIST_HEAD(&conf->delayed_list);
4289	INIT_LIST_HEAD(&conf->bitmap_list);
4290	INIT_LIST_HEAD(&conf->inactive_list);
4291	atomic_set(&conf->active_stripes, 0);
4292	atomic_set(&conf->preread_active_stripes, 0);
4293	atomic_set(&conf->active_aligned_reads, 0);
4294	conf->bypass_threshold = BYPASS_THRESHOLD;
4295
4296	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4297
4298	rdev_for_each(rdev, tmp, mddev) {
4299		raid_disk = rdev->raid_disk;
4300		if (raid_disk >= conf->raid_disks
4301		    || raid_disk < 0)
4302			continue;
4303		disk = conf->disks + raid_disk;
4304
4305		disk->rdev = rdev;
4306
4307		if (test_bit(In_sync, &rdev->flags)) {
4308			char b[BDEVNAME_SIZE];
4309			printk(KERN_INFO "raid5: device %s operational as raid"
4310				" disk %d\n", bdevname(rdev->bdev,b),
4311				raid_disk);
4312			working_disks++;
4313		} else
4314			/* Cannot rely on bitmap to complete recovery */
4315			conf->fullsync = 1;
4316	}
4317
4318	/*
4319	 * 0 for a fully functional array, 1 or 2 for a degraded array.
4320	 */
4321	mddev->degraded = conf->raid_disks - working_disks;
4322	conf->mddev = mddev;
4323	conf->chunk_size = mddev->chunk_size;
4324	conf->level = mddev->level;
4325	if (conf->level == 6)
4326		conf->max_degraded = 2;
4327	else
4328		conf->max_degraded = 1;
4329	conf->algorithm = mddev->layout;
4330	conf->max_nr_stripes = NR_STRIPES;
4331	conf->expand_progress = mddev->reshape_position;
4332
4333	/* device size must be a multiple of chunk size */
4334	mddev->size &= ~(mddev->chunk_size/1024 -1);
4335	mddev->resync_max_sectors = mddev->size << 1;
4336
4337	if (conf->level == 6 && conf->raid_disks < 4) {
4338		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4339		       mdname(mddev), conf->raid_disks);
4340		goto abort;
4341	}
4342	if (!conf->chunk_size || conf->chunk_size % 4) {
4343		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4344			conf->chunk_size, mdname(mddev));
4345		goto abort;
4346	}
4347	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
4348		printk(KERN_ERR
4349			"raid5: unsupported parity algorithm %d for %s\n",
4350			conf->algorithm, mdname(mddev));
4351		goto abort;
4352	}
4353	if (mddev->degraded > conf->max_degraded) {
4354		printk(KERN_ERR "raid5: not enough operational devices for %s"
4355			" (%d/%d failed)\n",
4356			mdname(mddev), mddev->degraded, conf->raid_disks);
4357		goto abort;
4358	}
4359
4360	if (mddev->degraded > 0 &&
4361	    mddev->recovery_cp != MaxSector) {
4362		if (mddev->ok_start_degraded)
4363			printk(KERN_WARNING
4364			       "raid5: starting dirty degraded array: %s"
4365			       "- data corruption possible.\n",
4366			       mdname(mddev));
4367		else {
4368			printk(KERN_ERR
4369			       "raid5: cannot start dirty degraded array for %s\n",
4370			       mdname(mddev));
4371			goto abort;
4372		}
4373	}
4374
4375	{
4376		mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
4377		if (!mddev->thread) {
4378			printk(KERN_ERR
4379				"raid5: couldn't allocate thread for %s\n",
4380				mdname(mddev));
4381			goto abort;
4382		}
4383	}
4384	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4385		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4386	if (grow_stripes(conf, conf->max_nr_stripes)) {
4387		printk(KERN_ERR
4388			"raid5: couldn't allocate %dkB for buffers\n", memory);
4389		shrink_stripes(conf);
4390		md_unregister_thread(mddev->thread);
4391		goto abort;
4392	} else
4393		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4394			memory, mdname(mddev));
4395
4396	if (mddev->degraded == 0)
4397		printk("raid5: raid level %d set %s active with %d out of %d"
4398			" devices, algorithm %d\n", conf->level, mdname(mddev),
4399			mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4400			conf->algorithm);
4401	else
4402		printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
4403			" out of %d devices, algorithm %d\n", conf->level,
4404			mdname(mddev), mddev->raid_disks - mddev->degraded,
4405			mddev->raid_disks, conf->algorithm);
4406
4407	print_raid5_conf(conf);
4408
4409	if (conf->expand_progress != MaxSector) {
4410		printk("...ok start reshape thread\n");
4411		conf->expand_lo = conf->expand_progress;
4412		atomic_set(&conf->reshape_stripes, 0);
4413		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4414		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4415		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4416		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4417		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4418							"%s_reshape");
4419	}
4420
4421	/* read-ahead size must cover two whole stripes, which is
4422	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4423	 */
4424	{
4425		int data_disks = conf->previous_raid_disks - conf->max_degraded;
4426		int stripe = data_disks *
4427			(mddev->chunk_size / PAGE_SIZE);
4428		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4429			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4430	}
4431
4432	/* Ok, everything is just fine now */
4433	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4434		printk(KERN_WARNING
4435		       "raid5: failed to create sysfs attributes for %s\n",
4436		       mdname(mddev));
4437
4438	mddev->queue->unplug_fn = raid5_unplug_device;
4439	mddev->queue->backing_dev_info.congested_data = mddev;
4440	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4441
4442	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
4443					    conf->max_degraded);
4444
4445	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4446
4447	return 0;
4448abort:
4449	if (conf) {
4450		print_raid5_conf(conf);
4451		safe_put_page(conf->spare_page);
4452		kfree(conf->disks);
4453		kfree(conf->stripe_hashtbl);
4454		kfree(conf);
4455	}
4456	mddev->private = NULL;
4457	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
4458	return -EIO;
4459}
4460
4461
4462
4463static int stop(mddev_t *mddev)
4464{
4465	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4466
4467	md_unregister_thread(mddev->thread);
4468	mddev->thread = NULL;
4469	shrink_stripes(conf);
4470	kfree(conf->stripe_hashtbl);
4471	mddev->queue->backing_dev_info.congested_fn = NULL;
4472	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4473	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4474	kfree(conf->disks);
4475	kfree(conf);
4476	mddev->private = NULL;
4477	return 0;
4478}
4479
4480#ifdef DEBUG
4481static void print_sh (struct seq_file *seq, struct stripe_head *sh)
4482{
4483	int i;
4484
4485	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
4486		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
4487	seq_printf(seq, "sh %llu,  count %d.\n",
4488		   (unsigned long long)sh->sector, atomic_read(&sh->count));
4489	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
4490	for (i = 0; i < sh->disks; i++) {
4491		seq_printf(seq, "(cache%d: %p %ld) ",
4492			   i, sh->dev[i].page, sh->dev[i].flags);
4493	}
4494	seq_printf(seq, "\n");
4495}
4496
4497static void printall (struct seq_file *seq, raid5_conf_t *conf)
4498{
4499	struct stripe_head *sh;
4500	struct hlist_node *hn;
4501	int i;
4502
4503	spin_lock_irq(&conf->device_lock);
4504	for (i = 0; i < NR_HASH; i++) {
4505		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
4506			if (sh->raid_conf != conf)
4507				continue;
4508			print_sh(seq, sh);
4509		}
4510	}
4511	spin_unlock_irq(&conf->device_lock);
4512}
4513#endif
4514
4515static void status (struct seq_file *seq, mddev_t *mddev)
4516{
4517	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4518	int i;
4519
4520	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
4521	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
4522	for (i = 0; i < conf->raid_disks; i++)
4523		seq_printf (seq, "%s",
4524			       conf->disks[i].rdev &&
4525			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
4526	seq_printf (seq, "]");
4527#ifdef DEBUG
4528	seq_printf (seq, "\n");
4529	printall(seq, conf);
4530#endif
4531}
4532
4533static void print_raid5_conf (raid5_conf_t *conf)
4534{
4535	int i;
4536	struct disk_info *tmp;
4537
4538	printk("RAID5 conf printout:\n");
4539	if (!conf) {
4540		printk("(conf==NULL)\n");
4541		return;
4542	}
4543	printk(" --- rd:%d wd:%d\n", conf->raid_disks,
4544		 conf->raid_disks - conf->mddev->degraded);
4545
4546	for (i = 0; i < conf->raid_disks; i++) {
4547		char b[BDEVNAME_SIZE];
4548		tmp = conf->disks + i;
4549		if (tmp->rdev)
4550		printk(" disk %d, o:%d, dev:%s\n",
4551			i, !test_bit(Faulty, &tmp->rdev->flags),
4552			bdevname(tmp->rdev->bdev,b));
4553	}
4554}
4555
4556static int raid5_spare_active(mddev_t *mddev)
4557{
4558	int i;
4559	raid5_conf_t *conf = mddev->private;
4560	struct disk_info *tmp;
4561
4562	for (i = 0; i < conf->raid_disks; i++) {
4563		tmp = conf->disks + i;
4564		if (tmp->rdev
4565		    && !test_bit(Faulty, &tmp->rdev->flags)
4566		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
4567			unsigned long flags;
4568			spin_lock_irqsave(&conf->device_lock, flags);
4569			mddev->degraded--;
4570			spin_unlock_irqrestore(&conf->device_lock, flags);
4571		}
4572	}
4573	print_raid5_conf(conf);
4574	return 0;
4575}
4576
4577static int raid5_remove_disk(mddev_t *mddev, int number)
4578{
4579	raid5_conf_t *conf = mddev->private;
4580	int err = 0;
4581	mdk_rdev_t *rdev;
4582	struct disk_info *p = conf->disks + number;
4583
4584	print_raid5_conf(conf);
4585	rdev = p->rdev;
4586	if (rdev) {
4587		if (test_bit(In_sync, &rdev->flags) ||
4588		    atomic_read(&rdev->nr_pending)) {
4589			err = -EBUSY;
4590			goto abort;
4591		}
4592		/* Only remove non-faulty devices if recovery
4593		 * isn't possible.
4594		 */
4595		if (!test_bit(Faulty, &rdev->flags) &&
4596		    mddev->degraded <= conf->max_degraded) {
4597			err = -EBUSY;
4598			goto abort;
4599		}
4600		p->rdev = NULL;
4601		synchronize_rcu();
4602		if (atomic_read(&rdev->nr_pending)) {
4603			/* lost the race, try later */
4604			err = -EBUSY;
4605			p->rdev = rdev;
4606		}
4607	}
4608abort:
4609
4610	print_raid5_conf(conf);
4611	return err;
4612}
4613
4614static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4615{
4616	raid5_conf_t *conf = mddev->private;
4617	int found = 0;
4618	int disk;
4619	struct disk_info *p;
4620
4621	if (mddev->degraded > conf->max_degraded)
4622		/* no point adding a device */
4623		return 0;
4624
4625	/*
4626	 * find the disk ... but prefer rdev->saved_raid_disk
4627	 * if possible.
4628	 */
4629	if (rdev->saved_raid_disk >= 0 &&
4630	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
4631		disk = rdev->saved_raid_disk;
4632	else
4633		disk = 0;
4634	for ( ; disk < conf->raid_disks; disk++)
4635		if ((p=conf->disks + disk)->rdev == NULL) {
4636			clear_bit(In_sync, &rdev->flags);
4637			rdev->raid_disk = disk;
4638			found = 1;
4639			if (rdev->saved_raid_disk != disk)
4640				conf->fullsync = 1;
4641			rcu_assign_pointer(p->rdev, rdev);
4642			break;
4643		}
4644	print_raid5_conf(conf);
4645	return found;
4646}
4647
4648static int raid5_resize(mddev_t *mddev, sector_t sectors)
4649{
4650	/* no resync is happening, and there is enough space
4651	 * on all devices, so we can resize.
4652	 * We need to make sure resync covers any new space.
4653	 * If the array is shrinking we should possibly wait until
4654	 * any io in the removed space completes, but it hardly seems
4655	 * worth it.
4656	 */
4657	raid5_conf_t *conf = mddev_to_conf(mddev);
4658
4659	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4660	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
4661	set_capacity(mddev->gendisk, mddev->array_size << 1);
4662	mddev->changed = 1;
4663	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
4664		mddev->recovery_cp = mddev->size << 1;
4665		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4666	}
4667	mddev->size = sectors /2;
4668	mddev->resync_max_sectors = sectors;
4669	return 0;
4670}
4671
4672#ifdef CONFIG_MD_RAID5_RESHAPE
4673static int raid5_check_reshape(mddev_t *mddev)
4674{
4675	raid5_conf_t *conf = mddev_to_conf(mddev);
4676	int err;
4677
4678	if (mddev->delta_disks < 0 ||
4679	    mddev->new_level != mddev->level)
4680		return -EINVAL; /* Cannot shrink array or change level yet */
4681	if (mddev->delta_disks == 0)
4682		return 0; /* nothing to do */
4683
4684	/* Can only proceed if there are plenty of stripe_heads.
4685	 * We need a minimum of one full stripe,, and for sensible progress
4686	 * it is best to have about 4 times that.
4687	 * If we require 4 times, then the default 256 4K stripe_heads will
4688	 * allow for chunk sizes up to 256K, which is probably OK.
4689	 * If the chunk size is greater, user-space should request more
4690	 * stripe_heads first.
4691	 */
4692	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
4693	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
4694		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
4695		       (mddev->chunk_size / STRIPE_SIZE)*4);
4696		return -ENOSPC;
4697	}
4698
4699	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
4700	if (err)
4701		return err;
4702
4703	if (mddev->degraded > conf->max_degraded)
4704		return -EINVAL;
4705	/* looks like we might be able to manage this */
4706	return 0;
4707}
4708
4709static int raid5_start_reshape(mddev_t *mddev)
4710{
4711	raid5_conf_t *conf = mddev_to_conf(mddev);
4712	mdk_rdev_t *rdev;
4713	struct list_head *rtmp;
4714	int spares = 0;
4715	int added_devices = 0;
4716	unsigned long flags;
4717
4718	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4719		return -EBUSY;
4720
4721	rdev_for_each(rdev, rtmp, mddev)
4722		if (rdev->raid_disk < 0 &&
4723		    !test_bit(Faulty, &rdev->flags))
4724			spares++;
4725
4726	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
4727		/* Not enough devices even to make a degraded array
4728		 * of that size
4729		 */
4730		return -EINVAL;
4731
4732	atomic_set(&conf->reshape_stripes, 0);
4733	spin_lock_irq(&conf->device_lock);
4734	conf->previous_raid_disks = conf->raid_disks;
4735	conf->raid_disks += mddev->delta_disks;
4736	conf->expand_progress = 0;
4737	conf->expand_lo = 0;
4738	spin_unlock_irq(&conf->device_lock);
4739
4740	/* Add some new drives, as many as will fit.
4741	 * We know there are enough to make the newly sized array work.
4742	 */
4743	rdev_for_each(rdev, rtmp, mddev)
4744		if (rdev->raid_disk < 0 &&
4745		    !test_bit(Faulty, &rdev->flags)) {
4746			if (raid5_add_disk(mddev, rdev)) {
4747				char nm[20];
4748				set_bit(In_sync, &rdev->flags);
4749				added_devices++;
4750				rdev->recovery_offset = 0;
4751				sprintf(nm, "rd%d", rdev->raid_disk);
4752				if (sysfs_create_link(&mddev->kobj,
4753						      &rdev->kobj, nm))
4754					printk(KERN_WARNING
4755					       "raid5: failed to create "
4756					       " link %s for %s\n",
4757					       nm, mdname(mddev));
4758			} else
4759				break;
4760		}
4761
4762	spin_lock_irqsave(&conf->device_lock, flags);
4763	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
4764	spin_unlock_irqrestore(&conf->device_lock, flags);
4765	mddev->raid_disks = conf->raid_disks;
4766	mddev->reshape_position = 0;
4767	set_bit(MD_CHANGE_DEVS, &mddev->flags);
4768
4769	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4770	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4771	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4772	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4773	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4774						"%s_reshape");
4775	if (!mddev->sync_thread) {
4776		mddev->recovery = 0;
4777		spin_lock_irq(&conf->device_lock);
4778		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
4779		conf->expand_progress = MaxSector;
4780		spin_unlock_irq(&conf->device_lock);
4781		return -EAGAIN;
4782	}
4783	md_wakeup_thread(mddev->sync_thread);
4784	md_new_event(mddev);
4785	return 0;
4786}
4787#endif
4788
4789static void end_reshape(raid5_conf_t *conf)
4790{
4791	struct block_device *bdev;
4792
4793	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4794		conf->mddev->array_size = conf->mddev->size *
4795			(conf->raid_disks - conf->max_degraded);
4796		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
4797		conf->mddev->changed = 1;
4798
4799		bdev = bdget_disk(conf->mddev->gendisk, 0);
4800		if (bdev) {
4801			mutex_lock(&bdev->bd_inode->i_mutex);
4802			i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
4803			mutex_unlock(&bdev->bd_inode->i_mutex);
4804			bdput(bdev);
4805		}
4806		spin_lock_irq(&conf->device_lock);
4807		conf->expand_progress = MaxSector;
4808		spin_unlock_irq(&conf->device_lock);
4809		conf->mddev->reshape_position = MaxSector;
4810
4811		/* read-ahead size must cover two whole stripes, which is
4812		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4813		 */
4814		{
4815			int data_disks = conf->previous_raid_disks - conf->max_degraded;
4816			int stripe = data_disks *
4817				(conf->mddev->chunk_size / PAGE_SIZE);
4818			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4819				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4820		}
4821	}
4822}
4823
4824static void raid5_quiesce(mddev_t *mddev, int state)
4825{
4826	raid5_conf_t *conf = mddev_to_conf(mddev);
4827
4828	switch(state) {
4829	case 2: /* resume for a suspend */
4830		wake_up(&conf->wait_for_overlap);
4831		break;
4832
4833	case 1: /* stop all writes */
4834		spin_lock_irq(&conf->device_lock);
4835		conf->quiesce = 1;
4836		wait_event_lock_irq(conf->wait_for_stripe,
4837				    atomic_read(&conf->active_stripes) == 0 &&
4838				    atomic_read(&conf->active_aligned_reads) == 0,
4839				    conf->device_lock, /* nothing */);
4840		spin_unlock_irq(&conf->device_lock);
4841		break;
4842
4843	case 0: /* re-enable writes */
4844		spin_lock_irq(&conf->device_lock);
4845		conf->quiesce = 0;
4846		wake_up(&conf->wait_for_stripe);
4847		wake_up(&conf->wait_for_overlap);
4848		spin_unlock_irq(&conf->device_lock);
4849		break;
4850	}
4851}
4852
4853static struct mdk_personality raid6_personality =
4854{
4855	.name		= "raid6",
4856	.level		= 6,
4857	.owner		= THIS_MODULE,
4858	.make_request	= make_request,
4859	.run		= run,
4860	.stop		= stop,
4861	.status		= status,
4862	.error_handler	= error,
4863	.hot_add_disk	= raid5_add_disk,
4864	.hot_remove_disk= raid5_remove_disk,
4865	.spare_active	= raid5_spare_active,
4866	.sync_request	= sync_request,
4867	.resize		= raid5_resize,
4868#ifdef CONFIG_MD_RAID5_RESHAPE
4869	.check_reshape	= raid5_check_reshape,
4870	.start_reshape  = raid5_start_reshape,
4871#endif
4872	.quiesce	= raid5_quiesce,
4873};
4874static struct mdk_personality raid5_personality =
4875{
4876	.name		= "raid5",
4877	.level		= 5,
4878	.owner		= THIS_MODULE,
4879	.make_request	= make_request,
4880	.run		= run,
4881	.stop		= stop,
4882	.status		= status,
4883	.error_handler	= error,
4884	.hot_add_disk	= raid5_add_disk,
4885	.hot_remove_disk= raid5_remove_disk,
4886	.spare_active	= raid5_spare_active,
4887	.sync_request	= sync_request,
4888	.resize		= raid5_resize,
4889#ifdef CONFIG_MD_RAID5_RESHAPE
4890	.check_reshape	= raid5_check_reshape,
4891	.start_reshape  = raid5_start_reshape,
4892#endif
4893	.quiesce	= raid5_quiesce,
4894};
4895
4896static struct mdk_personality raid4_personality =
4897{
4898	.name		= "raid4",
4899	.level		= 4,
4900	.owner		= THIS_MODULE,
4901	.make_request	= make_request,
4902	.run		= run,
4903	.stop		= stop,
4904	.status		= status,
4905	.error_handler	= error,
4906	.hot_add_disk	= raid5_add_disk,
4907	.hot_remove_disk= raid5_remove_disk,
4908	.spare_active	= raid5_spare_active,
4909	.sync_request	= sync_request,
4910	.resize		= raid5_resize,
4911#ifdef CONFIG_MD_RAID5_RESHAPE
4912	.check_reshape	= raid5_check_reshape,
4913	.start_reshape  = raid5_start_reshape,
4914#endif
4915	.quiesce	= raid5_quiesce,
4916};
4917
4918static int __init raid5_init(void)
4919{
4920	int e;
4921
4922	e = raid6_select_algo();
4923	if ( e )
4924		return e;
4925	register_md_personality(&raid6_personality);
4926	register_md_personality(&raid5_personality);
4927	register_md_personality(&raid4_personality);
4928	return 0;
4929}
4930
4931static void raid5_exit(void)
4932{
4933	unregister_md_personality(&raid6_personality);
4934	unregister_md_personality(&raid5_personality);
4935	unregister_md_personality(&raid4_personality);
4936}
4937
4938module_init(raid5_init);
4939module_exit(raid5_exit);
4940MODULE_LICENSE("GPL");
4941MODULE_ALIAS("md-personality-4"); /* RAID5 */
4942MODULE_ALIAS("md-raid5");
4943MODULE_ALIAS("md-raid4");
4944MODULE_ALIAS("md-level-5");
4945MODULE_ALIAS("md-level-4");
4946MODULE_ALIAS("md-personality-8"); /* RAID6 */
4947MODULE_ALIAS("md-raid6");
4948MODULE_ALIAS("md-level-6");
4949
4950/* This used to be two separate modules, they were: */
4951MODULE_ALIAS("raid5");
4952MODULE_ALIAS("raid6");
4953