raid5.c revision d089c6af10c2be5988f03667d6d22fe6085fbe5e
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 *	   Copyright (C) 1999, 2000 Ingo Molnar
5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches.  Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to
32 *    new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 *   we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 *   batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/module.h>
47#include <linux/slab.h>
48#include <linux/highmem.h>
49#include <linux/bitops.h>
50#include <linux/kthread.h>
51#include <asm/atomic.h>
52#include "raid6.h"
53
54#include <linux/raid/bitmap.h>
55#include <linux/async_tx.h>
56
57/*
58 * Stripe cache
59 */
60
61#define NR_STRIPES		256
62#define STRIPE_SIZE		PAGE_SIZE
63#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
64#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
65#define	IO_THRESHOLD		1
66#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
67#define HASH_MASK		(NR_HASH - 1)
68
69#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
70
71/* bio's attached to a stripe+device for I/O are linked together in bi_sector
72 * order without overlap.  There may be several bio's per stripe+device, and
73 * a bio could span several devices.
74 * When walking this list for a particular stripe+device, we must never proceed
75 * beyond a bio that extends past this device, as the next bio might no longer
76 * be valid.
77 * This macro is used to determine the 'next' bio in the list, given the sector
78 * of the current stripe+device
79 */
80#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
81/*
82 * The following can be used to debug the driver
83 */
84#define RAID5_PARANOIA	1
85#if RAID5_PARANOIA && defined(CONFIG_SMP)
86# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
87#else
88# define CHECK_DEVLOCK()
89#endif
90
91#ifdef DEBUG
92#define inline
93#define __inline__
94#endif
95
96#if !RAID6_USE_EMPTY_ZERO_PAGE
97/* In .bss so it's zeroed */
98const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
99#endif
100
101static inline int raid6_next_disk(int disk, int raid_disks)
102{
103	disk++;
104	return (disk < raid_disks) ? disk : 0;
105}
106
107static void return_io(struct bio *return_bi)
108{
109	struct bio *bi = return_bi;
110	while (bi) {
111
112		return_bi = bi->bi_next;
113		bi->bi_next = NULL;
114		bi->bi_size = 0;
115		bi->bi_end_io(bi,
116			      test_bit(BIO_UPTODATE, &bi->bi_flags)
117			        ? 0 : -EIO);
118		bi = return_bi;
119	}
120}
121
122static void print_raid5_conf (raid5_conf_t *conf);
123
124static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
125{
126	if (atomic_dec_and_test(&sh->count)) {
127		BUG_ON(!list_empty(&sh->lru));
128		BUG_ON(atomic_read(&conf->active_stripes)==0);
129		if (test_bit(STRIPE_HANDLE, &sh->state)) {
130			if (test_bit(STRIPE_DELAYED, &sh->state)) {
131				list_add_tail(&sh->lru, &conf->delayed_list);
132				blk_plug_device(conf->mddev->queue);
133			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
134				   sh->bm_seq - conf->seq_write > 0) {
135				list_add_tail(&sh->lru, &conf->bitmap_list);
136				blk_plug_device(conf->mddev->queue);
137			} else {
138				clear_bit(STRIPE_BIT_DELAY, &sh->state);
139				list_add_tail(&sh->lru, &conf->handle_list);
140			}
141			md_wakeup_thread(conf->mddev->thread);
142		} else {
143			BUG_ON(sh->ops.pending);
144			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
145				atomic_dec(&conf->preread_active_stripes);
146				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
147					md_wakeup_thread(conf->mddev->thread);
148			}
149			atomic_dec(&conf->active_stripes);
150			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
151				list_add_tail(&sh->lru, &conf->inactive_list);
152				wake_up(&conf->wait_for_stripe);
153				if (conf->retry_read_aligned)
154					md_wakeup_thread(conf->mddev->thread);
155			}
156		}
157	}
158}
159static void release_stripe(struct stripe_head *sh)
160{
161	raid5_conf_t *conf = sh->raid_conf;
162	unsigned long flags;
163
164	spin_lock_irqsave(&conf->device_lock, flags);
165	__release_stripe(conf, sh);
166	spin_unlock_irqrestore(&conf->device_lock, flags);
167}
168
169static inline void remove_hash(struct stripe_head *sh)
170{
171	pr_debug("remove_hash(), stripe %llu\n",
172		(unsigned long long)sh->sector);
173
174	hlist_del_init(&sh->hash);
175}
176
177static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
178{
179	struct hlist_head *hp = stripe_hash(conf, sh->sector);
180
181	pr_debug("insert_hash(), stripe %llu\n",
182		(unsigned long long)sh->sector);
183
184	CHECK_DEVLOCK();
185	hlist_add_head(&sh->hash, hp);
186}
187
188
189/* find an idle stripe, make sure it is unhashed, and return it. */
190static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
191{
192	struct stripe_head *sh = NULL;
193	struct list_head *first;
194
195	CHECK_DEVLOCK();
196	if (list_empty(&conf->inactive_list))
197		goto out;
198	first = conf->inactive_list.next;
199	sh = list_entry(first, struct stripe_head, lru);
200	list_del_init(first);
201	remove_hash(sh);
202	atomic_inc(&conf->active_stripes);
203out:
204	return sh;
205}
206
207static void shrink_buffers(struct stripe_head *sh, int num)
208{
209	struct page *p;
210	int i;
211
212	for (i=0; i<num ; i++) {
213		p = sh->dev[i].page;
214		if (!p)
215			continue;
216		sh->dev[i].page = NULL;
217		put_page(p);
218	}
219}
220
221static int grow_buffers(struct stripe_head *sh, int num)
222{
223	int i;
224
225	for (i=0; i<num; i++) {
226		struct page *page;
227
228		if (!(page = alloc_page(GFP_KERNEL))) {
229			return 1;
230		}
231		sh->dev[i].page = page;
232	}
233	return 0;
234}
235
236static void raid5_build_block (struct stripe_head *sh, int i);
237
238static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
239{
240	raid5_conf_t *conf = sh->raid_conf;
241	int i;
242
243	BUG_ON(atomic_read(&sh->count) != 0);
244	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
245	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
246
247	CHECK_DEVLOCK();
248	pr_debug("init_stripe called, stripe %llu\n",
249		(unsigned long long)sh->sector);
250
251	remove_hash(sh);
252
253	sh->sector = sector;
254	sh->pd_idx = pd_idx;
255	sh->state = 0;
256
257	sh->disks = disks;
258
259	for (i = sh->disks; i--; ) {
260		struct r5dev *dev = &sh->dev[i];
261
262		if (dev->toread || dev->read || dev->towrite || dev->written ||
263		    test_bit(R5_LOCKED, &dev->flags)) {
264			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
265			       (unsigned long long)sh->sector, i, dev->toread,
266			       dev->read, dev->towrite, dev->written,
267			       test_bit(R5_LOCKED, &dev->flags));
268			BUG();
269		}
270		dev->flags = 0;
271		raid5_build_block(sh, i);
272	}
273	insert_hash(conf, sh);
274}
275
276static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
277{
278	struct stripe_head *sh;
279	struct hlist_node *hn;
280
281	CHECK_DEVLOCK();
282	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
283	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
284		if (sh->sector == sector && sh->disks == disks)
285			return sh;
286	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
287	return NULL;
288}
289
290static void unplug_slaves(mddev_t *mddev);
291static void raid5_unplug_device(struct request_queue *q);
292
293static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
294					     int pd_idx, int noblock)
295{
296	struct stripe_head *sh;
297
298	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
299
300	spin_lock_irq(&conf->device_lock);
301
302	do {
303		wait_event_lock_irq(conf->wait_for_stripe,
304				    conf->quiesce == 0,
305				    conf->device_lock, /* nothing */);
306		sh = __find_stripe(conf, sector, disks);
307		if (!sh) {
308			if (!conf->inactive_blocked)
309				sh = get_free_stripe(conf);
310			if (noblock && sh == NULL)
311				break;
312			if (!sh) {
313				conf->inactive_blocked = 1;
314				wait_event_lock_irq(conf->wait_for_stripe,
315						    !list_empty(&conf->inactive_list) &&
316						    (atomic_read(&conf->active_stripes)
317						     < (conf->max_nr_stripes *3/4)
318						     || !conf->inactive_blocked),
319						    conf->device_lock,
320						    raid5_unplug_device(conf->mddev->queue)
321					);
322				conf->inactive_blocked = 0;
323			} else
324				init_stripe(sh, sector, pd_idx, disks);
325		} else {
326			if (atomic_read(&sh->count)) {
327			  BUG_ON(!list_empty(&sh->lru));
328			} else {
329				if (!test_bit(STRIPE_HANDLE, &sh->state))
330					atomic_inc(&conf->active_stripes);
331				if (list_empty(&sh->lru) &&
332				    !test_bit(STRIPE_EXPANDING, &sh->state))
333					BUG();
334				list_del_init(&sh->lru);
335			}
336		}
337	} while (sh == NULL);
338
339	if (sh)
340		atomic_inc(&sh->count);
341
342	spin_unlock_irq(&conf->device_lock);
343	return sh;
344}
345
346/* test_and_ack_op() ensures that we only dequeue an operation once */
347#define test_and_ack_op(op, pend) \
348do {							\
349	if (test_bit(op, &sh->ops.pending) &&		\
350		!test_bit(op, &sh->ops.complete)) {	\
351		if (test_and_set_bit(op, &sh->ops.ack)) \
352			clear_bit(op, &pend);		\
353		else					\
354			ack++;				\
355	} else						\
356		clear_bit(op, &pend);			\
357} while (0)
358
359/* find new work to run, do not resubmit work that is already
360 * in flight
361 */
362static unsigned long get_stripe_work(struct stripe_head *sh)
363{
364	unsigned long pending;
365	int ack = 0;
366
367	pending = sh->ops.pending;
368
369	test_and_ack_op(STRIPE_OP_BIOFILL, pending);
370	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
371	test_and_ack_op(STRIPE_OP_PREXOR, pending);
372	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
373	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
374	test_and_ack_op(STRIPE_OP_CHECK, pending);
375	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
376		ack++;
377
378	sh->ops.count -= ack;
379	if (unlikely(sh->ops.count < 0)) {
380		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
381			"ops.complete: %#lx\n", pending, sh->ops.pending,
382			sh->ops.ack, sh->ops.complete);
383		BUG();
384	}
385
386	return pending;
387}
388
389static void
390raid5_end_read_request(struct bio *bi, int error);
391static void
392raid5_end_write_request(struct bio *bi, int error);
393
394static void ops_run_io(struct stripe_head *sh)
395{
396	raid5_conf_t *conf = sh->raid_conf;
397	int i, disks = sh->disks;
398
399	might_sleep();
400
401	for (i = disks; i--; ) {
402		int rw;
403		struct bio *bi;
404		mdk_rdev_t *rdev;
405		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
406			rw = WRITE;
407		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
408			rw = READ;
409		else
410			continue;
411
412		bi = &sh->dev[i].req;
413
414		bi->bi_rw = rw;
415		if (rw == WRITE)
416			bi->bi_end_io = raid5_end_write_request;
417		else
418			bi->bi_end_io = raid5_end_read_request;
419
420		rcu_read_lock();
421		rdev = rcu_dereference(conf->disks[i].rdev);
422		if (rdev && test_bit(Faulty, &rdev->flags))
423			rdev = NULL;
424		if (rdev)
425			atomic_inc(&rdev->nr_pending);
426		rcu_read_unlock();
427
428		if (rdev) {
429			if (test_bit(STRIPE_SYNCING, &sh->state) ||
430				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
431				test_bit(STRIPE_EXPAND_READY, &sh->state))
432				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
433
434			bi->bi_bdev = rdev->bdev;
435			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
436				__FUNCTION__, (unsigned long long)sh->sector,
437				bi->bi_rw, i);
438			atomic_inc(&sh->count);
439			bi->bi_sector = sh->sector + rdev->data_offset;
440			bi->bi_flags = 1 << BIO_UPTODATE;
441			bi->bi_vcnt = 1;
442			bi->bi_max_vecs = 1;
443			bi->bi_idx = 0;
444			bi->bi_io_vec = &sh->dev[i].vec;
445			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
446			bi->bi_io_vec[0].bv_offset = 0;
447			bi->bi_size = STRIPE_SIZE;
448			bi->bi_next = NULL;
449			if (rw == WRITE &&
450			    test_bit(R5_ReWrite, &sh->dev[i].flags))
451				atomic_add(STRIPE_SECTORS,
452					&rdev->corrected_errors);
453			generic_make_request(bi);
454		} else {
455			if (rw == WRITE)
456				set_bit(STRIPE_DEGRADED, &sh->state);
457			pr_debug("skip op %ld on disc %d for sector %llu\n",
458				bi->bi_rw, i, (unsigned long long)sh->sector);
459			clear_bit(R5_LOCKED, &sh->dev[i].flags);
460			set_bit(STRIPE_HANDLE, &sh->state);
461		}
462	}
463}
464
465static struct dma_async_tx_descriptor *
466async_copy_data(int frombio, struct bio *bio, struct page *page,
467	sector_t sector, struct dma_async_tx_descriptor *tx)
468{
469	struct bio_vec *bvl;
470	struct page *bio_page;
471	int i;
472	int page_offset;
473
474	if (bio->bi_sector >= sector)
475		page_offset = (signed)(bio->bi_sector - sector) * 512;
476	else
477		page_offset = (signed)(sector - bio->bi_sector) * -512;
478	bio_for_each_segment(bvl, bio, i) {
479		int len = bio_iovec_idx(bio, i)->bv_len;
480		int clen;
481		int b_offset = 0;
482
483		if (page_offset < 0) {
484			b_offset = -page_offset;
485			page_offset += b_offset;
486			len -= b_offset;
487		}
488
489		if (len > 0 && page_offset + len > STRIPE_SIZE)
490			clen = STRIPE_SIZE - page_offset;
491		else
492			clen = len;
493
494		if (clen > 0) {
495			b_offset += bio_iovec_idx(bio, i)->bv_offset;
496			bio_page = bio_iovec_idx(bio, i)->bv_page;
497			if (frombio)
498				tx = async_memcpy(page, bio_page, page_offset,
499					b_offset, clen,
500					ASYNC_TX_DEP_ACK,
501					tx, NULL, NULL);
502			else
503				tx = async_memcpy(bio_page, page, b_offset,
504					page_offset, clen,
505					ASYNC_TX_DEP_ACK,
506					tx, NULL, NULL);
507		}
508		if (clen < len) /* hit end of page */
509			break;
510		page_offset +=  len;
511	}
512
513	return tx;
514}
515
516static void ops_complete_biofill(void *stripe_head_ref)
517{
518	struct stripe_head *sh = stripe_head_ref;
519	struct bio *return_bi = NULL;
520	raid5_conf_t *conf = sh->raid_conf;
521	int i;
522
523	pr_debug("%s: stripe %llu\n", __FUNCTION__,
524		(unsigned long long)sh->sector);
525
526	/* clear completed biofills */
527	for (i = sh->disks; i--; ) {
528		struct r5dev *dev = &sh->dev[i];
529
530		/* acknowledge completion of a biofill operation */
531		/* and check if we need to reply to a read request,
532		 * new R5_Wantfill requests are held off until
533		 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
534		 */
535		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
536			struct bio *rbi, *rbi2;
537
538			/* The access to dev->read is outside of the
539			 * spin_lock_irq(&conf->device_lock), but is protected
540			 * by the STRIPE_OP_BIOFILL pending bit
541			 */
542			BUG_ON(!dev->read);
543			rbi = dev->read;
544			dev->read = NULL;
545			while (rbi && rbi->bi_sector <
546				dev->sector + STRIPE_SECTORS) {
547				rbi2 = r5_next_bio(rbi, dev->sector);
548				spin_lock_irq(&conf->device_lock);
549				if (--rbi->bi_phys_segments == 0) {
550					rbi->bi_next = return_bi;
551					return_bi = rbi;
552				}
553				spin_unlock_irq(&conf->device_lock);
554				rbi = rbi2;
555			}
556		}
557	}
558	set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
559
560	return_io(return_bi);
561
562	set_bit(STRIPE_HANDLE, &sh->state);
563	release_stripe(sh);
564}
565
566static void ops_run_biofill(struct stripe_head *sh)
567{
568	struct dma_async_tx_descriptor *tx = NULL;
569	raid5_conf_t *conf = sh->raid_conf;
570	int i;
571
572	pr_debug("%s: stripe %llu\n", __FUNCTION__,
573		(unsigned long long)sh->sector);
574
575	for (i = sh->disks; i--; ) {
576		struct r5dev *dev = &sh->dev[i];
577		if (test_bit(R5_Wantfill, &dev->flags)) {
578			struct bio *rbi;
579			spin_lock_irq(&conf->device_lock);
580			dev->read = rbi = dev->toread;
581			dev->toread = NULL;
582			spin_unlock_irq(&conf->device_lock);
583			while (rbi && rbi->bi_sector <
584				dev->sector + STRIPE_SECTORS) {
585				tx = async_copy_data(0, rbi, dev->page,
586					dev->sector, tx);
587				rbi = r5_next_bio(rbi, dev->sector);
588			}
589		}
590	}
591
592	atomic_inc(&sh->count);
593	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
594		ops_complete_biofill, sh);
595}
596
597static void ops_complete_compute5(void *stripe_head_ref)
598{
599	struct stripe_head *sh = stripe_head_ref;
600	int target = sh->ops.target;
601	struct r5dev *tgt = &sh->dev[target];
602
603	pr_debug("%s: stripe %llu\n", __FUNCTION__,
604		(unsigned long long)sh->sector);
605
606	set_bit(R5_UPTODATE, &tgt->flags);
607	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
608	clear_bit(R5_Wantcompute, &tgt->flags);
609	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
610	set_bit(STRIPE_HANDLE, &sh->state);
611	release_stripe(sh);
612}
613
614static struct dma_async_tx_descriptor *
615ops_run_compute5(struct stripe_head *sh, unsigned long pending)
616{
617	/* kernel stack size limits the total number of disks */
618	int disks = sh->disks;
619	struct page *xor_srcs[disks];
620	int target = sh->ops.target;
621	struct r5dev *tgt = &sh->dev[target];
622	struct page *xor_dest = tgt->page;
623	int count = 0;
624	struct dma_async_tx_descriptor *tx;
625	int i;
626
627	pr_debug("%s: stripe %llu block: %d\n",
628		__FUNCTION__, (unsigned long long)sh->sector, target);
629	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
630
631	for (i = disks; i--; )
632		if (i != target)
633			xor_srcs[count++] = sh->dev[i].page;
634
635	atomic_inc(&sh->count);
636
637	if (unlikely(count == 1))
638		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
639			0, NULL, ops_complete_compute5, sh);
640	else
641		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
642			ASYNC_TX_XOR_ZERO_DST, NULL,
643			ops_complete_compute5, sh);
644
645	/* ack now if postxor is not set to be run */
646	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
647		async_tx_ack(tx);
648
649	return tx;
650}
651
652static void ops_complete_prexor(void *stripe_head_ref)
653{
654	struct stripe_head *sh = stripe_head_ref;
655
656	pr_debug("%s: stripe %llu\n", __FUNCTION__,
657		(unsigned long long)sh->sector);
658
659	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
660}
661
662static struct dma_async_tx_descriptor *
663ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
664{
665	/* kernel stack size limits the total number of disks */
666	int disks = sh->disks;
667	struct page *xor_srcs[disks];
668	int count = 0, pd_idx = sh->pd_idx, i;
669
670	/* existing parity data subtracted */
671	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
672
673	pr_debug("%s: stripe %llu\n", __FUNCTION__,
674		(unsigned long long)sh->sector);
675
676	for (i = disks; i--; ) {
677		struct r5dev *dev = &sh->dev[i];
678		/* Only process blocks that are known to be uptodate */
679		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
680			xor_srcs[count++] = dev->page;
681	}
682
683	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
684		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
685		ops_complete_prexor, sh);
686
687	return tx;
688}
689
690static struct dma_async_tx_descriptor *
691ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
692		 unsigned long pending)
693{
694	int disks = sh->disks;
695	int pd_idx = sh->pd_idx, i;
696
697	/* check if prexor is active which means only process blocks
698	 * that are part of a read-modify-write (Wantprexor)
699	 */
700	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
701
702	pr_debug("%s: stripe %llu\n", __FUNCTION__,
703		(unsigned long long)sh->sector);
704
705	for (i = disks; i--; ) {
706		struct r5dev *dev = &sh->dev[i];
707		struct bio *chosen;
708		int towrite;
709
710		towrite = 0;
711		if (prexor) { /* rmw */
712			if (dev->towrite &&
713			    test_bit(R5_Wantprexor, &dev->flags))
714				towrite = 1;
715		} else { /* rcw */
716			if (i != pd_idx && dev->towrite &&
717				test_bit(R5_LOCKED, &dev->flags))
718				towrite = 1;
719		}
720
721		if (towrite) {
722			struct bio *wbi;
723
724			spin_lock(&sh->lock);
725			chosen = dev->towrite;
726			dev->towrite = NULL;
727			BUG_ON(dev->written);
728			wbi = dev->written = chosen;
729			spin_unlock(&sh->lock);
730
731			while (wbi && wbi->bi_sector <
732				dev->sector + STRIPE_SECTORS) {
733				tx = async_copy_data(1, wbi, dev->page,
734					dev->sector, tx);
735				wbi = r5_next_bio(wbi, dev->sector);
736			}
737		}
738	}
739
740	return tx;
741}
742
743static void ops_complete_postxor(void *stripe_head_ref)
744{
745	struct stripe_head *sh = stripe_head_ref;
746
747	pr_debug("%s: stripe %llu\n", __FUNCTION__,
748		(unsigned long long)sh->sector);
749
750	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
751	set_bit(STRIPE_HANDLE, &sh->state);
752	release_stripe(sh);
753}
754
755static void ops_complete_write(void *stripe_head_ref)
756{
757	struct stripe_head *sh = stripe_head_ref;
758	int disks = sh->disks, i, pd_idx = sh->pd_idx;
759
760	pr_debug("%s: stripe %llu\n", __FUNCTION__,
761		(unsigned long long)sh->sector);
762
763	for (i = disks; i--; ) {
764		struct r5dev *dev = &sh->dev[i];
765		if (dev->written || i == pd_idx)
766			set_bit(R5_UPTODATE, &dev->flags);
767	}
768
769	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
770	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
771
772	set_bit(STRIPE_HANDLE, &sh->state);
773	release_stripe(sh);
774}
775
776static void
777ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
778		unsigned long pending)
779{
780	/* kernel stack size limits the total number of disks */
781	int disks = sh->disks;
782	struct page *xor_srcs[disks];
783
784	int count = 0, pd_idx = sh->pd_idx, i;
785	struct page *xor_dest;
786	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
787	unsigned long flags;
788	dma_async_tx_callback callback;
789
790	pr_debug("%s: stripe %llu\n", __FUNCTION__,
791		(unsigned long long)sh->sector);
792
793	/* check if prexor is active which means only process blocks
794	 * that are part of a read-modify-write (written)
795	 */
796	if (prexor) {
797		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
798		for (i = disks; i--; ) {
799			struct r5dev *dev = &sh->dev[i];
800			if (dev->written)
801				xor_srcs[count++] = dev->page;
802		}
803	} else {
804		xor_dest = sh->dev[pd_idx].page;
805		for (i = disks; i--; ) {
806			struct r5dev *dev = &sh->dev[i];
807			if (i != pd_idx)
808				xor_srcs[count++] = dev->page;
809		}
810	}
811
812	/* check whether this postxor is part of a write */
813	callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
814		ops_complete_write : ops_complete_postxor;
815
816	/* 1/ if we prexor'd then the dest is reused as a source
817	 * 2/ if we did not prexor then we are redoing the parity
818	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
819	 * for the synchronous xor case
820	 */
821	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
822		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
823
824	atomic_inc(&sh->count);
825
826	if (unlikely(count == 1)) {
827		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
828		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
829			flags, tx, callback, sh);
830	} else
831		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
832			flags, tx, callback, sh);
833}
834
835static void ops_complete_check(void *stripe_head_ref)
836{
837	struct stripe_head *sh = stripe_head_ref;
838	int pd_idx = sh->pd_idx;
839
840	pr_debug("%s: stripe %llu\n", __FUNCTION__,
841		(unsigned long long)sh->sector);
842
843	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
844		sh->ops.zero_sum_result == 0)
845		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
846
847	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
848	set_bit(STRIPE_HANDLE, &sh->state);
849	release_stripe(sh);
850}
851
852static void ops_run_check(struct stripe_head *sh)
853{
854	/* kernel stack size limits the total number of disks */
855	int disks = sh->disks;
856	struct page *xor_srcs[disks];
857	struct dma_async_tx_descriptor *tx;
858
859	int count = 0, pd_idx = sh->pd_idx, i;
860	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
861
862	pr_debug("%s: stripe %llu\n", __FUNCTION__,
863		(unsigned long long)sh->sector);
864
865	for (i = disks; i--; ) {
866		struct r5dev *dev = &sh->dev[i];
867		if (i != pd_idx)
868			xor_srcs[count++] = dev->page;
869	}
870
871	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
872		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
873
874	if (tx)
875		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
876	else
877		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
878
879	atomic_inc(&sh->count);
880	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
881		ops_complete_check, sh);
882}
883
884static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
885{
886	int overlap_clear = 0, i, disks = sh->disks;
887	struct dma_async_tx_descriptor *tx = NULL;
888
889	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
890		ops_run_biofill(sh);
891		overlap_clear++;
892	}
893
894	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
895		tx = ops_run_compute5(sh, pending);
896
897	if (test_bit(STRIPE_OP_PREXOR, &pending))
898		tx = ops_run_prexor(sh, tx);
899
900	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
901		tx = ops_run_biodrain(sh, tx, pending);
902		overlap_clear++;
903	}
904
905	if (test_bit(STRIPE_OP_POSTXOR, &pending))
906		ops_run_postxor(sh, tx, pending);
907
908	if (test_bit(STRIPE_OP_CHECK, &pending))
909		ops_run_check(sh);
910
911	if (test_bit(STRIPE_OP_IO, &pending))
912		ops_run_io(sh);
913
914	if (overlap_clear)
915		for (i = disks; i--; ) {
916			struct r5dev *dev = &sh->dev[i];
917			if (test_and_clear_bit(R5_Overlap, &dev->flags))
918				wake_up(&sh->raid_conf->wait_for_overlap);
919		}
920}
921
922static int grow_one_stripe(raid5_conf_t *conf)
923{
924	struct stripe_head *sh;
925	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
926	if (!sh)
927		return 0;
928	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
929	sh->raid_conf = conf;
930	spin_lock_init(&sh->lock);
931
932	if (grow_buffers(sh, conf->raid_disks)) {
933		shrink_buffers(sh, conf->raid_disks);
934		kmem_cache_free(conf->slab_cache, sh);
935		return 0;
936	}
937	sh->disks = conf->raid_disks;
938	/* we just created an active stripe so... */
939	atomic_set(&sh->count, 1);
940	atomic_inc(&conf->active_stripes);
941	INIT_LIST_HEAD(&sh->lru);
942	release_stripe(sh);
943	return 1;
944}
945
946static int grow_stripes(raid5_conf_t *conf, int num)
947{
948	struct kmem_cache *sc;
949	int devs = conf->raid_disks;
950
951	sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
952	sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
953	conf->active_name = 0;
954	sc = kmem_cache_create(conf->cache_name[conf->active_name],
955			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
956			       0, 0, NULL);
957	if (!sc)
958		return 1;
959	conf->slab_cache = sc;
960	conf->pool_size = devs;
961	while (num--)
962		if (!grow_one_stripe(conf))
963			return 1;
964	return 0;
965}
966
967#ifdef CONFIG_MD_RAID5_RESHAPE
968static int resize_stripes(raid5_conf_t *conf, int newsize)
969{
970	/* Make all the stripes able to hold 'newsize' devices.
971	 * New slots in each stripe get 'page' set to a new page.
972	 *
973	 * This happens in stages:
974	 * 1/ create a new kmem_cache and allocate the required number of
975	 *    stripe_heads.
976	 * 2/ gather all the old stripe_heads and tranfer the pages across
977	 *    to the new stripe_heads.  This will have the side effect of
978	 *    freezing the array as once all stripe_heads have been collected,
979	 *    no IO will be possible.  Old stripe heads are freed once their
980	 *    pages have been transferred over, and the old kmem_cache is
981	 *    freed when all stripes are done.
982	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
983	 *    we simple return a failre status - no need to clean anything up.
984	 * 4/ allocate new pages for the new slots in the new stripe_heads.
985	 *    If this fails, we don't bother trying the shrink the
986	 *    stripe_heads down again, we just leave them as they are.
987	 *    As each stripe_head is processed the new one is released into
988	 *    active service.
989	 *
990	 * Once step2 is started, we cannot afford to wait for a write,
991	 * so we use GFP_NOIO allocations.
992	 */
993	struct stripe_head *osh, *nsh;
994	LIST_HEAD(newstripes);
995	struct disk_info *ndisks;
996	int err = 0;
997	struct kmem_cache *sc;
998	int i;
999
1000	if (newsize <= conf->pool_size)
1001		return 0; /* never bother to shrink */
1002
1003	md_allow_write(conf->mddev);
1004
1005	/* Step 1 */
1006	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1007			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1008			       0, 0, NULL);
1009	if (!sc)
1010		return -ENOMEM;
1011
1012	for (i = conf->max_nr_stripes; i; i--) {
1013		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1014		if (!nsh)
1015			break;
1016
1017		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1018
1019		nsh->raid_conf = conf;
1020		spin_lock_init(&nsh->lock);
1021
1022		list_add(&nsh->lru, &newstripes);
1023	}
1024	if (i) {
1025		/* didn't get enough, give up */
1026		while (!list_empty(&newstripes)) {
1027			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1028			list_del(&nsh->lru);
1029			kmem_cache_free(sc, nsh);
1030		}
1031		kmem_cache_destroy(sc);
1032		return -ENOMEM;
1033	}
1034	/* Step 2 - Must use GFP_NOIO now.
1035	 * OK, we have enough stripes, start collecting inactive
1036	 * stripes and copying them over
1037	 */
1038	list_for_each_entry(nsh, &newstripes, lru) {
1039		spin_lock_irq(&conf->device_lock);
1040		wait_event_lock_irq(conf->wait_for_stripe,
1041				    !list_empty(&conf->inactive_list),
1042				    conf->device_lock,
1043				    unplug_slaves(conf->mddev)
1044			);
1045		osh = get_free_stripe(conf);
1046		spin_unlock_irq(&conf->device_lock);
1047		atomic_set(&nsh->count, 1);
1048		for(i=0; i<conf->pool_size; i++)
1049			nsh->dev[i].page = osh->dev[i].page;
1050		for( ; i<newsize; i++)
1051			nsh->dev[i].page = NULL;
1052		kmem_cache_free(conf->slab_cache, osh);
1053	}
1054	kmem_cache_destroy(conf->slab_cache);
1055
1056	/* Step 3.
1057	 * At this point, we are holding all the stripes so the array
1058	 * is completely stalled, so now is a good time to resize
1059	 * conf->disks.
1060	 */
1061	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1062	if (ndisks) {
1063		for (i=0; i<conf->raid_disks; i++)
1064			ndisks[i] = conf->disks[i];
1065		kfree(conf->disks);
1066		conf->disks = ndisks;
1067	} else
1068		err = -ENOMEM;
1069
1070	/* Step 4, return new stripes to service */
1071	while(!list_empty(&newstripes)) {
1072		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1073		list_del_init(&nsh->lru);
1074		for (i=conf->raid_disks; i < newsize; i++)
1075			if (nsh->dev[i].page == NULL) {
1076				struct page *p = alloc_page(GFP_NOIO);
1077				nsh->dev[i].page = p;
1078				if (!p)
1079					err = -ENOMEM;
1080			}
1081		release_stripe(nsh);
1082	}
1083	/* critical section pass, GFP_NOIO no longer needed */
1084
1085	conf->slab_cache = sc;
1086	conf->active_name = 1-conf->active_name;
1087	conf->pool_size = newsize;
1088	return err;
1089}
1090#endif
1091
1092static int drop_one_stripe(raid5_conf_t *conf)
1093{
1094	struct stripe_head *sh;
1095
1096	spin_lock_irq(&conf->device_lock);
1097	sh = get_free_stripe(conf);
1098	spin_unlock_irq(&conf->device_lock);
1099	if (!sh)
1100		return 0;
1101	BUG_ON(atomic_read(&sh->count));
1102	shrink_buffers(sh, conf->pool_size);
1103	kmem_cache_free(conf->slab_cache, sh);
1104	atomic_dec(&conf->active_stripes);
1105	return 1;
1106}
1107
1108static void shrink_stripes(raid5_conf_t *conf)
1109{
1110	while (drop_one_stripe(conf))
1111		;
1112
1113	if (conf->slab_cache)
1114		kmem_cache_destroy(conf->slab_cache);
1115	conf->slab_cache = NULL;
1116}
1117
1118static void raid5_end_read_request(struct bio * bi, int error)
1119{
1120 	struct stripe_head *sh = bi->bi_private;
1121	raid5_conf_t *conf = sh->raid_conf;
1122	int disks = sh->disks, i;
1123	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1124	char b[BDEVNAME_SIZE];
1125	mdk_rdev_t *rdev;
1126
1127
1128	for (i=0 ; i<disks; i++)
1129		if (bi == &sh->dev[i].req)
1130			break;
1131
1132	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1133		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1134		uptodate);
1135	if (i == disks) {
1136		BUG();
1137		return;
1138	}
1139
1140	if (uptodate) {
1141		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1142		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1143			rdev = conf->disks[i].rdev;
1144			printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
1145			       mdname(conf->mddev), STRIPE_SECTORS,
1146			       (unsigned long long)sh->sector + rdev->data_offset,
1147			       bdevname(rdev->bdev, b));
1148			clear_bit(R5_ReadError, &sh->dev[i].flags);
1149			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1150		}
1151		if (atomic_read(&conf->disks[i].rdev->read_errors))
1152			atomic_set(&conf->disks[i].rdev->read_errors, 0);
1153	} else {
1154		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1155		int retry = 0;
1156		rdev = conf->disks[i].rdev;
1157
1158		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1159		atomic_inc(&rdev->read_errors);
1160		if (conf->mddev->degraded)
1161			printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
1162			       mdname(conf->mddev),
1163			       (unsigned long long)sh->sector + rdev->data_offset,
1164			       bdn);
1165		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1166			/* Oh, no!!! */
1167			printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
1168			       mdname(conf->mddev),
1169			       (unsigned long long)sh->sector + rdev->data_offset,
1170			       bdn);
1171		else if (atomic_read(&rdev->read_errors)
1172			 > conf->max_nr_stripes)
1173			printk(KERN_WARNING
1174			       "raid5:%s: Too many read errors, failing device %s.\n",
1175			       mdname(conf->mddev), bdn);
1176		else
1177			retry = 1;
1178		if (retry)
1179			set_bit(R5_ReadError, &sh->dev[i].flags);
1180		else {
1181			clear_bit(R5_ReadError, &sh->dev[i].flags);
1182			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1183			md_error(conf->mddev, rdev);
1184		}
1185	}
1186	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1187	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1188	set_bit(STRIPE_HANDLE, &sh->state);
1189	release_stripe(sh);
1190}
1191
1192static void raid5_end_write_request (struct bio *bi, int error)
1193{
1194 	struct stripe_head *sh = bi->bi_private;
1195	raid5_conf_t *conf = sh->raid_conf;
1196	int disks = sh->disks, i;
1197	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1198
1199	for (i=0 ; i<disks; i++)
1200		if (bi == &sh->dev[i].req)
1201			break;
1202
1203	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1204		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1205		uptodate);
1206	if (i == disks) {
1207		BUG();
1208		return;
1209	}
1210
1211	if (!uptodate)
1212		md_error(conf->mddev, conf->disks[i].rdev);
1213
1214	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1215
1216	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1217	set_bit(STRIPE_HANDLE, &sh->state);
1218	release_stripe(sh);
1219}
1220
1221
1222static sector_t compute_blocknr(struct stripe_head *sh, int i);
1223
1224static void raid5_build_block (struct stripe_head *sh, int i)
1225{
1226	struct r5dev *dev = &sh->dev[i];
1227
1228	bio_init(&dev->req);
1229	dev->req.bi_io_vec = &dev->vec;
1230	dev->req.bi_vcnt++;
1231	dev->req.bi_max_vecs++;
1232	dev->vec.bv_page = dev->page;
1233	dev->vec.bv_len = STRIPE_SIZE;
1234	dev->vec.bv_offset = 0;
1235
1236	dev->req.bi_sector = sh->sector;
1237	dev->req.bi_private = sh;
1238
1239	dev->flags = 0;
1240	dev->sector = compute_blocknr(sh, i);
1241}
1242
1243static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1244{
1245	char b[BDEVNAME_SIZE];
1246	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1247	pr_debug("raid5: error called\n");
1248
1249	if (!test_bit(Faulty, &rdev->flags)) {
1250		set_bit(MD_CHANGE_DEVS, &mddev->flags);
1251		if (test_and_clear_bit(In_sync, &rdev->flags)) {
1252			unsigned long flags;
1253			spin_lock_irqsave(&conf->device_lock, flags);
1254			mddev->degraded++;
1255			spin_unlock_irqrestore(&conf->device_lock, flags);
1256			/*
1257			 * if recovery was running, make sure it aborts.
1258			 */
1259			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
1260		}
1261		set_bit(Faulty, &rdev->flags);
1262		printk (KERN_ALERT
1263			"raid5: Disk failure on %s, disabling device."
1264			" Operation continuing on %d devices\n",
1265			bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1266	}
1267}
1268
1269/*
1270 * Input: a 'big' sector number,
1271 * Output: index of the data and parity disk, and the sector # in them.
1272 */
1273static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
1274			unsigned int data_disks, unsigned int * dd_idx,
1275			unsigned int * pd_idx, raid5_conf_t *conf)
1276{
1277	long stripe;
1278	unsigned long chunk_number;
1279	unsigned int chunk_offset;
1280	sector_t new_sector;
1281	int sectors_per_chunk = conf->chunk_size >> 9;
1282
1283	/* First compute the information on this sector */
1284
1285	/*
1286	 * Compute the chunk number and the sector offset inside the chunk
1287	 */
1288	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1289	chunk_number = r_sector;
1290	BUG_ON(r_sector != chunk_number);
1291
1292	/*
1293	 * Compute the stripe number
1294	 */
1295	stripe = chunk_number / data_disks;
1296
1297	/*
1298	 * Compute the data disk and parity disk indexes inside the stripe
1299	 */
1300	*dd_idx = chunk_number % data_disks;
1301
1302	/*
1303	 * Select the parity disk based on the user selected algorithm.
1304	 */
1305	switch(conf->level) {
1306	case 4:
1307		*pd_idx = data_disks;
1308		break;
1309	case 5:
1310		switch (conf->algorithm) {
1311		case ALGORITHM_LEFT_ASYMMETRIC:
1312			*pd_idx = data_disks - stripe % raid_disks;
1313			if (*dd_idx >= *pd_idx)
1314				(*dd_idx)++;
1315			break;
1316		case ALGORITHM_RIGHT_ASYMMETRIC:
1317			*pd_idx = stripe % raid_disks;
1318			if (*dd_idx >= *pd_idx)
1319				(*dd_idx)++;
1320			break;
1321		case ALGORITHM_LEFT_SYMMETRIC:
1322			*pd_idx = data_disks - stripe % raid_disks;
1323			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1324			break;
1325		case ALGORITHM_RIGHT_SYMMETRIC:
1326			*pd_idx = stripe % raid_disks;
1327			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1328			break;
1329		default:
1330			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1331				conf->algorithm);
1332		}
1333		break;
1334	case 6:
1335
1336		/**** FIX THIS ****/
1337		switch (conf->algorithm) {
1338		case ALGORITHM_LEFT_ASYMMETRIC:
1339			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
1340			if (*pd_idx == raid_disks-1)
1341				(*dd_idx)++; 	/* Q D D D P */
1342			else if (*dd_idx >= *pd_idx)
1343				(*dd_idx) += 2; /* D D P Q D */
1344			break;
1345		case ALGORITHM_RIGHT_ASYMMETRIC:
1346			*pd_idx = stripe % raid_disks;
1347			if (*pd_idx == raid_disks-1)
1348				(*dd_idx)++; 	/* Q D D D P */
1349			else if (*dd_idx >= *pd_idx)
1350				(*dd_idx) += 2; /* D D P Q D */
1351			break;
1352		case ALGORITHM_LEFT_SYMMETRIC:
1353			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
1354			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1355			break;
1356		case ALGORITHM_RIGHT_SYMMETRIC:
1357			*pd_idx = stripe % raid_disks;
1358			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1359			break;
1360		default:
1361			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
1362				conf->algorithm);
1363		}
1364		break;
1365	}
1366
1367	/*
1368	 * Finally, compute the new sector number
1369	 */
1370	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1371	return new_sector;
1372}
1373
1374
1375static sector_t compute_blocknr(struct stripe_head *sh, int i)
1376{
1377	raid5_conf_t *conf = sh->raid_conf;
1378	int raid_disks = sh->disks;
1379	int data_disks = raid_disks - conf->max_degraded;
1380	sector_t new_sector = sh->sector, check;
1381	int sectors_per_chunk = conf->chunk_size >> 9;
1382	sector_t stripe;
1383	int chunk_offset;
1384	int chunk_number, dummy1, dummy2, dd_idx = i;
1385	sector_t r_sector;
1386
1387
1388	chunk_offset = sector_div(new_sector, sectors_per_chunk);
1389	stripe = new_sector;
1390	BUG_ON(new_sector != stripe);
1391
1392	if (i == sh->pd_idx)
1393		return 0;
1394	switch(conf->level) {
1395	case 4: break;
1396	case 5:
1397		switch (conf->algorithm) {
1398		case ALGORITHM_LEFT_ASYMMETRIC:
1399		case ALGORITHM_RIGHT_ASYMMETRIC:
1400			if (i > sh->pd_idx)
1401				i--;
1402			break;
1403		case ALGORITHM_LEFT_SYMMETRIC:
1404		case ALGORITHM_RIGHT_SYMMETRIC:
1405			if (i < sh->pd_idx)
1406				i += raid_disks;
1407			i -= (sh->pd_idx + 1);
1408			break;
1409		default:
1410			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1411			       conf->algorithm);
1412		}
1413		break;
1414	case 6:
1415		if (i == raid6_next_disk(sh->pd_idx, raid_disks))
1416			return 0; /* It is the Q disk */
1417		switch (conf->algorithm) {
1418		case ALGORITHM_LEFT_ASYMMETRIC:
1419		case ALGORITHM_RIGHT_ASYMMETRIC:
1420		  	if (sh->pd_idx == raid_disks-1)
1421				i--; 	/* Q D D D P */
1422			else if (i > sh->pd_idx)
1423				i -= 2; /* D D P Q D */
1424			break;
1425		case ALGORITHM_LEFT_SYMMETRIC:
1426		case ALGORITHM_RIGHT_SYMMETRIC:
1427			if (sh->pd_idx == raid_disks-1)
1428				i--; /* Q D D D P */
1429			else {
1430				/* D D P Q D */
1431				if (i < sh->pd_idx)
1432					i += raid_disks;
1433				i -= (sh->pd_idx + 2);
1434			}
1435			break;
1436		default:
1437			printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
1438				conf->algorithm);
1439		}
1440		break;
1441	}
1442
1443	chunk_number = stripe * data_disks + i;
1444	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1445
1446	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
1447	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
1448		printk(KERN_ERR "compute_blocknr: map not correct\n");
1449		return 0;
1450	}
1451	return r_sector;
1452}
1453
1454
1455
1456/*
1457 * Copy data between a page in the stripe cache, and one or more bion
1458 * The page could align with the middle of the bio, or there could be
1459 * several bion, each with several bio_vecs, which cover part of the page
1460 * Multiple bion are linked together on bi_next.  There may be extras
1461 * at the end of this list.  We ignore them.
1462 */
1463static void copy_data(int frombio, struct bio *bio,
1464		     struct page *page,
1465		     sector_t sector)
1466{
1467	char *pa = page_address(page);
1468	struct bio_vec *bvl;
1469	int i;
1470	int page_offset;
1471
1472	if (bio->bi_sector >= sector)
1473		page_offset = (signed)(bio->bi_sector - sector) * 512;
1474	else
1475		page_offset = (signed)(sector - bio->bi_sector) * -512;
1476	bio_for_each_segment(bvl, bio, i) {
1477		int len = bio_iovec_idx(bio,i)->bv_len;
1478		int clen;
1479		int b_offset = 0;
1480
1481		if (page_offset < 0) {
1482			b_offset = -page_offset;
1483			page_offset += b_offset;
1484			len -= b_offset;
1485		}
1486
1487		if (len > 0 && page_offset + len > STRIPE_SIZE)
1488			clen = STRIPE_SIZE - page_offset;
1489		else clen = len;
1490
1491		if (clen > 0) {
1492			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1493			if (frombio)
1494				memcpy(pa+page_offset, ba+b_offset, clen);
1495			else
1496				memcpy(ba+b_offset, pa+page_offset, clen);
1497			__bio_kunmap_atomic(ba, KM_USER0);
1498		}
1499		if (clen < len) /* hit end of page */
1500			break;
1501		page_offset +=  len;
1502	}
1503}
1504
1505#define check_xor()	do {						  \
1506				if (count == MAX_XOR_BLOCKS) {		  \
1507				xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1508				count = 0;				  \
1509			   }						  \
1510			} while(0)
1511
1512static void compute_parity6(struct stripe_head *sh, int method)
1513{
1514	raid6_conf_t *conf = sh->raid_conf;
1515	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1516	struct bio *chosen;
1517	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
1518	void *ptrs[disks];
1519
1520	qd_idx = raid6_next_disk(pd_idx, disks);
1521	d0_idx = raid6_next_disk(qd_idx, disks);
1522
1523	pr_debug("compute_parity, stripe %llu, method %d\n",
1524		(unsigned long long)sh->sector, method);
1525
1526	switch(method) {
1527	case READ_MODIFY_WRITE:
1528		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */
1529	case RECONSTRUCT_WRITE:
1530		for (i= disks; i-- ;)
1531			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1532				chosen = sh->dev[i].towrite;
1533				sh->dev[i].towrite = NULL;
1534
1535				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1536					wake_up(&conf->wait_for_overlap);
1537
1538				BUG_ON(sh->dev[i].written);
1539				sh->dev[i].written = chosen;
1540			}
1541		break;
1542	case CHECK_PARITY:
1543		BUG();		/* Not implemented yet */
1544	}
1545
1546	for (i = disks; i--;)
1547		if (sh->dev[i].written) {
1548			sector_t sector = sh->dev[i].sector;
1549			struct bio *wbi = sh->dev[i].written;
1550			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1551				copy_data(1, wbi, sh->dev[i].page, sector);
1552				wbi = r5_next_bio(wbi, sector);
1553			}
1554
1555			set_bit(R5_LOCKED, &sh->dev[i].flags);
1556			set_bit(R5_UPTODATE, &sh->dev[i].flags);
1557		}
1558
1559//	switch(method) {
1560//	case RECONSTRUCT_WRITE:
1561//	case CHECK_PARITY:
1562//	case UPDATE_PARITY:
1563		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1564		/* FIX: Is this ordering of drives even remotely optimal? */
1565		count = 0;
1566		i = d0_idx;
1567		do {
1568			ptrs[count++] = page_address(sh->dev[i].page);
1569			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1570				printk("block %d/%d not uptodate on parity calc\n", i,count);
1571			i = raid6_next_disk(i, disks);
1572		} while ( i != d0_idx );
1573//		break;
1574//	}
1575
1576	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1577
1578	switch(method) {
1579	case RECONSTRUCT_WRITE:
1580		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1581		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1582		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
1583		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
1584		break;
1585	case UPDATE_PARITY:
1586		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1587		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1588		break;
1589	}
1590}
1591
1592
1593/* Compute one missing block */
1594static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1595{
1596	int i, count, disks = sh->disks;
1597	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1598	int pd_idx = sh->pd_idx;
1599	int qd_idx = raid6_next_disk(pd_idx, disks);
1600
1601	pr_debug("compute_block_1, stripe %llu, idx %d\n",
1602		(unsigned long long)sh->sector, dd_idx);
1603
1604	if ( dd_idx == qd_idx ) {
1605		/* We're actually computing the Q drive */
1606		compute_parity6(sh, UPDATE_PARITY);
1607	} else {
1608		dest = page_address(sh->dev[dd_idx].page);
1609		if (!nozero) memset(dest, 0, STRIPE_SIZE);
1610		count = 0;
1611		for (i = disks ; i--; ) {
1612			if (i == dd_idx || i == qd_idx)
1613				continue;
1614			p = page_address(sh->dev[i].page);
1615			if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1616				ptr[count++] = p;
1617			else
1618				printk("compute_block() %d, stripe %llu, %d"
1619				       " not present\n", dd_idx,
1620				       (unsigned long long)sh->sector, i);
1621
1622			check_xor();
1623		}
1624		if (count)
1625			xor_blocks(count, STRIPE_SIZE, dest, ptr);
1626		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1627		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1628	}
1629}
1630
1631/* Compute two missing blocks */
1632static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1633{
1634	int i, count, disks = sh->disks;
1635	int pd_idx = sh->pd_idx;
1636	int qd_idx = raid6_next_disk(pd_idx, disks);
1637	int d0_idx = raid6_next_disk(qd_idx, disks);
1638	int faila, failb;
1639
1640	/* faila and failb are disk numbers relative to d0_idx */
1641	/* pd_idx become disks-2 and qd_idx become disks-1 */
1642	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1643	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1644
1645	BUG_ON(faila == failb);
1646	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1647
1648	pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1649	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1650
1651	if ( failb == disks-1 ) {
1652		/* Q disk is one of the missing disks */
1653		if ( faila == disks-2 ) {
1654			/* Missing P+Q, just recompute */
1655			compute_parity6(sh, UPDATE_PARITY);
1656			return;
1657		} else {
1658			/* We're missing D+Q; recompute D from P */
1659			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1660			compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1661			return;
1662		}
1663	}
1664
1665	/* We're missing D+P or D+D; build pointer table */
1666	{
1667		/**** FIX THIS: This could be very bad if disks is close to 256 ****/
1668		void *ptrs[disks];
1669
1670		count = 0;
1671		i = d0_idx;
1672		do {
1673			ptrs[count++] = page_address(sh->dev[i].page);
1674			i = raid6_next_disk(i, disks);
1675			if (i != dd_idx1 && i != dd_idx2 &&
1676			    !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1677				printk("compute_2 with missing block %d/%d\n", count, i);
1678		} while ( i != d0_idx );
1679
1680		if ( failb == disks-2 ) {
1681			/* We're missing D+P. */
1682			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1683		} else {
1684			/* We're missing D+D. */
1685			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1686		}
1687
1688		/* Both the above update both missing blocks */
1689		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1690		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1691	}
1692}
1693
1694static int
1695handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1696{
1697	int i, pd_idx = sh->pd_idx, disks = sh->disks;
1698	int locked = 0;
1699
1700	if (rcw) {
1701		/* if we are not expanding this is a proper write request, and
1702		 * there will be bios with new data to be drained into the
1703		 * stripe cache
1704		 */
1705		if (!expand) {
1706			set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1707			sh->ops.count++;
1708		}
1709
1710		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1711		sh->ops.count++;
1712
1713		for (i = disks; i--; ) {
1714			struct r5dev *dev = &sh->dev[i];
1715
1716			if (dev->towrite) {
1717				set_bit(R5_LOCKED, &dev->flags);
1718				if (!expand)
1719					clear_bit(R5_UPTODATE, &dev->flags);
1720				locked++;
1721			}
1722		}
1723	} else {
1724		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1725			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1726
1727		set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
1728		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
1729		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
1730
1731		sh->ops.count += 3;
1732
1733		for (i = disks; i--; ) {
1734			struct r5dev *dev = &sh->dev[i];
1735			if (i == pd_idx)
1736				continue;
1737
1738			/* For a read-modify write there may be blocks that are
1739			 * locked for reading while others are ready to be
1740			 * written so we distinguish these blocks by the
1741			 * R5_Wantprexor bit
1742			 */
1743			if (dev->towrite &&
1744			    (test_bit(R5_UPTODATE, &dev->flags) ||
1745			    test_bit(R5_Wantcompute, &dev->flags))) {
1746				set_bit(R5_Wantprexor, &dev->flags);
1747				set_bit(R5_LOCKED, &dev->flags);
1748				clear_bit(R5_UPTODATE, &dev->flags);
1749				locked++;
1750			}
1751		}
1752	}
1753
1754	/* keep the parity disk locked while asynchronous operations
1755	 * are in flight
1756	 */
1757	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1758	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1759	locked++;
1760
1761	pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1762		__FUNCTION__, (unsigned long long)sh->sector,
1763		locked, sh->ops.pending);
1764
1765	return locked;
1766}
1767
1768/*
1769 * Each stripe/dev can have one or more bion attached.
1770 * toread/towrite point to the first in a chain.
1771 * The bi_next chain must be in order.
1772 */
1773static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
1774{
1775	struct bio **bip;
1776	raid5_conf_t *conf = sh->raid_conf;
1777	int firstwrite=0;
1778
1779	pr_debug("adding bh b#%llu to stripe s#%llu\n",
1780		(unsigned long long)bi->bi_sector,
1781		(unsigned long long)sh->sector);
1782
1783
1784	spin_lock(&sh->lock);
1785	spin_lock_irq(&conf->device_lock);
1786	if (forwrite) {
1787		bip = &sh->dev[dd_idx].towrite;
1788		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
1789			firstwrite = 1;
1790	} else
1791		bip = &sh->dev[dd_idx].toread;
1792	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
1793		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
1794			goto overlap;
1795		bip = & (*bip)->bi_next;
1796	}
1797	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
1798		goto overlap;
1799
1800	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
1801	if (*bip)
1802		bi->bi_next = *bip;
1803	*bip = bi;
1804	bi->bi_phys_segments ++;
1805	spin_unlock_irq(&conf->device_lock);
1806	spin_unlock(&sh->lock);
1807
1808	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
1809		(unsigned long long)bi->bi_sector,
1810		(unsigned long long)sh->sector, dd_idx);
1811
1812	if (conf->mddev->bitmap && firstwrite) {
1813		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
1814				  STRIPE_SECTORS, 0);
1815		sh->bm_seq = conf->seq_flush+1;
1816		set_bit(STRIPE_BIT_DELAY, &sh->state);
1817	}
1818
1819	if (forwrite) {
1820		/* check if page is covered */
1821		sector_t sector = sh->dev[dd_idx].sector;
1822		for (bi=sh->dev[dd_idx].towrite;
1823		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
1824			     bi && bi->bi_sector <= sector;
1825		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
1826			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
1827				sector = bi->bi_sector + (bi->bi_size>>9);
1828		}
1829		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
1830			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
1831	}
1832	return 1;
1833
1834 overlap:
1835	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1836	spin_unlock_irq(&conf->device_lock);
1837	spin_unlock(&sh->lock);
1838	return 0;
1839}
1840
1841static void end_reshape(raid5_conf_t *conf);
1842
1843static int page_is_zero(struct page *p)
1844{
1845	char *a = page_address(p);
1846	return ((*(u32*)a) == 0 &&
1847		memcmp(a, a+4, STRIPE_SIZE-4)==0);
1848}
1849
1850static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1851{
1852	int sectors_per_chunk = conf->chunk_size >> 9;
1853	int pd_idx, dd_idx;
1854	int chunk_offset = sector_div(stripe, sectors_per_chunk);
1855
1856	raid5_compute_sector(stripe * (disks - conf->max_degraded)
1857			     *sectors_per_chunk + chunk_offset,
1858			     disks, disks - conf->max_degraded,
1859			     &dd_idx, &pd_idx, conf);
1860	return pd_idx;
1861}
1862
1863static void
1864handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1865				struct stripe_head_state *s, int disks,
1866				struct bio **return_bi)
1867{
1868	int i;
1869	for (i = disks; i--; ) {
1870		struct bio *bi;
1871		int bitmap_end = 0;
1872
1873		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1874			mdk_rdev_t *rdev;
1875			rcu_read_lock();
1876			rdev = rcu_dereference(conf->disks[i].rdev);
1877			if (rdev && test_bit(In_sync, &rdev->flags))
1878				/* multiple read failures in one stripe */
1879				md_error(conf->mddev, rdev);
1880			rcu_read_unlock();
1881		}
1882		spin_lock_irq(&conf->device_lock);
1883		/* fail all writes first */
1884		bi = sh->dev[i].towrite;
1885		sh->dev[i].towrite = NULL;
1886		if (bi) {
1887			s->to_write--;
1888			bitmap_end = 1;
1889		}
1890
1891		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1892			wake_up(&conf->wait_for_overlap);
1893
1894		while (bi && bi->bi_sector <
1895			sh->dev[i].sector + STRIPE_SECTORS) {
1896			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1897			clear_bit(BIO_UPTODATE, &bi->bi_flags);
1898			if (--bi->bi_phys_segments == 0) {
1899				md_write_end(conf->mddev);
1900				bi->bi_next = *return_bi;
1901				*return_bi = bi;
1902			}
1903			bi = nextbi;
1904		}
1905		/* and fail all 'written' */
1906		bi = sh->dev[i].written;
1907		sh->dev[i].written = NULL;
1908		if (bi) bitmap_end = 1;
1909		while (bi && bi->bi_sector <
1910		       sh->dev[i].sector + STRIPE_SECTORS) {
1911			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1912			clear_bit(BIO_UPTODATE, &bi->bi_flags);
1913			if (--bi->bi_phys_segments == 0) {
1914				md_write_end(conf->mddev);
1915				bi->bi_next = *return_bi;
1916				*return_bi = bi;
1917			}
1918			bi = bi2;
1919		}
1920
1921		/* fail any reads if this device is non-operational and
1922		 * the data has not reached the cache yet.
1923		 */
1924		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
1925		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1926		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
1927			bi = sh->dev[i].toread;
1928			sh->dev[i].toread = NULL;
1929			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1930				wake_up(&conf->wait_for_overlap);
1931			if (bi) s->to_read--;
1932			while (bi && bi->bi_sector <
1933			       sh->dev[i].sector + STRIPE_SECTORS) {
1934				struct bio *nextbi =
1935					r5_next_bio(bi, sh->dev[i].sector);
1936				clear_bit(BIO_UPTODATE, &bi->bi_flags);
1937				if (--bi->bi_phys_segments == 0) {
1938					bi->bi_next = *return_bi;
1939					*return_bi = bi;
1940				}
1941				bi = nextbi;
1942			}
1943		}
1944		spin_unlock_irq(&conf->device_lock);
1945		if (bitmap_end)
1946			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1947					STRIPE_SECTORS, 0, 0);
1948	}
1949
1950}
1951
1952/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
1953 * to process
1954 */
1955static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1956			struct stripe_head_state *s, int disk_idx, int disks)
1957{
1958	struct r5dev *dev = &sh->dev[disk_idx];
1959	struct r5dev *failed_dev = &sh->dev[s->failed_num];
1960
1961	/* don't schedule compute operations or reads on the parity block while
1962	 * a check is in flight
1963	 */
1964	if ((disk_idx == sh->pd_idx) &&
1965	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1966		return ~0;
1967
1968	/* is the data in this block needed, and can we get it? */
1969	if (!test_bit(R5_LOCKED, &dev->flags) &&
1970	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
1971	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1972	     s->syncing || s->expanding || (s->failed &&
1973	     (failed_dev->toread || (failed_dev->towrite &&
1974	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
1975	     ))))) {
1976		/* 1/ We would like to get this block, possibly by computing it,
1977		 * but we might not be able to.
1978		 *
1979		 * 2/ Since parity check operations potentially make the parity
1980		 * block !uptodate it will need to be refreshed before any
1981		 * compute operations on data disks are scheduled.
1982		 *
1983		 * 3/ We hold off parity block re-reads until check operations
1984		 * have quiesced.
1985		 */
1986		if ((s->uptodate == disks - 1) &&
1987		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
1988			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
1989			set_bit(R5_Wantcompute, &dev->flags);
1990			sh->ops.target = disk_idx;
1991			s->req_compute = 1;
1992			sh->ops.count++;
1993			/* Careful: from this point on 'uptodate' is in the eye
1994			 * of raid5_run_ops which services 'compute' operations
1995			 * before writes. R5_Wantcompute flags a block that will
1996			 * be R5_UPTODATE by the time it is needed for a
1997			 * subsequent operation.
1998			 */
1999			s->uptodate++;
2000			return 0; /* uptodate + compute == disks */
2001		} else if ((s->uptodate < disks - 1) &&
2002			test_bit(R5_Insync, &dev->flags)) {
2003			/* Note: we hold off compute operations while checks are
2004			 * in flight, but we still prefer 'compute' over 'read'
2005			 * hence we only read if (uptodate < * disks-1)
2006			 */
2007			set_bit(R5_LOCKED, &dev->flags);
2008			set_bit(R5_Wantread, &dev->flags);
2009			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2010				sh->ops.count++;
2011			s->locked++;
2012			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2013				s->syncing);
2014		}
2015	}
2016
2017	return ~0;
2018}
2019
2020static void handle_issuing_new_read_requests5(struct stripe_head *sh,
2021			struct stripe_head_state *s, int disks)
2022{
2023	int i;
2024
2025	/* Clear completed compute operations.  Parity recovery
2026	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2027	 * later on in this routine
2028	 */
2029	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2030		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2031		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2032		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2033		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2034	}
2035
2036	/* look for blocks to read/compute, skip this if a compute
2037	 * is already in flight, or if the stripe contents are in the
2038	 * midst of changing due to a write
2039	 */
2040	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2041		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
2042		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2043		for (i = disks; i--; )
2044			if (__handle_issuing_new_read_requests5(
2045				sh, s, i, disks) == 0)
2046				break;
2047	}
2048	set_bit(STRIPE_HANDLE, &sh->state);
2049}
2050
2051static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2052			struct stripe_head_state *s, struct r6_state *r6s,
2053			int disks)
2054{
2055	int i;
2056	for (i = disks; i--; ) {
2057		struct r5dev *dev = &sh->dev[i];
2058		if (!test_bit(R5_LOCKED, &dev->flags) &&
2059		    !test_bit(R5_UPTODATE, &dev->flags) &&
2060		    (dev->toread || (dev->towrite &&
2061		     !test_bit(R5_OVERWRITE, &dev->flags)) ||
2062		     s->syncing || s->expanding ||
2063		     (s->failed >= 1 &&
2064		      (sh->dev[r6s->failed_num[0]].toread ||
2065		       s->to_write)) ||
2066		     (s->failed >= 2 &&
2067		      (sh->dev[r6s->failed_num[1]].toread ||
2068		       s->to_write)))) {
2069			/* we would like to get this block, possibly
2070			 * by computing it, but we might not be able to
2071			 */
2072			if (s->uptodate == disks-1) {
2073				pr_debug("Computing stripe %llu block %d\n",
2074				       (unsigned long long)sh->sector, i);
2075				compute_block_1(sh, i, 0);
2076				s->uptodate++;
2077			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2078				/* Computing 2-failure is *very* expensive; only
2079				 * do it if failed >= 2
2080				 */
2081				int other;
2082				for (other = disks; other--; ) {
2083					if (other == i)
2084						continue;
2085					if (!test_bit(R5_UPTODATE,
2086					      &sh->dev[other].flags))
2087						break;
2088				}
2089				BUG_ON(other < 0);
2090				pr_debug("Computing stripe %llu blocks %d,%d\n",
2091				       (unsigned long long)sh->sector,
2092				       i, other);
2093				compute_block_2(sh, i, other);
2094				s->uptodate += 2;
2095			} else if (test_bit(R5_Insync, &dev->flags)) {
2096				set_bit(R5_LOCKED, &dev->flags);
2097				set_bit(R5_Wantread, &dev->flags);
2098				s->locked++;
2099				pr_debug("Reading block %d (sync=%d)\n",
2100					i, s->syncing);
2101			}
2102		}
2103	}
2104	set_bit(STRIPE_HANDLE, &sh->state);
2105}
2106
2107
2108/* handle_completed_write_requests
2109 * any written block on an uptodate or failed drive can be returned.
2110 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2111 * never LOCKED, so we don't need to test 'failed' directly.
2112 */
2113static void handle_completed_write_requests(raid5_conf_t *conf,
2114	struct stripe_head *sh, int disks, struct bio **return_bi)
2115{
2116	int i;
2117	struct r5dev *dev;
2118
2119	for (i = disks; i--; )
2120		if (sh->dev[i].written) {
2121			dev = &sh->dev[i];
2122			if (!test_bit(R5_LOCKED, &dev->flags) &&
2123				test_bit(R5_UPTODATE, &dev->flags)) {
2124				/* We can return any write requests */
2125				struct bio *wbi, *wbi2;
2126				int bitmap_end = 0;
2127				pr_debug("Return write for disc %d\n", i);
2128				spin_lock_irq(&conf->device_lock);
2129				wbi = dev->written;
2130				dev->written = NULL;
2131				while (wbi && wbi->bi_sector <
2132					dev->sector + STRIPE_SECTORS) {
2133					wbi2 = r5_next_bio(wbi, dev->sector);
2134					if (--wbi->bi_phys_segments == 0) {
2135						md_write_end(conf->mddev);
2136						wbi->bi_next = *return_bi;
2137						*return_bi = wbi;
2138					}
2139					wbi = wbi2;
2140				}
2141				if (dev->towrite == NULL)
2142					bitmap_end = 1;
2143				spin_unlock_irq(&conf->device_lock);
2144				if (bitmap_end)
2145					bitmap_endwrite(conf->mddev->bitmap,
2146							sh->sector,
2147							STRIPE_SECTORS,
2148					 !test_bit(STRIPE_DEGRADED, &sh->state),
2149							0);
2150			}
2151		}
2152}
2153
2154static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2155		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
2156{
2157	int rmw = 0, rcw = 0, i;
2158	for (i = disks; i--; ) {
2159		/* would I have to read this buffer for read_modify_write */
2160		struct r5dev *dev = &sh->dev[i];
2161		if ((dev->towrite || i == sh->pd_idx) &&
2162		    !test_bit(R5_LOCKED, &dev->flags) &&
2163		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2164		      test_bit(R5_Wantcompute, &dev->flags))) {
2165			if (test_bit(R5_Insync, &dev->flags))
2166				rmw++;
2167			else
2168				rmw += 2*disks;  /* cannot read it */
2169		}
2170		/* Would I have to read this buffer for reconstruct_write */
2171		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2172		    !test_bit(R5_LOCKED, &dev->flags) &&
2173		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2174		    test_bit(R5_Wantcompute, &dev->flags))) {
2175			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2176			else
2177				rcw += 2*disks;
2178		}
2179	}
2180	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2181		(unsigned long long)sh->sector, rmw, rcw);
2182	set_bit(STRIPE_HANDLE, &sh->state);
2183	if (rmw < rcw && rmw > 0)
2184		/* prefer read-modify-write, but need to get some data */
2185		for (i = disks; i--; ) {
2186			struct r5dev *dev = &sh->dev[i];
2187			if ((dev->towrite || i == sh->pd_idx) &&
2188			    !test_bit(R5_LOCKED, &dev->flags) &&
2189			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2190			    test_bit(R5_Wantcompute, &dev->flags)) &&
2191			    test_bit(R5_Insync, &dev->flags)) {
2192				if (
2193				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2194					pr_debug("Read_old block "
2195						"%d for r-m-w\n", i);
2196					set_bit(R5_LOCKED, &dev->flags);
2197					set_bit(R5_Wantread, &dev->flags);
2198					if (!test_and_set_bit(
2199						STRIPE_OP_IO, &sh->ops.pending))
2200						sh->ops.count++;
2201					s->locked++;
2202				} else {
2203					set_bit(STRIPE_DELAYED, &sh->state);
2204					set_bit(STRIPE_HANDLE, &sh->state);
2205				}
2206			}
2207		}
2208	if (rcw <= rmw && rcw > 0)
2209		/* want reconstruct write, but need to get some data */
2210		for (i = disks; i--; ) {
2211			struct r5dev *dev = &sh->dev[i];
2212			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2213			    i != sh->pd_idx &&
2214			    !test_bit(R5_LOCKED, &dev->flags) &&
2215			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2216			    test_bit(R5_Wantcompute, &dev->flags)) &&
2217			    test_bit(R5_Insync, &dev->flags)) {
2218				if (
2219				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2220					pr_debug("Read_old block "
2221						"%d for Reconstruct\n", i);
2222					set_bit(R5_LOCKED, &dev->flags);
2223					set_bit(R5_Wantread, &dev->flags);
2224					if (!test_and_set_bit(
2225						STRIPE_OP_IO, &sh->ops.pending))
2226						sh->ops.count++;
2227					s->locked++;
2228				} else {
2229					set_bit(STRIPE_DELAYED, &sh->state);
2230					set_bit(STRIPE_HANDLE, &sh->state);
2231				}
2232			}
2233		}
2234	/* now if nothing is locked, and if we have enough data,
2235	 * we can start a write request
2236	 */
2237	/* since handle_stripe can be called at any time we need to handle the
2238	 * case where a compute block operation has been submitted and then a
2239	 * subsequent call wants to start a write request.  raid5_run_ops only
2240	 * handles the case where compute block and postxor are requested
2241	 * simultaneously.  If this is not the case then new writes need to be
2242	 * held off until the compute completes.
2243	 */
2244	if ((s->req_compute ||
2245	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
2246		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
2247		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
2248		s->locked += handle_write_operations5(sh, rcw == 0, 0);
2249}
2250
2251static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2252		struct stripe_head *sh,	struct stripe_head_state *s,
2253		struct r6_state *r6s, int disks)
2254{
2255	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2256	int qd_idx = r6s->qd_idx;
2257	for (i = disks; i--; ) {
2258		struct r5dev *dev = &sh->dev[i];
2259		/* Would I have to read this buffer for reconstruct_write */
2260		if (!test_bit(R5_OVERWRITE, &dev->flags)
2261		    && i != pd_idx && i != qd_idx
2262		    && (!test_bit(R5_LOCKED, &dev->flags)
2263			    ) &&
2264		    !test_bit(R5_UPTODATE, &dev->flags)) {
2265			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2266			else {
2267				pr_debug("raid6: must_compute: "
2268					"disk %d flags=%#lx\n", i, dev->flags);
2269				must_compute++;
2270			}
2271		}
2272	}
2273	pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2274	       (unsigned long long)sh->sector, rcw, must_compute);
2275	set_bit(STRIPE_HANDLE, &sh->state);
2276
2277	if (rcw > 0)
2278		/* want reconstruct write, but need to get some data */
2279		for (i = disks; i--; ) {
2280			struct r5dev *dev = &sh->dev[i];
2281			if (!test_bit(R5_OVERWRITE, &dev->flags)
2282			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2283			    && !test_bit(R5_LOCKED, &dev->flags) &&
2284			    !test_bit(R5_UPTODATE, &dev->flags) &&
2285			    test_bit(R5_Insync, &dev->flags)) {
2286				if (
2287				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2288					pr_debug("Read_old stripe %llu "
2289						"block %d for Reconstruct\n",
2290					     (unsigned long long)sh->sector, i);
2291					set_bit(R5_LOCKED, &dev->flags);
2292					set_bit(R5_Wantread, &dev->flags);
2293					s->locked++;
2294				} else {
2295					pr_debug("Request delayed stripe %llu "
2296						"block %d for Reconstruct\n",
2297					     (unsigned long long)sh->sector, i);
2298					set_bit(STRIPE_DELAYED, &sh->state);
2299					set_bit(STRIPE_HANDLE, &sh->state);
2300				}
2301			}
2302		}
2303	/* now if nothing is locked, and if we have enough data, we can start a
2304	 * write request
2305	 */
2306	if (s->locked == 0 && rcw == 0 &&
2307	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2308		if (must_compute > 0) {
2309			/* We have failed blocks and need to compute them */
2310			switch (s->failed) {
2311			case 0:
2312				BUG();
2313			case 1:
2314				compute_block_1(sh, r6s->failed_num[0], 0);
2315				break;
2316			case 2:
2317				compute_block_2(sh, r6s->failed_num[0],
2318						r6s->failed_num[1]);
2319				break;
2320			default: /* This request should have been failed? */
2321				BUG();
2322			}
2323		}
2324
2325		pr_debug("Computing parity for stripe %llu\n",
2326			(unsigned long long)sh->sector);
2327		compute_parity6(sh, RECONSTRUCT_WRITE);
2328		/* now every locked buffer is ready to be written */
2329		for (i = disks; i--; )
2330			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2331				pr_debug("Writing stripe %llu block %d\n",
2332				       (unsigned long long)sh->sector, i);
2333				s->locked++;
2334				set_bit(R5_Wantwrite, &sh->dev[i].flags);
2335			}
2336		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2337		set_bit(STRIPE_INSYNC, &sh->state);
2338
2339		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2340			atomic_dec(&conf->preread_active_stripes);
2341			if (atomic_read(&conf->preread_active_stripes) <
2342			    IO_THRESHOLD)
2343				md_wakeup_thread(conf->mddev->thread);
2344		}
2345	}
2346}
2347
2348static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2349				struct stripe_head_state *s, int disks)
2350{
2351	set_bit(STRIPE_HANDLE, &sh->state);
2352	/* Take one of the following actions:
2353	 * 1/ start a check parity operation if (uptodate == disks)
2354	 * 2/ finish a check parity operation and act on the result
2355	 * 3/ skip to the writeback section if we previously
2356	 *    initiated a recovery operation
2357	 */
2358	if (s->failed == 0 &&
2359	    !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2360		if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2361			BUG_ON(s->uptodate != disks);
2362			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2363			sh->ops.count++;
2364			s->uptodate--;
2365		} else if (
2366		       test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
2367			clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
2368			clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2369
2370			if (sh->ops.zero_sum_result == 0)
2371				/* parity is correct (on disc,
2372				 * not in buffer any more)
2373				 */
2374				set_bit(STRIPE_INSYNC, &sh->state);
2375			else {
2376				conf->mddev->resync_mismatches +=
2377					STRIPE_SECTORS;
2378				if (test_bit(
2379				     MD_RECOVERY_CHECK, &conf->mddev->recovery))
2380					/* don't try to repair!! */
2381					set_bit(STRIPE_INSYNC, &sh->state);
2382				else {
2383					set_bit(STRIPE_OP_COMPUTE_BLK,
2384						&sh->ops.pending);
2385					set_bit(STRIPE_OP_MOD_REPAIR_PD,
2386						&sh->ops.pending);
2387					set_bit(R5_Wantcompute,
2388						&sh->dev[sh->pd_idx].flags);
2389					sh->ops.target = sh->pd_idx;
2390					sh->ops.count++;
2391					s->uptodate++;
2392				}
2393			}
2394		}
2395	}
2396
2397	/* check if we can clear a parity disk reconstruct */
2398	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2399		test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2400
2401		clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2402		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2403		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2404		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2405	}
2406
2407	/* Wait for check parity and compute block operations to complete
2408	 * before write-back
2409	 */
2410	if (!test_bit(STRIPE_INSYNC, &sh->state) &&
2411		!test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2412		!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2413		struct r5dev *dev;
2414		/* either failed parity check, or recovery is happening */
2415		if (s->failed == 0)
2416			s->failed_num = sh->pd_idx;
2417		dev = &sh->dev[s->failed_num];
2418		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2419		BUG_ON(s->uptodate != disks);
2420
2421		set_bit(R5_LOCKED, &dev->flags);
2422		set_bit(R5_Wantwrite, &dev->flags);
2423		if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2424			sh->ops.count++;
2425
2426		clear_bit(STRIPE_DEGRADED, &sh->state);
2427		s->locked++;
2428		set_bit(STRIPE_INSYNC, &sh->state);
2429	}
2430}
2431
2432
2433static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2434				struct stripe_head_state *s,
2435				struct r6_state *r6s, struct page *tmp_page,
2436				int disks)
2437{
2438	int update_p = 0, update_q = 0;
2439	struct r5dev *dev;
2440	int pd_idx = sh->pd_idx;
2441	int qd_idx = r6s->qd_idx;
2442
2443	set_bit(STRIPE_HANDLE, &sh->state);
2444
2445	BUG_ON(s->failed > 2);
2446	BUG_ON(s->uptodate < disks);
2447	/* Want to check and possibly repair P and Q.
2448	 * However there could be one 'failed' device, in which
2449	 * case we can only check one of them, possibly using the
2450	 * other to generate missing data
2451	 */
2452
2453	/* If !tmp_page, we cannot do the calculations,
2454	 * but as we have set STRIPE_HANDLE, we will soon be called
2455	 * by stripe_handle with a tmp_page - just wait until then.
2456	 */
2457	if (tmp_page) {
2458		if (s->failed == r6s->q_failed) {
2459			/* The only possible failed device holds 'Q', so it
2460			 * makes sense to check P (If anything else were failed,
2461			 * we would have used P to recreate it).
2462			 */
2463			compute_block_1(sh, pd_idx, 1);
2464			if (!page_is_zero(sh->dev[pd_idx].page)) {
2465				compute_block_1(sh, pd_idx, 0);
2466				update_p = 1;
2467			}
2468		}
2469		if (!r6s->q_failed && s->failed < 2) {
2470			/* q is not failed, and we didn't use it to generate
2471			 * anything, so it makes sense to check it
2472			 */
2473			memcpy(page_address(tmp_page),
2474			       page_address(sh->dev[qd_idx].page),
2475			       STRIPE_SIZE);
2476			compute_parity6(sh, UPDATE_PARITY);
2477			if (memcmp(page_address(tmp_page),
2478				   page_address(sh->dev[qd_idx].page),
2479				   STRIPE_SIZE) != 0) {
2480				clear_bit(STRIPE_INSYNC, &sh->state);
2481				update_q = 1;
2482			}
2483		}
2484		if (update_p || update_q) {
2485			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2486			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2487				/* don't try to repair!! */
2488				update_p = update_q = 0;
2489		}
2490
2491		/* now write out any block on a failed drive,
2492		 * or P or Q if they need it
2493		 */
2494
2495		if (s->failed == 2) {
2496			dev = &sh->dev[r6s->failed_num[1]];
2497			s->locked++;
2498			set_bit(R5_LOCKED, &dev->flags);
2499			set_bit(R5_Wantwrite, &dev->flags);
2500		}
2501		if (s->failed >= 1) {
2502			dev = &sh->dev[r6s->failed_num[0]];
2503			s->locked++;
2504			set_bit(R5_LOCKED, &dev->flags);
2505			set_bit(R5_Wantwrite, &dev->flags);
2506		}
2507
2508		if (update_p) {
2509			dev = &sh->dev[pd_idx];
2510			s->locked++;
2511			set_bit(R5_LOCKED, &dev->flags);
2512			set_bit(R5_Wantwrite, &dev->flags);
2513		}
2514		if (update_q) {
2515			dev = &sh->dev[qd_idx];
2516			s->locked++;
2517			set_bit(R5_LOCKED, &dev->flags);
2518			set_bit(R5_Wantwrite, &dev->flags);
2519		}
2520		clear_bit(STRIPE_DEGRADED, &sh->state);
2521
2522		set_bit(STRIPE_INSYNC, &sh->state);
2523	}
2524}
2525
2526static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2527				struct r6_state *r6s)
2528{
2529	int i;
2530
2531	/* We have read all the blocks in this stripe and now we need to
2532	 * copy some of them into a target stripe for expand.
2533	 */
2534	struct dma_async_tx_descriptor *tx = NULL;
2535	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2536	for (i = 0; i < sh->disks; i++)
2537		if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
2538			int dd_idx, pd_idx, j;
2539			struct stripe_head *sh2;
2540
2541			sector_t bn = compute_blocknr(sh, i);
2542			sector_t s = raid5_compute_sector(bn, conf->raid_disks,
2543						conf->raid_disks -
2544						conf->max_degraded, &dd_idx,
2545						&pd_idx, conf);
2546			sh2 = get_active_stripe(conf, s, conf->raid_disks,
2547						pd_idx, 1);
2548			if (sh2 == NULL)
2549				/* so far only the early blocks of this stripe
2550				 * have been requested.  When later blocks
2551				 * get requested, we will try again
2552				 */
2553				continue;
2554			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2555			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2556				/* must have already done this block */
2557				release_stripe(sh2);
2558				continue;
2559			}
2560
2561			/* place all the copies on one channel */
2562			tx = async_memcpy(sh2->dev[dd_idx].page,
2563				sh->dev[i].page, 0, 0, STRIPE_SIZE,
2564				ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2565
2566			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2567			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2568			for (j = 0; j < conf->raid_disks; j++)
2569				if (j != sh2->pd_idx &&
2570				    (!r6s || j != raid6_next_disk(sh2->pd_idx,
2571								 sh2->disks)) &&
2572				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
2573					break;
2574			if (j == conf->raid_disks) {
2575				set_bit(STRIPE_EXPAND_READY, &sh2->state);
2576				set_bit(STRIPE_HANDLE, &sh2->state);
2577			}
2578			release_stripe(sh2);
2579
2580		}
2581	/* done submitting copies, wait for them to complete */
2582	if (tx) {
2583		async_tx_ack(tx);
2584		dma_wait_for_async_tx(tx);
2585	}
2586}
2587
2588/*
2589 * handle_stripe - do things to a stripe.
2590 *
2591 * We lock the stripe and then examine the state of various bits
2592 * to see what needs to be done.
2593 * Possible results:
2594 *    return some read request which now have data
2595 *    return some write requests which are safely on disc
2596 *    schedule a read on some buffers
2597 *    schedule a write of some buffers
2598 *    return confirmation of parity correctness
2599 *
2600 * buffers are taken off read_list or write_list, and bh_cache buffers
2601 * get BH_Lock set before the stripe lock is released.
2602 *
2603 */
2604
2605static void handle_stripe5(struct stripe_head *sh)
2606{
2607	raid5_conf_t *conf = sh->raid_conf;
2608	int disks = sh->disks, i;
2609	struct bio *return_bi = NULL;
2610	struct stripe_head_state s;
2611	struct r5dev *dev;
2612	unsigned long pending = 0;
2613
2614	memset(&s, 0, sizeof(s));
2615	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
2616		"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
2617		atomic_read(&sh->count), sh->pd_idx,
2618		sh->ops.pending, sh->ops.ack, sh->ops.complete);
2619
2620	spin_lock(&sh->lock);
2621	clear_bit(STRIPE_HANDLE, &sh->state);
2622	clear_bit(STRIPE_DELAYED, &sh->state);
2623
2624	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2625	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2626	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2627	/* Now to look around and see what can be done */
2628
2629	/* clean-up completed biofill operations */
2630	if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2631		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2632		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2633		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2634	}
2635
2636	rcu_read_lock();
2637	for (i=disks; i--; ) {
2638		mdk_rdev_t *rdev;
2639		struct r5dev *dev = &sh->dev[i];
2640		clear_bit(R5_Insync, &dev->flags);
2641
2642		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2643			"written %p\n",	i, dev->flags, dev->toread, dev->read,
2644			dev->towrite, dev->written);
2645
2646		/* maybe we can request a biofill operation
2647		 *
2648		 * new wantfill requests are only permitted while
2649		 * STRIPE_OP_BIOFILL is clear
2650		 */
2651		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2652			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2653			set_bit(R5_Wantfill, &dev->flags);
2654
2655		/* now count some things */
2656		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2657		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2658		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2659
2660		if (test_bit(R5_Wantfill, &dev->flags))
2661			s.to_fill++;
2662		else if (dev->toread)
2663			s.to_read++;
2664		if (dev->towrite) {
2665			s.to_write++;
2666			if (!test_bit(R5_OVERWRITE, &dev->flags))
2667				s.non_overwrite++;
2668		}
2669		if (dev->written)
2670			s.written++;
2671		rdev = rcu_dereference(conf->disks[i].rdev);
2672		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2673			/* The ReadError flag will just be confusing now */
2674			clear_bit(R5_ReadError, &dev->flags);
2675			clear_bit(R5_ReWrite, &dev->flags);
2676		}
2677		if (!rdev || !test_bit(In_sync, &rdev->flags)
2678		    || test_bit(R5_ReadError, &dev->flags)) {
2679			s.failed++;
2680			s.failed_num = i;
2681		} else
2682			set_bit(R5_Insync, &dev->flags);
2683	}
2684	rcu_read_unlock();
2685
2686	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2687		sh->ops.count++;
2688
2689	pr_debug("locked=%d uptodate=%d to_read=%d"
2690		" to_write=%d failed=%d failed_num=%d\n",
2691		s.locked, s.uptodate, s.to_read, s.to_write,
2692		s.failed, s.failed_num);
2693	/* check if the array has lost two devices and, if so, some requests might
2694	 * need to be failed
2695	 */
2696	if (s.failed > 1 && s.to_read+s.to_write+s.written)
2697		handle_requests_to_failed_array(conf, sh, &s, disks,
2698						&return_bi);
2699	if (s.failed > 1 && s.syncing) {
2700		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2701		clear_bit(STRIPE_SYNCING, &sh->state);
2702		s.syncing = 0;
2703	}
2704
2705	/* might be able to return some write requests if the parity block
2706	 * is safe, or on a failed drive
2707	 */
2708	dev = &sh->dev[sh->pd_idx];
2709	if ( s.written &&
2710	     ((test_bit(R5_Insync, &dev->flags) &&
2711	       !test_bit(R5_LOCKED, &dev->flags) &&
2712	       test_bit(R5_UPTODATE, &dev->flags)) ||
2713	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
2714		handle_completed_write_requests(conf, sh, disks, &return_bi);
2715
2716	/* Now we might consider reading some blocks, either to check/generate
2717	 * parity, or to satisfy requests
2718	 * or to load a block that is being partially written.
2719	 */
2720	if (s.to_read || s.non_overwrite ||
2721	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
2722	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
2723		handle_issuing_new_read_requests5(sh, &s, disks);
2724
2725	/* Now we check to see if any write operations have recently
2726	 * completed
2727	 */
2728
2729	/* leave prexor set until postxor is done, allows us to distinguish
2730	 * a rmw from a rcw during biodrain
2731	 */
2732	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
2733		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2734
2735		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
2736		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
2737		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
2738
2739		for (i = disks; i--; )
2740			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2741	}
2742
2743	/* if only POSTXOR is set then this is an 'expand' postxor */
2744	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2745		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2746
2747		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2748		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2749		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2750
2751		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2752		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2753		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2754
2755		/* All the 'written' buffers and the parity block are ready to
2756		 * be written back to disk
2757		 */
2758		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
2759		for (i = disks; i--; ) {
2760			dev = &sh->dev[i];
2761			if (test_bit(R5_LOCKED, &dev->flags) &&
2762				(i == sh->pd_idx || dev->written)) {
2763				pr_debug("Writing block %d\n", i);
2764				set_bit(R5_Wantwrite, &dev->flags);
2765				if (!test_and_set_bit(
2766				    STRIPE_OP_IO, &sh->ops.pending))
2767					sh->ops.count++;
2768				if (!test_bit(R5_Insync, &dev->flags) ||
2769				    (i == sh->pd_idx && s.failed == 0))
2770					set_bit(STRIPE_INSYNC, &sh->state);
2771			}
2772		}
2773		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2774			atomic_dec(&conf->preread_active_stripes);
2775			if (atomic_read(&conf->preread_active_stripes) <
2776				IO_THRESHOLD)
2777				md_wakeup_thread(conf->mddev->thread);
2778		}
2779	}
2780
2781	/* Now to consider new write requests and what else, if anything
2782	 * should be read.  We do not handle new writes when:
2783	 * 1/ A 'write' operation (copy+xor) is already in flight.
2784	 * 2/ A 'check' operation is in flight, as it may clobber the parity
2785	 *    block.
2786	 */
2787	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
2788			  !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
2789		handle_issuing_new_write_requests5(conf, sh, &s, disks);
2790
2791	/* maybe we need to check and possibly fix the parity for this stripe
2792	 * Any reads will already have been scheduled, so we just see if enough
2793	 * data is available.  The parity check is held off while parity
2794	 * dependent operations are in flight.
2795	 */
2796	if ((s.syncing && s.locked == 0 &&
2797	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
2798	     !test_bit(STRIPE_INSYNC, &sh->state)) ||
2799	      test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
2800	      test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2801		handle_parity_checks5(conf, sh, &s, disks);
2802
2803	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2804		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2805		clear_bit(STRIPE_SYNCING, &sh->state);
2806	}
2807
2808	/* If the failed drive is just a ReadError, then we might need to progress
2809	 * the repair/check process
2810	 */
2811	if (s.failed == 1 && !conf->mddev->ro &&
2812	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
2813	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
2814	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
2815		) {
2816		dev = &sh->dev[s.failed_num];
2817		if (!test_bit(R5_ReWrite, &dev->flags)) {
2818			set_bit(R5_Wantwrite, &dev->flags);
2819			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2820				sh->ops.count++;
2821			set_bit(R5_ReWrite, &dev->flags);
2822			set_bit(R5_LOCKED, &dev->flags);
2823			s.locked++;
2824		} else {
2825			/* let's read it back */
2826			set_bit(R5_Wantread, &dev->flags);
2827			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2828				sh->ops.count++;
2829			set_bit(R5_LOCKED, &dev->flags);
2830			s.locked++;
2831		}
2832	}
2833
2834	/* Finish postxor operations initiated by the expansion
2835	 * process
2836	 */
2837	if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2838		!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2839
2840		clear_bit(STRIPE_EXPANDING, &sh->state);
2841
2842		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2843		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2844		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2845
2846		for (i = conf->raid_disks; i--; ) {
2847			set_bit(R5_Wantwrite, &sh->dev[i].flags);
2848			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2849				sh->ops.count++;
2850		}
2851	}
2852
2853	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2854		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2855		/* Need to write out all blocks after computing parity */
2856		sh->disks = conf->raid_disks;
2857		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2858			conf->raid_disks);
2859		s.locked += handle_write_operations5(sh, 1, 1);
2860	} else if (s.expanded &&
2861		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2862		clear_bit(STRIPE_EXPAND_READY, &sh->state);
2863		atomic_dec(&conf->reshape_stripes);
2864		wake_up(&conf->wait_for_overlap);
2865		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2866	}
2867
2868	if (s.expanding && s.locked == 0 &&
2869	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
2870		handle_stripe_expansion(conf, sh, NULL);
2871
2872	if (sh->ops.count)
2873		pending = get_stripe_work(sh);
2874
2875	spin_unlock(&sh->lock);
2876
2877	if (pending)
2878		raid5_run_ops(sh, pending);
2879
2880	return_io(return_bi);
2881
2882}
2883
2884static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2885{
2886	raid6_conf_t *conf = sh->raid_conf;
2887	int disks = sh->disks;
2888	struct bio *return_bi = NULL;
2889	int i, pd_idx = sh->pd_idx;
2890	struct stripe_head_state s;
2891	struct r6_state r6s;
2892	struct r5dev *dev, *pdev, *qdev;
2893
2894	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2895	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
2896		"pd_idx=%d, qd_idx=%d\n",
2897	       (unsigned long long)sh->sector, sh->state,
2898	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2899	memset(&s, 0, sizeof(s));
2900
2901	spin_lock(&sh->lock);
2902	clear_bit(STRIPE_HANDLE, &sh->state);
2903	clear_bit(STRIPE_DELAYED, &sh->state);
2904
2905	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2906	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2907	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2908	/* Now to look around and see what can be done */
2909
2910	rcu_read_lock();
2911	for (i=disks; i--; ) {
2912		mdk_rdev_t *rdev;
2913		dev = &sh->dev[i];
2914		clear_bit(R5_Insync, &dev->flags);
2915
2916		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
2917			i, dev->flags, dev->toread, dev->towrite, dev->written);
2918		/* maybe we can reply to a read */
2919		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
2920			struct bio *rbi, *rbi2;
2921			pr_debug("Return read for disc %d\n", i);
2922			spin_lock_irq(&conf->device_lock);
2923			rbi = dev->toread;
2924			dev->toread = NULL;
2925			if (test_and_clear_bit(R5_Overlap, &dev->flags))
2926				wake_up(&conf->wait_for_overlap);
2927			spin_unlock_irq(&conf->device_lock);
2928			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2929				copy_data(0, rbi, dev->page, dev->sector);
2930				rbi2 = r5_next_bio(rbi, dev->sector);
2931				spin_lock_irq(&conf->device_lock);
2932				if (--rbi->bi_phys_segments == 0) {
2933					rbi->bi_next = return_bi;
2934					return_bi = rbi;
2935				}
2936				spin_unlock_irq(&conf->device_lock);
2937				rbi = rbi2;
2938			}
2939		}
2940
2941		/* now count some things */
2942		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2943		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2944
2945
2946		if (dev->toread)
2947			s.to_read++;
2948		if (dev->towrite) {
2949			s.to_write++;
2950			if (!test_bit(R5_OVERWRITE, &dev->flags))
2951				s.non_overwrite++;
2952		}
2953		if (dev->written)
2954			s.written++;
2955		rdev = rcu_dereference(conf->disks[i].rdev);
2956		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2957			/* The ReadError flag will just be confusing now */
2958			clear_bit(R5_ReadError, &dev->flags);
2959			clear_bit(R5_ReWrite, &dev->flags);
2960		}
2961		if (!rdev || !test_bit(In_sync, &rdev->flags)
2962		    || test_bit(R5_ReadError, &dev->flags)) {
2963			if (s.failed < 2)
2964				r6s.failed_num[s.failed] = i;
2965			s.failed++;
2966		} else
2967			set_bit(R5_Insync, &dev->flags);
2968	}
2969	rcu_read_unlock();
2970	pr_debug("locked=%d uptodate=%d to_read=%d"
2971	       " to_write=%d failed=%d failed_num=%d,%d\n",
2972	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
2973	       r6s.failed_num[0], r6s.failed_num[1]);
2974	/* check if the array has lost >2 devices and, if so, some requests
2975	 * might need to be failed
2976	 */
2977	if (s.failed > 2 && s.to_read+s.to_write+s.written)
2978		handle_requests_to_failed_array(conf, sh, &s, disks,
2979						&return_bi);
2980	if (s.failed > 2 && s.syncing) {
2981		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2982		clear_bit(STRIPE_SYNCING, &sh->state);
2983		s.syncing = 0;
2984	}
2985
2986	/*
2987	 * might be able to return some write requests if the parity blocks
2988	 * are safe, or on a failed drive
2989	 */
2990	pdev = &sh->dev[pd_idx];
2991	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2992		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2993	qdev = &sh->dev[r6s.qd_idx];
2994	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
2995		|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
2996
2997	if ( s.written &&
2998	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
2999			     && !test_bit(R5_LOCKED, &pdev->flags)
3000			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3001	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3002			     && !test_bit(R5_LOCKED, &qdev->flags)
3003			     && test_bit(R5_UPTODATE, &qdev->flags)))))
3004		handle_completed_write_requests(conf, sh, disks, &return_bi);
3005
3006	/* Now we might consider reading some blocks, either to check/generate
3007	 * parity, or to satisfy requests
3008	 * or to load a block that is being partially written.
3009	 */
3010	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3011	    (s.syncing && (s.uptodate < disks)) || s.expanding)
3012		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
3013
3014	/* now to consider writing and what else, if anything should be read */
3015	if (s.to_write)
3016		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
3017
3018	/* maybe we need to check and possibly fix the parity for this stripe
3019	 * Any reads will already have been scheduled, so we just see if enough
3020	 * data is available
3021	 */
3022	if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
3023		handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
3024
3025	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3026		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3027		clear_bit(STRIPE_SYNCING, &sh->state);
3028	}
3029
3030	/* If the failed drives are just a ReadError, then we might need
3031	 * to progress the repair/check process
3032	 */
3033	if (s.failed <= 2 && !conf->mddev->ro)
3034		for (i = 0; i < s.failed; i++) {
3035			dev = &sh->dev[r6s.failed_num[i]];
3036			if (test_bit(R5_ReadError, &dev->flags)
3037			    && !test_bit(R5_LOCKED, &dev->flags)
3038			    && test_bit(R5_UPTODATE, &dev->flags)
3039				) {
3040				if (!test_bit(R5_ReWrite, &dev->flags)) {
3041					set_bit(R5_Wantwrite, &dev->flags);
3042					set_bit(R5_ReWrite, &dev->flags);
3043					set_bit(R5_LOCKED, &dev->flags);
3044				} else {
3045					/* let's read it back */
3046					set_bit(R5_Wantread, &dev->flags);
3047					set_bit(R5_LOCKED, &dev->flags);
3048				}
3049			}
3050		}
3051
3052	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3053		/* Need to write out all blocks after computing P&Q */
3054		sh->disks = conf->raid_disks;
3055		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
3056					     conf->raid_disks);
3057		compute_parity6(sh, RECONSTRUCT_WRITE);
3058		for (i = conf->raid_disks ; i-- ;  ) {
3059			set_bit(R5_LOCKED, &sh->dev[i].flags);
3060			s.locked++;
3061			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3062		}
3063		clear_bit(STRIPE_EXPANDING, &sh->state);
3064	} else if (s.expanded) {
3065		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3066		atomic_dec(&conf->reshape_stripes);
3067		wake_up(&conf->wait_for_overlap);
3068		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3069	}
3070
3071	if (s.expanding && s.locked == 0 &&
3072	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
3073		handle_stripe_expansion(conf, sh, &r6s);
3074
3075	spin_unlock(&sh->lock);
3076
3077	return_io(return_bi);
3078
3079	for (i=disks; i-- ;) {
3080		int rw;
3081		struct bio *bi;
3082		mdk_rdev_t *rdev;
3083		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3084			rw = WRITE;
3085		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3086			rw = READ;
3087		else
3088			continue;
3089
3090		bi = &sh->dev[i].req;
3091
3092		bi->bi_rw = rw;
3093		if (rw == WRITE)
3094			bi->bi_end_io = raid5_end_write_request;
3095		else
3096			bi->bi_end_io = raid5_end_read_request;
3097
3098		rcu_read_lock();
3099		rdev = rcu_dereference(conf->disks[i].rdev);
3100		if (rdev && test_bit(Faulty, &rdev->flags))
3101			rdev = NULL;
3102		if (rdev)
3103			atomic_inc(&rdev->nr_pending);
3104		rcu_read_unlock();
3105
3106		if (rdev) {
3107			if (s.syncing || s.expanding || s.expanded)
3108				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3109
3110			bi->bi_bdev = rdev->bdev;
3111			pr_debug("for %llu schedule op %ld on disc %d\n",
3112				(unsigned long long)sh->sector, bi->bi_rw, i);
3113			atomic_inc(&sh->count);
3114			bi->bi_sector = sh->sector + rdev->data_offset;
3115			bi->bi_flags = 1 << BIO_UPTODATE;
3116			bi->bi_vcnt = 1;
3117			bi->bi_max_vecs = 1;
3118			bi->bi_idx = 0;
3119			bi->bi_io_vec = &sh->dev[i].vec;
3120			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3121			bi->bi_io_vec[0].bv_offset = 0;
3122			bi->bi_size = STRIPE_SIZE;
3123			bi->bi_next = NULL;
3124			if (rw == WRITE &&
3125			    test_bit(R5_ReWrite, &sh->dev[i].flags))
3126				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3127			generic_make_request(bi);
3128		} else {
3129			if (rw == WRITE)
3130				set_bit(STRIPE_DEGRADED, &sh->state);
3131			pr_debug("skip op %ld on disc %d for sector %llu\n",
3132				bi->bi_rw, i, (unsigned long long)sh->sector);
3133			clear_bit(R5_LOCKED, &sh->dev[i].flags);
3134			set_bit(STRIPE_HANDLE, &sh->state);
3135		}
3136	}
3137}
3138
3139static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
3140{
3141	if (sh->raid_conf->level == 6)
3142		handle_stripe6(sh, tmp_page);
3143	else
3144		handle_stripe5(sh);
3145}
3146
3147
3148
3149static void raid5_activate_delayed(raid5_conf_t *conf)
3150{
3151	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3152		while (!list_empty(&conf->delayed_list)) {
3153			struct list_head *l = conf->delayed_list.next;
3154			struct stripe_head *sh;
3155			sh = list_entry(l, struct stripe_head, lru);
3156			list_del_init(l);
3157			clear_bit(STRIPE_DELAYED, &sh->state);
3158			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3159				atomic_inc(&conf->preread_active_stripes);
3160			list_add_tail(&sh->lru, &conf->handle_list);
3161		}
3162	}
3163}
3164
3165static void activate_bit_delay(raid5_conf_t *conf)
3166{
3167	/* device_lock is held */
3168	struct list_head head;
3169	list_add(&head, &conf->bitmap_list);
3170	list_del_init(&conf->bitmap_list);
3171	while (!list_empty(&head)) {
3172		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3173		list_del_init(&sh->lru);
3174		atomic_inc(&sh->count);
3175		__release_stripe(conf, sh);
3176	}
3177}
3178
3179static void unplug_slaves(mddev_t *mddev)
3180{
3181	raid5_conf_t *conf = mddev_to_conf(mddev);
3182	int i;
3183
3184	rcu_read_lock();
3185	for (i=0; i<mddev->raid_disks; i++) {
3186		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3187		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3188			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3189
3190			atomic_inc(&rdev->nr_pending);
3191			rcu_read_unlock();
3192
3193			blk_unplug(r_queue);
3194
3195			rdev_dec_pending(rdev, mddev);
3196			rcu_read_lock();
3197		}
3198	}
3199	rcu_read_unlock();
3200}
3201
3202static void raid5_unplug_device(struct request_queue *q)
3203{
3204	mddev_t *mddev = q->queuedata;
3205	raid5_conf_t *conf = mddev_to_conf(mddev);
3206	unsigned long flags;
3207
3208	spin_lock_irqsave(&conf->device_lock, flags);
3209
3210	if (blk_remove_plug(q)) {
3211		conf->seq_flush++;
3212		raid5_activate_delayed(conf);
3213	}
3214	md_wakeup_thread(mddev->thread);
3215
3216	spin_unlock_irqrestore(&conf->device_lock, flags);
3217
3218	unplug_slaves(mddev);
3219}
3220
3221static int raid5_congested(void *data, int bits)
3222{
3223	mddev_t *mddev = data;
3224	raid5_conf_t *conf = mddev_to_conf(mddev);
3225
3226	/* No difference between reads and writes.  Just check
3227	 * how busy the stripe_cache is
3228	 */
3229	if (conf->inactive_blocked)
3230		return 1;
3231	if (conf->quiesce)
3232		return 1;
3233	if (list_empty_careful(&conf->inactive_list))
3234		return 1;
3235
3236	return 0;
3237}
3238
3239/* We want read requests to align with chunks where possible,
3240 * but write requests don't need to.
3241 */
3242static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
3243{
3244	mddev_t *mddev = q->queuedata;
3245	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3246	int max;
3247	unsigned int chunk_sectors = mddev->chunk_size >> 9;
3248	unsigned int bio_sectors = bio->bi_size >> 9;
3249
3250	if (bio_data_dir(bio) == WRITE)
3251		return biovec->bv_len; /* always allow writes to be mergeable */
3252
3253	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3254	if (max < 0) max = 0;
3255	if (max <= biovec->bv_len && bio_sectors == 0)
3256		return biovec->bv_len;
3257	else
3258		return max;
3259}
3260
3261
3262static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3263{
3264	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3265	unsigned int chunk_sectors = mddev->chunk_size >> 9;
3266	unsigned int bio_sectors = bio->bi_size >> 9;
3267
3268	return  chunk_sectors >=
3269		((sector & (chunk_sectors - 1)) + bio_sectors);
3270}
3271
3272/*
3273 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3274 *  later sampled by raid5d.
3275 */
3276static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3277{
3278	unsigned long flags;
3279
3280	spin_lock_irqsave(&conf->device_lock, flags);
3281
3282	bi->bi_next = conf->retry_read_aligned_list;
3283	conf->retry_read_aligned_list = bi;
3284
3285	spin_unlock_irqrestore(&conf->device_lock, flags);
3286	md_wakeup_thread(conf->mddev->thread);
3287}
3288
3289
3290static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3291{
3292	struct bio *bi;
3293
3294	bi = conf->retry_read_aligned;
3295	if (bi) {
3296		conf->retry_read_aligned = NULL;
3297		return bi;
3298	}
3299	bi = conf->retry_read_aligned_list;
3300	if(bi) {
3301		conf->retry_read_aligned_list = bi->bi_next;
3302		bi->bi_next = NULL;
3303		bi->bi_phys_segments = 1; /* biased count of active stripes */
3304		bi->bi_hw_segments = 0; /* count of processed stripes */
3305	}
3306
3307	return bi;
3308}
3309
3310
3311/*
3312 *  The "raid5_align_endio" should check if the read succeeded and if it
3313 *  did, call bio_endio on the original bio (having bio_put the new bio
3314 *  first).
3315 *  If the read failed..
3316 */
3317static void raid5_align_endio(struct bio *bi, int error)
3318{
3319	struct bio* raid_bi  = bi->bi_private;
3320	mddev_t *mddev;
3321	raid5_conf_t *conf;
3322	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3323	mdk_rdev_t *rdev;
3324
3325	bio_put(bi);
3326
3327	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3328	conf = mddev_to_conf(mddev);
3329	rdev = (void*)raid_bi->bi_next;
3330	raid_bi->bi_next = NULL;
3331
3332	rdev_dec_pending(rdev, conf->mddev);
3333
3334	if (!error && uptodate) {
3335		bio_endio(raid_bi, 0);
3336		if (atomic_dec_and_test(&conf->active_aligned_reads))
3337			wake_up(&conf->wait_for_stripe);
3338		return;
3339	}
3340
3341
3342	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3343
3344	add_bio_to_retry(raid_bi, conf);
3345}
3346
3347static int bio_fits_rdev(struct bio *bi)
3348{
3349	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3350
3351	if ((bi->bi_size>>9) > q->max_sectors)
3352		return 0;
3353	blk_recount_segments(q, bi);
3354	if (bi->bi_phys_segments > q->max_phys_segments ||
3355	    bi->bi_hw_segments > q->max_hw_segments)
3356		return 0;
3357
3358	if (q->merge_bvec_fn)
3359		/* it's too hard to apply the merge_bvec_fn at this stage,
3360		 * just just give up
3361		 */
3362		return 0;
3363
3364	return 1;
3365}
3366
3367
3368static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3369{
3370	mddev_t *mddev = q->queuedata;
3371	raid5_conf_t *conf = mddev_to_conf(mddev);
3372	const unsigned int raid_disks = conf->raid_disks;
3373	const unsigned int data_disks = raid_disks - conf->max_degraded;
3374	unsigned int dd_idx, pd_idx;
3375	struct bio* align_bi;
3376	mdk_rdev_t *rdev;
3377
3378	if (!in_chunk_boundary(mddev, raid_bio)) {
3379		pr_debug("chunk_aligned_read : non aligned\n");
3380		return 0;
3381	}
3382	/*
3383 	 * use bio_clone to make a copy of the bio
3384	 */
3385	align_bi = bio_clone(raid_bio, GFP_NOIO);
3386	if (!align_bi)
3387		return 0;
3388	/*
3389	 *   set bi_end_io to a new function, and set bi_private to the
3390	 *     original bio.
3391	 */
3392	align_bi->bi_end_io  = raid5_align_endio;
3393	align_bi->bi_private = raid_bio;
3394	/*
3395	 *	compute position
3396	 */
3397	align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
3398					raid_disks,
3399					data_disks,
3400					&dd_idx,
3401					&pd_idx,
3402					conf);
3403
3404	rcu_read_lock();
3405	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3406	if (rdev && test_bit(In_sync, &rdev->flags)) {
3407		atomic_inc(&rdev->nr_pending);
3408		rcu_read_unlock();
3409		raid_bio->bi_next = (void*)rdev;
3410		align_bi->bi_bdev =  rdev->bdev;
3411		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3412		align_bi->bi_sector += rdev->data_offset;
3413
3414		if (!bio_fits_rdev(align_bi)) {
3415			/* too big in some way */
3416			bio_put(align_bi);
3417			rdev_dec_pending(rdev, mddev);
3418			return 0;
3419		}
3420
3421		spin_lock_irq(&conf->device_lock);
3422		wait_event_lock_irq(conf->wait_for_stripe,
3423				    conf->quiesce == 0,
3424				    conf->device_lock, /* nothing */);
3425		atomic_inc(&conf->active_aligned_reads);
3426		spin_unlock_irq(&conf->device_lock);
3427
3428		generic_make_request(align_bi);
3429		return 1;
3430	} else {
3431		rcu_read_unlock();
3432		bio_put(align_bi);
3433		return 0;
3434	}
3435}
3436
3437
3438static int make_request(struct request_queue *q, struct bio * bi)
3439{
3440	mddev_t *mddev = q->queuedata;
3441	raid5_conf_t *conf = mddev_to_conf(mddev);
3442	unsigned int dd_idx, pd_idx;
3443	sector_t new_sector;
3444	sector_t logical_sector, last_sector;
3445	struct stripe_head *sh;
3446	const int rw = bio_data_dir(bi);
3447	int remaining;
3448
3449	if (unlikely(bio_barrier(bi))) {
3450		bio_endio(bi, -EOPNOTSUPP);
3451		return 0;
3452	}
3453
3454	md_write_start(mddev, bi);
3455
3456	disk_stat_inc(mddev->gendisk, ios[rw]);
3457	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
3458
3459	if (rw == READ &&
3460	     mddev->reshape_position == MaxSector &&
3461	     chunk_aligned_read(q,bi))
3462            	return 0;
3463
3464	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3465	last_sector = bi->bi_sector + (bi->bi_size>>9);
3466	bi->bi_next = NULL;
3467	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
3468
3469	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3470		DEFINE_WAIT(w);
3471		int disks, data_disks;
3472
3473	retry:
3474		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3475		if (likely(conf->expand_progress == MaxSector))
3476			disks = conf->raid_disks;
3477		else {
3478			/* spinlock is needed as expand_progress may be
3479			 * 64bit on a 32bit platform, and so it might be
3480			 * possible to see a half-updated value
3481			 * Ofcourse expand_progress could change after
3482			 * the lock is dropped, so once we get a reference
3483			 * to the stripe that we think it is, we will have
3484			 * to check again.
3485			 */
3486			spin_lock_irq(&conf->device_lock);
3487			disks = conf->raid_disks;
3488			if (logical_sector >= conf->expand_progress)
3489				disks = conf->previous_raid_disks;
3490			else {
3491				if (logical_sector >= conf->expand_lo) {
3492					spin_unlock_irq(&conf->device_lock);
3493					schedule();
3494					goto retry;
3495				}
3496			}
3497			spin_unlock_irq(&conf->device_lock);
3498		}
3499		data_disks = disks - conf->max_degraded;
3500
3501 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
3502						  &dd_idx, &pd_idx, conf);
3503		pr_debug("raid5: make_request, sector %llu logical %llu\n",
3504			(unsigned long long)new_sector,
3505			(unsigned long long)logical_sector);
3506
3507		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
3508		if (sh) {
3509			if (unlikely(conf->expand_progress != MaxSector)) {
3510				/* expansion might have moved on while waiting for a
3511				 * stripe, so we must do the range check again.
3512				 * Expansion could still move past after this
3513				 * test, but as we are holding a reference to
3514				 * 'sh', we know that if that happens,
3515				 *  STRIPE_EXPANDING will get set and the expansion
3516				 * won't proceed until we finish with the stripe.
3517				 */
3518				int must_retry = 0;
3519				spin_lock_irq(&conf->device_lock);
3520				if (logical_sector <  conf->expand_progress &&
3521				    disks == conf->previous_raid_disks)
3522					/* mismatch, need to try again */
3523					must_retry = 1;
3524				spin_unlock_irq(&conf->device_lock);
3525				if (must_retry) {
3526					release_stripe(sh);
3527					goto retry;
3528				}
3529			}
3530			/* FIXME what if we get a false positive because these
3531			 * are being updated.
3532			 */
3533			if (logical_sector >= mddev->suspend_lo &&
3534			    logical_sector < mddev->suspend_hi) {
3535				release_stripe(sh);
3536				schedule();
3537				goto retry;
3538			}
3539
3540			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3541			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
3542				/* Stripe is busy expanding or
3543				 * add failed due to overlap.  Flush everything
3544				 * and wait a while
3545				 */
3546				raid5_unplug_device(mddev->queue);
3547				release_stripe(sh);
3548				schedule();
3549				goto retry;
3550			}
3551			finish_wait(&conf->wait_for_overlap, &w);
3552			handle_stripe(sh, NULL);
3553			release_stripe(sh);
3554		} else {
3555			/* cannot get stripe for read-ahead, just give-up */
3556			clear_bit(BIO_UPTODATE, &bi->bi_flags);
3557			finish_wait(&conf->wait_for_overlap, &w);
3558			break;
3559		}
3560
3561	}
3562	spin_lock_irq(&conf->device_lock);
3563	remaining = --bi->bi_phys_segments;
3564	spin_unlock_irq(&conf->device_lock);
3565	if (remaining == 0) {
3566
3567		if ( rw == WRITE )
3568			md_write_end(mddev);
3569
3570		bi->bi_end_io(bi,
3571			      test_bit(BIO_UPTODATE, &bi->bi_flags)
3572			        ? 0 : -EIO);
3573	}
3574	return 0;
3575}
3576
3577static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
3578{
3579	/* reshaping is quite different to recovery/resync so it is
3580	 * handled quite separately ... here.
3581	 *
3582	 * On each call to sync_request, we gather one chunk worth of
3583	 * destination stripes and flag them as expanding.
3584	 * Then we find all the source stripes and request reads.
3585	 * As the reads complete, handle_stripe will copy the data
3586	 * into the destination stripe and release that stripe.
3587	 */
3588	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3589	struct stripe_head *sh;
3590	int pd_idx;
3591	sector_t first_sector, last_sector;
3592	int raid_disks = conf->previous_raid_disks;
3593	int data_disks = raid_disks - conf->max_degraded;
3594	int new_data_disks = conf->raid_disks - conf->max_degraded;
3595	int i;
3596	int dd_idx;
3597	sector_t writepos, safepos, gap;
3598
3599	if (sector_nr == 0 &&
3600	    conf->expand_progress != 0) {
3601		/* restarting in the middle, skip the initial sectors */
3602		sector_nr = conf->expand_progress;
3603		sector_div(sector_nr, new_data_disks);
3604		*skipped = 1;
3605		return sector_nr;
3606	}
3607
3608	/* we update the metadata when there is more than 3Meg
3609	 * in the block range (that is rather arbitrary, should
3610	 * probably be time based) or when the data about to be
3611	 * copied would over-write the source of the data at
3612	 * the front of the range.
3613	 * i.e. one new_stripe forward from expand_progress new_maps
3614	 * to after where expand_lo old_maps to
3615	 */
3616	writepos = conf->expand_progress +
3617		conf->chunk_size/512*(new_data_disks);
3618	sector_div(writepos, new_data_disks);
3619	safepos = conf->expand_lo;
3620	sector_div(safepos, data_disks);
3621	gap = conf->expand_progress - conf->expand_lo;
3622
3623	if (writepos >= safepos ||
3624	    gap > (new_data_disks)*3000*2 /*3Meg*/) {
3625		/* Cannot proceed until we've updated the superblock... */
3626		wait_event(conf->wait_for_overlap,
3627			   atomic_read(&conf->reshape_stripes)==0);
3628		mddev->reshape_position = conf->expand_progress;
3629		set_bit(MD_CHANGE_DEVS, &mddev->flags);
3630		md_wakeup_thread(mddev->thread);
3631		wait_event(mddev->sb_wait, mddev->flags == 0 ||
3632			   kthread_should_stop());
3633		spin_lock_irq(&conf->device_lock);
3634		conf->expand_lo = mddev->reshape_position;
3635		spin_unlock_irq(&conf->device_lock);
3636		wake_up(&conf->wait_for_overlap);
3637	}
3638
3639	for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
3640		int j;
3641		int skipped = 0;
3642		pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
3643		sh = get_active_stripe(conf, sector_nr+i,
3644				       conf->raid_disks, pd_idx, 0);
3645		set_bit(STRIPE_EXPANDING, &sh->state);
3646		atomic_inc(&conf->reshape_stripes);
3647		/* If any of this stripe is beyond the end of the old
3648		 * array, then we need to zero those blocks
3649		 */
3650		for (j=sh->disks; j--;) {
3651			sector_t s;
3652			if (j == sh->pd_idx)
3653				continue;
3654			if (conf->level == 6 &&
3655			    j == raid6_next_disk(sh->pd_idx, sh->disks))
3656				continue;
3657			s = compute_blocknr(sh, j);
3658			if (s < (mddev->array_size<<1)) {
3659				skipped = 1;
3660				continue;
3661			}
3662			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
3663			set_bit(R5_Expanded, &sh->dev[j].flags);
3664			set_bit(R5_UPTODATE, &sh->dev[j].flags);
3665		}
3666		if (!skipped) {
3667			set_bit(STRIPE_EXPAND_READY, &sh->state);
3668			set_bit(STRIPE_HANDLE, &sh->state);
3669		}
3670		release_stripe(sh);
3671	}
3672	spin_lock_irq(&conf->device_lock);
3673	conf->expand_progress = (sector_nr + i) * new_data_disks;
3674	spin_unlock_irq(&conf->device_lock);
3675	/* Ok, those stripe are ready. We can start scheduling
3676	 * reads on the source stripes.
3677	 * The source stripes are determined by mapping the first and last
3678	 * block on the destination stripes.
3679	 */
3680	first_sector =
3681		raid5_compute_sector(sector_nr*(new_data_disks),
3682				     raid_disks, data_disks,
3683				     &dd_idx, &pd_idx, conf);
3684	last_sector =
3685		raid5_compute_sector((sector_nr+conf->chunk_size/512)
3686				     *(new_data_disks) -1,
3687				     raid_disks, data_disks,
3688				     &dd_idx, &pd_idx, conf);
3689	if (last_sector >= (mddev->size<<1))
3690		last_sector = (mddev->size<<1)-1;
3691	while (first_sector <= last_sector) {
3692		pd_idx = stripe_to_pdidx(first_sector, conf,
3693					 conf->previous_raid_disks);
3694		sh = get_active_stripe(conf, first_sector,
3695				       conf->previous_raid_disks, pd_idx, 0);
3696		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3697		set_bit(STRIPE_HANDLE, &sh->state);
3698		release_stripe(sh);
3699		first_sector += STRIPE_SECTORS;
3700	}
3701	/* If this takes us to the resync_max point where we have to pause,
3702	 * then we need to write out the superblock.
3703	 */
3704	sector_nr += conf->chunk_size>>9;
3705	if (sector_nr >= mddev->resync_max) {
3706		/* Cannot proceed until we've updated the superblock... */
3707		wait_event(conf->wait_for_overlap,
3708			   atomic_read(&conf->reshape_stripes) == 0);
3709		mddev->reshape_position = conf->expand_progress;
3710		set_bit(MD_CHANGE_DEVS, &mddev->flags);
3711		md_wakeup_thread(mddev->thread);
3712		wait_event(mddev->sb_wait,
3713			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
3714			   || kthread_should_stop());
3715		spin_lock_irq(&conf->device_lock);
3716		conf->expand_lo = mddev->reshape_position;
3717		spin_unlock_irq(&conf->device_lock);
3718		wake_up(&conf->wait_for_overlap);
3719	}
3720	return conf->chunk_size>>9;
3721}
3722
3723/* FIXME go_faster isn't used */
3724static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
3725{
3726	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3727	struct stripe_head *sh;
3728	int pd_idx;
3729	int raid_disks = conf->raid_disks;
3730	sector_t max_sector = mddev->size << 1;
3731	int sync_blocks;
3732	int still_degraded = 0;
3733	int i;
3734
3735	if (sector_nr >= max_sector) {
3736		/* just being told to finish up .. nothing much to do */
3737		unplug_slaves(mddev);
3738		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3739			end_reshape(conf);
3740			return 0;
3741		}
3742
3743		if (mddev->curr_resync < max_sector) /* aborted */
3744			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3745					&sync_blocks, 1);
3746		else /* completed sync */
3747			conf->fullsync = 0;
3748		bitmap_close_sync(mddev->bitmap);
3749
3750		return 0;
3751	}
3752
3753	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3754		return reshape_request(mddev, sector_nr, skipped);
3755
3756	/* No need to check resync_max as we never do more than one
3757	 * stripe, and as resync_max will always be on a chunk boundary,
3758	 * if the check in md_do_sync didn't fire, there is no chance
3759	 * of overstepping resync_max here
3760	 */
3761
3762	/* if there is too many failed drives and we are trying
3763	 * to resync, then assert that we are finished, because there is
3764	 * nothing we can do.
3765	 */
3766	if (mddev->degraded >= conf->max_degraded &&
3767	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3768		sector_t rv = (mddev->size << 1) - sector_nr;
3769		*skipped = 1;
3770		return rv;
3771	}
3772	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
3773	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3774	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
3775		/* we can skip this block, and probably more */
3776		sync_blocks /= STRIPE_SECTORS;
3777		*skipped = 1;
3778		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
3779	}
3780
3781
3782	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3783
3784	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
3785	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
3786	if (sh == NULL) {
3787		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
3788		/* make sure we don't swamp the stripe cache if someone else
3789		 * is trying to get access
3790		 */
3791		schedule_timeout_uninterruptible(1);
3792	}
3793	/* Need to check if array will still be degraded after recovery/resync
3794	 * We don't need to check the 'failed' flag as when that gets set,
3795	 * recovery aborts.
3796	 */
3797	for (i=0; i<mddev->raid_disks; i++)
3798		if (conf->disks[i].rdev == NULL)
3799			still_degraded = 1;
3800
3801	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
3802
3803	spin_lock(&sh->lock);
3804	set_bit(STRIPE_SYNCING, &sh->state);
3805	clear_bit(STRIPE_INSYNC, &sh->state);
3806	spin_unlock(&sh->lock);
3807
3808	handle_stripe(sh, NULL);
3809	release_stripe(sh);
3810
3811	return STRIPE_SECTORS;
3812}
3813
3814static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3815{
3816	/* We may not be able to submit a whole bio at once as there
3817	 * may not be enough stripe_heads available.
3818	 * We cannot pre-allocate enough stripe_heads as we may need
3819	 * more than exist in the cache (if we allow ever large chunks).
3820	 * So we do one stripe head at a time and record in
3821	 * ->bi_hw_segments how many have been done.
3822	 *
3823	 * We *know* that this entire raid_bio is in one chunk, so
3824	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
3825	 */
3826	struct stripe_head *sh;
3827	int dd_idx, pd_idx;
3828	sector_t sector, logical_sector, last_sector;
3829	int scnt = 0;
3830	int remaining;
3831	int handled = 0;
3832
3833	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3834	sector = raid5_compute_sector(	logical_sector,
3835					conf->raid_disks,
3836					conf->raid_disks - conf->max_degraded,
3837					&dd_idx,
3838					&pd_idx,
3839					conf);
3840	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
3841
3842	for (; logical_sector < last_sector;
3843	     logical_sector += STRIPE_SECTORS,
3844		     sector += STRIPE_SECTORS,
3845		     scnt++) {
3846
3847		if (scnt < raid_bio->bi_hw_segments)
3848			/* already done this stripe */
3849			continue;
3850
3851		sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
3852
3853		if (!sh) {
3854			/* failed to get a stripe - must wait */
3855			raid_bio->bi_hw_segments = scnt;
3856			conf->retry_read_aligned = raid_bio;
3857			return handled;
3858		}
3859
3860		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
3861		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
3862			release_stripe(sh);
3863			raid_bio->bi_hw_segments = scnt;
3864			conf->retry_read_aligned = raid_bio;
3865			return handled;
3866		}
3867
3868		handle_stripe(sh, NULL);
3869		release_stripe(sh);
3870		handled++;
3871	}
3872	spin_lock_irq(&conf->device_lock);
3873	remaining = --raid_bio->bi_phys_segments;
3874	spin_unlock_irq(&conf->device_lock);
3875	if (remaining == 0) {
3876
3877		raid_bio->bi_end_io(raid_bio,
3878			      test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
3879			        ? 0 : -EIO);
3880	}
3881	if (atomic_dec_and_test(&conf->active_aligned_reads))
3882		wake_up(&conf->wait_for_stripe);
3883	return handled;
3884}
3885
3886
3887
3888/*
3889 * This is our raid5 kernel thread.
3890 *
3891 * We scan the hash table for stripes which can be handled now.
3892 * During the scan, completed stripes are saved for us by the interrupt
3893 * handler, so that they will not have to wait for our next wakeup.
3894 */
3895static void raid5d (mddev_t *mddev)
3896{
3897	struct stripe_head *sh;
3898	raid5_conf_t *conf = mddev_to_conf(mddev);
3899	int handled;
3900
3901	pr_debug("+++ raid5d active\n");
3902
3903	md_check_recovery(mddev);
3904
3905	handled = 0;
3906	spin_lock_irq(&conf->device_lock);
3907	while (1) {
3908		struct list_head *first;
3909		struct bio *bio;
3910
3911		if (conf->seq_flush != conf->seq_write) {
3912			int seq = conf->seq_flush;
3913			spin_unlock_irq(&conf->device_lock);
3914			bitmap_unplug(mddev->bitmap);
3915			spin_lock_irq(&conf->device_lock);
3916			conf->seq_write = seq;
3917			activate_bit_delay(conf);
3918		}
3919
3920		if (list_empty(&conf->handle_list) &&
3921		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
3922		    !blk_queue_plugged(mddev->queue) &&
3923		    !list_empty(&conf->delayed_list))
3924			raid5_activate_delayed(conf);
3925
3926		while ((bio = remove_bio_from_retry(conf))) {
3927			int ok;
3928			spin_unlock_irq(&conf->device_lock);
3929			ok = retry_aligned_read(conf, bio);
3930			spin_lock_irq(&conf->device_lock);
3931			if (!ok)
3932				break;
3933			handled++;
3934		}
3935
3936		if (list_empty(&conf->handle_list)) {
3937			async_tx_issue_pending_all();
3938			break;
3939		}
3940
3941		first = conf->handle_list.next;
3942		sh = list_entry(first, struct stripe_head, lru);
3943
3944		list_del_init(first);
3945		atomic_inc(&sh->count);
3946		BUG_ON(atomic_read(&sh->count)!= 1);
3947		spin_unlock_irq(&conf->device_lock);
3948
3949		handled++;
3950		handle_stripe(sh, conf->spare_page);
3951		release_stripe(sh);
3952
3953		spin_lock_irq(&conf->device_lock);
3954	}
3955	pr_debug("%d stripes handled\n", handled);
3956
3957	spin_unlock_irq(&conf->device_lock);
3958
3959	unplug_slaves(mddev);
3960
3961	pr_debug("--- raid5d inactive\n");
3962}
3963
3964static ssize_t
3965raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
3966{
3967	raid5_conf_t *conf = mddev_to_conf(mddev);
3968	if (conf)
3969		return sprintf(page, "%d\n", conf->max_nr_stripes);
3970	else
3971		return 0;
3972}
3973
3974static ssize_t
3975raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3976{
3977	raid5_conf_t *conf = mddev_to_conf(mddev);
3978	char *end;
3979	int new;
3980	if (len >= PAGE_SIZE)
3981		return -EINVAL;
3982	if (!conf)
3983		return -ENODEV;
3984
3985	new = simple_strtoul(page, &end, 10);
3986	if (!*page || (*end && *end != '\n') )
3987		return -EINVAL;
3988	if (new <= 16 || new > 32768)
3989		return -EINVAL;
3990	while (new < conf->max_nr_stripes) {
3991		if (drop_one_stripe(conf))
3992			conf->max_nr_stripes--;
3993		else
3994			break;
3995	}
3996	md_allow_write(mddev);
3997	while (new > conf->max_nr_stripes) {
3998		if (grow_one_stripe(conf))
3999			conf->max_nr_stripes++;
4000		else break;
4001	}
4002	return len;
4003}
4004
4005static struct md_sysfs_entry
4006raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4007				raid5_show_stripe_cache_size,
4008				raid5_store_stripe_cache_size);
4009
4010static ssize_t
4011stripe_cache_active_show(mddev_t *mddev, char *page)
4012{
4013	raid5_conf_t *conf = mddev_to_conf(mddev);
4014	if (conf)
4015		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4016	else
4017		return 0;
4018}
4019
4020static struct md_sysfs_entry
4021raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4022
4023static struct attribute *raid5_attrs[] =  {
4024	&raid5_stripecache_size.attr,
4025	&raid5_stripecache_active.attr,
4026	NULL,
4027};
4028static struct attribute_group raid5_attrs_group = {
4029	.name = NULL,
4030	.attrs = raid5_attrs,
4031};
4032
4033static int run(mddev_t *mddev)
4034{
4035	raid5_conf_t *conf;
4036	int raid_disk, memory;
4037	mdk_rdev_t *rdev;
4038	struct disk_info *disk;
4039	struct list_head *tmp;
4040	int working_disks = 0;
4041
4042	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
4043		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4044		       mdname(mddev), mddev->level);
4045		return -EIO;
4046	}
4047
4048	if (mddev->reshape_position != MaxSector) {
4049		/* Check that we can continue the reshape.
4050		 * Currently only disks can change, it must
4051		 * increase, and we must be past the point where
4052		 * a stripe over-writes itself
4053		 */
4054		sector_t here_new, here_old;
4055		int old_disks;
4056		int max_degraded = (mddev->level == 5 ? 1 : 2);
4057
4058		if (mddev->new_level != mddev->level ||
4059		    mddev->new_layout != mddev->layout ||
4060		    mddev->new_chunk != mddev->chunk_size) {
4061			printk(KERN_ERR "raid5: %s: unsupported reshape "
4062			       "required - aborting.\n",
4063			       mdname(mddev));
4064			return -EINVAL;
4065		}
4066		if (mddev->delta_disks <= 0) {
4067			printk(KERN_ERR "raid5: %s: unsupported reshape "
4068			       "(reduce disks) required - aborting.\n",
4069			       mdname(mddev));
4070			return -EINVAL;
4071		}
4072		old_disks = mddev->raid_disks - mddev->delta_disks;
4073		/* reshape_position must be on a new-stripe boundary, and one
4074		 * further up in new geometry must map after here in old
4075		 * geometry.
4076		 */
4077		here_new = mddev->reshape_position;
4078		if (sector_div(here_new, (mddev->chunk_size>>9)*
4079			       (mddev->raid_disks - max_degraded))) {
4080			printk(KERN_ERR "raid5: reshape_position not "
4081			       "on a stripe boundary\n");
4082			return -EINVAL;
4083		}
4084		/* here_new is the stripe we will write to */
4085		here_old = mddev->reshape_position;
4086		sector_div(here_old, (mddev->chunk_size>>9)*
4087			   (old_disks-max_degraded));
4088		/* here_old is the first stripe that we might need to read
4089		 * from */
4090		if (here_new >= here_old) {
4091			/* Reading from the same stripe as writing to - bad */
4092			printk(KERN_ERR "raid5: reshape_position too early for "
4093			       "auto-recovery - aborting.\n");
4094			return -EINVAL;
4095		}
4096		printk(KERN_INFO "raid5: reshape will continue\n");
4097		/* OK, we should be able to continue; */
4098	}
4099
4100
4101	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
4102	if ((conf = mddev->private) == NULL)
4103		goto abort;
4104	if (mddev->reshape_position == MaxSector) {
4105		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
4106	} else {
4107		conf->raid_disks = mddev->raid_disks;
4108		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4109	}
4110
4111	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4112			      GFP_KERNEL);
4113	if (!conf->disks)
4114		goto abort;
4115
4116	conf->mddev = mddev;
4117
4118	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4119		goto abort;
4120
4121	if (mddev->level == 6) {
4122		conf->spare_page = alloc_page(GFP_KERNEL);
4123		if (!conf->spare_page)
4124			goto abort;
4125	}
4126	spin_lock_init(&conf->device_lock);
4127	init_waitqueue_head(&conf->wait_for_stripe);
4128	init_waitqueue_head(&conf->wait_for_overlap);
4129	INIT_LIST_HEAD(&conf->handle_list);
4130	INIT_LIST_HEAD(&conf->delayed_list);
4131	INIT_LIST_HEAD(&conf->bitmap_list);
4132	INIT_LIST_HEAD(&conf->inactive_list);
4133	atomic_set(&conf->active_stripes, 0);
4134	atomic_set(&conf->preread_active_stripes, 0);
4135	atomic_set(&conf->active_aligned_reads, 0);
4136
4137	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4138
4139	rdev_for_each(rdev, tmp, mddev) {
4140		raid_disk = rdev->raid_disk;
4141		if (raid_disk >= conf->raid_disks
4142		    || raid_disk < 0)
4143			continue;
4144		disk = conf->disks + raid_disk;
4145
4146		disk->rdev = rdev;
4147
4148		if (test_bit(In_sync, &rdev->flags)) {
4149			char b[BDEVNAME_SIZE];
4150			printk(KERN_INFO "raid5: device %s operational as raid"
4151				" disk %d\n", bdevname(rdev->bdev,b),
4152				raid_disk);
4153			working_disks++;
4154		}
4155	}
4156
4157	/*
4158	 * 0 for a fully functional array, 1 or 2 for a degraded array.
4159	 */
4160	mddev->degraded = conf->raid_disks - working_disks;
4161	conf->mddev = mddev;
4162	conf->chunk_size = mddev->chunk_size;
4163	conf->level = mddev->level;
4164	if (conf->level == 6)
4165		conf->max_degraded = 2;
4166	else
4167		conf->max_degraded = 1;
4168	conf->algorithm = mddev->layout;
4169	conf->max_nr_stripes = NR_STRIPES;
4170	conf->expand_progress = mddev->reshape_position;
4171
4172	/* device size must be a multiple of chunk size */
4173	mddev->size &= ~(mddev->chunk_size/1024 -1);
4174	mddev->resync_max_sectors = mddev->size << 1;
4175
4176	if (conf->level == 6 && conf->raid_disks < 4) {
4177		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4178		       mdname(mddev), conf->raid_disks);
4179		goto abort;
4180	}
4181	if (!conf->chunk_size || conf->chunk_size % 4) {
4182		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4183			conf->chunk_size, mdname(mddev));
4184		goto abort;
4185	}
4186	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
4187		printk(KERN_ERR
4188			"raid5: unsupported parity algorithm %d for %s\n",
4189			conf->algorithm, mdname(mddev));
4190		goto abort;
4191	}
4192	if (mddev->degraded > conf->max_degraded) {
4193		printk(KERN_ERR "raid5: not enough operational devices for %s"
4194			" (%d/%d failed)\n",
4195			mdname(mddev), mddev->degraded, conf->raid_disks);
4196		goto abort;
4197	}
4198
4199	if (mddev->degraded > 0 &&
4200	    mddev->recovery_cp != MaxSector) {
4201		if (mddev->ok_start_degraded)
4202			printk(KERN_WARNING
4203			       "raid5: starting dirty degraded array: %s"
4204			       "- data corruption possible.\n",
4205			       mdname(mddev));
4206		else {
4207			printk(KERN_ERR
4208			       "raid5: cannot start dirty degraded array for %s\n",
4209			       mdname(mddev));
4210			goto abort;
4211		}
4212	}
4213
4214	{
4215		mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
4216		if (!mddev->thread) {
4217			printk(KERN_ERR
4218				"raid5: couldn't allocate thread for %s\n",
4219				mdname(mddev));
4220			goto abort;
4221		}
4222	}
4223	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4224		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4225	if (grow_stripes(conf, conf->max_nr_stripes)) {
4226		printk(KERN_ERR
4227			"raid5: couldn't allocate %dkB for buffers\n", memory);
4228		shrink_stripes(conf);
4229		md_unregister_thread(mddev->thread);
4230		goto abort;
4231	} else
4232		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4233			memory, mdname(mddev));
4234
4235	if (mddev->degraded == 0)
4236		printk("raid5: raid level %d set %s active with %d out of %d"
4237			" devices, algorithm %d\n", conf->level, mdname(mddev),
4238			mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4239			conf->algorithm);
4240	else
4241		printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
4242			" out of %d devices, algorithm %d\n", conf->level,
4243			mdname(mddev), mddev->raid_disks - mddev->degraded,
4244			mddev->raid_disks, conf->algorithm);
4245
4246	print_raid5_conf(conf);
4247
4248	if (conf->expand_progress != MaxSector) {
4249		printk("...ok start reshape thread\n");
4250		conf->expand_lo = conf->expand_progress;
4251		atomic_set(&conf->reshape_stripes, 0);
4252		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4253		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4254		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4255		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4256		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4257							"%s_reshape");
4258	}
4259
4260	/* read-ahead size must cover two whole stripes, which is
4261	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4262	 */
4263	{
4264		int data_disks = conf->previous_raid_disks - conf->max_degraded;
4265		int stripe = data_disks *
4266			(mddev->chunk_size / PAGE_SIZE);
4267		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4268			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4269	}
4270
4271	/* Ok, everything is just fine now */
4272	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4273		printk(KERN_WARNING
4274		       "raid5: failed to create sysfs attributes for %s\n",
4275		       mdname(mddev));
4276
4277	mddev->queue->unplug_fn = raid5_unplug_device;
4278	mddev->queue->backing_dev_info.congested_data = mddev;
4279	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4280
4281	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
4282					    conf->max_degraded);
4283
4284	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4285
4286	return 0;
4287abort:
4288	if (conf) {
4289		print_raid5_conf(conf);
4290		safe_put_page(conf->spare_page);
4291		kfree(conf->disks);
4292		kfree(conf->stripe_hashtbl);
4293		kfree(conf);
4294	}
4295	mddev->private = NULL;
4296	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
4297	return -EIO;
4298}
4299
4300
4301
4302static int stop(mddev_t *mddev)
4303{
4304	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4305
4306	md_unregister_thread(mddev->thread);
4307	mddev->thread = NULL;
4308	shrink_stripes(conf);
4309	kfree(conf->stripe_hashtbl);
4310	mddev->queue->backing_dev_info.congested_fn = NULL;
4311	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4312	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4313	kfree(conf->disks);
4314	kfree(conf);
4315	mddev->private = NULL;
4316	return 0;
4317}
4318
4319#ifdef DEBUG
4320static void print_sh (struct seq_file *seq, struct stripe_head *sh)
4321{
4322	int i;
4323
4324	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
4325		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
4326	seq_printf(seq, "sh %llu,  count %d.\n",
4327		   (unsigned long long)sh->sector, atomic_read(&sh->count));
4328	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
4329	for (i = 0; i < sh->disks; i++) {
4330		seq_printf(seq, "(cache%d: %p %ld) ",
4331			   i, sh->dev[i].page, sh->dev[i].flags);
4332	}
4333	seq_printf(seq, "\n");
4334}
4335
4336static void printall (struct seq_file *seq, raid5_conf_t *conf)
4337{
4338	struct stripe_head *sh;
4339	struct hlist_node *hn;
4340	int i;
4341
4342	spin_lock_irq(&conf->device_lock);
4343	for (i = 0; i < NR_HASH; i++) {
4344		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
4345			if (sh->raid_conf != conf)
4346				continue;
4347			print_sh(seq, sh);
4348		}
4349	}
4350	spin_unlock_irq(&conf->device_lock);
4351}
4352#endif
4353
4354static void status (struct seq_file *seq, mddev_t *mddev)
4355{
4356	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4357	int i;
4358
4359	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
4360	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
4361	for (i = 0; i < conf->raid_disks; i++)
4362		seq_printf (seq, "%s",
4363			       conf->disks[i].rdev &&
4364			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
4365	seq_printf (seq, "]");
4366#ifdef DEBUG
4367	seq_printf (seq, "\n");
4368	printall(seq, conf);
4369#endif
4370}
4371
4372static void print_raid5_conf (raid5_conf_t *conf)
4373{
4374	int i;
4375	struct disk_info *tmp;
4376
4377	printk("RAID5 conf printout:\n");
4378	if (!conf) {
4379		printk("(conf==NULL)\n");
4380		return;
4381	}
4382	printk(" --- rd:%d wd:%d\n", conf->raid_disks,
4383		 conf->raid_disks - conf->mddev->degraded);
4384
4385	for (i = 0; i < conf->raid_disks; i++) {
4386		char b[BDEVNAME_SIZE];
4387		tmp = conf->disks + i;
4388		if (tmp->rdev)
4389		printk(" disk %d, o:%d, dev:%s\n",
4390			i, !test_bit(Faulty, &tmp->rdev->flags),
4391			bdevname(tmp->rdev->bdev,b));
4392	}
4393}
4394
4395static int raid5_spare_active(mddev_t *mddev)
4396{
4397	int i;
4398	raid5_conf_t *conf = mddev->private;
4399	struct disk_info *tmp;
4400
4401	for (i = 0; i < conf->raid_disks; i++) {
4402		tmp = conf->disks + i;
4403		if (tmp->rdev
4404		    && !test_bit(Faulty, &tmp->rdev->flags)
4405		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
4406			unsigned long flags;
4407			spin_lock_irqsave(&conf->device_lock, flags);
4408			mddev->degraded--;
4409			spin_unlock_irqrestore(&conf->device_lock, flags);
4410		}
4411	}
4412	print_raid5_conf(conf);
4413	return 0;
4414}
4415
4416static int raid5_remove_disk(mddev_t *mddev, int number)
4417{
4418	raid5_conf_t *conf = mddev->private;
4419	int err = 0;
4420	mdk_rdev_t *rdev;
4421	struct disk_info *p = conf->disks + number;
4422
4423	print_raid5_conf(conf);
4424	rdev = p->rdev;
4425	if (rdev) {
4426		if (test_bit(In_sync, &rdev->flags) ||
4427		    atomic_read(&rdev->nr_pending)) {
4428			err = -EBUSY;
4429			goto abort;
4430		}
4431		p->rdev = NULL;
4432		synchronize_rcu();
4433		if (atomic_read(&rdev->nr_pending)) {
4434			/* lost the race, try later */
4435			err = -EBUSY;
4436			p->rdev = rdev;
4437		}
4438	}
4439abort:
4440
4441	print_raid5_conf(conf);
4442	return err;
4443}
4444
4445static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4446{
4447	raid5_conf_t *conf = mddev->private;
4448	int found = 0;
4449	int disk;
4450	struct disk_info *p;
4451
4452	if (mddev->degraded > conf->max_degraded)
4453		/* no point adding a device */
4454		return 0;
4455
4456	/*
4457	 * find the disk ... but prefer rdev->saved_raid_disk
4458	 * if possible.
4459	 */
4460	if (rdev->saved_raid_disk >= 0 &&
4461	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
4462		disk = rdev->saved_raid_disk;
4463	else
4464		disk = 0;
4465	for ( ; disk < conf->raid_disks; disk++)
4466		if ((p=conf->disks + disk)->rdev == NULL) {
4467			clear_bit(In_sync, &rdev->flags);
4468			rdev->raid_disk = disk;
4469			found = 1;
4470			if (rdev->saved_raid_disk != disk)
4471				conf->fullsync = 1;
4472			rcu_assign_pointer(p->rdev, rdev);
4473			break;
4474		}
4475	print_raid5_conf(conf);
4476	return found;
4477}
4478
4479static int raid5_resize(mddev_t *mddev, sector_t sectors)
4480{
4481	/* no resync is happening, and there is enough space
4482	 * on all devices, so we can resize.
4483	 * We need to make sure resync covers any new space.
4484	 * If the array is shrinking we should possibly wait until
4485	 * any io in the removed space completes, but it hardly seems
4486	 * worth it.
4487	 */
4488	raid5_conf_t *conf = mddev_to_conf(mddev);
4489
4490	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4491	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
4492	set_capacity(mddev->gendisk, mddev->array_size << 1);
4493	mddev->changed = 1;
4494	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
4495		mddev->recovery_cp = mddev->size << 1;
4496		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4497	}
4498	mddev->size = sectors /2;
4499	mddev->resync_max_sectors = sectors;
4500	return 0;
4501}
4502
4503#ifdef CONFIG_MD_RAID5_RESHAPE
4504static int raid5_check_reshape(mddev_t *mddev)
4505{
4506	raid5_conf_t *conf = mddev_to_conf(mddev);
4507	int err;
4508
4509	if (mddev->delta_disks < 0 ||
4510	    mddev->new_level != mddev->level)
4511		return -EINVAL; /* Cannot shrink array or change level yet */
4512	if (mddev->delta_disks == 0)
4513		return 0; /* nothing to do */
4514
4515	/* Can only proceed if there are plenty of stripe_heads.
4516	 * We need a minimum of one full stripe,, and for sensible progress
4517	 * it is best to have about 4 times that.
4518	 * If we require 4 times, then the default 256 4K stripe_heads will
4519	 * allow for chunk sizes up to 256K, which is probably OK.
4520	 * If the chunk size is greater, user-space should request more
4521	 * stripe_heads first.
4522	 */
4523	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
4524	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
4525		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
4526		       (mddev->chunk_size / STRIPE_SIZE)*4);
4527		return -ENOSPC;
4528	}
4529
4530	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
4531	if (err)
4532		return err;
4533
4534	if (mddev->degraded > conf->max_degraded)
4535		return -EINVAL;
4536	/* looks like we might be able to manage this */
4537	return 0;
4538}
4539
4540static int raid5_start_reshape(mddev_t *mddev)
4541{
4542	raid5_conf_t *conf = mddev_to_conf(mddev);
4543	mdk_rdev_t *rdev;
4544	struct list_head *rtmp;
4545	int spares = 0;
4546	int added_devices = 0;
4547	unsigned long flags;
4548
4549	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4550		return -EBUSY;
4551
4552	rdev_for_each(rdev, rtmp, mddev)
4553		if (rdev->raid_disk < 0 &&
4554		    !test_bit(Faulty, &rdev->flags))
4555			spares++;
4556
4557	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
4558		/* Not enough devices even to make a degraded array
4559		 * of that size
4560		 */
4561		return -EINVAL;
4562
4563	atomic_set(&conf->reshape_stripes, 0);
4564	spin_lock_irq(&conf->device_lock);
4565	conf->previous_raid_disks = conf->raid_disks;
4566	conf->raid_disks += mddev->delta_disks;
4567	conf->expand_progress = 0;
4568	conf->expand_lo = 0;
4569	spin_unlock_irq(&conf->device_lock);
4570
4571	/* Add some new drives, as many as will fit.
4572	 * We know there are enough to make the newly sized array work.
4573	 */
4574	rdev_for_each(rdev, rtmp, mddev)
4575		if (rdev->raid_disk < 0 &&
4576		    !test_bit(Faulty, &rdev->flags)) {
4577			if (raid5_add_disk(mddev, rdev)) {
4578				char nm[20];
4579				set_bit(In_sync, &rdev->flags);
4580				added_devices++;
4581				rdev->recovery_offset = 0;
4582				sprintf(nm, "rd%d", rdev->raid_disk);
4583				if (sysfs_create_link(&mddev->kobj,
4584						      &rdev->kobj, nm))
4585					printk(KERN_WARNING
4586					       "raid5: failed to create "
4587					       " link %s for %s\n",
4588					       nm, mdname(mddev));
4589			} else
4590				break;
4591		}
4592
4593	spin_lock_irqsave(&conf->device_lock, flags);
4594	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
4595	spin_unlock_irqrestore(&conf->device_lock, flags);
4596	mddev->raid_disks = conf->raid_disks;
4597	mddev->reshape_position = 0;
4598	set_bit(MD_CHANGE_DEVS, &mddev->flags);
4599
4600	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4601	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4602	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4603	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4604	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4605						"%s_reshape");
4606	if (!mddev->sync_thread) {
4607		mddev->recovery = 0;
4608		spin_lock_irq(&conf->device_lock);
4609		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
4610		conf->expand_progress = MaxSector;
4611		spin_unlock_irq(&conf->device_lock);
4612		return -EAGAIN;
4613	}
4614	md_wakeup_thread(mddev->sync_thread);
4615	md_new_event(mddev);
4616	return 0;
4617}
4618#endif
4619
4620static void end_reshape(raid5_conf_t *conf)
4621{
4622	struct block_device *bdev;
4623
4624	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4625		conf->mddev->array_size = conf->mddev->size *
4626			(conf->raid_disks - conf->max_degraded);
4627		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
4628		conf->mddev->changed = 1;
4629
4630		bdev = bdget_disk(conf->mddev->gendisk, 0);
4631		if (bdev) {
4632			mutex_lock(&bdev->bd_inode->i_mutex);
4633			i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
4634			mutex_unlock(&bdev->bd_inode->i_mutex);
4635			bdput(bdev);
4636		}
4637		spin_lock_irq(&conf->device_lock);
4638		conf->expand_progress = MaxSector;
4639		spin_unlock_irq(&conf->device_lock);
4640		conf->mddev->reshape_position = MaxSector;
4641
4642		/* read-ahead size must cover two whole stripes, which is
4643		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4644		 */
4645		{
4646			int data_disks = conf->previous_raid_disks - conf->max_degraded;
4647			int stripe = data_disks *
4648				(conf->mddev->chunk_size / PAGE_SIZE);
4649			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4650				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4651		}
4652	}
4653}
4654
4655static void raid5_quiesce(mddev_t *mddev, int state)
4656{
4657	raid5_conf_t *conf = mddev_to_conf(mddev);
4658
4659	switch(state) {
4660	case 2: /* resume for a suspend */
4661		wake_up(&conf->wait_for_overlap);
4662		break;
4663
4664	case 1: /* stop all writes */
4665		spin_lock_irq(&conf->device_lock);
4666		conf->quiesce = 1;
4667		wait_event_lock_irq(conf->wait_for_stripe,
4668				    atomic_read(&conf->active_stripes) == 0 &&
4669				    atomic_read(&conf->active_aligned_reads) == 0,
4670				    conf->device_lock, /* nothing */);
4671		spin_unlock_irq(&conf->device_lock);
4672		break;
4673
4674	case 0: /* re-enable writes */
4675		spin_lock_irq(&conf->device_lock);
4676		conf->quiesce = 0;
4677		wake_up(&conf->wait_for_stripe);
4678		wake_up(&conf->wait_for_overlap);
4679		spin_unlock_irq(&conf->device_lock);
4680		break;
4681	}
4682}
4683
4684static struct mdk_personality raid6_personality =
4685{
4686	.name		= "raid6",
4687	.level		= 6,
4688	.owner		= THIS_MODULE,
4689	.make_request	= make_request,
4690	.run		= run,
4691	.stop		= stop,
4692	.status		= status,
4693	.error_handler	= error,
4694	.hot_add_disk	= raid5_add_disk,
4695	.hot_remove_disk= raid5_remove_disk,
4696	.spare_active	= raid5_spare_active,
4697	.sync_request	= sync_request,
4698	.resize		= raid5_resize,
4699#ifdef CONFIG_MD_RAID5_RESHAPE
4700	.check_reshape	= raid5_check_reshape,
4701	.start_reshape  = raid5_start_reshape,
4702#endif
4703	.quiesce	= raid5_quiesce,
4704};
4705static struct mdk_personality raid5_personality =
4706{
4707	.name		= "raid5",
4708	.level		= 5,
4709	.owner		= THIS_MODULE,
4710	.make_request	= make_request,
4711	.run		= run,
4712	.stop		= stop,
4713	.status		= status,
4714	.error_handler	= error,
4715	.hot_add_disk	= raid5_add_disk,
4716	.hot_remove_disk= raid5_remove_disk,
4717	.spare_active	= raid5_spare_active,
4718	.sync_request	= sync_request,
4719	.resize		= raid5_resize,
4720#ifdef CONFIG_MD_RAID5_RESHAPE
4721	.check_reshape	= raid5_check_reshape,
4722	.start_reshape  = raid5_start_reshape,
4723#endif
4724	.quiesce	= raid5_quiesce,
4725};
4726
4727static struct mdk_personality raid4_personality =
4728{
4729	.name		= "raid4",
4730	.level		= 4,
4731	.owner		= THIS_MODULE,
4732	.make_request	= make_request,
4733	.run		= run,
4734	.stop		= stop,
4735	.status		= status,
4736	.error_handler	= error,
4737	.hot_add_disk	= raid5_add_disk,
4738	.hot_remove_disk= raid5_remove_disk,
4739	.spare_active	= raid5_spare_active,
4740	.sync_request	= sync_request,
4741	.resize		= raid5_resize,
4742#ifdef CONFIG_MD_RAID5_RESHAPE
4743	.check_reshape	= raid5_check_reshape,
4744	.start_reshape  = raid5_start_reshape,
4745#endif
4746	.quiesce	= raid5_quiesce,
4747};
4748
4749static int __init raid5_init(void)
4750{
4751	int e;
4752
4753	e = raid6_select_algo();
4754	if ( e )
4755		return e;
4756	register_md_personality(&raid6_personality);
4757	register_md_personality(&raid5_personality);
4758	register_md_personality(&raid4_personality);
4759	return 0;
4760}
4761
4762static void raid5_exit(void)
4763{
4764	unregister_md_personality(&raid6_personality);
4765	unregister_md_personality(&raid5_personality);
4766	unregister_md_personality(&raid4_personality);
4767}
4768
4769module_init(raid5_init);
4770module_exit(raid5_exit);
4771MODULE_LICENSE("GPL");
4772MODULE_ALIAS("md-personality-4"); /* RAID5 */
4773MODULE_ALIAS("md-raid5");
4774MODULE_ALIAS("md-raid4");
4775MODULE_ALIAS("md-level-5");
4776MODULE_ALIAS("md-level-4");
4777MODULE_ALIAS("md-personality-8"); /* RAID6 */
4778MODULE_ALIAS("md-raid6");
4779MODULE_ALIAS("md-level-6");
4780
4781/* This used to be two separate modules, they were: */
4782MODULE_ALIAS("raid5");
4783MODULE_ALIAS("raid6");
4784