raid5.c revision 671488cc25f7c194c7c7a9f258bab1df17a6ff69
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 *	   Copyright (C) 1999, 2000 Ingo Molnar
5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches.  Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->seq_flush is the number of the last batch that was closed to
32 *    new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 *   we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 *   batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include "md.h"
57#include "raid5.h"
58#include "raid0.h"
59#include "bitmap.h"
60
61/*
62 * Stripe cache
63 */
64
65#define NR_STRIPES		256
66#define STRIPE_SIZE		PAGE_SIZE
67#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
68#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
69#define	IO_THRESHOLD		1
70#define BYPASS_THRESHOLD	1
71#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
72#define HASH_MASK		(NR_HASH - 1)
73
74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
75{
76	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
77	return &conf->stripe_hashtbl[hash];
78}
79
80/* bio's attached to a stripe+device for I/O are linked together in bi_sector
81 * order without overlap.  There may be several bio's per stripe+device, and
82 * a bio could span several devices.
83 * When walking this list for a particular stripe+device, we must never proceed
84 * beyond a bio that extends past this device, as the next bio might no longer
85 * be valid.
86 * This function is used to determine the 'next' bio in the list, given the sector
87 * of the current stripe+device
88 */
89static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
90{
91	int sectors = bio->bi_size >> 9;
92	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
93		return bio->bi_next;
94	else
95		return NULL;
96}
97
98/*
99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */
102static inline int raid5_bi_phys_segments(struct bio *bio)
103{
104	return bio->bi_phys_segments & 0xffff;
105}
106
107static inline int raid5_bi_hw_segments(struct bio *bio)
108{
109	return (bio->bi_phys_segments >> 16) & 0xffff;
110}
111
112static inline int raid5_dec_bi_phys_segments(struct bio *bio)
113{
114	--bio->bi_phys_segments;
115	return raid5_bi_phys_segments(bio);
116}
117
118static inline int raid5_dec_bi_hw_segments(struct bio *bio)
119{
120	unsigned short val = raid5_bi_hw_segments(bio);
121
122	--val;
123	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
124	return val;
125}
126
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
128{
129	bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
130}
131
132/* Find first data disk in a raid6 stripe */
133static inline int raid6_d0(struct stripe_head *sh)
134{
135	if (sh->ddf_layout)
136		/* ddf always start from first device */
137		return 0;
138	/* md starts just after Q block */
139	if (sh->qd_idx == sh->disks - 1)
140		return 0;
141	else
142		return sh->qd_idx + 1;
143}
144static inline int raid6_next_disk(int disk, int raid_disks)
145{
146	disk++;
147	return (disk < raid_disks) ? disk : 0;
148}
149
150/* When walking through the disks in a raid5, starting at raid6_d0,
151 * We need to map each disk to a 'slot', where the data disks are slot
152 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
153 * is raid_disks-1.  This help does that mapping.
154 */
155static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
156			     int *count, int syndrome_disks)
157{
158	int slot = *count;
159
160	if (sh->ddf_layout)
161		(*count)++;
162	if (idx == sh->pd_idx)
163		return syndrome_disks;
164	if (idx == sh->qd_idx)
165		return syndrome_disks + 1;
166	if (!sh->ddf_layout)
167		(*count)++;
168	return slot;
169}
170
171static void return_io(struct bio *return_bi)
172{
173	struct bio *bi = return_bi;
174	while (bi) {
175
176		return_bi = bi->bi_next;
177		bi->bi_next = NULL;
178		bi->bi_size = 0;
179		bio_endio(bi, 0);
180		bi = return_bi;
181	}
182}
183
184static void print_raid5_conf (struct r5conf *conf);
185
186static int stripe_operations_active(struct stripe_head *sh)
187{
188	return sh->check_state || sh->reconstruct_state ||
189	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
190	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191}
192
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{
195	if (atomic_dec_and_test(&sh->count)) {
196		BUG_ON(!list_empty(&sh->lru));
197		BUG_ON(atomic_read(&conf->active_stripes)==0);
198		if (test_bit(STRIPE_HANDLE, &sh->state)) {
199			if (test_bit(STRIPE_DELAYED, &sh->state))
200				list_add_tail(&sh->lru, &conf->delayed_list);
201			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202				   sh->bm_seq - conf->seq_write > 0)
203				list_add_tail(&sh->lru, &conf->bitmap_list);
204			else {
205				clear_bit(STRIPE_BIT_DELAY, &sh->state);
206				list_add_tail(&sh->lru, &conf->handle_list);
207			}
208			md_wakeup_thread(conf->mddev->thread);
209		} else {
210			BUG_ON(stripe_operations_active(sh));
211			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
212				atomic_dec(&conf->preread_active_stripes);
213				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
214					md_wakeup_thread(conf->mddev->thread);
215			}
216			atomic_dec(&conf->active_stripes);
217			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218				list_add_tail(&sh->lru, &conf->inactive_list);
219				wake_up(&conf->wait_for_stripe);
220				if (conf->retry_read_aligned)
221					md_wakeup_thread(conf->mddev->thread);
222			}
223		}
224	}
225}
226
227static void release_stripe(struct stripe_head *sh)
228{
229	struct r5conf *conf = sh->raid_conf;
230	unsigned long flags;
231
232	spin_lock_irqsave(&conf->device_lock, flags);
233	__release_stripe(conf, sh);
234	spin_unlock_irqrestore(&conf->device_lock, flags);
235}
236
237static inline void remove_hash(struct stripe_head *sh)
238{
239	pr_debug("remove_hash(), stripe %llu\n",
240		(unsigned long long)sh->sector);
241
242	hlist_del_init(&sh->hash);
243}
244
245static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
246{
247	struct hlist_head *hp = stripe_hash(conf, sh->sector);
248
249	pr_debug("insert_hash(), stripe %llu\n",
250		(unsigned long long)sh->sector);
251
252	hlist_add_head(&sh->hash, hp);
253}
254
255
256/* find an idle stripe, make sure it is unhashed, and return it. */
257static struct stripe_head *get_free_stripe(struct r5conf *conf)
258{
259	struct stripe_head *sh = NULL;
260	struct list_head *first;
261
262	if (list_empty(&conf->inactive_list))
263		goto out;
264	first = conf->inactive_list.next;
265	sh = list_entry(first, struct stripe_head, lru);
266	list_del_init(first);
267	remove_hash(sh);
268	atomic_inc(&conf->active_stripes);
269out:
270	return sh;
271}
272
273static void shrink_buffers(struct stripe_head *sh)
274{
275	struct page *p;
276	int i;
277	int num = sh->raid_conf->pool_size;
278
279	for (i = 0; i < num ; i++) {
280		p = sh->dev[i].page;
281		if (!p)
282			continue;
283		sh->dev[i].page = NULL;
284		put_page(p);
285	}
286}
287
288static int grow_buffers(struct stripe_head *sh)
289{
290	int i;
291	int num = sh->raid_conf->pool_size;
292
293	for (i = 0; i < num; i++) {
294		struct page *page;
295
296		if (!(page = alloc_page(GFP_KERNEL))) {
297			return 1;
298		}
299		sh->dev[i].page = page;
300	}
301	return 0;
302}
303
304static void raid5_build_block(struct stripe_head *sh, int i, int previous);
305static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
306			    struct stripe_head *sh);
307
308static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
309{
310	struct r5conf *conf = sh->raid_conf;
311	int i;
312
313	BUG_ON(atomic_read(&sh->count) != 0);
314	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
315	BUG_ON(stripe_operations_active(sh));
316
317	pr_debug("init_stripe called, stripe %llu\n",
318		(unsigned long long)sh->sector);
319
320	remove_hash(sh);
321
322	sh->generation = conf->generation - previous;
323	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
324	sh->sector = sector;
325	stripe_set_idx(sector, conf, previous, sh);
326	sh->state = 0;
327
328
329	for (i = sh->disks; i--; ) {
330		struct r5dev *dev = &sh->dev[i];
331
332		if (dev->toread || dev->read || dev->towrite || dev->written ||
333		    test_bit(R5_LOCKED, &dev->flags)) {
334			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
335			       (unsigned long long)sh->sector, i, dev->toread,
336			       dev->read, dev->towrite, dev->written,
337			       test_bit(R5_LOCKED, &dev->flags));
338			WARN_ON(1);
339		}
340		dev->flags = 0;
341		raid5_build_block(sh, i, previous);
342	}
343	insert_hash(conf, sh);
344}
345
346static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
347					 short generation)
348{
349	struct stripe_head *sh;
350	struct hlist_node *hn;
351
352	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
353	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
354		if (sh->sector == sector && sh->generation == generation)
355			return sh;
356	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
357	return NULL;
358}
359
360/*
361 * Need to check if array has failed when deciding whether to:
362 *  - start an array
363 *  - remove non-faulty devices
364 *  - add a spare
365 *  - allow a reshape
366 * This determination is simple when no reshape is happening.
367 * However if there is a reshape, we need to carefully check
368 * both the before and after sections.
369 * This is because some failed devices may only affect one
370 * of the two sections, and some non-in_sync devices may
371 * be insync in the section most affected by failed devices.
372 */
373static int calc_degraded(struct r5conf *conf)
374{
375	int degraded, degraded2;
376	int i;
377
378	rcu_read_lock();
379	degraded = 0;
380	for (i = 0; i < conf->previous_raid_disks; i++) {
381		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
382		if (!rdev || test_bit(Faulty, &rdev->flags))
383			degraded++;
384		else if (test_bit(In_sync, &rdev->flags))
385			;
386		else
387			/* not in-sync or faulty.
388			 * If the reshape increases the number of devices,
389			 * this is being recovered by the reshape, so
390			 * this 'previous' section is not in_sync.
391			 * If the number of devices is being reduced however,
392			 * the device can only be part of the array if
393			 * we are reverting a reshape, so this section will
394			 * be in-sync.
395			 */
396			if (conf->raid_disks >= conf->previous_raid_disks)
397				degraded++;
398	}
399	rcu_read_unlock();
400	if (conf->raid_disks == conf->previous_raid_disks)
401		return degraded;
402	rcu_read_lock();
403	degraded2 = 0;
404	for (i = 0; i < conf->raid_disks; i++) {
405		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
406		if (!rdev || test_bit(Faulty, &rdev->flags))
407			degraded2++;
408		else if (test_bit(In_sync, &rdev->flags))
409			;
410		else
411			/* not in-sync or faulty.
412			 * If reshape increases the number of devices, this
413			 * section has already been recovered, else it
414			 * almost certainly hasn't.
415			 */
416			if (conf->raid_disks <= conf->previous_raid_disks)
417				degraded2++;
418	}
419	rcu_read_unlock();
420	if (degraded2 > degraded)
421		return degraded2;
422	return degraded;
423}
424
425static int has_failed(struct r5conf *conf)
426{
427	int degraded;
428
429	if (conf->mddev->reshape_position == MaxSector)
430		return conf->mddev->degraded > conf->max_degraded;
431
432	degraded = calc_degraded(conf);
433	if (degraded > conf->max_degraded)
434		return 1;
435	return 0;
436}
437
438static struct stripe_head *
439get_active_stripe(struct r5conf *conf, sector_t sector,
440		  int previous, int noblock, int noquiesce)
441{
442	struct stripe_head *sh;
443
444	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
445
446	spin_lock_irq(&conf->device_lock);
447
448	do {
449		wait_event_lock_irq(conf->wait_for_stripe,
450				    conf->quiesce == 0 || noquiesce,
451				    conf->device_lock, /* nothing */);
452		sh = __find_stripe(conf, sector, conf->generation - previous);
453		if (!sh) {
454			if (!conf->inactive_blocked)
455				sh = get_free_stripe(conf);
456			if (noblock && sh == NULL)
457				break;
458			if (!sh) {
459				conf->inactive_blocked = 1;
460				wait_event_lock_irq(conf->wait_for_stripe,
461						    !list_empty(&conf->inactive_list) &&
462						    (atomic_read(&conf->active_stripes)
463						     < (conf->max_nr_stripes *3/4)
464						     || !conf->inactive_blocked),
465						    conf->device_lock,
466						    );
467				conf->inactive_blocked = 0;
468			} else
469				init_stripe(sh, sector, previous);
470		} else {
471			if (atomic_read(&sh->count)) {
472				BUG_ON(!list_empty(&sh->lru)
473				    && !test_bit(STRIPE_EXPANDING, &sh->state));
474			} else {
475				if (!test_bit(STRIPE_HANDLE, &sh->state))
476					atomic_inc(&conf->active_stripes);
477				if (list_empty(&sh->lru) &&
478				    !test_bit(STRIPE_EXPANDING, &sh->state))
479					BUG();
480				list_del_init(&sh->lru);
481			}
482		}
483	} while (sh == NULL);
484
485	if (sh)
486		atomic_inc(&sh->count);
487
488	spin_unlock_irq(&conf->device_lock);
489	return sh;
490}
491
492static void
493raid5_end_read_request(struct bio *bi, int error);
494static void
495raid5_end_write_request(struct bio *bi, int error);
496
497static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
498{
499	struct r5conf *conf = sh->raid_conf;
500	int i, disks = sh->disks;
501
502	might_sleep();
503
504	for (i = disks; i--; ) {
505		int rw;
506		struct bio *bi;
507		struct md_rdev *rdev;
508		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
509			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
510				rw = WRITE_FUA;
511			else
512				rw = WRITE;
513		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
514			rw = READ;
515		else
516			continue;
517
518		bi = &sh->dev[i].req;
519
520		bi->bi_rw = rw;
521		if (rw & WRITE)
522			bi->bi_end_io = raid5_end_write_request;
523		else
524			bi->bi_end_io = raid5_end_read_request;
525
526		rcu_read_lock();
527		rdev = rcu_dereference(conf->disks[i].rdev);
528		if (rdev && test_bit(Faulty, &rdev->flags))
529			rdev = NULL;
530		if (rdev)
531			atomic_inc(&rdev->nr_pending);
532		rcu_read_unlock();
533
534		/* We have already checked bad blocks for reads.  Now
535		 * need to check for writes.
536		 */
537		while ((rw & WRITE) && rdev &&
538		       test_bit(WriteErrorSeen, &rdev->flags)) {
539			sector_t first_bad;
540			int bad_sectors;
541			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
542					      &first_bad, &bad_sectors);
543			if (!bad)
544				break;
545
546			if (bad < 0) {
547				set_bit(BlockedBadBlocks, &rdev->flags);
548				if (!conf->mddev->external &&
549				    conf->mddev->flags) {
550					/* It is very unlikely, but we might
551					 * still need to write out the
552					 * bad block log - better give it
553					 * a chance*/
554					md_check_recovery(conf->mddev);
555				}
556				md_wait_for_blocked_rdev(rdev, conf->mddev);
557			} else {
558				/* Acknowledged bad block - skip the write */
559				rdev_dec_pending(rdev, conf->mddev);
560				rdev = NULL;
561			}
562		}
563
564		if (rdev) {
565			if (s->syncing || s->expanding || s->expanded)
566				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
567
568			set_bit(STRIPE_IO_STARTED, &sh->state);
569
570			bi->bi_bdev = rdev->bdev;
571			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
572				__func__, (unsigned long long)sh->sector,
573				bi->bi_rw, i);
574			atomic_inc(&sh->count);
575			bi->bi_sector = sh->sector + rdev->data_offset;
576			bi->bi_flags = 1 << BIO_UPTODATE;
577			bi->bi_vcnt = 1;
578			bi->bi_max_vecs = 1;
579			bi->bi_idx = 0;
580			bi->bi_io_vec = &sh->dev[i].vec;
581			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
582			bi->bi_io_vec[0].bv_offset = 0;
583			bi->bi_size = STRIPE_SIZE;
584			bi->bi_next = NULL;
585			generic_make_request(bi);
586		} else {
587			if (rw & WRITE)
588				set_bit(STRIPE_DEGRADED, &sh->state);
589			pr_debug("skip op %ld on disc %d for sector %llu\n",
590				bi->bi_rw, i, (unsigned long long)sh->sector);
591			clear_bit(R5_LOCKED, &sh->dev[i].flags);
592			set_bit(STRIPE_HANDLE, &sh->state);
593		}
594	}
595}
596
597static struct dma_async_tx_descriptor *
598async_copy_data(int frombio, struct bio *bio, struct page *page,
599	sector_t sector, struct dma_async_tx_descriptor *tx)
600{
601	struct bio_vec *bvl;
602	struct page *bio_page;
603	int i;
604	int page_offset;
605	struct async_submit_ctl submit;
606	enum async_tx_flags flags = 0;
607
608	if (bio->bi_sector >= sector)
609		page_offset = (signed)(bio->bi_sector - sector) * 512;
610	else
611		page_offset = (signed)(sector - bio->bi_sector) * -512;
612
613	if (frombio)
614		flags |= ASYNC_TX_FENCE;
615	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
616
617	bio_for_each_segment(bvl, bio, i) {
618		int len = bvl->bv_len;
619		int clen;
620		int b_offset = 0;
621
622		if (page_offset < 0) {
623			b_offset = -page_offset;
624			page_offset += b_offset;
625			len -= b_offset;
626		}
627
628		if (len > 0 && page_offset + len > STRIPE_SIZE)
629			clen = STRIPE_SIZE - page_offset;
630		else
631			clen = len;
632
633		if (clen > 0) {
634			b_offset += bvl->bv_offset;
635			bio_page = bvl->bv_page;
636			if (frombio)
637				tx = async_memcpy(page, bio_page, page_offset,
638						  b_offset, clen, &submit);
639			else
640				tx = async_memcpy(bio_page, page, b_offset,
641						  page_offset, clen, &submit);
642		}
643		/* chain the operations */
644		submit.depend_tx = tx;
645
646		if (clen < len) /* hit end of page */
647			break;
648		page_offset +=  len;
649	}
650
651	return tx;
652}
653
654static void ops_complete_biofill(void *stripe_head_ref)
655{
656	struct stripe_head *sh = stripe_head_ref;
657	struct bio *return_bi = NULL;
658	struct r5conf *conf = sh->raid_conf;
659	int i;
660
661	pr_debug("%s: stripe %llu\n", __func__,
662		(unsigned long long)sh->sector);
663
664	/* clear completed biofills */
665	spin_lock_irq(&conf->device_lock);
666	for (i = sh->disks; i--; ) {
667		struct r5dev *dev = &sh->dev[i];
668
669		/* acknowledge completion of a biofill operation */
670		/* and check if we need to reply to a read request,
671		 * new R5_Wantfill requests are held off until
672		 * !STRIPE_BIOFILL_RUN
673		 */
674		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
675			struct bio *rbi, *rbi2;
676
677			BUG_ON(!dev->read);
678			rbi = dev->read;
679			dev->read = NULL;
680			while (rbi && rbi->bi_sector <
681				dev->sector + STRIPE_SECTORS) {
682				rbi2 = r5_next_bio(rbi, dev->sector);
683				if (!raid5_dec_bi_phys_segments(rbi)) {
684					rbi->bi_next = return_bi;
685					return_bi = rbi;
686				}
687				rbi = rbi2;
688			}
689		}
690	}
691	spin_unlock_irq(&conf->device_lock);
692	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
693
694	return_io(return_bi);
695
696	set_bit(STRIPE_HANDLE, &sh->state);
697	release_stripe(sh);
698}
699
700static void ops_run_biofill(struct stripe_head *sh)
701{
702	struct dma_async_tx_descriptor *tx = NULL;
703	struct r5conf *conf = sh->raid_conf;
704	struct async_submit_ctl submit;
705	int i;
706
707	pr_debug("%s: stripe %llu\n", __func__,
708		(unsigned long long)sh->sector);
709
710	for (i = sh->disks; i--; ) {
711		struct r5dev *dev = &sh->dev[i];
712		if (test_bit(R5_Wantfill, &dev->flags)) {
713			struct bio *rbi;
714			spin_lock_irq(&conf->device_lock);
715			dev->read = rbi = dev->toread;
716			dev->toread = NULL;
717			spin_unlock_irq(&conf->device_lock);
718			while (rbi && rbi->bi_sector <
719				dev->sector + STRIPE_SECTORS) {
720				tx = async_copy_data(0, rbi, dev->page,
721					dev->sector, tx);
722				rbi = r5_next_bio(rbi, dev->sector);
723			}
724		}
725	}
726
727	atomic_inc(&sh->count);
728	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
729	async_trigger_callback(&submit);
730}
731
732static void mark_target_uptodate(struct stripe_head *sh, int target)
733{
734	struct r5dev *tgt;
735
736	if (target < 0)
737		return;
738
739	tgt = &sh->dev[target];
740	set_bit(R5_UPTODATE, &tgt->flags);
741	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
742	clear_bit(R5_Wantcompute, &tgt->flags);
743}
744
745static void ops_complete_compute(void *stripe_head_ref)
746{
747	struct stripe_head *sh = stripe_head_ref;
748
749	pr_debug("%s: stripe %llu\n", __func__,
750		(unsigned long long)sh->sector);
751
752	/* mark the computed target(s) as uptodate */
753	mark_target_uptodate(sh, sh->ops.target);
754	mark_target_uptodate(sh, sh->ops.target2);
755
756	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
757	if (sh->check_state == check_state_compute_run)
758		sh->check_state = check_state_compute_result;
759	set_bit(STRIPE_HANDLE, &sh->state);
760	release_stripe(sh);
761}
762
763/* return a pointer to the address conversion region of the scribble buffer */
764static addr_conv_t *to_addr_conv(struct stripe_head *sh,
765				 struct raid5_percpu *percpu)
766{
767	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
768}
769
770static struct dma_async_tx_descriptor *
771ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
772{
773	int disks = sh->disks;
774	struct page **xor_srcs = percpu->scribble;
775	int target = sh->ops.target;
776	struct r5dev *tgt = &sh->dev[target];
777	struct page *xor_dest = tgt->page;
778	int count = 0;
779	struct dma_async_tx_descriptor *tx;
780	struct async_submit_ctl submit;
781	int i;
782
783	pr_debug("%s: stripe %llu block: %d\n",
784		__func__, (unsigned long long)sh->sector, target);
785	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
786
787	for (i = disks; i--; )
788		if (i != target)
789			xor_srcs[count++] = sh->dev[i].page;
790
791	atomic_inc(&sh->count);
792
793	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
794			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
795	if (unlikely(count == 1))
796		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
797	else
798		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
799
800	return tx;
801}
802
803/* set_syndrome_sources - populate source buffers for gen_syndrome
804 * @srcs - (struct page *) array of size sh->disks
805 * @sh - stripe_head to parse
806 *
807 * Populates srcs in proper layout order for the stripe and returns the
808 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
809 * destination buffer is recorded in srcs[count] and the Q destination
810 * is recorded in srcs[count+1]].
811 */
812static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
813{
814	int disks = sh->disks;
815	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
816	int d0_idx = raid6_d0(sh);
817	int count;
818	int i;
819
820	for (i = 0; i < disks; i++)
821		srcs[i] = NULL;
822
823	count = 0;
824	i = d0_idx;
825	do {
826		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
827
828		srcs[slot] = sh->dev[i].page;
829		i = raid6_next_disk(i, disks);
830	} while (i != d0_idx);
831
832	return syndrome_disks;
833}
834
835static struct dma_async_tx_descriptor *
836ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
837{
838	int disks = sh->disks;
839	struct page **blocks = percpu->scribble;
840	int target;
841	int qd_idx = sh->qd_idx;
842	struct dma_async_tx_descriptor *tx;
843	struct async_submit_ctl submit;
844	struct r5dev *tgt;
845	struct page *dest;
846	int i;
847	int count;
848
849	if (sh->ops.target < 0)
850		target = sh->ops.target2;
851	else if (sh->ops.target2 < 0)
852		target = sh->ops.target;
853	else
854		/* we should only have one valid target */
855		BUG();
856	BUG_ON(target < 0);
857	pr_debug("%s: stripe %llu block: %d\n",
858		__func__, (unsigned long long)sh->sector, target);
859
860	tgt = &sh->dev[target];
861	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
862	dest = tgt->page;
863
864	atomic_inc(&sh->count);
865
866	if (target == qd_idx) {
867		count = set_syndrome_sources(blocks, sh);
868		blocks[count] = NULL; /* regenerating p is not necessary */
869		BUG_ON(blocks[count+1] != dest); /* q should already be set */
870		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
871				  ops_complete_compute, sh,
872				  to_addr_conv(sh, percpu));
873		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
874	} else {
875		/* Compute any data- or p-drive using XOR */
876		count = 0;
877		for (i = disks; i-- ; ) {
878			if (i == target || i == qd_idx)
879				continue;
880			blocks[count++] = sh->dev[i].page;
881		}
882
883		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
884				  NULL, ops_complete_compute, sh,
885				  to_addr_conv(sh, percpu));
886		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
887	}
888
889	return tx;
890}
891
892static struct dma_async_tx_descriptor *
893ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
894{
895	int i, count, disks = sh->disks;
896	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
897	int d0_idx = raid6_d0(sh);
898	int faila = -1, failb = -1;
899	int target = sh->ops.target;
900	int target2 = sh->ops.target2;
901	struct r5dev *tgt = &sh->dev[target];
902	struct r5dev *tgt2 = &sh->dev[target2];
903	struct dma_async_tx_descriptor *tx;
904	struct page **blocks = percpu->scribble;
905	struct async_submit_ctl submit;
906
907	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
908		 __func__, (unsigned long long)sh->sector, target, target2);
909	BUG_ON(target < 0 || target2 < 0);
910	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
911	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
912
913	/* we need to open-code set_syndrome_sources to handle the
914	 * slot number conversion for 'faila' and 'failb'
915	 */
916	for (i = 0; i < disks ; i++)
917		blocks[i] = NULL;
918	count = 0;
919	i = d0_idx;
920	do {
921		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
922
923		blocks[slot] = sh->dev[i].page;
924
925		if (i == target)
926			faila = slot;
927		if (i == target2)
928			failb = slot;
929		i = raid6_next_disk(i, disks);
930	} while (i != d0_idx);
931
932	BUG_ON(faila == failb);
933	if (failb < faila)
934		swap(faila, failb);
935	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
936		 __func__, (unsigned long long)sh->sector, faila, failb);
937
938	atomic_inc(&sh->count);
939
940	if (failb == syndrome_disks+1) {
941		/* Q disk is one of the missing disks */
942		if (faila == syndrome_disks) {
943			/* Missing P+Q, just recompute */
944			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
945					  ops_complete_compute, sh,
946					  to_addr_conv(sh, percpu));
947			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
948						  STRIPE_SIZE, &submit);
949		} else {
950			struct page *dest;
951			int data_target;
952			int qd_idx = sh->qd_idx;
953
954			/* Missing D+Q: recompute D from P, then recompute Q */
955			if (target == qd_idx)
956				data_target = target2;
957			else
958				data_target = target;
959
960			count = 0;
961			for (i = disks; i-- ; ) {
962				if (i == data_target || i == qd_idx)
963					continue;
964				blocks[count++] = sh->dev[i].page;
965			}
966			dest = sh->dev[data_target].page;
967			init_async_submit(&submit,
968					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
969					  NULL, NULL, NULL,
970					  to_addr_conv(sh, percpu));
971			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
972				       &submit);
973
974			count = set_syndrome_sources(blocks, sh);
975			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
976					  ops_complete_compute, sh,
977					  to_addr_conv(sh, percpu));
978			return async_gen_syndrome(blocks, 0, count+2,
979						  STRIPE_SIZE, &submit);
980		}
981	} else {
982		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
983				  ops_complete_compute, sh,
984				  to_addr_conv(sh, percpu));
985		if (failb == syndrome_disks) {
986			/* We're missing D+P. */
987			return async_raid6_datap_recov(syndrome_disks+2,
988						       STRIPE_SIZE, faila,
989						       blocks, &submit);
990		} else {
991			/* We're missing D+D. */
992			return async_raid6_2data_recov(syndrome_disks+2,
993						       STRIPE_SIZE, faila, failb,
994						       blocks, &submit);
995		}
996	}
997}
998
999
1000static void ops_complete_prexor(void *stripe_head_ref)
1001{
1002	struct stripe_head *sh = stripe_head_ref;
1003
1004	pr_debug("%s: stripe %llu\n", __func__,
1005		(unsigned long long)sh->sector);
1006}
1007
1008static struct dma_async_tx_descriptor *
1009ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1010	       struct dma_async_tx_descriptor *tx)
1011{
1012	int disks = sh->disks;
1013	struct page **xor_srcs = percpu->scribble;
1014	int count = 0, pd_idx = sh->pd_idx, i;
1015	struct async_submit_ctl submit;
1016
1017	/* existing parity data subtracted */
1018	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1019
1020	pr_debug("%s: stripe %llu\n", __func__,
1021		(unsigned long long)sh->sector);
1022
1023	for (i = disks; i--; ) {
1024		struct r5dev *dev = &sh->dev[i];
1025		/* Only process blocks that are known to be uptodate */
1026		if (test_bit(R5_Wantdrain, &dev->flags))
1027			xor_srcs[count++] = dev->page;
1028	}
1029
1030	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1031			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1032	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1033
1034	return tx;
1035}
1036
1037static struct dma_async_tx_descriptor *
1038ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1039{
1040	int disks = sh->disks;
1041	int i;
1042
1043	pr_debug("%s: stripe %llu\n", __func__,
1044		(unsigned long long)sh->sector);
1045
1046	for (i = disks; i--; ) {
1047		struct r5dev *dev = &sh->dev[i];
1048		struct bio *chosen;
1049
1050		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1051			struct bio *wbi;
1052
1053			spin_lock_irq(&sh->raid_conf->device_lock);
1054			chosen = dev->towrite;
1055			dev->towrite = NULL;
1056			BUG_ON(dev->written);
1057			wbi = dev->written = chosen;
1058			spin_unlock_irq(&sh->raid_conf->device_lock);
1059
1060			while (wbi && wbi->bi_sector <
1061				dev->sector + STRIPE_SECTORS) {
1062				if (wbi->bi_rw & REQ_FUA)
1063					set_bit(R5_WantFUA, &dev->flags);
1064				tx = async_copy_data(1, wbi, dev->page,
1065					dev->sector, tx);
1066				wbi = r5_next_bio(wbi, dev->sector);
1067			}
1068		}
1069	}
1070
1071	return tx;
1072}
1073
1074static void ops_complete_reconstruct(void *stripe_head_ref)
1075{
1076	struct stripe_head *sh = stripe_head_ref;
1077	int disks = sh->disks;
1078	int pd_idx = sh->pd_idx;
1079	int qd_idx = sh->qd_idx;
1080	int i;
1081	bool fua = false;
1082
1083	pr_debug("%s: stripe %llu\n", __func__,
1084		(unsigned long long)sh->sector);
1085
1086	for (i = disks; i--; )
1087		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1088
1089	for (i = disks; i--; ) {
1090		struct r5dev *dev = &sh->dev[i];
1091
1092		if (dev->written || i == pd_idx || i == qd_idx) {
1093			set_bit(R5_UPTODATE, &dev->flags);
1094			if (fua)
1095				set_bit(R5_WantFUA, &dev->flags);
1096		}
1097	}
1098
1099	if (sh->reconstruct_state == reconstruct_state_drain_run)
1100		sh->reconstruct_state = reconstruct_state_drain_result;
1101	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1102		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1103	else {
1104		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1105		sh->reconstruct_state = reconstruct_state_result;
1106	}
1107
1108	set_bit(STRIPE_HANDLE, &sh->state);
1109	release_stripe(sh);
1110}
1111
1112static void
1113ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1114		     struct dma_async_tx_descriptor *tx)
1115{
1116	int disks = sh->disks;
1117	struct page **xor_srcs = percpu->scribble;
1118	struct async_submit_ctl submit;
1119	int count = 0, pd_idx = sh->pd_idx, i;
1120	struct page *xor_dest;
1121	int prexor = 0;
1122	unsigned long flags;
1123
1124	pr_debug("%s: stripe %llu\n", __func__,
1125		(unsigned long long)sh->sector);
1126
1127	/* check if prexor is active which means only process blocks
1128	 * that are part of a read-modify-write (written)
1129	 */
1130	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1131		prexor = 1;
1132		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1133		for (i = disks; i--; ) {
1134			struct r5dev *dev = &sh->dev[i];
1135			if (dev->written)
1136				xor_srcs[count++] = dev->page;
1137		}
1138	} else {
1139		xor_dest = sh->dev[pd_idx].page;
1140		for (i = disks; i--; ) {
1141			struct r5dev *dev = &sh->dev[i];
1142			if (i != pd_idx)
1143				xor_srcs[count++] = dev->page;
1144		}
1145	}
1146
1147	/* 1/ if we prexor'd then the dest is reused as a source
1148	 * 2/ if we did not prexor then we are redoing the parity
1149	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1150	 * for the synchronous xor case
1151	 */
1152	flags = ASYNC_TX_ACK |
1153		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1154
1155	atomic_inc(&sh->count);
1156
1157	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1158			  to_addr_conv(sh, percpu));
1159	if (unlikely(count == 1))
1160		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1161	else
1162		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1163}
1164
1165static void
1166ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1167		     struct dma_async_tx_descriptor *tx)
1168{
1169	struct async_submit_ctl submit;
1170	struct page **blocks = percpu->scribble;
1171	int count;
1172
1173	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1174
1175	count = set_syndrome_sources(blocks, sh);
1176
1177	atomic_inc(&sh->count);
1178
1179	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1180			  sh, to_addr_conv(sh, percpu));
1181	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1182}
1183
1184static void ops_complete_check(void *stripe_head_ref)
1185{
1186	struct stripe_head *sh = stripe_head_ref;
1187
1188	pr_debug("%s: stripe %llu\n", __func__,
1189		(unsigned long long)sh->sector);
1190
1191	sh->check_state = check_state_check_result;
1192	set_bit(STRIPE_HANDLE, &sh->state);
1193	release_stripe(sh);
1194}
1195
1196static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1197{
1198	int disks = sh->disks;
1199	int pd_idx = sh->pd_idx;
1200	int qd_idx = sh->qd_idx;
1201	struct page *xor_dest;
1202	struct page **xor_srcs = percpu->scribble;
1203	struct dma_async_tx_descriptor *tx;
1204	struct async_submit_ctl submit;
1205	int count;
1206	int i;
1207
1208	pr_debug("%s: stripe %llu\n", __func__,
1209		(unsigned long long)sh->sector);
1210
1211	count = 0;
1212	xor_dest = sh->dev[pd_idx].page;
1213	xor_srcs[count++] = xor_dest;
1214	for (i = disks; i--; ) {
1215		if (i == pd_idx || i == qd_idx)
1216			continue;
1217		xor_srcs[count++] = sh->dev[i].page;
1218	}
1219
1220	init_async_submit(&submit, 0, NULL, NULL, NULL,
1221			  to_addr_conv(sh, percpu));
1222	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1223			   &sh->ops.zero_sum_result, &submit);
1224
1225	atomic_inc(&sh->count);
1226	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1227	tx = async_trigger_callback(&submit);
1228}
1229
1230static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1231{
1232	struct page **srcs = percpu->scribble;
1233	struct async_submit_ctl submit;
1234	int count;
1235
1236	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1237		(unsigned long long)sh->sector, checkp);
1238
1239	count = set_syndrome_sources(srcs, sh);
1240	if (!checkp)
1241		srcs[count] = NULL;
1242
1243	atomic_inc(&sh->count);
1244	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1245			  sh, to_addr_conv(sh, percpu));
1246	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1247			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1248}
1249
1250static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1251{
1252	int overlap_clear = 0, i, disks = sh->disks;
1253	struct dma_async_tx_descriptor *tx = NULL;
1254	struct r5conf *conf = sh->raid_conf;
1255	int level = conf->level;
1256	struct raid5_percpu *percpu;
1257	unsigned long cpu;
1258
1259	cpu = get_cpu();
1260	percpu = per_cpu_ptr(conf->percpu, cpu);
1261	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1262		ops_run_biofill(sh);
1263		overlap_clear++;
1264	}
1265
1266	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1267		if (level < 6)
1268			tx = ops_run_compute5(sh, percpu);
1269		else {
1270			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1271				tx = ops_run_compute6_1(sh, percpu);
1272			else
1273				tx = ops_run_compute6_2(sh, percpu);
1274		}
1275		/* terminate the chain if reconstruct is not set to be run */
1276		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1277			async_tx_ack(tx);
1278	}
1279
1280	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1281		tx = ops_run_prexor(sh, percpu, tx);
1282
1283	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1284		tx = ops_run_biodrain(sh, tx);
1285		overlap_clear++;
1286	}
1287
1288	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1289		if (level < 6)
1290			ops_run_reconstruct5(sh, percpu, tx);
1291		else
1292			ops_run_reconstruct6(sh, percpu, tx);
1293	}
1294
1295	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1296		if (sh->check_state == check_state_run)
1297			ops_run_check_p(sh, percpu);
1298		else if (sh->check_state == check_state_run_q)
1299			ops_run_check_pq(sh, percpu, 0);
1300		else if (sh->check_state == check_state_run_pq)
1301			ops_run_check_pq(sh, percpu, 1);
1302		else
1303			BUG();
1304	}
1305
1306	if (overlap_clear)
1307		for (i = disks; i--; ) {
1308			struct r5dev *dev = &sh->dev[i];
1309			if (test_and_clear_bit(R5_Overlap, &dev->flags))
1310				wake_up(&sh->raid_conf->wait_for_overlap);
1311		}
1312	put_cpu();
1313}
1314
1315#ifdef CONFIG_MULTICORE_RAID456
1316static void async_run_ops(void *param, async_cookie_t cookie)
1317{
1318	struct stripe_head *sh = param;
1319	unsigned long ops_request = sh->ops.request;
1320
1321	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1322	wake_up(&sh->ops.wait_for_ops);
1323
1324	__raid_run_ops(sh, ops_request);
1325	release_stripe(sh);
1326}
1327
1328static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1329{
1330	/* since handle_stripe can be called outside of raid5d context
1331	 * we need to ensure sh->ops.request is de-staged before another
1332	 * request arrives
1333	 */
1334	wait_event(sh->ops.wait_for_ops,
1335		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1336	sh->ops.request = ops_request;
1337
1338	atomic_inc(&sh->count);
1339	async_schedule(async_run_ops, sh);
1340}
1341#else
1342#define raid_run_ops __raid_run_ops
1343#endif
1344
1345static int grow_one_stripe(struct r5conf *conf)
1346{
1347	struct stripe_head *sh;
1348	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1349	if (!sh)
1350		return 0;
1351
1352	sh->raid_conf = conf;
1353	#ifdef CONFIG_MULTICORE_RAID456
1354	init_waitqueue_head(&sh->ops.wait_for_ops);
1355	#endif
1356
1357	if (grow_buffers(sh)) {
1358		shrink_buffers(sh);
1359		kmem_cache_free(conf->slab_cache, sh);
1360		return 0;
1361	}
1362	/* we just created an active stripe so... */
1363	atomic_set(&sh->count, 1);
1364	atomic_inc(&conf->active_stripes);
1365	INIT_LIST_HEAD(&sh->lru);
1366	release_stripe(sh);
1367	return 1;
1368}
1369
1370static int grow_stripes(struct r5conf *conf, int num)
1371{
1372	struct kmem_cache *sc;
1373	int devs = max(conf->raid_disks, conf->previous_raid_disks);
1374
1375	if (conf->mddev->gendisk)
1376		sprintf(conf->cache_name[0],
1377			"raid%d-%s", conf->level, mdname(conf->mddev));
1378	else
1379		sprintf(conf->cache_name[0],
1380			"raid%d-%p", conf->level, conf->mddev);
1381	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1382
1383	conf->active_name = 0;
1384	sc = kmem_cache_create(conf->cache_name[conf->active_name],
1385			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1386			       0, 0, NULL);
1387	if (!sc)
1388		return 1;
1389	conf->slab_cache = sc;
1390	conf->pool_size = devs;
1391	while (num--)
1392		if (!grow_one_stripe(conf))
1393			return 1;
1394	return 0;
1395}
1396
1397/**
1398 * scribble_len - return the required size of the scribble region
1399 * @num - total number of disks in the array
1400 *
1401 * The size must be enough to contain:
1402 * 1/ a struct page pointer for each device in the array +2
1403 * 2/ room to convert each entry in (1) to its corresponding dma
1404 *    (dma_map_page()) or page (page_address()) address.
1405 *
1406 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1407 * calculate over all devices (not just the data blocks), using zeros in place
1408 * of the P and Q blocks.
1409 */
1410static size_t scribble_len(int num)
1411{
1412	size_t len;
1413
1414	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1415
1416	return len;
1417}
1418
1419static int resize_stripes(struct r5conf *conf, int newsize)
1420{
1421	/* Make all the stripes able to hold 'newsize' devices.
1422	 * New slots in each stripe get 'page' set to a new page.
1423	 *
1424	 * This happens in stages:
1425	 * 1/ create a new kmem_cache and allocate the required number of
1426	 *    stripe_heads.
1427	 * 2/ gather all the old stripe_heads and tranfer the pages across
1428	 *    to the new stripe_heads.  This will have the side effect of
1429	 *    freezing the array as once all stripe_heads have been collected,
1430	 *    no IO will be possible.  Old stripe heads are freed once their
1431	 *    pages have been transferred over, and the old kmem_cache is
1432	 *    freed when all stripes are done.
1433	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1434	 *    we simple return a failre status - no need to clean anything up.
1435	 * 4/ allocate new pages for the new slots in the new stripe_heads.
1436	 *    If this fails, we don't bother trying the shrink the
1437	 *    stripe_heads down again, we just leave them as they are.
1438	 *    As each stripe_head is processed the new one is released into
1439	 *    active service.
1440	 *
1441	 * Once step2 is started, we cannot afford to wait for a write,
1442	 * so we use GFP_NOIO allocations.
1443	 */
1444	struct stripe_head *osh, *nsh;
1445	LIST_HEAD(newstripes);
1446	struct disk_info *ndisks;
1447	unsigned long cpu;
1448	int err;
1449	struct kmem_cache *sc;
1450	int i;
1451
1452	if (newsize <= conf->pool_size)
1453		return 0; /* never bother to shrink */
1454
1455	err = md_allow_write(conf->mddev);
1456	if (err)
1457		return err;
1458
1459	/* Step 1 */
1460	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1461			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1462			       0, 0, NULL);
1463	if (!sc)
1464		return -ENOMEM;
1465
1466	for (i = conf->max_nr_stripes; i; i--) {
1467		nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1468		if (!nsh)
1469			break;
1470
1471		nsh->raid_conf = conf;
1472		#ifdef CONFIG_MULTICORE_RAID456
1473		init_waitqueue_head(&nsh->ops.wait_for_ops);
1474		#endif
1475
1476		list_add(&nsh->lru, &newstripes);
1477	}
1478	if (i) {
1479		/* didn't get enough, give up */
1480		while (!list_empty(&newstripes)) {
1481			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1482			list_del(&nsh->lru);
1483			kmem_cache_free(sc, nsh);
1484		}
1485		kmem_cache_destroy(sc);
1486		return -ENOMEM;
1487	}
1488	/* Step 2 - Must use GFP_NOIO now.
1489	 * OK, we have enough stripes, start collecting inactive
1490	 * stripes and copying them over
1491	 */
1492	list_for_each_entry(nsh, &newstripes, lru) {
1493		spin_lock_irq(&conf->device_lock);
1494		wait_event_lock_irq(conf->wait_for_stripe,
1495				    !list_empty(&conf->inactive_list),
1496				    conf->device_lock,
1497				    );
1498		osh = get_free_stripe(conf);
1499		spin_unlock_irq(&conf->device_lock);
1500		atomic_set(&nsh->count, 1);
1501		for(i=0; i<conf->pool_size; i++)
1502			nsh->dev[i].page = osh->dev[i].page;
1503		for( ; i<newsize; i++)
1504			nsh->dev[i].page = NULL;
1505		kmem_cache_free(conf->slab_cache, osh);
1506	}
1507	kmem_cache_destroy(conf->slab_cache);
1508
1509	/* Step 3.
1510	 * At this point, we are holding all the stripes so the array
1511	 * is completely stalled, so now is a good time to resize
1512	 * conf->disks and the scribble region
1513	 */
1514	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1515	if (ndisks) {
1516		for (i=0; i<conf->raid_disks; i++)
1517			ndisks[i] = conf->disks[i];
1518		kfree(conf->disks);
1519		conf->disks = ndisks;
1520	} else
1521		err = -ENOMEM;
1522
1523	get_online_cpus();
1524	conf->scribble_len = scribble_len(newsize);
1525	for_each_present_cpu(cpu) {
1526		struct raid5_percpu *percpu;
1527		void *scribble;
1528
1529		percpu = per_cpu_ptr(conf->percpu, cpu);
1530		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1531
1532		if (scribble) {
1533			kfree(percpu->scribble);
1534			percpu->scribble = scribble;
1535		} else {
1536			err = -ENOMEM;
1537			break;
1538		}
1539	}
1540	put_online_cpus();
1541
1542	/* Step 4, return new stripes to service */
1543	while(!list_empty(&newstripes)) {
1544		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1545		list_del_init(&nsh->lru);
1546
1547		for (i=conf->raid_disks; i < newsize; i++)
1548			if (nsh->dev[i].page == NULL) {
1549				struct page *p = alloc_page(GFP_NOIO);
1550				nsh->dev[i].page = p;
1551				if (!p)
1552					err = -ENOMEM;
1553			}
1554		release_stripe(nsh);
1555	}
1556	/* critical section pass, GFP_NOIO no longer needed */
1557
1558	conf->slab_cache = sc;
1559	conf->active_name = 1-conf->active_name;
1560	conf->pool_size = newsize;
1561	return err;
1562}
1563
1564static int drop_one_stripe(struct r5conf *conf)
1565{
1566	struct stripe_head *sh;
1567
1568	spin_lock_irq(&conf->device_lock);
1569	sh = get_free_stripe(conf);
1570	spin_unlock_irq(&conf->device_lock);
1571	if (!sh)
1572		return 0;
1573	BUG_ON(atomic_read(&sh->count));
1574	shrink_buffers(sh);
1575	kmem_cache_free(conf->slab_cache, sh);
1576	atomic_dec(&conf->active_stripes);
1577	return 1;
1578}
1579
1580static void shrink_stripes(struct r5conf *conf)
1581{
1582	while (drop_one_stripe(conf))
1583		;
1584
1585	if (conf->slab_cache)
1586		kmem_cache_destroy(conf->slab_cache);
1587	conf->slab_cache = NULL;
1588}
1589
1590static void raid5_end_read_request(struct bio * bi, int error)
1591{
1592	struct stripe_head *sh = bi->bi_private;
1593	struct r5conf *conf = sh->raid_conf;
1594	int disks = sh->disks, i;
1595	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1596	char b[BDEVNAME_SIZE];
1597	struct md_rdev *rdev;
1598
1599
1600	for (i=0 ; i<disks; i++)
1601		if (bi == &sh->dev[i].req)
1602			break;
1603
1604	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1605		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1606		uptodate);
1607	if (i == disks) {
1608		BUG();
1609		return;
1610	}
1611
1612	if (uptodate) {
1613		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1614		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1615			rdev = conf->disks[i].rdev;
1616			printk_ratelimited(
1617				KERN_INFO
1618				"md/raid:%s: read error corrected"
1619				" (%lu sectors at %llu on %s)\n",
1620				mdname(conf->mddev), STRIPE_SECTORS,
1621				(unsigned long long)(sh->sector
1622						     + rdev->data_offset),
1623				bdevname(rdev->bdev, b));
1624			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1625			clear_bit(R5_ReadError, &sh->dev[i].flags);
1626			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1627		}
1628		if (atomic_read(&conf->disks[i].rdev->read_errors))
1629			atomic_set(&conf->disks[i].rdev->read_errors, 0);
1630	} else {
1631		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1632		int retry = 0;
1633		rdev = conf->disks[i].rdev;
1634
1635		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1636		atomic_inc(&rdev->read_errors);
1637		if (conf->mddev->degraded >= conf->max_degraded)
1638			printk_ratelimited(
1639				KERN_WARNING
1640				"md/raid:%s: read error not correctable "
1641				"(sector %llu on %s).\n",
1642				mdname(conf->mddev),
1643				(unsigned long long)(sh->sector
1644						     + rdev->data_offset),
1645				bdn);
1646		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1647			/* Oh, no!!! */
1648			printk_ratelimited(
1649				KERN_WARNING
1650				"md/raid:%s: read error NOT corrected!! "
1651				"(sector %llu on %s).\n",
1652				mdname(conf->mddev),
1653				(unsigned long long)(sh->sector
1654						     + rdev->data_offset),
1655				bdn);
1656		else if (atomic_read(&rdev->read_errors)
1657			 > conf->max_nr_stripes)
1658			printk(KERN_WARNING
1659			       "md/raid:%s: Too many read errors, failing device %s.\n",
1660			       mdname(conf->mddev), bdn);
1661		else
1662			retry = 1;
1663		if (retry)
1664			set_bit(R5_ReadError, &sh->dev[i].flags);
1665		else {
1666			clear_bit(R5_ReadError, &sh->dev[i].flags);
1667			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1668			md_error(conf->mddev, rdev);
1669		}
1670	}
1671	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1672	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1673	set_bit(STRIPE_HANDLE, &sh->state);
1674	release_stripe(sh);
1675}
1676
1677static void raid5_end_write_request(struct bio *bi, int error)
1678{
1679	struct stripe_head *sh = bi->bi_private;
1680	struct r5conf *conf = sh->raid_conf;
1681	int disks = sh->disks, i;
1682	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1683	sector_t first_bad;
1684	int bad_sectors;
1685
1686	for (i=0 ; i<disks; i++)
1687		if (bi == &sh->dev[i].req)
1688			break;
1689
1690	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1691		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1692		uptodate);
1693	if (i == disks) {
1694		BUG();
1695		return;
1696	}
1697
1698	if (!uptodate) {
1699		set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1700		set_bit(R5_WriteError, &sh->dev[i].flags);
1701	} else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1702			       &first_bad, &bad_sectors))
1703		set_bit(R5_MadeGood, &sh->dev[i].flags);
1704
1705	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1706
1707	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1708	set_bit(STRIPE_HANDLE, &sh->state);
1709	release_stripe(sh);
1710}
1711
1712
1713static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1714
1715static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1716{
1717	struct r5dev *dev = &sh->dev[i];
1718
1719	bio_init(&dev->req);
1720	dev->req.bi_io_vec = &dev->vec;
1721	dev->req.bi_vcnt++;
1722	dev->req.bi_max_vecs++;
1723	dev->vec.bv_page = dev->page;
1724	dev->vec.bv_len = STRIPE_SIZE;
1725	dev->vec.bv_offset = 0;
1726
1727	dev->req.bi_sector = sh->sector;
1728	dev->req.bi_private = sh;
1729
1730	dev->flags = 0;
1731	dev->sector = compute_blocknr(sh, i, previous);
1732}
1733
1734static void error(struct mddev *mddev, struct md_rdev *rdev)
1735{
1736	char b[BDEVNAME_SIZE];
1737	struct r5conf *conf = mddev->private;
1738	unsigned long flags;
1739	pr_debug("raid456: error called\n");
1740
1741	spin_lock_irqsave(&conf->device_lock, flags);
1742	clear_bit(In_sync, &rdev->flags);
1743	mddev->degraded = calc_degraded(conf);
1744	spin_unlock_irqrestore(&conf->device_lock, flags);
1745	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1746
1747	set_bit(Blocked, &rdev->flags);
1748	set_bit(Faulty, &rdev->flags);
1749	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1750	printk(KERN_ALERT
1751	       "md/raid:%s: Disk failure on %s, disabling device.\n"
1752	       "md/raid:%s: Operation continuing on %d devices.\n",
1753	       mdname(mddev),
1754	       bdevname(rdev->bdev, b),
1755	       mdname(mddev),
1756	       conf->raid_disks - mddev->degraded);
1757}
1758
1759/*
1760 * Input: a 'big' sector number,
1761 * Output: index of the data and parity disk, and the sector # in them.
1762 */
1763static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
1764				     int previous, int *dd_idx,
1765				     struct stripe_head *sh)
1766{
1767	sector_t stripe, stripe2;
1768	sector_t chunk_number;
1769	unsigned int chunk_offset;
1770	int pd_idx, qd_idx;
1771	int ddf_layout = 0;
1772	sector_t new_sector;
1773	int algorithm = previous ? conf->prev_algo
1774				 : conf->algorithm;
1775	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1776					 : conf->chunk_sectors;
1777	int raid_disks = previous ? conf->previous_raid_disks
1778				  : conf->raid_disks;
1779	int data_disks = raid_disks - conf->max_degraded;
1780
1781	/* First compute the information on this sector */
1782
1783	/*
1784	 * Compute the chunk number and the sector offset inside the chunk
1785	 */
1786	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1787	chunk_number = r_sector;
1788
1789	/*
1790	 * Compute the stripe number
1791	 */
1792	stripe = chunk_number;
1793	*dd_idx = sector_div(stripe, data_disks);
1794	stripe2 = stripe;
1795	/*
1796	 * Select the parity disk based on the user selected algorithm.
1797	 */
1798	pd_idx = qd_idx = -1;
1799	switch(conf->level) {
1800	case 4:
1801		pd_idx = data_disks;
1802		break;
1803	case 5:
1804		switch (algorithm) {
1805		case ALGORITHM_LEFT_ASYMMETRIC:
1806			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1807			if (*dd_idx >= pd_idx)
1808				(*dd_idx)++;
1809			break;
1810		case ALGORITHM_RIGHT_ASYMMETRIC:
1811			pd_idx = sector_div(stripe2, raid_disks);
1812			if (*dd_idx >= pd_idx)
1813				(*dd_idx)++;
1814			break;
1815		case ALGORITHM_LEFT_SYMMETRIC:
1816			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1817			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1818			break;
1819		case ALGORITHM_RIGHT_SYMMETRIC:
1820			pd_idx = sector_div(stripe2, raid_disks);
1821			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1822			break;
1823		case ALGORITHM_PARITY_0:
1824			pd_idx = 0;
1825			(*dd_idx)++;
1826			break;
1827		case ALGORITHM_PARITY_N:
1828			pd_idx = data_disks;
1829			break;
1830		default:
1831			BUG();
1832		}
1833		break;
1834	case 6:
1835
1836		switch (algorithm) {
1837		case ALGORITHM_LEFT_ASYMMETRIC:
1838			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1839			qd_idx = pd_idx + 1;
1840			if (pd_idx == raid_disks-1) {
1841				(*dd_idx)++;	/* Q D D D P */
1842				qd_idx = 0;
1843			} else if (*dd_idx >= pd_idx)
1844				(*dd_idx) += 2; /* D D P Q D */
1845			break;
1846		case ALGORITHM_RIGHT_ASYMMETRIC:
1847			pd_idx = sector_div(stripe2, raid_disks);
1848			qd_idx = pd_idx + 1;
1849			if (pd_idx == raid_disks-1) {
1850				(*dd_idx)++;	/* Q D D D P */
1851				qd_idx = 0;
1852			} else if (*dd_idx >= pd_idx)
1853				(*dd_idx) += 2; /* D D P Q D */
1854			break;
1855		case ALGORITHM_LEFT_SYMMETRIC:
1856			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1857			qd_idx = (pd_idx + 1) % raid_disks;
1858			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1859			break;
1860		case ALGORITHM_RIGHT_SYMMETRIC:
1861			pd_idx = sector_div(stripe2, raid_disks);
1862			qd_idx = (pd_idx + 1) % raid_disks;
1863			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1864			break;
1865
1866		case ALGORITHM_PARITY_0:
1867			pd_idx = 0;
1868			qd_idx = 1;
1869			(*dd_idx) += 2;
1870			break;
1871		case ALGORITHM_PARITY_N:
1872			pd_idx = data_disks;
1873			qd_idx = data_disks + 1;
1874			break;
1875
1876		case ALGORITHM_ROTATING_ZERO_RESTART:
1877			/* Exactly the same as RIGHT_ASYMMETRIC, but or
1878			 * of blocks for computing Q is different.
1879			 */
1880			pd_idx = sector_div(stripe2, raid_disks);
1881			qd_idx = pd_idx + 1;
1882			if (pd_idx == raid_disks-1) {
1883				(*dd_idx)++;	/* Q D D D P */
1884				qd_idx = 0;
1885			} else if (*dd_idx >= pd_idx)
1886				(*dd_idx) += 2; /* D D P Q D */
1887			ddf_layout = 1;
1888			break;
1889
1890		case ALGORITHM_ROTATING_N_RESTART:
1891			/* Same a left_asymmetric, by first stripe is
1892			 * D D D P Q  rather than
1893			 * Q D D D P
1894			 */
1895			stripe2 += 1;
1896			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1897			qd_idx = pd_idx + 1;
1898			if (pd_idx == raid_disks-1) {
1899				(*dd_idx)++;	/* Q D D D P */
1900				qd_idx = 0;
1901			} else if (*dd_idx >= pd_idx)
1902				(*dd_idx) += 2; /* D D P Q D */
1903			ddf_layout = 1;
1904			break;
1905
1906		case ALGORITHM_ROTATING_N_CONTINUE:
1907			/* Same as left_symmetric but Q is before P */
1908			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1909			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1910			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1911			ddf_layout = 1;
1912			break;
1913
1914		case ALGORITHM_LEFT_ASYMMETRIC_6:
1915			/* RAID5 left_asymmetric, with Q on last device */
1916			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1917			if (*dd_idx >= pd_idx)
1918				(*dd_idx)++;
1919			qd_idx = raid_disks - 1;
1920			break;
1921
1922		case ALGORITHM_RIGHT_ASYMMETRIC_6:
1923			pd_idx = sector_div(stripe2, raid_disks-1);
1924			if (*dd_idx >= pd_idx)
1925				(*dd_idx)++;
1926			qd_idx = raid_disks - 1;
1927			break;
1928
1929		case ALGORITHM_LEFT_SYMMETRIC_6:
1930			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1931			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1932			qd_idx = raid_disks - 1;
1933			break;
1934
1935		case ALGORITHM_RIGHT_SYMMETRIC_6:
1936			pd_idx = sector_div(stripe2, raid_disks-1);
1937			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1938			qd_idx = raid_disks - 1;
1939			break;
1940
1941		case ALGORITHM_PARITY_0_6:
1942			pd_idx = 0;
1943			(*dd_idx)++;
1944			qd_idx = raid_disks - 1;
1945			break;
1946
1947		default:
1948			BUG();
1949		}
1950		break;
1951	}
1952
1953	if (sh) {
1954		sh->pd_idx = pd_idx;
1955		sh->qd_idx = qd_idx;
1956		sh->ddf_layout = ddf_layout;
1957	}
1958	/*
1959	 * Finally, compute the new sector number
1960	 */
1961	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1962	return new_sector;
1963}
1964
1965
1966static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1967{
1968	struct r5conf *conf = sh->raid_conf;
1969	int raid_disks = sh->disks;
1970	int data_disks = raid_disks - conf->max_degraded;
1971	sector_t new_sector = sh->sector, check;
1972	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1973					 : conf->chunk_sectors;
1974	int algorithm = previous ? conf->prev_algo
1975				 : conf->algorithm;
1976	sector_t stripe;
1977	int chunk_offset;
1978	sector_t chunk_number;
1979	int dummy1, dd_idx = i;
1980	sector_t r_sector;
1981	struct stripe_head sh2;
1982
1983
1984	chunk_offset = sector_div(new_sector, sectors_per_chunk);
1985	stripe = new_sector;
1986
1987	if (i == sh->pd_idx)
1988		return 0;
1989	switch(conf->level) {
1990	case 4: break;
1991	case 5:
1992		switch (algorithm) {
1993		case ALGORITHM_LEFT_ASYMMETRIC:
1994		case ALGORITHM_RIGHT_ASYMMETRIC:
1995			if (i > sh->pd_idx)
1996				i--;
1997			break;
1998		case ALGORITHM_LEFT_SYMMETRIC:
1999		case ALGORITHM_RIGHT_SYMMETRIC:
2000			if (i < sh->pd_idx)
2001				i += raid_disks;
2002			i -= (sh->pd_idx + 1);
2003			break;
2004		case ALGORITHM_PARITY_0:
2005			i -= 1;
2006			break;
2007		case ALGORITHM_PARITY_N:
2008			break;
2009		default:
2010			BUG();
2011		}
2012		break;
2013	case 6:
2014		if (i == sh->qd_idx)
2015			return 0; /* It is the Q disk */
2016		switch (algorithm) {
2017		case ALGORITHM_LEFT_ASYMMETRIC:
2018		case ALGORITHM_RIGHT_ASYMMETRIC:
2019		case ALGORITHM_ROTATING_ZERO_RESTART:
2020		case ALGORITHM_ROTATING_N_RESTART:
2021			if (sh->pd_idx == raid_disks-1)
2022				i--;	/* Q D D D P */
2023			else if (i > sh->pd_idx)
2024				i -= 2; /* D D P Q D */
2025			break;
2026		case ALGORITHM_LEFT_SYMMETRIC:
2027		case ALGORITHM_RIGHT_SYMMETRIC:
2028			if (sh->pd_idx == raid_disks-1)
2029				i--; /* Q D D D P */
2030			else {
2031				/* D D P Q D */
2032				if (i < sh->pd_idx)
2033					i += raid_disks;
2034				i -= (sh->pd_idx + 2);
2035			}
2036			break;
2037		case ALGORITHM_PARITY_0:
2038			i -= 2;
2039			break;
2040		case ALGORITHM_PARITY_N:
2041			break;
2042		case ALGORITHM_ROTATING_N_CONTINUE:
2043			/* Like left_symmetric, but P is before Q */
2044			if (sh->pd_idx == 0)
2045				i--;	/* P D D D Q */
2046			else {
2047				/* D D Q P D */
2048				if (i < sh->pd_idx)
2049					i += raid_disks;
2050				i -= (sh->pd_idx + 1);
2051			}
2052			break;
2053		case ALGORITHM_LEFT_ASYMMETRIC_6:
2054		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2055			if (i > sh->pd_idx)
2056				i--;
2057			break;
2058		case ALGORITHM_LEFT_SYMMETRIC_6:
2059		case ALGORITHM_RIGHT_SYMMETRIC_6:
2060			if (i < sh->pd_idx)
2061				i += data_disks + 1;
2062			i -= (sh->pd_idx + 1);
2063			break;
2064		case ALGORITHM_PARITY_0_6:
2065			i -= 1;
2066			break;
2067		default:
2068			BUG();
2069		}
2070		break;
2071	}
2072
2073	chunk_number = stripe * data_disks + i;
2074	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2075
2076	check = raid5_compute_sector(conf, r_sector,
2077				     previous, &dummy1, &sh2);
2078	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2079		|| sh2.qd_idx != sh->qd_idx) {
2080		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2081		       mdname(conf->mddev));
2082		return 0;
2083	}
2084	return r_sector;
2085}
2086
2087
2088static void
2089schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2090			 int rcw, int expand)
2091{
2092	int i, pd_idx = sh->pd_idx, disks = sh->disks;
2093	struct r5conf *conf = sh->raid_conf;
2094	int level = conf->level;
2095
2096	if (rcw) {
2097		/* if we are not expanding this is a proper write request, and
2098		 * there will be bios with new data to be drained into the
2099		 * stripe cache
2100		 */
2101		if (!expand) {
2102			sh->reconstruct_state = reconstruct_state_drain_run;
2103			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2104		} else
2105			sh->reconstruct_state = reconstruct_state_run;
2106
2107		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2108
2109		for (i = disks; i--; ) {
2110			struct r5dev *dev = &sh->dev[i];
2111
2112			if (dev->towrite) {
2113				set_bit(R5_LOCKED, &dev->flags);
2114				set_bit(R5_Wantdrain, &dev->flags);
2115				if (!expand)
2116					clear_bit(R5_UPTODATE, &dev->flags);
2117				s->locked++;
2118			}
2119		}
2120		if (s->locked + conf->max_degraded == disks)
2121			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2122				atomic_inc(&conf->pending_full_writes);
2123	} else {
2124		BUG_ON(level == 6);
2125		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2126			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2127
2128		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2129		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2130		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2131		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2132
2133		for (i = disks; i--; ) {
2134			struct r5dev *dev = &sh->dev[i];
2135			if (i == pd_idx)
2136				continue;
2137
2138			if (dev->towrite &&
2139			    (test_bit(R5_UPTODATE, &dev->flags) ||
2140			     test_bit(R5_Wantcompute, &dev->flags))) {
2141				set_bit(R5_Wantdrain, &dev->flags);
2142				set_bit(R5_LOCKED, &dev->flags);
2143				clear_bit(R5_UPTODATE, &dev->flags);
2144				s->locked++;
2145			}
2146		}
2147	}
2148
2149	/* keep the parity disk(s) locked while asynchronous operations
2150	 * are in flight
2151	 */
2152	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2153	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2154	s->locked++;
2155
2156	if (level == 6) {
2157		int qd_idx = sh->qd_idx;
2158		struct r5dev *dev = &sh->dev[qd_idx];
2159
2160		set_bit(R5_LOCKED, &dev->flags);
2161		clear_bit(R5_UPTODATE, &dev->flags);
2162		s->locked++;
2163	}
2164
2165	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2166		__func__, (unsigned long long)sh->sector,
2167		s->locked, s->ops_request);
2168}
2169
2170/*
2171 * Each stripe/dev can have one or more bion attached.
2172 * toread/towrite point to the first in a chain.
2173 * The bi_next chain must be in order.
2174 */
2175static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2176{
2177	struct bio **bip;
2178	struct r5conf *conf = sh->raid_conf;
2179	int firstwrite=0;
2180
2181	pr_debug("adding bi b#%llu to stripe s#%llu\n",
2182		(unsigned long long)bi->bi_sector,
2183		(unsigned long long)sh->sector);
2184
2185
2186	spin_lock_irq(&conf->device_lock);
2187	if (forwrite) {
2188		bip = &sh->dev[dd_idx].towrite;
2189		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2190			firstwrite = 1;
2191	} else
2192		bip = &sh->dev[dd_idx].toread;
2193	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2194		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2195			goto overlap;
2196		bip = & (*bip)->bi_next;
2197	}
2198	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2199		goto overlap;
2200
2201	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2202	if (*bip)
2203		bi->bi_next = *bip;
2204	*bip = bi;
2205	bi->bi_phys_segments++;
2206
2207	if (forwrite) {
2208		/* check if page is covered */
2209		sector_t sector = sh->dev[dd_idx].sector;
2210		for (bi=sh->dev[dd_idx].towrite;
2211		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2212			     bi && bi->bi_sector <= sector;
2213		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2214			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2215				sector = bi->bi_sector + (bi->bi_size>>9);
2216		}
2217		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2218			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2219	}
2220	spin_unlock_irq(&conf->device_lock);
2221
2222	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2223		(unsigned long long)(*bip)->bi_sector,
2224		(unsigned long long)sh->sector, dd_idx);
2225
2226	if (conf->mddev->bitmap && firstwrite) {
2227		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2228				  STRIPE_SECTORS, 0);
2229		sh->bm_seq = conf->seq_flush+1;
2230		set_bit(STRIPE_BIT_DELAY, &sh->state);
2231	}
2232	return 1;
2233
2234 overlap:
2235	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2236	spin_unlock_irq(&conf->device_lock);
2237	return 0;
2238}
2239
2240static void end_reshape(struct r5conf *conf);
2241
2242static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2243			    struct stripe_head *sh)
2244{
2245	int sectors_per_chunk =
2246		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2247	int dd_idx;
2248	int chunk_offset = sector_div(stripe, sectors_per_chunk);
2249	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2250
2251	raid5_compute_sector(conf,
2252			     stripe * (disks - conf->max_degraded)
2253			     *sectors_per_chunk + chunk_offset,
2254			     previous,
2255			     &dd_idx, sh);
2256}
2257
2258static void
2259handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2260				struct stripe_head_state *s, int disks,
2261				struct bio **return_bi)
2262{
2263	int i;
2264	for (i = disks; i--; ) {
2265		struct bio *bi;
2266		int bitmap_end = 0;
2267
2268		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2269			struct md_rdev *rdev;
2270			rcu_read_lock();
2271			rdev = rcu_dereference(conf->disks[i].rdev);
2272			if (rdev && test_bit(In_sync, &rdev->flags))
2273				atomic_inc(&rdev->nr_pending);
2274			else
2275				rdev = NULL;
2276			rcu_read_unlock();
2277			if (rdev) {
2278				if (!rdev_set_badblocks(
2279					    rdev,
2280					    sh->sector,
2281					    STRIPE_SECTORS, 0))
2282					md_error(conf->mddev, rdev);
2283				rdev_dec_pending(rdev, conf->mddev);
2284			}
2285		}
2286		spin_lock_irq(&conf->device_lock);
2287		/* fail all writes first */
2288		bi = sh->dev[i].towrite;
2289		sh->dev[i].towrite = NULL;
2290		if (bi) {
2291			s->to_write--;
2292			bitmap_end = 1;
2293		}
2294
2295		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2296			wake_up(&conf->wait_for_overlap);
2297
2298		while (bi && bi->bi_sector <
2299			sh->dev[i].sector + STRIPE_SECTORS) {
2300			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2301			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2302			if (!raid5_dec_bi_phys_segments(bi)) {
2303				md_write_end(conf->mddev);
2304				bi->bi_next = *return_bi;
2305				*return_bi = bi;
2306			}
2307			bi = nextbi;
2308		}
2309		/* and fail all 'written' */
2310		bi = sh->dev[i].written;
2311		sh->dev[i].written = NULL;
2312		if (bi) bitmap_end = 1;
2313		while (bi && bi->bi_sector <
2314		       sh->dev[i].sector + STRIPE_SECTORS) {
2315			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2316			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2317			if (!raid5_dec_bi_phys_segments(bi)) {
2318				md_write_end(conf->mddev);
2319				bi->bi_next = *return_bi;
2320				*return_bi = bi;
2321			}
2322			bi = bi2;
2323		}
2324
2325		/* fail any reads if this device is non-operational and
2326		 * the data has not reached the cache yet.
2327		 */
2328		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2329		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2330		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
2331			bi = sh->dev[i].toread;
2332			sh->dev[i].toread = NULL;
2333			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2334				wake_up(&conf->wait_for_overlap);
2335			if (bi) s->to_read--;
2336			while (bi && bi->bi_sector <
2337			       sh->dev[i].sector + STRIPE_SECTORS) {
2338				struct bio *nextbi =
2339					r5_next_bio(bi, sh->dev[i].sector);
2340				clear_bit(BIO_UPTODATE, &bi->bi_flags);
2341				if (!raid5_dec_bi_phys_segments(bi)) {
2342					bi->bi_next = *return_bi;
2343					*return_bi = bi;
2344				}
2345				bi = nextbi;
2346			}
2347		}
2348		spin_unlock_irq(&conf->device_lock);
2349		if (bitmap_end)
2350			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2351					STRIPE_SECTORS, 0, 0);
2352		/* If we were in the middle of a write the parity block might
2353		 * still be locked - so just clear all R5_LOCKED flags
2354		 */
2355		clear_bit(R5_LOCKED, &sh->dev[i].flags);
2356	}
2357
2358	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2359		if (atomic_dec_and_test(&conf->pending_full_writes))
2360			md_wakeup_thread(conf->mddev->thread);
2361}
2362
2363static void
2364handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2365		   struct stripe_head_state *s)
2366{
2367	int abort = 0;
2368	int i;
2369
2370	md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2371	clear_bit(STRIPE_SYNCING, &sh->state);
2372	s->syncing = 0;
2373	/* There is nothing more to do for sync/check/repair.
2374	 * For recover we need to record a bad block on all
2375	 * non-sync devices, or abort the recovery
2376	 */
2377	if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2378		return;
2379	/* During recovery devices cannot be removed, so locking and
2380	 * refcounting of rdevs is not needed
2381	 */
2382	for (i = 0; i < conf->raid_disks; i++) {
2383		struct md_rdev *rdev = conf->disks[i].rdev;
2384		if (!rdev
2385		    || test_bit(Faulty, &rdev->flags)
2386		    || test_bit(In_sync, &rdev->flags))
2387			continue;
2388		if (!rdev_set_badblocks(rdev, sh->sector,
2389					STRIPE_SECTORS, 0))
2390			abort = 1;
2391	}
2392	if (abort) {
2393		conf->recovery_disabled = conf->mddev->recovery_disabled;
2394		set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2395	}
2396}
2397
2398/* fetch_block - checks the given member device to see if its data needs
2399 * to be read or computed to satisfy a request.
2400 *
2401 * Returns 1 when no more member devices need to be checked, otherwise returns
2402 * 0 to tell the loop in handle_stripe_fill to continue
2403 */
2404static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2405		       int disk_idx, int disks)
2406{
2407	struct r5dev *dev = &sh->dev[disk_idx];
2408	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2409				  &sh->dev[s->failed_num[1]] };
2410
2411	/* is the data in this block needed, and can we get it? */
2412	if (!test_bit(R5_LOCKED, &dev->flags) &&
2413	    !test_bit(R5_UPTODATE, &dev->flags) &&
2414	    (dev->toread ||
2415	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2416	     s->syncing || s->expanding ||
2417	     (s->failed >= 1 && fdev[0]->toread) ||
2418	     (s->failed >= 2 && fdev[1]->toread) ||
2419	     (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2420	      !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2421	     (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2422		/* we would like to get this block, possibly by computing it,
2423		 * otherwise read it if the backing disk is insync
2424		 */
2425		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2426		BUG_ON(test_bit(R5_Wantread, &dev->flags));
2427		if ((s->uptodate == disks - 1) &&
2428		    (s->failed && (disk_idx == s->failed_num[0] ||
2429				   disk_idx == s->failed_num[1]))) {
2430			/* have disk failed, and we're requested to fetch it;
2431			 * do compute it
2432			 */
2433			pr_debug("Computing stripe %llu block %d\n",
2434			       (unsigned long long)sh->sector, disk_idx);
2435			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2436			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2437			set_bit(R5_Wantcompute, &dev->flags);
2438			sh->ops.target = disk_idx;
2439			sh->ops.target2 = -1; /* no 2nd target */
2440			s->req_compute = 1;
2441			/* Careful: from this point on 'uptodate' is in the eye
2442			 * of raid_run_ops which services 'compute' operations
2443			 * before writes. R5_Wantcompute flags a block that will
2444			 * be R5_UPTODATE by the time it is needed for a
2445			 * subsequent operation.
2446			 */
2447			s->uptodate++;
2448			return 1;
2449		} else if (s->uptodate == disks-2 && s->failed >= 2) {
2450			/* Computing 2-failure is *very* expensive; only
2451			 * do it if failed >= 2
2452			 */
2453			int other;
2454			for (other = disks; other--; ) {
2455				if (other == disk_idx)
2456					continue;
2457				if (!test_bit(R5_UPTODATE,
2458				      &sh->dev[other].flags))
2459					break;
2460			}
2461			BUG_ON(other < 0);
2462			pr_debug("Computing stripe %llu blocks %d,%d\n",
2463			       (unsigned long long)sh->sector,
2464			       disk_idx, other);
2465			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2466			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2467			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2468			set_bit(R5_Wantcompute, &sh->dev[other].flags);
2469			sh->ops.target = disk_idx;
2470			sh->ops.target2 = other;
2471			s->uptodate += 2;
2472			s->req_compute = 1;
2473			return 1;
2474		} else if (test_bit(R5_Insync, &dev->flags)) {
2475			set_bit(R5_LOCKED, &dev->flags);
2476			set_bit(R5_Wantread, &dev->flags);
2477			s->locked++;
2478			pr_debug("Reading block %d (sync=%d)\n",
2479				disk_idx, s->syncing);
2480		}
2481	}
2482
2483	return 0;
2484}
2485
2486/**
2487 * handle_stripe_fill - read or compute data to satisfy pending requests.
2488 */
2489static void handle_stripe_fill(struct stripe_head *sh,
2490			       struct stripe_head_state *s,
2491			       int disks)
2492{
2493	int i;
2494
2495	/* look for blocks to read/compute, skip this if a compute
2496	 * is already in flight, or if the stripe contents are in the
2497	 * midst of changing due to a write
2498	 */
2499	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2500	    !sh->reconstruct_state)
2501		for (i = disks; i--; )
2502			if (fetch_block(sh, s, i, disks))
2503				break;
2504	set_bit(STRIPE_HANDLE, &sh->state);
2505}
2506
2507
2508/* handle_stripe_clean_event
2509 * any written block on an uptodate or failed drive can be returned.
2510 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2511 * never LOCKED, so we don't need to test 'failed' directly.
2512 */
2513static void handle_stripe_clean_event(struct r5conf *conf,
2514	struct stripe_head *sh, int disks, struct bio **return_bi)
2515{
2516	int i;
2517	struct r5dev *dev;
2518
2519	for (i = disks; i--; )
2520		if (sh->dev[i].written) {
2521			dev = &sh->dev[i];
2522			if (!test_bit(R5_LOCKED, &dev->flags) &&
2523				test_bit(R5_UPTODATE, &dev->flags)) {
2524				/* We can return any write requests */
2525				struct bio *wbi, *wbi2;
2526				int bitmap_end = 0;
2527				pr_debug("Return write for disc %d\n", i);
2528				spin_lock_irq(&conf->device_lock);
2529				wbi = dev->written;
2530				dev->written = NULL;
2531				while (wbi && wbi->bi_sector <
2532					dev->sector + STRIPE_SECTORS) {
2533					wbi2 = r5_next_bio(wbi, dev->sector);
2534					if (!raid5_dec_bi_phys_segments(wbi)) {
2535						md_write_end(conf->mddev);
2536						wbi->bi_next = *return_bi;
2537						*return_bi = wbi;
2538					}
2539					wbi = wbi2;
2540				}
2541				if (dev->towrite == NULL)
2542					bitmap_end = 1;
2543				spin_unlock_irq(&conf->device_lock);
2544				if (bitmap_end)
2545					bitmap_endwrite(conf->mddev->bitmap,
2546							sh->sector,
2547							STRIPE_SECTORS,
2548					 !test_bit(STRIPE_DEGRADED, &sh->state),
2549							0);
2550			}
2551		}
2552
2553	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2554		if (atomic_dec_and_test(&conf->pending_full_writes))
2555			md_wakeup_thread(conf->mddev->thread);
2556}
2557
2558static void handle_stripe_dirtying(struct r5conf *conf,
2559				   struct stripe_head *sh,
2560				   struct stripe_head_state *s,
2561				   int disks)
2562{
2563	int rmw = 0, rcw = 0, i;
2564	if (conf->max_degraded == 2) {
2565		/* RAID6 requires 'rcw' in current implementation
2566		 * Calculate the real rcw later - for now fake it
2567		 * look like rcw is cheaper
2568		 */
2569		rcw = 1; rmw = 2;
2570	} else for (i = disks; i--; ) {
2571		/* would I have to read this buffer for read_modify_write */
2572		struct r5dev *dev = &sh->dev[i];
2573		if ((dev->towrite || i == sh->pd_idx) &&
2574		    !test_bit(R5_LOCKED, &dev->flags) &&
2575		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2576		      test_bit(R5_Wantcompute, &dev->flags))) {
2577			if (test_bit(R5_Insync, &dev->flags))
2578				rmw++;
2579			else
2580				rmw += 2*disks;  /* cannot read it */
2581		}
2582		/* Would I have to read this buffer for reconstruct_write */
2583		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2584		    !test_bit(R5_LOCKED, &dev->flags) &&
2585		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2586		    test_bit(R5_Wantcompute, &dev->flags))) {
2587			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2588			else
2589				rcw += 2*disks;
2590		}
2591	}
2592	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2593		(unsigned long long)sh->sector, rmw, rcw);
2594	set_bit(STRIPE_HANDLE, &sh->state);
2595	if (rmw < rcw && rmw > 0)
2596		/* prefer read-modify-write, but need to get some data */
2597		for (i = disks; i--; ) {
2598			struct r5dev *dev = &sh->dev[i];
2599			if ((dev->towrite || i == sh->pd_idx) &&
2600			    !test_bit(R5_LOCKED, &dev->flags) &&
2601			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2602			    test_bit(R5_Wantcompute, &dev->flags)) &&
2603			    test_bit(R5_Insync, &dev->flags)) {
2604				if (
2605				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2606					pr_debug("Read_old block "
2607						"%d for r-m-w\n", i);
2608					set_bit(R5_LOCKED, &dev->flags);
2609					set_bit(R5_Wantread, &dev->flags);
2610					s->locked++;
2611				} else {
2612					set_bit(STRIPE_DELAYED, &sh->state);
2613					set_bit(STRIPE_HANDLE, &sh->state);
2614				}
2615			}
2616		}
2617	if (rcw <= rmw && rcw > 0) {
2618		/* want reconstruct write, but need to get some data */
2619		rcw = 0;
2620		for (i = disks; i--; ) {
2621			struct r5dev *dev = &sh->dev[i];
2622			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2623			    i != sh->pd_idx && i != sh->qd_idx &&
2624			    !test_bit(R5_LOCKED, &dev->flags) &&
2625			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2626			      test_bit(R5_Wantcompute, &dev->flags))) {
2627				rcw++;
2628				if (!test_bit(R5_Insync, &dev->flags))
2629					continue; /* it's a failed drive */
2630				if (
2631				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2632					pr_debug("Read_old block "
2633						"%d for Reconstruct\n", i);
2634					set_bit(R5_LOCKED, &dev->flags);
2635					set_bit(R5_Wantread, &dev->flags);
2636					s->locked++;
2637				} else {
2638					set_bit(STRIPE_DELAYED, &sh->state);
2639					set_bit(STRIPE_HANDLE, &sh->state);
2640				}
2641			}
2642		}
2643	}
2644	/* now if nothing is locked, and if we have enough data,
2645	 * we can start a write request
2646	 */
2647	/* since handle_stripe can be called at any time we need to handle the
2648	 * case where a compute block operation has been submitted and then a
2649	 * subsequent call wants to start a write request.  raid_run_ops only
2650	 * handles the case where compute block and reconstruct are requested
2651	 * simultaneously.  If this is not the case then new writes need to be
2652	 * held off until the compute completes.
2653	 */
2654	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2655	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2656	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2657		schedule_reconstruction(sh, s, rcw == 0, 0);
2658}
2659
2660static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2661				struct stripe_head_state *s, int disks)
2662{
2663	struct r5dev *dev = NULL;
2664
2665	set_bit(STRIPE_HANDLE, &sh->state);
2666
2667	switch (sh->check_state) {
2668	case check_state_idle:
2669		/* start a new check operation if there are no failures */
2670		if (s->failed == 0) {
2671			BUG_ON(s->uptodate != disks);
2672			sh->check_state = check_state_run;
2673			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2674			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2675			s->uptodate--;
2676			break;
2677		}
2678		dev = &sh->dev[s->failed_num[0]];
2679		/* fall through */
2680	case check_state_compute_result:
2681		sh->check_state = check_state_idle;
2682		if (!dev)
2683			dev = &sh->dev[sh->pd_idx];
2684
2685		/* check that a write has not made the stripe insync */
2686		if (test_bit(STRIPE_INSYNC, &sh->state))
2687			break;
2688
2689		/* either failed parity check, or recovery is happening */
2690		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2691		BUG_ON(s->uptodate != disks);
2692
2693		set_bit(R5_LOCKED, &dev->flags);
2694		s->locked++;
2695		set_bit(R5_Wantwrite, &dev->flags);
2696
2697		clear_bit(STRIPE_DEGRADED, &sh->state);
2698		set_bit(STRIPE_INSYNC, &sh->state);
2699		break;
2700	case check_state_run:
2701		break; /* we will be called again upon completion */
2702	case check_state_check_result:
2703		sh->check_state = check_state_idle;
2704
2705		/* if a failure occurred during the check operation, leave
2706		 * STRIPE_INSYNC not set and let the stripe be handled again
2707		 */
2708		if (s->failed)
2709			break;
2710
2711		/* handle a successful check operation, if parity is correct
2712		 * we are done.  Otherwise update the mismatch count and repair
2713		 * parity if !MD_RECOVERY_CHECK
2714		 */
2715		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2716			/* parity is correct (on disc,
2717			 * not in buffer any more)
2718			 */
2719			set_bit(STRIPE_INSYNC, &sh->state);
2720		else {
2721			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2722			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2723				/* don't try to repair!! */
2724				set_bit(STRIPE_INSYNC, &sh->state);
2725			else {
2726				sh->check_state = check_state_compute_run;
2727				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2728				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2729				set_bit(R5_Wantcompute,
2730					&sh->dev[sh->pd_idx].flags);
2731				sh->ops.target = sh->pd_idx;
2732				sh->ops.target2 = -1;
2733				s->uptodate++;
2734			}
2735		}
2736		break;
2737	case check_state_compute_run:
2738		break;
2739	default:
2740		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2741		       __func__, sh->check_state,
2742		       (unsigned long long) sh->sector);
2743		BUG();
2744	}
2745}
2746
2747
2748static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
2749				  struct stripe_head_state *s,
2750				  int disks)
2751{
2752	int pd_idx = sh->pd_idx;
2753	int qd_idx = sh->qd_idx;
2754	struct r5dev *dev;
2755
2756	set_bit(STRIPE_HANDLE, &sh->state);
2757
2758	BUG_ON(s->failed > 2);
2759
2760	/* Want to check and possibly repair P and Q.
2761	 * However there could be one 'failed' device, in which
2762	 * case we can only check one of them, possibly using the
2763	 * other to generate missing data
2764	 */
2765
2766	switch (sh->check_state) {
2767	case check_state_idle:
2768		/* start a new check operation if there are < 2 failures */
2769		if (s->failed == s->q_failed) {
2770			/* The only possible failed device holds Q, so it
2771			 * makes sense to check P (If anything else were failed,
2772			 * we would have used P to recreate it).
2773			 */
2774			sh->check_state = check_state_run;
2775		}
2776		if (!s->q_failed && s->failed < 2) {
2777			/* Q is not failed, and we didn't use it to generate
2778			 * anything, so it makes sense to check it
2779			 */
2780			if (sh->check_state == check_state_run)
2781				sh->check_state = check_state_run_pq;
2782			else
2783				sh->check_state = check_state_run_q;
2784		}
2785
2786		/* discard potentially stale zero_sum_result */
2787		sh->ops.zero_sum_result = 0;
2788
2789		if (sh->check_state == check_state_run) {
2790			/* async_xor_zero_sum destroys the contents of P */
2791			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2792			s->uptodate--;
2793		}
2794		if (sh->check_state >= check_state_run &&
2795		    sh->check_state <= check_state_run_pq) {
2796			/* async_syndrome_zero_sum preserves P and Q, so
2797			 * no need to mark them !uptodate here
2798			 */
2799			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2800			break;
2801		}
2802
2803		/* we have 2-disk failure */
2804		BUG_ON(s->failed != 2);
2805		/* fall through */
2806	case check_state_compute_result:
2807		sh->check_state = check_state_idle;
2808
2809		/* check that a write has not made the stripe insync */
2810		if (test_bit(STRIPE_INSYNC, &sh->state))
2811			break;
2812
2813		/* now write out any block on a failed drive,
2814		 * or P or Q if they were recomputed
2815		 */
2816		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2817		if (s->failed == 2) {
2818			dev = &sh->dev[s->failed_num[1]];
2819			s->locked++;
2820			set_bit(R5_LOCKED, &dev->flags);
2821			set_bit(R5_Wantwrite, &dev->flags);
2822		}
2823		if (s->failed >= 1) {
2824			dev = &sh->dev[s->failed_num[0]];
2825			s->locked++;
2826			set_bit(R5_LOCKED, &dev->flags);
2827			set_bit(R5_Wantwrite, &dev->flags);
2828		}
2829		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2830			dev = &sh->dev[pd_idx];
2831			s->locked++;
2832			set_bit(R5_LOCKED, &dev->flags);
2833			set_bit(R5_Wantwrite, &dev->flags);
2834		}
2835		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2836			dev = &sh->dev[qd_idx];
2837			s->locked++;
2838			set_bit(R5_LOCKED, &dev->flags);
2839			set_bit(R5_Wantwrite, &dev->flags);
2840		}
2841		clear_bit(STRIPE_DEGRADED, &sh->state);
2842
2843		set_bit(STRIPE_INSYNC, &sh->state);
2844		break;
2845	case check_state_run:
2846	case check_state_run_q:
2847	case check_state_run_pq:
2848		break; /* we will be called again upon completion */
2849	case check_state_check_result:
2850		sh->check_state = check_state_idle;
2851
2852		/* handle a successful check operation, if parity is correct
2853		 * we are done.  Otherwise update the mismatch count and repair
2854		 * parity if !MD_RECOVERY_CHECK
2855		 */
2856		if (sh->ops.zero_sum_result == 0) {
2857			/* both parities are correct */
2858			if (!s->failed)
2859				set_bit(STRIPE_INSYNC, &sh->state);
2860			else {
2861				/* in contrast to the raid5 case we can validate
2862				 * parity, but still have a failure to write
2863				 * back
2864				 */
2865				sh->check_state = check_state_compute_result;
2866				/* Returning at this point means that we may go
2867				 * off and bring p and/or q uptodate again so
2868				 * we make sure to check zero_sum_result again
2869				 * to verify if p or q need writeback
2870				 */
2871			}
2872		} else {
2873			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2874			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2875				/* don't try to repair!! */
2876				set_bit(STRIPE_INSYNC, &sh->state);
2877			else {
2878				int *target = &sh->ops.target;
2879
2880				sh->ops.target = -1;
2881				sh->ops.target2 = -1;
2882				sh->check_state = check_state_compute_run;
2883				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2884				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2885				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2886					set_bit(R5_Wantcompute,
2887						&sh->dev[pd_idx].flags);
2888					*target = pd_idx;
2889					target = &sh->ops.target2;
2890					s->uptodate++;
2891				}
2892				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2893					set_bit(R5_Wantcompute,
2894						&sh->dev[qd_idx].flags);
2895					*target = qd_idx;
2896					s->uptodate++;
2897				}
2898			}
2899		}
2900		break;
2901	case check_state_compute_run:
2902		break;
2903	default:
2904		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2905		       __func__, sh->check_state,
2906		       (unsigned long long) sh->sector);
2907		BUG();
2908	}
2909}
2910
2911static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
2912{
2913	int i;
2914
2915	/* We have read all the blocks in this stripe and now we need to
2916	 * copy some of them into a target stripe for expand.
2917	 */
2918	struct dma_async_tx_descriptor *tx = NULL;
2919	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2920	for (i = 0; i < sh->disks; i++)
2921		if (i != sh->pd_idx && i != sh->qd_idx) {
2922			int dd_idx, j;
2923			struct stripe_head *sh2;
2924			struct async_submit_ctl submit;
2925
2926			sector_t bn = compute_blocknr(sh, i, 1);
2927			sector_t s = raid5_compute_sector(conf, bn, 0,
2928							  &dd_idx, NULL);
2929			sh2 = get_active_stripe(conf, s, 0, 1, 1);
2930			if (sh2 == NULL)
2931				/* so far only the early blocks of this stripe
2932				 * have been requested.  When later blocks
2933				 * get requested, we will try again
2934				 */
2935				continue;
2936			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2937			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2938				/* must have already done this block */
2939				release_stripe(sh2);
2940				continue;
2941			}
2942
2943			/* place all the copies on one channel */
2944			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2945			tx = async_memcpy(sh2->dev[dd_idx].page,
2946					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
2947					  &submit);
2948
2949			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2950			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2951			for (j = 0; j < conf->raid_disks; j++)
2952				if (j != sh2->pd_idx &&
2953				    j != sh2->qd_idx &&
2954				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
2955					break;
2956			if (j == conf->raid_disks) {
2957				set_bit(STRIPE_EXPAND_READY, &sh2->state);
2958				set_bit(STRIPE_HANDLE, &sh2->state);
2959			}
2960			release_stripe(sh2);
2961
2962		}
2963	/* done submitting copies, wait for them to complete */
2964	if (tx) {
2965		async_tx_ack(tx);
2966		dma_wait_for_async_tx(tx);
2967	}
2968}
2969
2970
2971/*
2972 * handle_stripe - do things to a stripe.
2973 *
2974 * We lock the stripe and then examine the state of various bits
2975 * to see what needs to be done.
2976 * Possible results:
2977 *    return some read request which now have data
2978 *    return some write requests which are safely on disc
2979 *    schedule a read on some buffers
2980 *    schedule a write of some buffers
2981 *    return confirmation of parity correctness
2982 *
2983 * buffers are taken off read_list or write_list, and bh_cache buffers
2984 * get BH_Lock set before the stripe lock is released.
2985 *
2986 */
2987
2988static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2989{
2990	struct r5conf *conf = sh->raid_conf;
2991	int disks = sh->disks;
2992	struct r5dev *dev;
2993	int i;
2994
2995	memset(s, 0, sizeof(*s));
2996
2997	s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
2998	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2999	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3000	s->failed_num[0] = -1;
3001	s->failed_num[1] = -1;
3002
3003	/* Now to look around and see what can be done */
3004	rcu_read_lock();
3005	spin_lock_irq(&conf->device_lock);
3006	for (i=disks; i--; ) {
3007		struct md_rdev *rdev;
3008		sector_t first_bad;
3009		int bad_sectors;
3010		int is_bad = 0;
3011
3012		dev = &sh->dev[i];
3013
3014		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3015			i, dev->flags, dev->toread, dev->towrite, dev->written);
3016		/* maybe we can reply to a read
3017		 *
3018		 * new wantfill requests are only permitted while
3019		 * ops_complete_biofill is guaranteed to be inactive
3020		 */
3021		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3022		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3023			set_bit(R5_Wantfill, &dev->flags);
3024
3025		/* now count some things */
3026		if (test_bit(R5_LOCKED, &dev->flags))
3027			s->locked++;
3028		if (test_bit(R5_UPTODATE, &dev->flags))
3029			s->uptodate++;
3030		if (test_bit(R5_Wantcompute, &dev->flags)) {
3031			s->compute++;
3032			BUG_ON(s->compute > 2);
3033		}
3034
3035		if (test_bit(R5_Wantfill, &dev->flags))
3036			s->to_fill++;
3037		else if (dev->toread)
3038			s->to_read++;
3039		if (dev->towrite) {
3040			s->to_write++;
3041			if (!test_bit(R5_OVERWRITE, &dev->flags))
3042				s->non_overwrite++;
3043		}
3044		if (dev->written)
3045			s->written++;
3046		rdev = rcu_dereference(conf->disks[i].rdev);
3047		if (rdev && test_bit(Faulty, &rdev->flags))
3048			rdev = NULL;
3049		if (rdev) {
3050			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3051					     &first_bad, &bad_sectors);
3052			if (s->blocked_rdev == NULL
3053			    && (test_bit(Blocked, &rdev->flags)
3054				|| is_bad < 0)) {
3055				if (is_bad < 0)
3056					set_bit(BlockedBadBlocks,
3057						&rdev->flags);
3058				s->blocked_rdev = rdev;
3059				atomic_inc(&rdev->nr_pending);
3060			}
3061		}
3062		clear_bit(R5_Insync, &dev->flags);
3063		if (!rdev)
3064			/* Not in-sync */;
3065		else if (is_bad) {
3066			/* also not in-sync */
3067			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3068				/* treat as in-sync, but with a read error
3069				 * which we can now try to correct
3070				 */
3071				set_bit(R5_Insync, &dev->flags);
3072				set_bit(R5_ReadError, &dev->flags);
3073			}
3074		} else if (test_bit(In_sync, &rdev->flags))
3075			set_bit(R5_Insync, &dev->flags);
3076		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3077			/* in sync if before recovery_offset */
3078			set_bit(R5_Insync, &dev->flags);
3079		else if (test_bit(R5_UPTODATE, &dev->flags) &&
3080			 test_bit(R5_Expanded, &dev->flags))
3081			/* If we've reshaped into here, we assume it is Insync.
3082			 * We will shortly update recovery_offset to make
3083			 * it official.
3084			 */
3085			set_bit(R5_Insync, &dev->flags);
3086
3087		if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3088			clear_bit(R5_Insync, &dev->flags);
3089			if (!test_bit(Faulty, &rdev->flags)) {
3090				s->handle_bad_blocks = 1;
3091				atomic_inc(&rdev->nr_pending);
3092			} else
3093				clear_bit(R5_WriteError, &dev->flags);
3094		}
3095		if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3096			if (!test_bit(Faulty, &rdev->flags)) {
3097				s->handle_bad_blocks = 1;
3098				atomic_inc(&rdev->nr_pending);
3099			} else
3100				clear_bit(R5_MadeGood, &dev->flags);
3101		}
3102		if (!test_bit(R5_Insync, &dev->flags)) {
3103			/* The ReadError flag will just be confusing now */
3104			clear_bit(R5_ReadError, &dev->flags);
3105			clear_bit(R5_ReWrite, &dev->flags);
3106		}
3107		if (test_bit(R5_ReadError, &dev->flags))
3108			clear_bit(R5_Insync, &dev->flags);
3109		if (!test_bit(R5_Insync, &dev->flags)) {
3110			if (s->failed < 2)
3111				s->failed_num[s->failed] = i;
3112			s->failed++;
3113		}
3114	}
3115	spin_unlock_irq(&conf->device_lock);
3116	rcu_read_unlock();
3117}
3118
3119static void handle_stripe(struct stripe_head *sh)
3120{
3121	struct stripe_head_state s;
3122	struct r5conf *conf = sh->raid_conf;
3123	int i;
3124	int prexor;
3125	int disks = sh->disks;
3126	struct r5dev *pdev, *qdev;
3127
3128	clear_bit(STRIPE_HANDLE, &sh->state);
3129	if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
3130		/* already being handled, ensure it gets handled
3131		 * again when current action finishes */
3132		set_bit(STRIPE_HANDLE, &sh->state);
3133		return;
3134	}
3135
3136	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3137		set_bit(STRIPE_SYNCING, &sh->state);
3138		clear_bit(STRIPE_INSYNC, &sh->state);
3139	}
3140	clear_bit(STRIPE_DELAYED, &sh->state);
3141
3142	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3143		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3144	       (unsigned long long)sh->sector, sh->state,
3145	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3146	       sh->check_state, sh->reconstruct_state);
3147
3148	analyse_stripe(sh, &s);
3149
3150	if (s.handle_bad_blocks) {
3151		set_bit(STRIPE_HANDLE, &sh->state);
3152		goto finish;
3153	}
3154
3155	if (unlikely(s.blocked_rdev)) {
3156		if (s.syncing || s.expanding || s.expanded ||
3157		    s.to_write || s.written) {
3158			set_bit(STRIPE_HANDLE, &sh->state);
3159			goto finish;
3160		}
3161		/* There is nothing for the blocked_rdev to block */
3162		rdev_dec_pending(s.blocked_rdev, conf->mddev);
3163		s.blocked_rdev = NULL;
3164	}
3165
3166	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3167		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3168		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3169	}
3170
3171	pr_debug("locked=%d uptodate=%d to_read=%d"
3172	       " to_write=%d failed=%d failed_num=%d,%d\n",
3173	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3174	       s.failed_num[0], s.failed_num[1]);
3175	/* check if the array has lost more than max_degraded devices and,
3176	 * if so, some requests might need to be failed.
3177	 */
3178	if (s.failed > conf->max_degraded) {
3179		sh->check_state = 0;
3180		sh->reconstruct_state = 0;
3181		if (s.to_read+s.to_write+s.written)
3182			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3183		if (s.syncing)
3184			handle_failed_sync(conf, sh, &s);
3185	}
3186
3187	/*
3188	 * might be able to return some write requests if the parity blocks
3189	 * are safe, or on a failed drive
3190	 */
3191	pdev = &sh->dev[sh->pd_idx];
3192	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3193		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3194	qdev = &sh->dev[sh->qd_idx];
3195	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3196		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3197		|| conf->level < 6;
3198
3199	if (s.written &&
3200	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3201			     && !test_bit(R5_LOCKED, &pdev->flags)
3202			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3203	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3204			     && !test_bit(R5_LOCKED, &qdev->flags)
3205			     && test_bit(R5_UPTODATE, &qdev->flags)))))
3206		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3207
3208	/* Now we might consider reading some blocks, either to check/generate
3209	 * parity, or to satisfy requests
3210	 * or to load a block that is being partially written.
3211	 */
3212	if (s.to_read || s.non_overwrite
3213	    || (conf->level == 6 && s.to_write && s.failed)
3214	    || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3215		handle_stripe_fill(sh, &s, disks);
3216
3217	/* Now we check to see if any write operations have recently
3218	 * completed
3219	 */
3220	prexor = 0;
3221	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3222		prexor = 1;
3223	if (sh->reconstruct_state == reconstruct_state_drain_result ||
3224	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3225		sh->reconstruct_state = reconstruct_state_idle;
3226
3227		/* All the 'written' buffers and the parity block are ready to
3228		 * be written back to disk
3229		 */
3230		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3231		BUG_ON(sh->qd_idx >= 0 &&
3232		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3233		for (i = disks; i--; ) {
3234			struct r5dev *dev = &sh->dev[i];
3235			if (test_bit(R5_LOCKED, &dev->flags) &&
3236				(i == sh->pd_idx || i == sh->qd_idx ||
3237				 dev->written)) {
3238				pr_debug("Writing block %d\n", i);
3239				set_bit(R5_Wantwrite, &dev->flags);
3240				if (prexor)
3241					continue;
3242				if (!test_bit(R5_Insync, &dev->flags) ||
3243				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
3244				     s.failed == 0))
3245					set_bit(STRIPE_INSYNC, &sh->state);
3246			}
3247		}
3248		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3249			s.dec_preread_active = 1;
3250	}
3251
3252	/* Now to consider new write requests and what else, if anything
3253	 * should be read.  We do not handle new writes when:
3254	 * 1/ A 'write' operation (copy+xor) is already in flight.
3255	 * 2/ A 'check' operation is in flight, as it may clobber the parity
3256	 *    block.
3257	 */
3258	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3259		handle_stripe_dirtying(conf, sh, &s, disks);
3260
3261	/* maybe we need to check and possibly fix the parity for this stripe
3262	 * Any reads will already have been scheduled, so we just see if enough
3263	 * data is available.  The parity check is held off while parity
3264	 * dependent operations are in flight.
3265	 */
3266	if (sh->check_state ||
3267	    (s.syncing && s.locked == 0 &&
3268	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3269	     !test_bit(STRIPE_INSYNC, &sh->state))) {
3270		if (conf->level == 6)
3271			handle_parity_checks6(conf, sh, &s, disks);
3272		else
3273			handle_parity_checks5(conf, sh, &s, disks);
3274	}
3275
3276	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3277		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3278		clear_bit(STRIPE_SYNCING, &sh->state);
3279	}
3280
3281	/* If the failed drives are just a ReadError, then we might need
3282	 * to progress the repair/check process
3283	 */
3284	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3285		for (i = 0; i < s.failed; i++) {
3286			struct r5dev *dev = &sh->dev[s.failed_num[i]];
3287			if (test_bit(R5_ReadError, &dev->flags)
3288			    && !test_bit(R5_LOCKED, &dev->flags)
3289			    && test_bit(R5_UPTODATE, &dev->flags)
3290				) {
3291				if (!test_bit(R5_ReWrite, &dev->flags)) {
3292					set_bit(R5_Wantwrite, &dev->flags);
3293					set_bit(R5_ReWrite, &dev->flags);
3294					set_bit(R5_LOCKED, &dev->flags);
3295					s.locked++;
3296				} else {
3297					/* let's read it back */
3298					set_bit(R5_Wantread, &dev->flags);
3299					set_bit(R5_LOCKED, &dev->flags);
3300					s.locked++;
3301				}
3302			}
3303		}
3304
3305
3306	/* Finish reconstruct operations initiated by the expansion process */
3307	if (sh->reconstruct_state == reconstruct_state_result) {
3308		struct stripe_head *sh_src
3309			= get_active_stripe(conf, sh->sector, 1, 1, 1);
3310		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3311			/* sh cannot be written until sh_src has been read.
3312			 * so arrange for sh to be delayed a little
3313			 */
3314			set_bit(STRIPE_DELAYED, &sh->state);
3315			set_bit(STRIPE_HANDLE, &sh->state);
3316			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3317					      &sh_src->state))
3318				atomic_inc(&conf->preread_active_stripes);
3319			release_stripe(sh_src);
3320			goto finish;
3321		}
3322		if (sh_src)
3323			release_stripe(sh_src);
3324
3325		sh->reconstruct_state = reconstruct_state_idle;
3326		clear_bit(STRIPE_EXPANDING, &sh->state);
3327		for (i = conf->raid_disks; i--; ) {
3328			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3329			set_bit(R5_LOCKED, &sh->dev[i].flags);
3330			s.locked++;
3331		}
3332	}
3333
3334	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3335	    !sh->reconstruct_state) {
3336		/* Need to write out all blocks after computing parity */
3337		sh->disks = conf->raid_disks;
3338		stripe_set_idx(sh->sector, conf, 0, sh);
3339		schedule_reconstruction(sh, &s, 1, 1);
3340	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3341		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3342		atomic_dec(&conf->reshape_stripes);
3343		wake_up(&conf->wait_for_overlap);
3344		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3345	}
3346
3347	if (s.expanding && s.locked == 0 &&
3348	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3349		handle_stripe_expansion(conf, sh);
3350
3351finish:
3352	/* wait for this device to become unblocked */
3353	if (conf->mddev->external && unlikely(s.blocked_rdev))
3354		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3355
3356	if (s.handle_bad_blocks)
3357		for (i = disks; i--; ) {
3358			struct md_rdev *rdev;
3359			struct r5dev *dev = &sh->dev[i];
3360			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3361				/* We own a safe reference to the rdev */
3362				rdev = conf->disks[i].rdev;
3363				if (!rdev_set_badblocks(rdev, sh->sector,
3364							STRIPE_SECTORS, 0))
3365					md_error(conf->mddev, rdev);
3366				rdev_dec_pending(rdev, conf->mddev);
3367			}
3368			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3369				rdev = conf->disks[i].rdev;
3370				rdev_clear_badblocks(rdev, sh->sector,
3371						     STRIPE_SECTORS);
3372				rdev_dec_pending(rdev, conf->mddev);
3373			}
3374		}
3375
3376	if (s.ops_request)
3377		raid_run_ops(sh, s.ops_request);
3378
3379	ops_run_io(sh, &s);
3380
3381	if (s.dec_preread_active) {
3382		/* We delay this until after ops_run_io so that if make_request
3383		 * is waiting on a flush, it won't continue until the writes
3384		 * have actually been submitted.
3385		 */
3386		atomic_dec(&conf->preread_active_stripes);
3387		if (atomic_read(&conf->preread_active_stripes) <
3388		    IO_THRESHOLD)
3389			md_wakeup_thread(conf->mddev->thread);
3390	}
3391
3392	return_io(s.return_bi);
3393
3394	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3395}
3396
3397static void raid5_activate_delayed(struct r5conf *conf)
3398{
3399	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3400		while (!list_empty(&conf->delayed_list)) {
3401			struct list_head *l = conf->delayed_list.next;
3402			struct stripe_head *sh;
3403			sh = list_entry(l, struct stripe_head, lru);
3404			list_del_init(l);
3405			clear_bit(STRIPE_DELAYED, &sh->state);
3406			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3407				atomic_inc(&conf->preread_active_stripes);
3408			list_add_tail(&sh->lru, &conf->hold_list);
3409		}
3410	}
3411}
3412
3413static void activate_bit_delay(struct r5conf *conf)
3414{
3415	/* device_lock is held */
3416	struct list_head head;
3417	list_add(&head, &conf->bitmap_list);
3418	list_del_init(&conf->bitmap_list);
3419	while (!list_empty(&head)) {
3420		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3421		list_del_init(&sh->lru);
3422		atomic_inc(&sh->count);
3423		__release_stripe(conf, sh);
3424	}
3425}
3426
3427int md_raid5_congested(struct mddev *mddev, int bits)
3428{
3429	struct r5conf *conf = mddev->private;
3430
3431	/* No difference between reads and writes.  Just check
3432	 * how busy the stripe_cache is
3433	 */
3434
3435	if (conf->inactive_blocked)
3436		return 1;
3437	if (conf->quiesce)
3438		return 1;
3439	if (list_empty_careful(&conf->inactive_list))
3440		return 1;
3441
3442	return 0;
3443}
3444EXPORT_SYMBOL_GPL(md_raid5_congested);
3445
3446static int raid5_congested(void *data, int bits)
3447{
3448	struct mddev *mddev = data;
3449
3450	return mddev_congested(mddev, bits) ||
3451		md_raid5_congested(mddev, bits);
3452}
3453
3454/* We want read requests to align with chunks where possible,
3455 * but write requests don't need to.
3456 */
3457static int raid5_mergeable_bvec(struct request_queue *q,
3458				struct bvec_merge_data *bvm,
3459				struct bio_vec *biovec)
3460{
3461	struct mddev *mddev = q->queuedata;
3462	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3463	int max;
3464	unsigned int chunk_sectors = mddev->chunk_sectors;
3465	unsigned int bio_sectors = bvm->bi_size >> 9;
3466
3467	if ((bvm->bi_rw & 1) == WRITE)
3468		return biovec->bv_len; /* always allow writes to be mergeable */
3469
3470	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3471		chunk_sectors = mddev->new_chunk_sectors;
3472	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3473	if (max < 0) max = 0;
3474	if (max <= biovec->bv_len && bio_sectors == 0)
3475		return biovec->bv_len;
3476	else
3477		return max;
3478}
3479
3480
3481static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3482{
3483	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3484	unsigned int chunk_sectors = mddev->chunk_sectors;
3485	unsigned int bio_sectors = bio->bi_size >> 9;
3486
3487	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3488		chunk_sectors = mddev->new_chunk_sectors;
3489	return  chunk_sectors >=
3490		((sector & (chunk_sectors - 1)) + bio_sectors);
3491}
3492
3493/*
3494 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3495 *  later sampled by raid5d.
3496 */
3497static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
3498{
3499	unsigned long flags;
3500
3501	spin_lock_irqsave(&conf->device_lock, flags);
3502
3503	bi->bi_next = conf->retry_read_aligned_list;
3504	conf->retry_read_aligned_list = bi;
3505
3506	spin_unlock_irqrestore(&conf->device_lock, flags);
3507	md_wakeup_thread(conf->mddev->thread);
3508}
3509
3510
3511static struct bio *remove_bio_from_retry(struct r5conf *conf)
3512{
3513	struct bio *bi;
3514
3515	bi = conf->retry_read_aligned;
3516	if (bi) {
3517		conf->retry_read_aligned = NULL;
3518		return bi;
3519	}
3520	bi = conf->retry_read_aligned_list;
3521	if(bi) {
3522		conf->retry_read_aligned_list = bi->bi_next;
3523		bi->bi_next = NULL;
3524		/*
3525		 * this sets the active strip count to 1 and the processed
3526		 * strip count to zero (upper 8 bits)
3527		 */
3528		bi->bi_phys_segments = 1; /* biased count of active stripes */
3529	}
3530
3531	return bi;
3532}
3533
3534
3535/*
3536 *  The "raid5_align_endio" should check if the read succeeded and if it
3537 *  did, call bio_endio on the original bio (having bio_put the new bio
3538 *  first).
3539 *  If the read failed..
3540 */
3541static void raid5_align_endio(struct bio *bi, int error)
3542{
3543	struct bio* raid_bi  = bi->bi_private;
3544	struct mddev *mddev;
3545	struct r5conf *conf;
3546	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3547	struct md_rdev *rdev;
3548
3549	bio_put(bi);
3550
3551	rdev = (void*)raid_bi->bi_next;
3552	raid_bi->bi_next = NULL;
3553	mddev = rdev->mddev;
3554	conf = mddev->private;
3555
3556	rdev_dec_pending(rdev, conf->mddev);
3557
3558	if (!error && uptodate) {
3559		bio_endio(raid_bi, 0);
3560		if (atomic_dec_and_test(&conf->active_aligned_reads))
3561			wake_up(&conf->wait_for_stripe);
3562		return;
3563	}
3564
3565
3566	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3567
3568	add_bio_to_retry(raid_bi, conf);
3569}
3570
3571static int bio_fits_rdev(struct bio *bi)
3572{
3573	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3574
3575	if ((bi->bi_size>>9) > queue_max_sectors(q))
3576		return 0;
3577	blk_recount_segments(q, bi);
3578	if (bi->bi_phys_segments > queue_max_segments(q))
3579		return 0;
3580
3581	if (q->merge_bvec_fn)
3582		/* it's too hard to apply the merge_bvec_fn at this stage,
3583		 * just just give up
3584		 */
3585		return 0;
3586
3587	return 1;
3588}
3589
3590
3591static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3592{
3593	struct r5conf *conf = mddev->private;
3594	int dd_idx;
3595	struct bio* align_bi;
3596	struct md_rdev *rdev;
3597	sector_t end_sector;
3598
3599	if (!in_chunk_boundary(mddev, raid_bio)) {
3600		pr_debug("chunk_aligned_read : non aligned\n");
3601		return 0;
3602	}
3603	/*
3604	 * use bio_clone_mddev to make a copy of the bio
3605	 */
3606	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3607	if (!align_bi)
3608		return 0;
3609	/*
3610	 *   set bi_end_io to a new function, and set bi_private to the
3611	 *     original bio.
3612	 */
3613	align_bi->bi_end_io  = raid5_align_endio;
3614	align_bi->bi_private = raid_bio;
3615	/*
3616	 *	compute position
3617	 */
3618	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
3619						    0,
3620						    &dd_idx, NULL);
3621
3622	end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3623	rcu_read_lock();
3624	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3625	if (!rdev || test_bit(Faulty, &rdev->flags) ||
3626	    rdev->recovery_offset < end_sector) {
3627		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3628		if (rdev &&
3629		    (test_bit(Faulty, &rdev->flags) ||
3630		    !(test_bit(In_sync, &rdev->flags) ||
3631		      rdev->recovery_offset >= end_sector)))
3632			rdev = NULL;
3633	}
3634	if (rdev) {
3635		sector_t first_bad;
3636		int bad_sectors;
3637
3638		atomic_inc(&rdev->nr_pending);
3639		rcu_read_unlock();
3640		raid_bio->bi_next = (void*)rdev;
3641		align_bi->bi_bdev =  rdev->bdev;
3642		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3643		align_bi->bi_sector += rdev->data_offset;
3644
3645		if (!bio_fits_rdev(align_bi) ||
3646		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3647				&first_bad, &bad_sectors)) {
3648			/* too big in some way, or has a known bad block */
3649			bio_put(align_bi);
3650			rdev_dec_pending(rdev, mddev);
3651			return 0;
3652		}
3653
3654		spin_lock_irq(&conf->device_lock);
3655		wait_event_lock_irq(conf->wait_for_stripe,
3656				    conf->quiesce == 0,
3657				    conf->device_lock, /* nothing */);
3658		atomic_inc(&conf->active_aligned_reads);
3659		spin_unlock_irq(&conf->device_lock);
3660
3661		generic_make_request(align_bi);
3662		return 1;
3663	} else {
3664		rcu_read_unlock();
3665		bio_put(align_bi);
3666		return 0;
3667	}
3668}
3669
3670/* __get_priority_stripe - get the next stripe to process
3671 *
3672 * Full stripe writes are allowed to pass preread active stripes up until
3673 * the bypass_threshold is exceeded.  In general the bypass_count
3674 * increments when the handle_list is handled before the hold_list; however, it
3675 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3676 * stripe with in flight i/o.  The bypass_count will be reset when the
3677 * head of the hold_list has changed, i.e. the head was promoted to the
3678 * handle_list.
3679 */
3680static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
3681{
3682	struct stripe_head *sh;
3683
3684	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3685		  __func__,
3686		  list_empty(&conf->handle_list) ? "empty" : "busy",
3687		  list_empty(&conf->hold_list) ? "empty" : "busy",
3688		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
3689
3690	if (!list_empty(&conf->handle_list)) {
3691		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3692
3693		if (list_empty(&conf->hold_list))
3694			conf->bypass_count = 0;
3695		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3696			if (conf->hold_list.next == conf->last_hold)
3697				conf->bypass_count++;
3698			else {
3699				conf->last_hold = conf->hold_list.next;
3700				conf->bypass_count -= conf->bypass_threshold;
3701				if (conf->bypass_count < 0)
3702					conf->bypass_count = 0;
3703			}
3704		}
3705	} else if (!list_empty(&conf->hold_list) &&
3706		   ((conf->bypass_threshold &&
3707		     conf->bypass_count > conf->bypass_threshold) ||
3708		    atomic_read(&conf->pending_full_writes) == 0)) {
3709		sh = list_entry(conf->hold_list.next,
3710				typeof(*sh), lru);
3711		conf->bypass_count -= conf->bypass_threshold;
3712		if (conf->bypass_count < 0)
3713			conf->bypass_count = 0;
3714	} else
3715		return NULL;
3716
3717	list_del_init(&sh->lru);
3718	atomic_inc(&sh->count);
3719	BUG_ON(atomic_read(&sh->count) != 1);
3720	return sh;
3721}
3722
3723static void make_request(struct mddev *mddev, struct bio * bi)
3724{
3725	struct r5conf *conf = mddev->private;
3726	int dd_idx;
3727	sector_t new_sector;
3728	sector_t logical_sector, last_sector;
3729	struct stripe_head *sh;
3730	const int rw = bio_data_dir(bi);
3731	int remaining;
3732	int plugged;
3733
3734	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3735		md_flush_request(mddev, bi);
3736		return;
3737	}
3738
3739	md_write_start(mddev, bi);
3740
3741	if (rw == READ &&
3742	     mddev->reshape_position == MaxSector &&
3743	     chunk_aligned_read(mddev,bi))
3744		return;
3745
3746	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3747	last_sector = bi->bi_sector + (bi->bi_size>>9);
3748	bi->bi_next = NULL;
3749	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
3750
3751	plugged = mddev_check_plugged(mddev);
3752	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3753		DEFINE_WAIT(w);
3754		int disks, data_disks;
3755		int previous;
3756
3757	retry:
3758		previous = 0;
3759		disks = conf->raid_disks;
3760		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3761		if (unlikely(conf->reshape_progress != MaxSector)) {
3762			/* spinlock is needed as reshape_progress may be
3763			 * 64bit on a 32bit platform, and so it might be
3764			 * possible to see a half-updated value
3765			 * Of course reshape_progress could change after
3766			 * the lock is dropped, so once we get a reference
3767			 * to the stripe that we think it is, we will have
3768			 * to check again.
3769			 */
3770			spin_lock_irq(&conf->device_lock);
3771			if (mddev->delta_disks < 0
3772			    ? logical_sector < conf->reshape_progress
3773			    : logical_sector >= conf->reshape_progress) {
3774				disks = conf->previous_raid_disks;
3775				previous = 1;
3776			} else {
3777				if (mddev->delta_disks < 0
3778				    ? logical_sector < conf->reshape_safe
3779				    : logical_sector >= conf->reshape_safe) {
3780					spin_unlock_irq(&conf->device_lock);
3781					schedule();
3782					goto retry;
3783				}
3784			}
3785			spin_unlock_irq(&conf->device_lock);
3786		}
3787		data_disks = disks - conf->max_degraded;
3788
3789		new_sector = raid5_compute_sector(conf, logical_sector,
3790						  previous,
3791						  &dd_idx, NULL);
3792		pr_debug("raid456: make_request, sector %llu logical %llu\n",
3793			(unsigned long long)new_sector,
3794			(unsigned long long)logical_sector);
3795
3796		sh = get_active_stripe(conf, new_sector, previous,
3797				       (bi->bi_rw&RWA_MASK), 0);
3798		if (sh) {
3799			if (unlikely(previous)) {
3800				/* expansion might have moved on while waiting for a
3801				 * stripe, so we must do the range check again.
3802				 * Expansion could still move past after this
3803				 * test, but as we are holding a reference to
3804				 * 'sh', we know that if that happens,
3805				 *  STRIPE_EXPANDING will get set and the expansion
3806				 * won't proceed until we finish with the stripe.
3807				 */
3808				int must_retry = 0;
3809				spin_lock_irq(&conf->device_lock);
3810				if (mddev->delta_disks < 0
3811				    ? logical_sector >= conf->reshape_progress
3812				    : logical_sector < conf->reshape_progress)
3813					/* mismatch, need to try again */
3814					must_retry = 1;
3815				spin_unlock_irq(&conf->device_lock);
3816				if (must_retry) {
3817					release_stripe(sh);
3818					schedule();
3819					goto retry;
3820				}
3821			}
3822
3823			if (rw == WRITE &&
3824			    logical_sector >= mddev->suspend_lo &&
3825			    logical_sector < mddev->suspend_hi) {
3826				release_stripe(sh);
3827				/* As the suspend_* range is controlled by
3828				 * userspace, we want an interruptible
3829				 * wait.
3830				 */
3831				flush_signals(current);
3832				prepare_to_wait(&conf->wait_for_overlap,
3833						&w, TASK_INTERRUPTIBLE);
3834				if (logical_sector >= mddev->suspend_lo &&
3835				    logical_sector < mddev->suspend_hi)
3836					schedule();
3837				goto retry;
3838			}
3839
3840			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3841			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
3842				/* Stripe is busy expanding or
3843				 * add failed due to overlap.  Flush everything
3844				 * and wait a while
3845				 */
3846				md_wakeup_thread(mddev->thread);
3847				release_stripe(sh);
3848				schedule();
3849				goto retry;
3850			}
3851			finish_wait(&conf->wait_for_overlap, &w);
3852			set_bit(STRIPE_HANDLE, &sh->state);
3853			clear_bit(STRIPE_DELAYED, &sh->state);
3854			if ((bi->bi_rw & REQ_SYNC) &&
3855			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3856				atomic_inc(&conf->preread_active_stripes);
3857			release_stripe(sh);
3858		} else {
3859			/* cannot get stripe for read-ahead, just give-up */
3860			clear_bit(BIO_UPTODATE, &bi->bi_flags);
3861			finish_wait(&conf->wait_for_overlap, &w);
3862			break;
3863		}
3864
3865	}
3866	if (!plugged)
3867		md_wakeup_thread(mddev->thread);
3868
3869	spin_lock_irq(&conf->device_lock);
3870	remaining = raid5_dec_bi_phys_segments(bi);
3871	spin_unlock_irq(&conf->device_lock);
3872	if (remaining == 0) {
3873
3874		if ( rw == WRITE )
3875			md_write_end(mddev);
3876
3877		bio_endio(bi, 0);
3878	}
3879}
3880
3881static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
3882
3883static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
3884{
3885	/* reshaping is quite different to recovery/resync so it is
3886	 * handled quite separately ... here.
3887	 *
3888	 * On each call to sync_request, we gather one chunk worth of
3889	 * destination stripes and flag them as expanding.
3890	 * Then we find all the source stripes and request reads.
3891	 * As the reads complete, handle_stripe will copy the data
3892	 * into the destination stripe and release that stripe.
3893	 */
3894	struct r5conf *conf = mddev->private;
3895	struct stripe_head *sh;
3896	sector_t first_sector, last_sector;
3897	int raid_disks = conf->previous_raid_disks;
3898	int data_disks = raid_disks - conf->max_degraded;
3899	int new_data_disks = conf->raid_disks - conf->max_degraded;
3900	int i;
3901	int dd_idx;
3902	sector_t writepos, readpos, safepos;
3903	sector_t stripe_addr;
3904	int reshape_sectors;
3905	struct list_head stripes;
3906
3907	if (sector_nr == 0) {
3908		/* If restarting in the middle, skip the initial sectors */
3909		if (mddev->delta_disks < 0 &&
3910		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
3911			sector_nr = raid5_size(mddev, 0, 0)
3912				- conf->reshape_progress;
3913		} else if (mddev->delta_disks >= 0 &&
3914			   conf->reshape_progress > 0)
3915			sector_nr = conf->reshape_progress;
3916		sector_div(sector_nr, new_data_disks);
3917		if (sector_nr) {
3918			mddev->curr_resync_completed = sector_nr;
3919			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
3920			*skipped = 1;
3921			return sector_nr;
3922		}
3923	}
3924
3925	/* We need to process a full chunk at a time.
3926	 * If old and new chunk sizes differ, we need to process the
3927	 * largest of these
3928	 */
3929	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
3930		reshape_sectors = mddev->new_chunk_sectors;
3931	else
3932		reshape_sectors = mddev->chunk_sectors;
3933
3934	/* we update the metadata when there is more than 3Meg
3935	 * in the block range (that is rather arbitrary, should
3936	 * probably be time based) or when the data about to be
3937	 * copied would over-write the source of the data at
3938	 * the front of the range.
3939	 * i.e. one new_stripe along from reshape_progress new_maps
3940	 * to after where reshape_safe old_maps to
3941	 */
3942	writepos = conf->reshape_progress;
3943	sector_div(writepos, new_data_disks);
3944	readpos = conf->reshape_progress;
3945	sector_div(readpos, data_disks);
3946	safepos = conf->reshape_safe;
3947	sector_div(safepos, data_disks);
3948	if (mddev->delta_disks < 0) {
3949		writepos -= min_t(sector_t, reshape_sectors, writepos);
3950		readpos += reshape_sectors;
3951		safepos += reshape_sectors;
3952	} else {
3953		writepos += reshape_sectors;
3954		readpos -= min_t(sector_t, reshape_sectors, readpos);
3955		safepos -= min_t(sector_t, reshape_sectors, safepos);
3956	}
3957
3958	/* 'writepos' is the most advanced device address we might write.
3959	 * 'readpos' is the least advanced device address we might read.
3960	 * 'safepos' is the least address recorded in the metadata as having
3961	 *     been reshaped.
3962	 * If 'readpos' is behind 'writepos', then there is no way that we can
3963	 * ensure safety in the face of a crash - that must be done by userspace
3964	 * making a backup of the data.  So in that case there is no particular
3965	 * rush to update metadata.
3966	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
3967	 * update the metadata to advance 'safepos' to match 'readpos' so that
3968	 * we can be safe in the event of a crash.
3969	 * So we insist on updating metadata if safepos is behind writepos and
3970	 * readpos is beyond writepos.
3971	 * In any case, update the metadata every 10 seconds.
3972	 * Maybe that number should be configurable, but I'm not sure it is
3973	 * worth it.... maybe it could be a multiple of safemode_delay???
3974	 */
3975	if ((mddev->delta_disks < 0
3976	     ? (safepos > writepos && readpos < writepos)
3977	     : (safepos < writepos && readpos > writepos)) ||
3978	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
3979		/* Cannot proceed until we've updated the superblock... */
3980		wait_event(conf->wait_for_overlap,
3981			   atomic_read(&conf->reshape_stripes)==0);
3982		mddev->reshape_position = conf->reshape_progress;
3983		mddev->curr_resync_completed = sector_nr;
3984		conf->reshape_checkpoint = jiffies;
3985		set_bit(MD_CHANGE_DEVS, &mddev->flags);
3986		md_wakeup_thread(mddev->thread);
3987		wait_event(mddev->sb_wait, mddev->flags == 0 ||
3988			   kthread_should_stop());
3989		spin_lock_irq(&conf->device_lock);
3990		conf->reshape_safe = mddev->reshape_position;
3991		spin_unlock_irq(&conf->device_lock);
3992		wake_up(&conf->wait_for_overlap);
3993		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
3994	}
3995
3996	if (mddev->delta_disks < 0) {
3997		BUG_ON(conf->reshape_progress == 0);
3998		stripe_addr = writepos;
3999		BUG_ON((mddev->dev_sectors &
4000			~((sector_t)reshape_sectors - 1))
4001		       - reshape_sectors - stripe_addr
4002		       != sector_nr);
4003	} else {
4004		BUG_ON(writepos != sector_nr + reshape_sectors);
4005		stripe_addr = sector_nr;
4006	}
4007	INIT_LIST_HEAD(&stripes);
4008	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4009		int j;
4010		int skipped_disk = 0;
4011		sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4012		set_bit(STRIPE_EXPANDING, &sh->state);
4013		atomic_inc(&conf->reshape_stripes);
4014		/* If any of this stripe is beyond the end of the old
4015		 * array, then we need to zero those blocks
4016		 */
4017		for (j=sh->disks; j--;) {
4018			sector_t s;
4019			if (j == sh->pd_idx)
4020				continue;
4021			if (conf->level == 6 &&
4022			    j == sh->qd_idx)
4023				continue;
4024			s = compute_blocknr(sh, j, 0);
4025			if (s < raid5_size(mddev, 0, 0)) {
4026				skipped_disk = 1;
4027				continue;
4028			}
4029			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4030			set_bit(R5_Expanded, &sh->dev[j].flags);
4031			set_bit(R5_UPTODATE, &sh->dev[j].flags);
4032		}
4033		if (!skipped_disk) {
4034			set_bit(STRIPE_EXPAND_READY, &sh->state);
4035			set_bit(STRIPE_HANDLE, &sh->state);
4036		}
4037		list_add(&sh->lru, &stripes);
4038	}
4039	spin_lock_irq(&conf->device_lock);
4040	if (mddev->delta_disks < 0)
4041		conf->reshape_progress -= reshape_sectors * new_data_disks;
4042	else
4043		conf->reshape_progress += reshape_sectors * new_data_disks;
4044	spin_unlock_irq(&conf->device_lock);
4045	/* Ok, those stripe are ready. We can start scheduling
4046	 * reads on the source stripes.
4047	 * The source stripes are determined by mapping the first and last
4048	 * block on the destination stripes.
4049	 */
4050	first_sector =
4051		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4052				     1, &dd_idx, NULL);
4053	last_sector =
4054		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4055					    * new_data_disks - 1),
4056				     1, &dd_idx, NULL);
4057	if (last_sector >= mddev->dev_sectors)
4058		last_sector = mddev->dev_sectors - 1;
4059	while (first_sector <= last_sector) {
4060		sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4061		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4062		set_bit(STRIPE_HANDLE, &sh->state);
4063		release_stripe(sh);
4064		first_sector += STRIPE_SECTORS;
4065	}
4066	/* Now that the sources are clearly marked, we can release
4067	 * the destination stripes
4068	 */
4069	while (!list_empty(&stripes)) {
4070		sh = list_entry(stripes.next, struct stripe_head, lru);
4071		list_del_init(&sh->lru);
4072		release_stripe(sh);
4073	}
4074	/* If this takes us to the resync_max point where we have to pause,
4075	 * then we need to write out the superblock.
4076	 */
4077	sector_nr += reshape_sectors;
4078	if ((sector_nr - mddev->curr_resync_completed) * 2
4079	    >= mddev->resync_max - mddev->curr_resync_completed) {
4080		/* Cannot proceed until we've updated the superblock... */
4081		wait_event(conf->wait_for_overlap,
4082			   atomic_read(&conf->reshape_stripes) == 0);
4083		mddev->reshape_position = conf->reshape_progress;
4084		mddev->curr_resync_completed = sector_nr;
4085		conf->reshape_checkpoint = jiffies;
4086		set_bit(MD_CHANGE_DEVS, &mddev->flags);
4087		md_wakeup_thread(mddev->thread);
4088		wait_event(mddev->sb_wait,
4089			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4090			   || kthread_should_stop());
4091		spin_lock_irq(&conf->device_lock);
4092		conf->reshape_safe = mddev->reshape_position;
4093		spin_unlock_irq(&conf->device_lock);
4094		wake_up(&conf->wait_for_overlap);
4095		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4096	}
4097	return reshape_sectors;
4098}
4099
4100/* FIXME go_faster isn't used */
4101static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
4102{
4103	struct r5conf *conf = mddev->private;
4104	struct stripe_head *sh;
4105	sector_t max_sector = mddev->dev_sectors;
4106	sector_t sync_blocks;
4107	int still_degraded = 0;
4108	int i;
4109
4110	if (sector_nr >= max_sector) {
4111		/* just being told to finish up .. nothing much to do */
4112
4113		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4114			end_reshape(conf);
4115			return 0;
4116		}
4117
4118		if (mddev->curr_resync < max_sector) /* aborted */
4119			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4120					&sync_blocks, 1);
4121		else /* completed sync */
4122			conf->fullsync = 0;
4123		bitmap_close_sync(mddev->bitmap);
4124
4125		return 0;
4126	}
4127
4128	/* Allow raid5_quiesce to complete */
4129	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4130
4131	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4132		return reshape_request(mddev, sector_nr, skipped);
4133
4134	/* No need to check resync_max as we never do more than one
4135	 * stripe, and as resync_max will always be on a chunk boundary,
4136	 * if the check in md_do_sync didn't fire, there is no chance
4137	 * of overstepping resync_max here
4138	 */
4139
4140	/* if there is too many failed drives and we are trying
4141	 * to resync, then assert that we are finished, because there is
4142	 * nothing we can do.
4143	 */
4144	if (mddev->degraded >= conf->max_degraded &&
4145	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4146		sector_t rv = mddev->dev_sectors - sector_nr;
4147		*skipped = 1;
4148		return rv;
4149	}
4150	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4151	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4152	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4153		/* we can skip this block, and probably more */
4154		sync_blocks /= STRIPE_SECTORS;
4155		*skipped = 1;
4156		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4157	}
4158
4159
4160	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4161
4162	sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4163	if (sh == NULL) {
4164		sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4165		/* make sure we don't swamp the stripe cache if someone else
4166		 * is trying to get access
4167		 */
4168		schedule_timeout_uninterruptible(1);
4169	}
4170	/* Need to check if array will still be degraded after recovery/resync
4171	 * We don't need to check the 'failed' flag as when that gets set,
4172	 * recovery aborts.
4173	 */
4174	for (i = 0; i < conf->raid_disks; i++)
4175		if (conf->disks[i].rdev == NULL)
4176			still_degraded = 1;
4177
4178	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4179
4180	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4181
4182	handle_stripe(sh);
4183	release_stripe(sh);
4184
4185	return STRIPE_SECTORS;
4186}
4187
4188static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4189{
4190	/* We may not be able to submit a whole bio at once as there
4191	 * may not be enough stripe_heads available.
4192	 * We cannot pre-allocate enough stripe_heads as we may need
4193	 * more than exist in the cache (if we allow ever large chunks).
4194	 * So we do one stripe head at a time and record in
4195	 * ->bi_hw_segments how many have been done.
4196	 *
4197	 * We *know* that this entire raid_bio is in one chunk, so
4198	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4199	 */
4200	struct stripe_head *sh;
4201	int dd_idx;
4202	sector_t sector, logical_sector, last_sector;
4203	int scnt = 0;
4204	int remaining;
4205	int handled = 0;
4206
4207	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4208	sector = raid5_compute_sector(conf, logical_sector,
4209				      0, &dd_idx, NULL);
4210	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4211
4212	for (; logical_sector < last_sector;
4213	     logical_sector += STRIPE_SECTORS,
4214		     sector += STRIPE_SECTORS,
4215		     scnt++) {
4216
4217		if (scnt < raid5_bi_hw_segments(raid_bio))
4218			/* already done this stripe */
4219			continue;
4220
4221		sh = get_active_stripe(conf, sector, 0, 1, 0);
4222
4223		if (!sh) {
4224			/* failed to get a stripe - must wait */
4225			raid5_set_bi_hw_segments(raid_bio, scnt);
4226			conf->retry_read_aligned = raid_bio;
4227			return handled;
4228		}
4229
4230		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4231		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4232			release_stripe(sh);
4233			raid5_set_bi_hw_segments(raid_bio, scnt);
4234			conf->retry_read_aligned = raid_bio;
4235			return handled;
4236		}
4237
4238		handle_stripe(sh);
4239		release_stripe(sh);
4240		handled++;
4241	}
4242	spin_lock_irq(&conf->device_lock);
4243	remaining = raid5_dec_bi_phys_segments(raid_bio);
4244	spin_unlock_irq(&conf->device_lock);
4245	if (remaining == 0)
4246		bio_endio(raid_bio, 0);
4247	if (atomic_dec_and_test(&conf->active_aligned_reads))
4248		wake_up(&conf->wait_for_stripe);
4249	return handled;
4250}
4251
4252
4253/*
4254 * This is our raid5 kernel thread.
4255 *
4256 * We scan the hash table for stripes which can be handled now.
4257 * During the scan, completed stripes are saved for us by the interrupt
4258 * handler, so that they will not have to wait for our next wakeup.
4259 */
4260static void raid5d(struct mddev *mddev)
4261{
4262	struct stripe_head *sh;
4263	struct r5conf *conf = mddev->private;
4264	int handled;
4265	struct blk_plug plug;
4266
4267	pr_debug("+++ raid5d active\n");
4268
4269	md_check_recovery(mddev);
4270
4271	blk_start_plug(&plug);
4272	handled = 0;
4273	spin_lock_irq(&conf->device_lock);
4274	while (1) {
4275		struct bio *bio;
4276
4277		if (atomic_read(&mddev->plug_cnt) == 0 &&
4278		    !list_empty(&conf->bitmap_list)) {
4279			/* Now is a good time to flush some bitmap updates */
4280			conf->seq_flush++;
4281			spin_unlock_irq(&conf->device_lock);
4282			bitmap_unplug(mddev->bitmap);
4283			spin_lock_irq(&conf->device_lock);
4284			conf->seq_write = conf->seq_flush;
4285			activate_bit_delay(conf);
4286		}
4287		if (atomic_read(&mddev->plug_cnt) == 0)
4288			raid5_activate_delayed(conf);
4289
4290		while ((bio = remove_bio_from_retry(conf))) {
4291			int ok;
4292			spin_unlock_irq(&conf->device_lock);
4293			ok = retry_aligned_read(conf, bio);
4294			spin_lock_irq(&conf->device_lock);
4295			if (!ok)
4296				break;
4297			handled++;
4298		}
4299
4300		sh = __get_priority_stripe(conf);
4301
4302		if (!sh)
4303			break;
4304		spin_unlock_irq(&conf->device_lock);
4305
4306		handled++;
4307		handle_stripe(sh);
4308		release_stripe(sh);
4309		cond_resched();
4310
4311		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4312			md_check_recovery(mddev);
4313
4314		spin_lock_irq(&conf->device_lock);
4315	}
4316	pr_debug("%d stripes handled\n", handled);
4317
4318	spin_unlock_irq(&conf->device_lock);
4319
4320	async_tx_issue_pending_all();
4321	blk_finish_plug(&plug);
4322
4323	pr_debug("--- raid5d inactive\n");
4324}
4325
4326static ssize_t
4327raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
4328{
4329	struct r5conf *conf = mddev->private;
4330	if (conf)
4331		return sprintf(page, "%d\n", conf->max_nr_stripes);
4332	else
4333		return 0;
4334}
4335
4336int
4337raid5_set_cache_size(struct mddev *mddev, int size)
4338{
4339	struct r5conf *conf = mddev->private;
4340	int err;
4341
4342	if (size <= 16 || size > 32768)
4343		return -EINVAL;
4344	while (size < conf->max_nr_stripes) {
4345		if (drop_one_stripe(conf))
4346			conf->max_nr_stripes--;
4347		else
4348			break;
4349	}
4350	err = md_allow_write(mddev);
4351	if (err)
4352		return err;
4353	while (size > conf->max_nr_stripes) {
4354		if (grow_one_stripe(conf))
4355			conf->max_nr_stripes++;
4356		else break;
4357	}
4358	return 0;
4359}
4360EXPORT_SYMBOL(raid5_set_cache_size);
4361
4362static ssize_t
4363raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
4364{
4365	struct r5conf *conf = mddev->private;
4366	unsigned long new;
4367	int err;
4368
4369	if (len >= PAGE_SIZE)
4370		return -EINVAL;
4371	if (!conf)
4372		return -ENODEV;
4373
4374	if (strict_strtoul(page, 10, &new))
4375		return -EINVAL;
4376	err = raid5_set_cache_size(mddev, new);
4377	if (err)
4378		return err;
4379	return len;
4380}
4381
4382static struct md_sysfs_entry
4383raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4384				raid5_show_stripe_cache_size,
4385				raid5_store_stripe_cache_size);
4386
4387static ssize_t
4388raid5_show_preread_threshold(struct mddev *mddev, char *page)
4389{
4390	struct r5conf *conf = mddev->private;
4391	if (conf)
4392		return sprintf(page, "%d\n", conf->bypass_threshold);
4393	else
4394		return 0;
4395}
4396
4397static ssize_t
4398raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
4399{
4400	struct r5conf *conf = mddev->private;
4401	unsigned long new;
4402	if (len >= PAGE_SIZE)
4403		return -EINVAL;
4404	if (!conf)
4405		return -ENODEV;
4406
4407	if (strict_strtoul(page, 10, &new))
4408		return -EINVAL;
4409	if (new > conf->max_nr_stripes)
4410		return -EINVAL;
4411	conf->bypass_threshold = new;
4412	return len;
4413}
4414
4415static struct md_sysfs_entry
4416raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4417					S_IRUGO | S_IWUSR,
4418					raid5_show_preread_threshold,
4419					raid5_store_preread_threshold);
4420
4421static ssize_t
4422stripe_cache_active_show(struct mddev *mddev, char *page)
4423{
4424	struct r5conf *conf = mddev->private;
4425	if (conf)
4426		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4427	else
4428		return 0;
4429}
4430
4431static struct md_sysfs_entry
4432raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4433
4434static struct attribute *raid5_attrs[] =  {
4435	&raid5_stripecache_size.attr,
4436	&raid5_stripecache_active.attr,
4437	&raid5_preread_bypass_threshold.attr,
4438	NULL,
4439};
4440static struct attribute_group raid5_attrs_group = {
4441	.name = NULL,
4442	.attrs = raid5_attrs,
4443};
4444
4445static sector_t
4446raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
4447{
4448	struct r5conf *conf = mddev->private;
4449
4450	if (!sectors)
4451		sectors = mddev->dev_sectors;
4452	if (!raid_disks)
4453		/* size is defined by the smallest of previous and new size */
4454		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4455
4456	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4457	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4458	return sectors * (raid_disks - conf->max_degraded);
4459}
4460
4461static void raid5_free_percpu(struct r5conf *conf)
4462{
4463	struct raid5_percpu *percpu;
4464	unsigned long cpu;
4465
4466	if (!conf->percpu)
4467		return;
4468
4469	get_online_cpus();
4470	for_each_possible_cpu(cpu) {
4471		percpu = per_cpu_ptr(conf->percpu, cpu);
4472		safe_put_page(percpu->spare_page);
4473		kfree(percpu->scribble);
4474	}
4475#ifdef CONFIG_HOTPLUG_CPU
4476	unregister_cpu_notifier(&conf->cpu_notify);
4477#endif
4478	put_online_cpus();
4479
4480	free_percpu(conf->percpu);
4481}
4482
4483static void free_conf(struct r5conf *conf)
4484{
4485	shrink_stripes(conf);
4486	raid5_free_percpu(conf);
4487	kfree(conf->disks);
4488	kfree(conf->stripe_hashtbl);
4489	kfree(conf);
4490}
4491
4492#ifdef CONFIG_HOTPLUG_CPU
4493static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4494			      void *hcpu)
4495{
4496	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
4497	long cpu = (long)hcpu;
4498	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4499
4500	switch (action) {
4501	case CPU_UP_PREPARE:
4502	case CPU_UP_PREPARE_FROZEN:
4503		if (conf->level == 6 && !percpu->spare_page)
4504			percpu->spare_page = alloc_page(GFP_KERNEL);
4505		if (!percpu->scribble)
4506			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4507
4508		if (!percpu->scribble ||
4509		    (conf->level == 6 && !percpu->spare_page)) {
4510			safe_put_page(percpu->spare_page);
4511			kfree(percpu->scribble);
4512			pr_err("%s: failed memory allocation for cpu%ld\n",
4513			       __func__, cpu);
4514			return notifier_from_errno(-ENOMEM);
4515		}
4516		break;
4517	case CPU_DEAD:
4518	case CPU_DEAD_FROZEN:
4519		safe_put_page(percpu->spare_page);
4520		kfree(percpu->scribble);
4521		percpu->spare_page = NULL;
4522		percpu->scribble = NULL;
4523		break;
4524	default:
4525		break;
4526	}
4527	return NOTIFY_OK;
4528}
4529#endif
4530
4531static int raid5_alloc_percpu(struct r5conf *conf)
4532{
4533	unsigned long cpu;
4534	struct page *spare_page;
4535	struct raid5_percpu __percpu *allcpus;
4536	void *scribble;
4537	int err;
4538
4539	allcpus = alloc_percpu(struct raid5_percpu);
4540	if (!allcpus)
4541		return -ENOMEM;
4542	conf->percpu = allcpus;
4543
4544	get_online_cpus();
4545	err = 0;
4546	for_each_present_cpu(cpu) {
4547		if (conf->level == 6) {
4548			spare_page = alloc_page(GFP_KERNEL);
4549			if (!spare_page) {
4550				err = -ENOMEM;
4551				break;
4552			}
4553			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4554		}
4555		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4556		if (!scribble) {
4557			err = -ENOMEM;
4558			break;
4559		}
4560		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4561	}
4562#ifdef CONFIG_HOTPLUG_CPU
4563	conf->cpu_notify.notifier_call = raid456_cpu_notify;
4564	conf->cpu_notify.priority = 0;
4565	if (err == 0)
4566		err = register_cpu_notifier(&conf->cpu_notify);
4567#endif
4568	put_online_cpus();
4569
4570	return err;
4571}
4572
4573static struct r5conf *setup_conf(struct mddev *mddev)
4574{
4575	struct r5conf *conf;
4576	int raid_disk, memory, max_disks;
4577	struct md_rdev *rdev;
4578	struct disk_info *disk;
4579
4580	if (mddev->new_level != 5
4581	    && mddev->new_level != 4
4582	    && mddev->new_level != 6) {
4583		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
4584		       mdname(mddev), mddev->new_level);
4585		return ERR_PTR(-EIO);
4586	}
4587	if ((mddev->new_level == 5
4588	     && !algorithm_valid_raid5(mddev->new_layout)) ||
4589	    (mddev->new_level == 6
4590	     && !algorithm_valid_raid6(mddev->new_layout))) {
4591		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
4592		       mdname(mddev), mddev->new_layout);
4593		return ERR_PTR(-EIO);
4594	}
4595	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4596		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
4597		       mdname(mddev), mddev->raid_disks);
4598		return ERR_PTR(-EINVAL);
4599	}
4600
4601	if (!mddev->new_chunk_sectors ||
4602	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4603	    !is_power_of_2(mddev->new_chunk_sectors)) {
4604		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
4605		       mdname(mddev), mddev->new_chunk_sectors << 9);
4606		return ERR_PTR(-EINVAL);
4607	}
4608
4609	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
4610	if (conf == NULL)
4611		goto abort;
4612	spin_lock_init(&conf->device_lock);
4613	init_waitqueue_head(&conf->wait_for_stripe);
4614	init_waitqueue_head(&conf->wait_for_overlap);
4615	INIT_LIST_HEAD(&conf->handle_list);
4616	INIT_LIST_HEAD(&conf->hold_list);
4617	INIT_LIST_HEAD(&conf->delayed_list);
4618	INIT_LIST_HEAD(&conf->bitmap_list);
4619	INIT_LIST_HEAD(&conf->inactive_list);
4620	atomic_set(&conf->active_stripes, 0);
4621	atomic_set(&conf->preread_active_stripes, 0);
4622	atomic_set(&conf->active_aligned_reads, 0);
4623	conf->bypass_threshold = BYPASS_THRESHOLD;
4624	conf->recovery_disabled = mddev->recovery_disabled - 1;
4625
4626	conf->raid_disks = mddev->raid_disks;
4627	if (mddev->reshape_position == MaxSector)
4628		conf->previous_raid_disks = mddev->raid_disks;
4629	else
4630		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4631	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
4632	conf->scribble_len = scribble_len(max_disks);
4633
4634	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
4635			      GFP_KERNEL);
4636	if (!conf->disks)
4637		goto abort;
4638
4639	conf->mddev = mddev;
4640
4641	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4642		goto abort;
4643
4644	conf->level = mddev->new_level;
4645	if (raid5_alloc_percpu(conf) != 0)
4646		goto abort;
4647
4648	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4649
4650	list_for_each_entry(rdev, &mddev->disks, same_set) {
4651		raid_disk = rdev->raid_disk;
4652		if (raid_disk >= max_disks
4653		    || raid_disk < 0)
4654			continue;
4655		disk = conf->disks + raid_disk;
4656
4657		disk->rdev = rdev;
4658
4659		if (test_bit(In_sync, &rdev->flags)) {
4660			char b[BDEVNAME_SIZE];
4661			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4662			       " disk %d\n",
4663			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4664		} else if (rdev->saved_raid_disk != raid_disk)
4665			/* Cannot rely on bitmap to complete recovery */
4666			conf->fullsync = 1;
4667	}
4668
4669	conf->chunk_sectors = mddev->new_chunk_sectors;
4670	conf->level = mddev->new_level;
4671	if (conf->level == 6)
4672		conf->max_degraded = 2;
4673	else
4674		conf->max_degraded = 1;
4675	conf->algorithm = mddev->new_layout;
4676	conf->max_nr_stripes = NR_STRIPES;
4677	conf->reshape_progress = mddev->reshape_position;
4678	if (conf->reshape_progress != MaxSector) {
4679		conf->prev_chunk_sectors = mddev->chunk_sectors;
4680		conf->prev_algo = mddev->layout;
4681	}
4682
4683	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4684		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4685	if (grow_stripes(conf, conf->max_nr_stripes)) {
4686		printk(KERN_ERR
4687		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
4688		       mdname(mddev), memory);
4689		goto abort;
4690	} else
4691		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
4692		       mdname(mddev), memory);
4693
4694	conf->thread = md_register_thread(raid5d, mddev, NULL);
4695	if (!conf->thread) {
4696		printk(KERN_ERR
4697		       "md/raid:%s: couldn't allocate thread.\n",
4698		       mdname(mddev));
4699		goto abort;
4700	}
4701
4702	return conf;
4703
4704 abort:
4705	if (conf) {
4706		free_conf(conf);
4707		return ERR_PTR(-EIO);
4708	} else
4709		return ERR_PTR(-ENOMEM);
4710}
4711
4712
4713static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
4714{
4715	switch (algo) {
4716	case ALGORITHM_PARITY_0:
4717		if (raid_disk < max_degraded)
4718			return 1;
4719		break;
4720	case ALGORITHM_PARITY_N:
4721		if (raid_disk >= raid_disks - max_degraded)
4722			return 1;
4723		break;
4724	case ALGORITHM_PARITY_0_6:
4725		if (raid_disk == 0 ||
4726		    raid_disk == raid_disks - 1)
4727			return 1;
4728		break;
4729	case ALGORITHM_LEFT_ASYMMETRIC_6:
4730	case ALGORITHM_RIGHT_ASYMMETRIC_6:
4731	case ALGORITHM_LEFT_SYMMETRIC_6:
4732	case ALGORITHM_RIGHT_SYMMETRIC_6:
4733		if (raid_disk == raid_disks - 1)
4734			return 1;
4735	}
4736	return 0;
4737}
4738
4739static int run(struct mddev *mddev)
4740{
4741	struct r5conf *conf;
4742	int working_disks = 0;
4743	int dirty_parity_disks = 0;
4744	struct md_rdev *rdev;
4745	sector_t reshape_offset = 0;
4746
4747	if (mddev->recovery_cp != MaxSector)
4748		printk(KERN_NOTICE "md/raid:%s: not clean"
4749		       " -- starting background reconstruction\n",
4750		       mdname(mddev));
4751	if (mddev->reshape_position != MaxSector) {
4752		/* Check that we can continue the reshape.
4753		 * Currently only disks can change, it must
4754		 * increase, and we must be past the point where
4755		 * a stripe over-writes itself
4756		 */
4757		sector_t here_new, here_old;
4758		int old_disks;
4759		int max_degraded = (mddev->level == 6 ? 2 : 1);
4760
4761		if (mddev->new_level != mddev->level) {
4762			printk(KERN_ERR "md/raid:%s: unsupported reshape "
4763			       "required - aborting.\n",
4764			       mdname(mddev));
4765			return -EINVAL;
4766		}
4767		old_disks = mddev->raid_disks - mddev->delta_disks;
4768		/* reshape_position must be on a new-stripe boundary, and one
4769		 * further up in new geometry must map after here in old
4770		 * geometry.
4771		 */
4772		here_new = mddev->reshape_position;
4773		if (sector_div(here_new, mddev->new_chunk_sectors *
4774			       (mddev->raid_disks - max_degraded))) {
4775			printk(KERN_ERR "md/raid:%s: reshape_position not "
4776			       "on a stripe boundary\n", mdname(mddev));
4777			return -EINVAL;
4778		}
4779		reshape_offset = here_new * mddev->new_chunk_sectors;
4780		/* here_new is the stripe we will write to */
4781		here_old = mddev->reshape_position;
4782		sector_div(here_old, mddev->chunk_sectors *
4783			   (old_disks-max_degraded));
4784		/* here_old is the first stripe that we might need to read
4785		 * from */
4786		if (mddev->delta_disks == 0) {
4787			/* We cannot be sure it is safe to start an in-place
4788			 * reshape.  It is only safe if user-space if monitoring
4789			 * and taking constant backups.
4790			 * mdadm always starts a situation like this in
4791			 * readonly mode so it can take control before
4792			 * allowing any writes.  So just check for that.
4793			 */
4794			if ((here_new * mddev->new_chunk_sectors !=
4795			     here_old * mddev->chunk_sectors) ||
4796			    mddev->ro == 0) {
4797				printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
4798				       " in read-only mode - aborting\n",
4799				       mdname(mddev));
4800				return -EINVAL;
4801			}
4802		} else if (mddev->delta_disks < 0
4803		    ? (here_new * mddev->new_chunk_sectors <=
4804		       here_old * mddev->chunk_sectors)
4805		    : (here_new * mddev->new_chunk_sectors >=
4806		       here_old * mddev->chunk_sectors)) {
4807			/* Reading from the same stripe as writing to - bad */
4808			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
4809			       "auto-recovery - aborting.\n",
4810			       mdname(mddev));
4811			return -EINVAL;
4812		}
4813		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
4814		       mdname(mddev));
4815		/* OK, we should be able to continue; */
4816	} else {
4817		BUG_ON(mddev->level != mddev->new_level);
4818		BUG_ON(mddev->layout != mddev->new_layout);
4819		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
4820		BUG_ON(mddev->delta_disks != 0);
4821	}
4822
4823	if (mddev->private == NULL)
4824		conf = setup_conf(mddev);
4825	else
4826		conf = mddev->private;
4827
4828	if (IS_ERR(conf))
4829		return PTR_ERR(conf);
4830
4831	mddev->thread = conf->thread;
4832	conf->thread = NULL;
4833	mddev->private = conf;
4834
4835	/*
4836	 * 0 for a fully functional array, 1 or 2 for a degraded array.
4837	 */
4838	list_for_each_entry(rdev, &mddev->disks, same_set) {
4839		if (rdev->raid_disk < 0)
4840			continue;
4841		if (test_bit(In_sync, &rdev->flags)) {
4842			working_disks++;
4843			continue;
4844		}
4845		/* This disc is not fully in-sync.  However if it
4846		 * just stored parity (beyond the recovery_offset),
4847		 * when we don't need to be concerned about the
4848		 * array being dirty.
4849		 * When reshape goes 'backwards', we never have
4850		 * partially completed devices, so we only need
4851		 * to worry about reshape going forwards.
4852		 */
4853		/* Hack because v0.91 doesn't store recovery_offset properly. */
4854		if (mddev->major_version == 0 &&
4855		    mddev->minor_version > 90)
4856			rdev->recovery_offset = reshape_offset;
4857
4858		if (rdev->recovery_offset < reshape_offset) {
4859			/* We need to check old and new layout */
4860			if (!only_parity(rdev->raid_disk,
4861					 conf->algorithm,
4862					 conf->raid_disks,
4863					 conf->max_degraded))
4864				continue;
4865		}
4866		if (!only_parity(rdev->raid_disk,
4867				 conf->prev_algo,
4868				 conf->previous_raid_disks,
4869				 conf->max_degraded))
4870			continue;
4871		dirty_parity_disks++;
4872	}
4873
4874	mddev->degraded = calc_degraded(conf);
4875
4876	if (has_failed(conf)) {
4877		printk(KERN_ERR "md/raid:%s: not enough operational devices"
4878			" (%d/%d failed)\n",
4879			mdname(mddev), mddev->degraded, conf->raid_disks);
4880		goto abort;
4881	}
4882
4883	/* device size must be a multiple of chunk size */
4884	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
4885	mddev->resync_max_sectors = mddev->dev_sectors;
4886
4887	if (mddev->degraded > dirty_parity_disks &&
4888	    mddev->recovery_cp != MaxSector) {
4889		if (mddev->ok_start_degraded)
4890			printk(KERN_WARNING
4891			       "md/raid:%s: starting dirty degraded array"
4892			       " - data corruption possible.\n",
4893			       mdname(mddev));
4894		else {
4895			printk(KERN_ERR
4896			       "md/raid:%s: cannot start dirty degraded array.\n",
4897			       mdname(mddev));
4898			goto abort;
4899		}
4900	}
4901
4902	if (mddev->degraded == 0)
4903		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
4904		       " devices, algorithm %d\n", mdname(mddev), conf->level,
4905		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4906		       mddev->new_layout);
4907	else
4908		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
4909		       " out of %d devices, algorithm %d\n",
4910		       mdname(mddev), conf->level,
4911		       mddev->raid_disks - mddev->degraded,
4912		       mddev->raid_disks, mddev->new_layout);
4913
4914	print_raid5_conf(conf);
4915
4916	if (conf->reshape_progress != MaxSector) {
4917		conf->reshape_safe = conf->reshape_progress;
4918		atomic_set(&conf->reshape_stripes, 0);
4919		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4920		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4921		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4922		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4923		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4924							"reshape");
4925	}
4926
4927
4928	/* Ok, everything is just fine now */
4929	if (mddev->to_remove == &raid5_attrs_group)
4930		mddev->to_remove = NULL;
4931	else if (mddev->kobj.sd &&
4932	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4933		printk(KERN_WARNING
4934		       "raid5: failed to create sysfs attributes for %s\n",
4935		       mdname(mddev));
4936	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
4937
4938	if (mddev->queue) {
4939		int chunk_size;
4940		/* read-ahead size must cover two whole stripes, which
4941		 * is 2 * (datadisks) * chunksize where 'n' is the
4942		 * number of raid devices
4943		 */
4944		int data_disks = conf->previous_raid_disks - conf->max_degraded;
4945		int stripe = data_disks *
4946			((mddev->chunk_sectors << 9) / PAGE_SIZE);
4947		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4948			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4949
4950		blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4951
4952		mddev->queue->backing_dev_info.congested_data = mddev;
4953		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4954
4955		chunk_size = mddev->chunk_sectors << 9;
4956		blk_queue_io_min(mddev->queue, chunk_size);
4957		blk_queue_io_opt(mddev->queue, chunk_size *
4958				 (conf->raid_disks - conf->max_degraded));
4959
4960		list_for_each_entry(rdev, &mddev->disks, same_set)
4961			disk_stack_limits(mddev->gendisk, rdev->bdev,
4962					  rdev->data_offset << 9);
4963	}
4964
4965	return 0;
4966abort:
4967	md_unregister_thread(&mddev->thread);
4968	print_raid5_conf(conf);
4969	free_conf(conf);
4970	mddev->private = NULL;
4971	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
4972	return -EIO;
4973}
4974
4975static int stop(struct mddev *mddev)
4976{
4977	struct r5conf *conf = mddev->private;
4978
4979	md_unregister_thread(&mddev->thread);
4980	if (mddev->queue)
4981		mddev->queue->backing_dev_info.congested_fn = NULL;
4982	free_conf(conf);
4983	mddev->private = NULL;
4984	mddev->to_remove = &raid5_attrs_group;
4985	return 0;
4986}
4987
4988static void status(struct seq_file *seq, struct mddev *mddev)
4989{
4990	struct r5conf *conf = mddev->private;
4991	int i;
4992
4993	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
4994		mddev->chunk_sectors / 2, mddev->layout);
4995	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
4996	for (i = 0; i < conf->raid_disks; i++)
4997		seq_printf (seq, "%s",
4998			       conf->disks[i].rdev &&
4999			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
5000	seq_printf (seq, "]");
5001}
5002
5003static void print_raid5_conf (struct r5conf *conf)
5004{
5005	int i;
5006	struct disk_info *tmp;
5007
5008	printk(KERN_DEBUG "RAID conf printout:\n");
5009	if (!conf) {
5010		printk("(conf==NULL)\n");
5011		return;
5012	}
5013	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
5014	       conf->raid_disks,
5015	       conf->raid_disks - conf->mddev->degraded);
5016
5017	for (i = 0; i < conf->raid_disks; i++) {
5018		char b[BDEVNAME_SIZE];
5019		tmp = conf->disks + i;
5020		if (tmp->rdev)
5021			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
5022			       i, !test_bit(Faulty, &tmp->rdev->flags),
5023			       bdevname(tmp->rdev->bdev, b));
5024	}
5025}
5026
5027static int raid5_spare_active(struct mddev *mddev)
5028{
5029	int i;
5030	struct r5conf *conf = mddev->private;
5031	struct disk_info *tmp;
5032	int count = 0;
5033	unsigned long flags;
5034
5035	for (i = 0; i < conf->raid_disks; i++) {
5036		tmp = conf->disks + i;
5037		if (tmp->rdev
5038		    && tmp->rdev->recovery_offset == MaxSector
5039		    && !test_bit(Faulty, &tmp->rdev->flags)
5040		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5041			count++;
5042			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
5043		}
5044	}
5045	spin_lock_irqsave(&conf->device_lock, flags);
5046	mddev->degraded = calc_degraded(conf);
5047	spin_unlock_irqrestore(&conf->device_lock, flags);
5048	print_raid5_conf(conf);
5049	return count;
5050}
5051
5052static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5053{
5054	struct r5conf *conf = mddev->private;
5055	int err = 0;
5056	int number = rdev->raid_disk;
5057	struct disk_info *p = conf->disks + number;
5058
5059	print_raid5_conf(conf);
5060	if (rdev == p->rdev) {
5061		if (number >= conf->raid_disks &&
5062		    conf->reshape_progress == MaxSector)
5063			clear_bit(In_sync, &rdev->flags);
5064
5065		if (test_bit(In_sync, &rdev->flags) ||
5066		    atomic_read(&rdev->nr_pending)) {
5067			err = -EBUSY;
5068			goto abort;
5069		}
5070		/* Only remove non-faulty devices if recovery
5071		 * isn't possible.
5072		 */
5073		if (!test_bit(Faulty, &rdev->flags) &&
5074		    mddev->recovery_disabled != conf->recovery_disabled &&
5075		    !has_failed(conf) &&
5076		    number < conf->raid_disks) {
5077			err = -EBUSY;
5078			goto abort;
5079		}
5080		p->rdev = NULL;
5081		synchronize_rcu();
5082		if (atomic_read(&rdev->nr_pending)) {
5083			/* lost the race, try later */
5084			err = -EBUSY;
5085			p->rdev = rdev;
5086		}
5087	}
5088abort:
5089
5090	print_raid5_conf(conf);
5091	return err;
5092}
5093
5094static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5095{
5096	struct r5conf *conf = mddev->private;
5097	int err = -EEXIST;
5098	int disk;
5099	struct disk_info *p;
5100	int first = 0;
5101	int last = conf->raid_disks - 1;
5102
5103	if (mddev->recovery_disabled == conf->recovery_disabled)
5104		return -EBUSY;
5105
5106	if (has_failed(conf))
5107		/* no point adding a device */
5108		return -EINVAL;
5109
5110	if (rdev->raid_disk >= 0)
5111		first = last = rdev->raid_disk;
5112
5113	/*
5114	 * find the disk ... but prefer rdev->saved_raid_disk
5115	 * if possible.
5116	 */
5117	if (rdev->saved_raid_disk >= 0 &&
5118	    rdev->saved_raid_disk >= first &&
5119	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
5120		disk = rdev->saved_raid_disk;
5121	else
5122		disk = first;
5123	for ( ; disk <= last ; disk++)
5124		if ((p=conf->disks + disk)->rdev == NULL) {
5125			clear_bit(In_sync, &rdev->flags);
5126			rdev->raid_disk = disk;
5127			err = 0;
5128			if (rdev->saved_raid_disk != disk)
5129				conf->fullsync = 1;
5130			rcu_assign_pointer(p->rdev, rdev);
5131			break;
5132		}
5133	print_raid5_conf(conf);
5134	return err;
5135}
5136
5137static int raid5_resize(struct mddev *mddev, sector_t sectors)
5138{
5139	/* no resync is happening, and there is enough space
5140	 * on all devices, so we can resize.
5141	 * We need to make sure resync covers any new space.
5142	 * If the array is shrinking we should possibly wait until
5143	 * any io in the removed space completes, but it hardly seems
5144	 * worth it.
5145	 */
5146	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5147	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5148					       mddev->raid_disks));
5149	if (mddev->array_sectors >
5150	    raid5_size(mddev, sectors, mddev->raid_disks))
5151		return -EINVAL;
5152	set_capacity(mddev->gendisk, mddev->array_sectors);
5153	revalidate_disk(mddev->gendisk);
5154	if (sectors > mddev->dev_sectors &&
5155	    mddev->recovery_cp > mddev->dev_sectors) {
5156		mddev->recovery_cp = mddev->dev_sectors;
5157		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5158	}
5159	mddev->dev_sectors = sectors;
5160	mddev->resync_max_sectors = sectors;
5161	return 0;
5162}
5163
5164static int check_stripe_cache(struct mddev *mddev)
5165{
5166	/* Can only proceed if there are plenty of stripe_heads.
5167	 * We need a minimum of one full stripe,, and for sensible progress
5168	 * it is best to have about 4 times that.
5169	 * If we require 4 times, then the default 256 4K stripe_heads will
5170	 * allow for chunk sizes up to 256K, which is probably OK.
5171	 * If the chunk size is greater, user-space should request more
5172	 * stripe_heads first.
5173	 */
5174	struct r5conf *conf = mddev->private;
5175	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5176	    > conf->max_nr_stripes ||
5177	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5178	    > conf->max_nr_stripes) {
5179		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
5180		       mdname(mddev),
5181		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5182			/ STRIPE_SIZE)*4);
5183		return 0;
5184	}
5185	return 1;
5186}
5187
5188static int check_reshape(struct mddev *mddev)
5189{
5190	struct r5conf *conf = mddev->private;
5191
5192	if (mddev->delta_disks == 0 &&
5193	    mddev->new_layout == mddev->layout &&
5194	    mddev->new_chunk_sectors == mddev->chunk_sectors)
5195		return 0; /* nothing to do */
5196	if (mddev->bitmap)
5197		/* Cannot grow a bitmap yet */
5198		return -EBUSY;
5199	if (has_failed(conf))
5200		return -EINVAL;
5201	if (mddev->delta_disks < 0) {
5202		/* We might be able to shrink, but the devices must
5203		 * be made bigger first.
5204		 * For raid6, 4 is the minimum size.
5205		 * Otherwise 2 is the minimum
5206		 */
5207		int min = 2;
5208		if (mddev->level == 6)
5209			min = 4;
5210		if (mddev->raid_disks + mddev->delta_disks < min)
5211			return -EINVAL;
5212	}
5213
5214	if (!check_stripe_cache(mddev))
5215		return -ENOSPC;
5216
5217	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5218}
5219
5220static int raid5_start_reshape(struct mddev *mddev)
5221{
5222	struct r5conf *conf = mddev->private;
5223	struct md_rdev *rdev;
5224	int spares = 0;
5225	unsigned long flags;
5226
5227	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5228		return -EBUSY;
5229
5230	if (!check_stripe_cache(mddev))
5231		return -ENOSPC;
5232
5233	list_for_each_entry(rdev, &mddev->disks, same_set)
5234		if (!test_bit(In_sync, &rdev->flags)
5235		    && !test_bit(Faulty, &rdev->flags))
5236			spares++;
5237
5238	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5239		/* Not enough devices even to make a degraded array
5240		 * of that size
5241		 */
5242		return -EINVAL;
5243
5244	/* Refuse to reduce size of the array.  Any reductions in
5245	 * array size must be through explicit setting of array_size
5246	 * attribute.
5247	 */
5248	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5249	    < mddev->array_sectors) {
5250		printk(KERN_ERR "md/raid:%s: array size must be reduced "
5251		       "before number of disks\n", mdname(mddev));
5252		return -EINVAL;
5253	}
5254
5255	atomic_set(&conf->reshape_stripes, 0);
5256	spin_lock_irq(&conf->device_lock);
5257	conf->previous_raid_disks = conf->raid_disks;
5258	conf->raid_disks += mddev->delta_disks;
5259	conf->prev_chunk_sectors = conf->chunk_sectors;
5260	conf->chunk_sectors = mddev->new_chunk_sectors;
5261	conf->prev_algo = conf->algorithm;
5262	conf->algorithm = mddev->new_layout;
5263	if (mddev->delta_disks < 0)
5264		conf->reshape_progress = raid5_size(mddev, 0, 0);
5265	else
5266		conf->reshape_progress = 0;
5267	conf->reshape_safe = conf->reshape_progress;
5268	conf->generation++;
5269	spin_unlock_irq(&conf->device_lock);
5270
5271	/* Add some new drives, as many as will fit.
5272	 * We know there are enough to make the newly sized array work.
5273	 * Don't add devices if we are reducing the number of
5274	 * devices in the array.  This is because it is not possible
5275	 * to correctly record the "partially reconstructed" state of
5276	 * such devices during the reshape and confusion could result.
5277	 */
5278	if (mddev->delta_disks >= 0) {
5279		int added_devices = 0;
5280		list_for_each_entry(rdev, &mddev->disks, same_set)
5281			if (rdev->raid_disk < 0 &&
5282			    !test_bit(Faulty, &rdev->flags)) {
5283				if (raid5_add_disk(mddev, rdev) == 0) {
5284					if (rdev->raid_disk
5285					    >= conf->previous_raid_disks) {
5286						set_bit(In_sync, &rdev->flags);
5287						added_devices++;
5288					} else
5289						rdev->recovery_offset = 0;
5290
5291					if (sysfs_link_rdev(mddev, rdev))
5292						/* Failure here is OK */;
5293				}
5294			} else if (rdev->raid_disk >= conf->previous_raid_disks
5295				   && !test_bit(Faulty, &rdev->flags)) {
5296				/* This is a spare that was manually added */
5297				set_bit(In_sync, &rdev->flags);
5298				added_devices++;
5299			}
5300
5301		/* When a reshape changes the number of devices,
5302		 * ->degraded is measured against the larger of the
5303		 * pre and post number of devices.
5304		 */
5305		spin_lock_irqsave(&conf->device_lock, flags);
5306		mddev->degraded = calc_degraded(conf);
5307		spin_unlock_irqrestore(&conf->device_lock, flags);
5308	}
5309	mddev->raid_disks = conf->raid_disks;
5310	mddev->reshape_position = conf->reshape_progress;
5311	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5312
5313	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5314	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5315	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5316	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5317	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5318						"reshape");
5319	if (!mddev->sync_thread) {
5320		mddev->recovery = 0;
5321		spin_lock_irq(&conf->device_lock);
5322		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5323		conf->reshape_progress = MaxSector;
5324		spin_unlock_irq(&conf->device_lock);
5325		return -EAGAIN;
5326	}
5327	conf->reshape_checkpoint = jiffies;
5328	md_wakeup_thread(mddev->sync_thread);
5329	md_new_event(mddev);
5330	return 0;
5331}
5332
5333/* This is called from the reshape thread and should make any
5334 * changes needed in 'conf'
5335 */
5336static void end_reshape(struct r5conf *conf)
5337{
5338
5339	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5340
5341		spin_lock_irq(&conf->device_lock);
5342		conf->previous_raid_disks = conf->raid_disks;
5343		conf->reshape_progress = MaxSector;
5344		spin_unlock_irq(&conf->device_lock);
5345		wake_up(&conf->wait_for_overlap);
5346
5347		/* read-ahead size must cover two whole stripes, which is
5348		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
5349		 */
5350		if (conf->mddev->queue) {
5351			int data_disks = conf->raid_disks - conf->max_degraded;
5352			int stripe = data_disks * ((conf->chunk_sectors << 9)
5353						   / PAGE_SIZE);
5354			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5355				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5356		}
5357	}
5358}
5359
5360/* This is called from the raid5d thread with mddev_lock held.
5361 * It makes config changes to the device.
5362 */
5363static void raid5_finish_reshape(struct mddev *mddev)
5364{
5365	struct r5conf *conf = mddev->private;
5366
5367	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5368
5369		if (mddev->delta_disks > 0) {
5370			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5371			set_capacity(mddev->gendisk, mddev->array_sectors);
5372			revalidate_disk(mddev->gendisk);
5373		} else {
5374			int d;
5375			spin_lock_irq(&conf->device_lock);
5376			mddev->degraded = calc_degraded(conf);
5377			spin_unlock_irq(&conf->device_lock);
5378			for (d = conf->raid_disks ;
5379			     d < conf->raid_disks - mddev->delta_disks;
5380			     d++) {
5381				struct md_rdev *rdev = conf->disks[d].rdev;
5382				if (rdev &&
5383				    raid5_remove_disk(mddev, rdev) == 0) {
5384					sysfs_unlink_rdev(mddev, rdev);
5385					rdev->raid_disk = -1;
5386				}
5387			}
5388		}
5389		mddev->layout = conf->algorithm;
5390		mddev->chunk_sectors = conf->chunk_sectors;
5391		mddev->reshape_position = MaxSector;
5392		mddev->delta_disks = 0;
5393	}
5394}
5395
5396static void raid5_quiesce(struct mddev *mddev, int state)
5397{
5398	struct r5conf *conf = mddev->private;
5399
5400	switch(state) {
5401	case 2: /* resume for a suspend */
5402		wake_up(&conf->wait_for_overlap);
5403		break;
5404
5405	case 1: /* stop all writes */
5406		spin_lock_irq(&conf->device_lock);
5407		/* '2' tells resync/reshape to pause so that all
5408		 * active stripes can drain
5409		 */
5410		conf->quiesce = 2;
5411		wait_event_lock_irq(conf->wait_for_stripe,
5412				    atomic_read(&conf->active_stripes) == 0 &&
5413				    atomic_read(&conf->active_aligned_reads) == 0,
5414				    conf->device_lock, /* nothing */);
5415		conf->quiesce = 1;
5416		spin_unlock_irq(&conf->device_lock);
5417		/* allow reshape to continue */
5418		wake_up(&conf->wait_for_overlap);
5419		break;
5420
5421	case 0: /* re-enable writes */
5422		spin_lock_irq(&conf->device_lock);
5423		conf->quiesce = 0;
5424		wake_up(&conf->wait_for_stripe);
5425		wake_up(&conf->wait_for_overlap);
5426		spin_unlock_irq(&conf->device_lock);
5427		break;
5428	}
5429}
5430
5431
5432static void *raid45_takeover_raid0(struct mddev *mddev, int level)
5433{
5434	struct r0conf *raid0_conf = mddev->private;
5435	sector_t sectors;
5436
5437	/* for raid0 takeover only one zone is supported */
5438	if (raid0_conf->nr_strip_zones > 1) {
5439		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
5440		       mdname(mddev));
5441		return ERR_PTR(-EINVAL);
5442	}
5443
5444	sectors = raid0_conf->strip_zone[0].zone_end;
5445	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
5446	mddev->dev_sectors = sectors;
5447	mddev->new_level = level;
5448	mddev->new_layout = ALGORITHM_PARITY_N;
5449	mddev->new_chunk_sectors = mddev->chunk_sectors;
5450	mddev->raid_disks += 1;
5451	mddev->delta_disks = 1;
5452	/* make sure it will be not marked as dirty */
5453	mddev->recovery_cp = MaxSector;
5454
5455	return setup_conf(mddev);
5456}
5457
5458
5459static void *raid5_takeover_raid1(struct mddev *mddev)
5460{
5461	int chunksect;
5462
5463	if (mddev->raid_disks != 2 ||
5464	    mddev->degraded > 1)
5465		return ERR_PTR(-EINVAL);
5466
5467	/* Should check if there are write-behind devices? */
5468
5469	chunksect = 64*2; /* 64K by default */
5470
5471	/* The array must be an exact multiple of chunksize */
5472	while (chunksect && (mddev->array_sectors & (chunksect-1)))
5473		chunksect >>= 1;
5474
5475	if ((chunksect<<9) < STRIPE_SIZE)
5476		/* array size does not allow a suitable chunk size */
5477		return ERR_PTR(-EINVAL);
5478
5479	mddev->new_level = 5;
5480	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5481	mddev->new_chunk_sectors = chunksect;
5482
5483	return setup_conf(mddev);
5484}
5485
5486static void *raid5_takeover_raid6(struct mddev *mddev)
5487{
5488	int new_layout;
5489
5490	switch (mddev->layout) {
5491	case ALGORITHM_LEFT_ASYMMETRIC_6:
5492		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
5493		break;
5494	case ALGORITHM_RIGHT_ASYMMETRIC_6:
5495		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
5496		break;
5497	case ALGORITHM_LEFT_SYMMETRIC_6:
5498		new_layout = ALGORITHM_LEFT_SYMMETRIC;
5499		break;
5500	case ALGORITHM_RIGHT_SYMMETRIC_6:
5501		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
5502		break;
5503	case ALGORITHM_PARITY_0_6:
5504		new_layout = ALGORITHM_PARITY_0;
5505		break;
5506	case ALGORITHM_PARITY_N:
5507		new_layout = ALGORITHM_PARITY_N;
5508		break;
5509	default:
5510		return ERR_PTR(-EINVAL);
5511	}
5512	mddev->new_level = 5;
5513	mddev->new_layout = new_layout;
5514	mddev->delta_disks = -1;
5515	mddev->raid_disks -= 1;
5516	return setup_conf(mddev);
5517}
5518
5519
5520static int raid5_check_reshape(struct mddev *mddev)
5521{
5522	/* For a 2-drive array, the layout and chunk size can be changed
5523	 * immediately as not restriping is needed.
5524	 * For larger arrays we record the new value - after validation
5525	 * to be used by a reshape pass.
5526	 */
5527	struct r5conf *conf = mddev->private;
5528	int new_chunk = mddev->new_chunk_sectors;
5529
5530	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
5531		return -EINVAL;
5532	if (new_chunk > 0) {
5533		if (!is_power_of_2(new_chunk))
5534			return -EINVAL;
5535		if (new_chunk < (PAGE_SIZE>>9))
5536			return -EINVAL;
5537		if (mddev->array_sectors & (new_chunk-1))
5538			/* not factor of array size */
5539			return -EINVAL;
5540	}
5541
5542	/* They look valid */
5543
5544	if (mddev->raid_disks == 2) {
5545		/* can make the change immediately */
5546		if (mddev->new_layout >= 0) {
5547			conf->algorithm = mddev->new_layout;
5548			mddev->layout = mddev->new_layout;
5549		}
5550		if (new_chunk > 0) {
5551			conf->chunk_sectors = new_chunk ;
5552			mddev->chunk_sectors = new_chunk;
5553		}
5554		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5555		md_wakeup_thread(mddev->thread);
5556	}
5557	return check_reshape(mddev);
5558}
5559
5560static int raid6_check_reshape(struct mddev *mddev)
5561{
5562	int new_chunk = mddev->new_chunk_sectors;
5563
5564	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
5565		return -EINVAL;
5566	if (new_chunk > 0) {
5567		if (!is_power_of_2(new_chunk))
5568			return -EINVAL;
5569		if (new_chunk < (PAGE_SIZE >> 9))
5570			return -EINVAL;
5571		if (mddev->array_sectors & (new_chunk-1))
5572			/* not factor of array size */
5573			return -EINVAL;
5574	}
5575
5576	/* They look valid */
5577	return check_reshape(mddev);
5578}
5579
5580static void *raid5_takeover(struct mddev *mddev)
5581{
5582	/* raid5 can take over:
5583	 *  raid0 - if there is only one strip zone - make it a raid4 layout
5584	 *  raid1 - if there are two drives.  We need to know the chunk size
5585	 *  raid4 - trivial - just use a raid4 layout.
5586	 *  raid6 - Providing it is a *_6 layout
5587	 */
5588	if (mddev->level == 0)
5589		return raid45_takeover_raid0(mddev, 5);
5590	if (mddev->level == 1)
5591		return raid5_takeover_raid1(mddev);
5592	if (mddev->level == 4) {
5593		mddev->new_layout = ALGORITHM_PARITY_N;
5594		mddev->new_level = 5;
5595		return setup_conf(mddev);
5596	}
5597	if (mddev->level == 6)
5598		return raid5_takeover_raid6(mddev);
5599
5600	return ERR_PTR(-EINVAL);
5601}
5602
5603static void *raid4_takeover(struct mddev *mddev)
5604{
5605	/* raid4 can take over:
5606	 *  raid0 - if there is only one strip zone
5607	 *  raid5 - if layout is right
5608	 */
5609	if (mddev->level == 0)
5610		return raid45_takeover_raid0(mddev, 4);
5611	if (mddev->level == 5 &&
5612	    mddev->layout == ALGORITHM_PARITY_N) {
5613		mddev->new_layout = 0;
5614		mddev->new_level = 4;
5615		return setup_conf(mddev);
5616	}
5617	return ERR_PTR(-EINVAL);
5618}
5619
5620static struct md_personality raid5_personality;
5621
5622static void *raid6_takeover(struct mddev *mddev)
5623{
5624	/* Currently can only take over a raid5.  We map the
5625	 * personality to an equivalent raid6 personality
5626	 * with the Q block at the end.
5627	 */
5628	int new_layout;
5629
5630	if (mddev->pers != &raid5_personality)
5631		return ERR_PTR(-EINVAL);
5632	if (mddev->degraded > 1)
5633		return ERR_PTR(-EINVAL);
5634	if (mddev->raid_disks > 253)
5635		return ERR_PTR(-EINVAL);
5636	if (mddev->raid_disks < 3)
5637		return ERR_PTR(-EINVAL);
5638
5639	switch (mddev->layout) {
5640	case ALGORITHM_LEFT_ASYMMETRIC:
5641		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
5642		break;
5643	case ALGORITHM_RIGHT_ASYMMETRIC:
5644		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
5645		break;
5646	case ALGORITHM_LEFT_SYMMETRIC:
5647		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
5648		break;
5649	case ALGORITHM_RIGHT_SYMMETRIC:
5650		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
5651		break;
5652	case ALGORITHM_PARITY_0:
5653		new_layout = ALGORITHM_PARITY_0_6;
5654		break;
5655	case ALGORITHM_PARITY_N:
5656		new_layout = ALGORITHM_PARITY_N;
5657		break;
5658	default:
5659		return ERR_PTR(-EINVAL);
5660	}
5661	mddev->new_level = 6;
5662	mddev->new_layout = new_layout;
5663	mddev->delta_disks = 1;
5664	mddev->raid_disks += 1;
5665	return setup_conf(mddev);
5666}
5667
5668
5669static struct md_personality raid6_personality =
5670{
5671	.name		= "raid6",
5672	.level		= 6,
5673	.owner		= THIS_MODULE,
5674	.make_request	= make_request,
5675	.run		= run,
5676	.stop		= stop,
5677	.status		= status,
5678	.error_handler	= error,
5679	.hot_add_disk	= raid5_add_disk,
5680	.hot_remove_disk= raid5_remove_disk,
5681	.spare_active	= raid5_spare_active,
5682	.sync_request	= sync_request,
5683	.resize		= raid5_resize,
5684	.size		= raid5_size,
5685	.check_reshape	= raid6_check_reshape,
5686	.start_reshape  = raid5_start_reshape,
5687	.finish_reshape = raid5_finish_reshape,
5688	.quiesce	= raid5_quiesce,
5689	.takeover	= raid6_takeover,
5690};
5691static struct md_personality raid5_personality =
5692{
5693	.name		= "raid5",
5694	.level		= 5,
5695	.owner		= THIS_MODULE,
5696	.make_request	= make_request,
5697	.run		= run,
5698	.stop		= stop,
5699	.status		= status,
5700	.error_handler	= error,
5701	.hot_add_disk	= raid5_add_disk,
5702	.hot_remove_disk= raid5_remove_disk,
5703	.spare_active	= raid5_spare_active,
5704	.sync_request	= sync_request,
5705	.resize		= raid5_resize,
5706	.size		= raid5_size,
5707	.check_reshape	= raid5_check_reshape,
5708	.start_reshape  = raid5_start_reshape,
5709	.finish_reshape = raid5_finish_reshape,
5710	.quiesce	= raid5_quiesce,
5711	.takeover	= raid5_takeover,
5712};
5713
5714static struct md_personality raid4_personality =
5715{
5716	.name		= "raid4",
5717	.level		= 4,
5718	.owner		= THIS_MODULE,
5719	.make_request	= make_request,
5720	.run		= run,
5721	.stop		= stop,
5722	.status		= status,
5723	.error_handler	= error,
5724	.hot_add_disk	= raid5_add_disk,
5725	.hot_remove_disk= raid5_remove_disk,
5726	.spare_active	= raid5_spare_active,
5727	.sync_request	= sync_request,
5728	.resize		= raid5_resize,
5729	.size		= raid5_size,
5730	.check_reshape	= raid5_check_reshape,
5731	.start_reshape  = raid5_start_reshape,
5732	.finish_reshape = raid5_finish_reshape,
5733	.quiesce	= raid5_quiesce,
5734	.takeover	= raid4_takeover,
5735};
5736
5737static int __init raid5_init(void)
5738{
5739	register_md_personality(&raid6_personality);
5740	register_md_personality(&raid5_personality);
5741	register_md_personality(&raid4_personality);
5742	return 0;
5743}
5744
5745static void raid5_exit(void)
5746{
5747	unregister_md_personality(&raid6_personality);
5748	unregister_md_personality(&raid5_personality);
5749	unregister_md_personality(&raid4_personality);
5750}
5751
5752module_init(raid5_init);
5753module_exit(raid5_exit);
5754MODULE_LICENSE("GPL");
5755MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5756MODULE_ALIAS("md-personality-4"); /* RAID5 */
5757MODULE_ALIAS("md-raid5");
5758MODULE_ALIAS("md-raid4");
5759MODULE_ALIAS("md-level-5");
5760MODULE_ALIAS("md-level-4");
5761MODULE_ALIAS("md-personality-8"); /* RAID6 */
5762MODULE_ALIAS("md-raid6");
5763MODULE_ALIAS("md-level-6");
5764
5765/* This used to be two separate modules, they were: */
5766MODULE_ALIAS("raid5");
5767MODULE_ALIAS("raid6");
5768