raid5.c revision dd054fce88d33da1aa81d018db75b91b102a6959
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 *	   Copyright (C) 1999, 2000 Ingo Molnar
5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches.  Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->seq_flush is the number of the last batch that was closed to
32 *    new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 *   we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 *   batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include "md.h"
57#include "raid5.h"
58#include "raid0.h"
59#include "bitmap.h"
60
61/*
62 * Stripe cache
63 */
64
65#define NR_STRIPES		256
66#define STRIPE_SIZE		PAGE_SIZE
67#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
68#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
69#define	IO_THRESHOLD		1
70#define BYPASS_THRESHOLD	1
71#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
72#define HASH_MASK		(NR_HASH - 1)
73
74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
75{
76	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
77	return &conf->stripe_hashtbl[hash];
78}
79
80/* bio's attached to a stripe+device for I/O are linked together in bi_sector
81 * order without overlap.  There may be several bio's per stripe+device, and
82 * a bio could span several devices.
83 * When walking this list for a particular stripe+device, we must never proceed
84 * beyond a bio that extends past this device, as the next bio might no longer
85 * be valid.
86 * This function is used to determine the 'next' bio in the list, given the sector
87 * of the current stripe+device
88 */
89static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
90{
91	int sectors = bio->bi_size >> 9;
92	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
93		return bio->bi_next;
94	else
95		return NULL;
96}
97
98/*
99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */
102static inline int raid5_bi_phys_segments(struct bio *bio)
103{
104	return bio->bi_phys_segments & 0xffff;
105}
106
107static inline int raid5_bi_hw_segments(struct bio *bio)
108{
109	return (bio->bi_phys_segments >> 16) & 0xffff;
110}
111
112static inline int raid5_dec_bi_phys_segments(struct bio *bio)
113{
114	--bio->bi_phys_segments;
115	return raid5_bi_phys_segments(bio);
116}
117
118static inline int raid5_dec_bi_hw_segments(struct bio *bio)
119{
120	unsigned short val = raid5_bi_hw_segments(bio);
121
122	--val;
123	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
124	return val;
125}
126
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
128{
129	bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
130}
131
132/* Find first data disk in a raid6 stripe */
133static inline int raid6_d0(struct stripe_head *sh)
134{
135	if (sh->ddf_layout)
136		/* ddf always start from first device */
137		return 0;
138	/* md starts just after Q block */
139	if (sh->qd_idx == sh->disks - 1)
140		return 0;
141	else
142		return sh->qd_idx + 1;
143}
144static inline int raid6_next_disk(int disk, int raid_disks)
145{
146	disk++;
147	return (disk < raid_disks) ? disk : 0;
148}
149
150/* When walking through the disks in a raid5, starting at raid6_d0,
151 * We need to map each disk to a 'slot', where the data disks are slot
152 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
153 * is raid_disks-1.  This help does that mapping.
154 */
155static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
156			     int *count, int syndrome_disks)
157{
158	int slot = *count;
159
160	if (sh->ddf_layout)
161		(*count)++;
162	if (idx == sh->pd_idx)
163		return syndrome_disks;
164	if (idx == sh->qd_idx)
165		return syndrome_disks + 1;
166	if (!sh->ddf_layout)
167		(*count)++;
168	return slot;
169}
170
171static void return_io(struct bio *return_bi)
172{
173	struct bio *bi = return_bi;
174	while (bi) {
175
176		return_bi = bi->bi_next;
177		bi->bi_next = NULL;
178		bi->bi_size = 0;
179		bio_endio(bi, 0);
180		bi = return_bi;
181	}
182}
183
184static void print_raid5_conf (struct r5conf *conf);
185
186static int stripe_operations_active(struct stripe_head *sh)
187{
188	return sh->check_state || sh->reconstruct_state ||
189	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
190	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191}
192
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{
195	if (atomic_dec_and_test(&sh->count)) {
196		BUG_ON(!list_empty(&sh->lru));
197		BUG_ON(atomic_read(&conf->active_stripes)==0);
198		if (test_bit(STRIPE_HANDLE, &sh->state)) {
199			if (test_bit(STRIPE_DELAYED, &sh->state))
200				list_add_tail(&sh->lru, &conf->delayed_list);
201			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202				   sh->bm_seq - conf->seq_write > 0)
203				list_add_tail(&sh->lru, &conf->bitmap_list);
204			else {
205				clear_bit(STRIPE_BIT_DELAY, &sh->state);
206				list_add_tail(&sh->lru, &conf->handle_list);
207			}
208			md_wakeup_thread(conf->mddev->thread);
209		} else {
210			BUG_ON(stripe_operations_active(sh));
211			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
212				atomic_dec(&conf->preread_active_stripes);
213				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
214					md_wakeup_thread(conf->mddev->thread);
215			}
216			atomic_dec(&conf->active_stripes);
217			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218				list_add_tail(&sh->lru, &conf->inactive_list);
219				wake_up(&conf->wait_for_stripe);
220				if (conf->retry_read_aligned)
221					md_wakeup_thread(conf->mddev->thread);
222			}
223		}
224	}
225}
226
227static void release_stripe(struct stripe_head *sh)
228{
229	struct r5conf *conf = sh->raid_conf;
230	unsigned long flags;
231
232	spin_lock_irqsave(&conf->device_lock, flags);
233	__release_stripe(conf, sh);
234	spin_unlock_irqrestore(&conf->device_lock, flags);
235}
236
237static inline void remove_hash(struct stripe_head *sh)
238{
239	pr_debug("remove_hash(), stripe %llu\n",
240		(unsigned long long)sh->sector);
241
242	hlist_del_init(&sh->hash);
243}
244
245static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
246{
247	struct hlist_head *hp = stripe_hash(conf, sh->sector);
248
249	pr_debug("insert_hash(), stripe %llu\n",
250		(unsigned long long)sh->sector);
251
252	hlist_add_head(&sh->hash, hp);
253}
254
255
256/* find an idle stripe, make sure it is unhashed, and return it. */
257static struct stripe_head *get_free_stripe(struct r5conf *conf)
258{
259	struct stripe_head *sh = NULL;
260	struct list_head *first;
261
262	if (list_empty(&conf->inactive_list))
263		goto out;
264	first = conf->inactive_list.next;
265	sh = list_entry(first, struct stripe_head, lru);
266	list_del_init(first);
267	remove_hash(sh);
268	atomic_inc(&conf->active_stripes);
269out:
270	return sh;
271}
272
273static void shrink_buffers(struct stripe_head *sh)
274{
275	struct page *p;
276	int i;
277	int num = sh->raid_conf->pool_size;
278
279	for (i = 0; i < num ; i++) {
280		p = sh->dev[i].page;
281		if (!p)
282			continue;
283		sh->dev[i].page = NULL;
284		put_page(p);
285	}
286}
287
288static int grow_buffers(struct stripe_head *sh)
289{
290	int i;
291	int num = sh->raid_conf->pool_size;
292
293	for (i = 0; i < num; i++) {
294		struct page *page;
295
296		if (!(page = alloc_page(GFP_KERNEL))) {
297			return 1;
298		}
299		sh->dev[i].page = page;
300	}
301	return 0;
302}
303
304static void raid5_build_block(struct stripe_head *sh, int i, int previous);
305static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
306			    struct stripe_head *sh);
307
308static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
309{
310	struct r5conf *conf = sh->raid_conf;
311	int i;
312
313	BUG_ON(atomic_read(&sh->count) != 0);
314	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
315	BUG_ON(stripe_operations_active(sh));
316
317	pr_debug("init_stripe called, stripe %llu\n",
318		(unsigned long long)sh->sector);
319
320	remove_hash(sh);
321
322	sh->generation = conf->generation - previous;
323	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
324	sh->sector = sector;
325	stripe_set_idx(sector, conf, previous, sh);
326	sh->state = 0;
327
328
329	for (i = sh->disks; i--; ) {
330		struct r5dev *dev = &sh->dev[i];
331
332		if (dev->toread || dev->read || dev->towrite || dev->written ||
333		    test_bit(R5_LOCKED, &dev->flags)) {
334			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
335			       (unsigned long long)sh->sector, i, dev->toread,
336			       dev->read, dev->towrite, dev->written,
337			       test_bit(R5_LOCKED, &dev->flags));
338			WARN_ON(1);
339		}
340		dev->flags = 0;
341		raid5_build_block(sh, i, previous);
342	}
343	insert_hash(conf, sh);
344}
345
346static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
347					 short generation)
348{
349	struct stripe_head *sh;
350	struct hlist_node *hn;
351
352	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
353	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
354		if (sh->sector == sector && sh->generation == generation)
355			return sh;
356	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
357	return NULL;
358}
359
360/*
361 * Need to check if array has failed when deciding whether to:
362 *  - start an array
363 *  - remove non-faulty devices
364 *  - add a spare
365 *  - allow a reshape
366 * This determination is simple when no reshape is happening.
367 * However if there is a reshape, we need to carefully check
368 * both the before and after sections.
369 * This is because some failed devices may only affect one
370 * of the two sections, and some non-in_sync devices may
371 * be insync in the section most affected by failed devices.
372 */
373static int calc_degraded(struct r5conf *conf)
374{
375	int degraded, degraded2;
376	int i;
377
378	rcu_read_lock();
379	degraded = 0;
380	for (i = 0; i < conf->previous_raid_disks; i++) {
381		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
382		if (!rdev || test_bit(Faulty, &rdev->flags))
383			degraded++;
384		else if (test_bit(In_sync, &rdev->flags))
385			;
386		else
387			/* not in-sync or faulty.
388			 * If the reshape increases the number of devices,
389			 * this is being recovered by the reshape, so
390			 * this 'previous' section is not in_sync.
391			 * If the number of devices is being reduced however,
392			 * the device can only be part of the array if
393			 * we are reverting a reshape, so this section will
394			 * be in-sync.
395			 */
396			if (conf->raid_disks >= conf->previous_raid_disks)
397				degraded++;
398	}
399	rcu_read_unlock();
400	if (conf->raid_disks == conf->previous_raid_disks)
401		return degraded;
402	rcu_read_lock();
403	degraded2 = 0;
404	for (i = 0; i < conf->raid_disks; i++) {
405		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
406		if (!rdev || test_bit(Faulty, &rdev->flags))
407			degraded2++;
408		else if (test_bit(In_sync, &rdev->flags))
409			;
410		else
411			/* not in-sync or faulty.
412			 * If reshape increases the number of devices, this
413			 * section has already been recovered, else it
414			 * almost certainly hasn't.
415			 */
416			if (conf->raid_disks <= conf->previous_raid_disks)
417				degraded2++;
418	}
419	rcu_read_unlock();
420	if (degraded2 > degraded)
421		return degraded2;
422	return degraded;
423}
424
425static int has_failed(struct r5conf *conf)
426{
427	int degraded;
428
429	if (conf->mddev->reshape_position == MaxSector)
430		return conf->mddev->degraded > conf->max_degraded;
431
432	degraded = calc_degraded(conf);
433	if (degraded > conf->max_degraded)
434		return 1;
435	return 0;
436}
437
438static struct stripe_head *
439get_active_stripe(struct r5conf *conf, sector_t sector,
440		  int previous, int noblock, int noquiesce)
441{
442	struct stripe_head *sh;
443
444	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
445
446	spin_lock_irq(&conf->device_lock);
447
448	do {
449		wait_event_lock_irq(conf->wait_for_stripe,
450				    conf->quiesce == 0 || noquiesce,
451				    conf->device_lock, /* nothing */);
452		sh = __find_stripe(conf, sector, conf->generation - previous);
453		if (!sh) {
454			if (!conf->inactive_blocked)
455				sh = get_free_stripe(conf);
456			if (noblock && sh == NULL)
457				break;
458			if (!sh) {
459				conf->inactive_blocked = 1;
460				wait_event_lock_irq(conf->wait_for_stripe,
461						    !list_empty(&conf->inactive_list) &&
462						    (atomic_read(&conf->active_stripes)
463						     < (conf->max_nr_stripes *3/4)
464						     || !conf->inactive_blocked),
465						    conf->device_lock,
466						    );
467				conf->inactive_blocked = 0;
468			} else
469				init_stripe(sh, sector, previous);
470		} else {
471			if (atomic_read(&sh->count)) {
472				BUG_ON(!list_empty(&sh->lru)
473				    && !test_bit(STRIPE_EXPANDING, &sh->state));
474			} else {
475				if (!test_bit(STRIPE_HANDLE, &sh->state))
476					atomic_inc(&conf->active_stripes);
477				if (list_empty(&sh->lru) &&
478				    !test_bit(STRIPE_EXPANDING, &sh->state))
479					BUG();
480				list_del_init(&sh->lru);
481			}
482		}
483	} while (sh == NULL);
484
485	if (sh)
486		atomic_inc(&sh->count);
487
488	spin_unlock_irq(&conf->device_lock);
489	return sh;
490}
491
492static void
493raid5_end_read_request(struct bio *bi, int error);
494static void
495raid5_end_write_request(struct bio *bi, int error);
496
497static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
498{
499	struct r5conf *conf = sh->raid_conf;
500	int i, disks = sh->disks;
501
502	might_sleep();
503
504	for (i = disks; i--; ) {
505		int rw;
506		int replace_only = 0;
507		struct bio *bi, *rbi;
508		struct md_rdev *rdev, *rrdev = NULL;
509		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511				rw = WRITE_FUA;
512			else
513				rw = WRITE;
514		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
515			rw = READ;
516		else if (test_and_clear_bit(R5_WantReplace,
517					    &sh->dev[i].flags)) {
518			rw = WRITE;
519			replace_only = 1;
520		} else
521			continue;
522
523		bi = &sh->dev[i].req;
524		rbi = &sh->dev[i].rreq; /* For writing to replacement */
525
526		bi->bi_rw = rw;
527		rbi->bi_rw = rw;
528		if (rw & WRITE) {
529			bi->bi_end_io = raid5_end_write_request;
530			rbi->bi_end_io = raid5_end_write_request;
531		} else
532			bi->bi_end_io = raid5_end_read_request;
533
534		rcu_read_lock();
535		rrdev = rcu_dereference(conf->disks[i].replacement);
536		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
537		rdev = rcu_dereference(conf->disks[i].rdev);
538		if (!rdev) {
539			rdev = rrdev;
540			rrdev = NULL;
541		}
542		if (rw & WRITE) {
543			if (replace_only)
544				rdev = NULL;
545			if (rdev == rrdev)
546				/* We raced and saw duplicates */
547				rrdev = NULL;
548		} else {
549			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
550				rdev = rrdev;
551			rrdev = NULL;
552		}
553
554		if (rdev && test_bit(Faulty, &rdev->flags))
555			rdev = NULL;
556		if (rdev)
557			atomic_inc(&rdev->nr_pending);
558		if (rrdev && test_bit(Faulty, &rrdev->flags))
559			rrdev = NULL;
560		if (rrdev)
561			atomic_inc(&rrdev->nr_pending);
562		rcu_read_unlock();
563
564		/* We have already checked bad blocks for reads.  Now
565		 * need to check for writes.  We never accept write errors
566		 * on the replacement, so we don't to check rrdev.
567		 */
568		while ((rw & WRITE) && rdev &&
569		       test_bit(WriteErrorSeen, &rdev->flags)) {
570			sector_t first_bad;
571			int bad_sectors;
572			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
573					      &first_bad, &bad_sectors);
574			if (!bad)
575				break;
576
577			if (bad < 0) {
578				set_bit(BlockedBadBlocks, &rdev->flags);
579				if (!conf->mddev->external &&
580				    conf->mddev->flags) {
581					/* It is very unlikely, but we might
582					 * still need to write out the
583					 * bad block log - better give it
584					 * a chance*/
585					md_check_recovery(conf->mddev);
586				}
587				md_wait_for_blocked_rdev(rdev, conf->mddev);
588			} else {
589				/* Acknowledged bad block - skip the write */
590				rdev_dec_pending(rdev, conf->mddev);
591				rdev = NULL;
592			}
593		}
594
595		if (rdev) {
596			if (s->syncing || s->expanding || s->expanded
597			    || s->replacing)
598				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
599
600			set_bit(STRIPE_IO_STARTED, &sh->state);
601
602			bi->bi_bdev = rdev->bdev;
603			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
604				__func__, (unsigned long long)sh->sector,
605				bi->bi_rw, i);
606			atomic_inc(&sh->count);
607			bi->bi_sector = sh->sector + rdev->data_offset;
608			bi->bi_flags = 1 << BIO_UPTODATE;
609			bi->bi_idx = 0;
610			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
611			bi->bi_io_vec[0].bv_offset = 0;
612			bi->bi_size = STRIPE_SIZE;
613			bi->bi_next = NULL;
614			if (rrdev)
615				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
616			generic_make_request(bi);
617		}
618		if (rrdev) {
619			if (s->syncing || s->expanding || s->expanded
620			    || s->replacing)
621				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
622
623			set_bit(STRIPE_IO_STARTED, &sh->state);
624
625			rbi->bi_bdev = rrdev->bdev;
626			pr_debug("%s: for %llu schedule op %ld on "
627				 "replacement disc %d\n",
628				__func__, (unsigned long long)sh->sector,
629				rbi->bi_rw, i);
630			atomic_inc(&sh->count);
631			rbi->bi_sector = sh->sector + rrdev->data_offset;
632			rbi->bi_flags = 1 << BIO_UPTODATE;
633			rbi->bi_idx = 0;
634			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
635			rbi->bi_io_vec[0].bv_offset = 0;
636			rbi->bi_size = STRIPE_SIZE;
637			rbi->bi_next = NULL;
638			generic_make_request(rbi);
639		}
640		if (!rdev && !rrdev) {
641			if (rw & WRITE)
642				set_bit(STRIPE_DEGRADED, &sh->state);
643			pr_debug("skip op %ld on disc %d for sector %llu\n",
644				bi->bi_rw, i, (unsigned long long)sh->sector);
645			clear_bit(R5_LOCKED, &sh->dev[i].flags);
646			set_bit(STRIPE_HANDLE, &sh->state);
647		}
648	}
649}
650
651static struct dma_async_tx_descriptor *
652async_copy_data(int frombio, struct bio *bio, struct page *page,
653	sector_t sector, struct dma_async_tx_descriptor *tx)
654{
655	struct bio_vec *bvl;
656	struct page *bio_page;
657	int i;
658	int page_offset;
659	struct async_submit_ctl submit;
660	enum async_tx_flags flags = 0;
661
662	if (bio->bi_sector >= sector)
663		page_offset = (signed)(bio->bi_sector - sector) * 512;
664	else
665		page_offset = (signed)(sector - bio->bi_sector) * -512;
666
667	if (frombio)
668		flags |= ASYNC_TX_FENCE;
669	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
670
671	bio_for_each_segment(bvl, bio, i) {
672		int len = bvl->bv_len;
673		int clen;
674		int b_offset = 0;
675
676		if (page_offset < 0) {
677			b_offset = -page_offset;
678			page_offset += b_offset;
679			len -= b_offset;
680		}
681
682		if (len > 0 && page_offset + len > STRIPE_SIZE)
683			clen = STRIPE_SIZE - page_offset;
684		else
685			clen = len;
686
687		if (clen > 0) {
688			b_offset += bvl->bv_offset;
689			bio_page = bvl->bv_page;
690			if (frombio)
691				tx = async_memcpy(page, bio_page, page_offset,
692						  b_offset, clen, &submit);
693			else
694				tx = async_memcpy(bio_page, page, b_offset,
695						  page_offset, clen, &submit);
696		}
697		/* chain the operations */
698		submit.depend_tx = tx;
699
700		if (clen < len) /* hit end of page */
701			break;
702		page_offset +=  len;
703	}
704
705	return tx;
706}
707
708static void ops_complete_biofill(void *stripe_head_ref)
709{
710	struct stripe_head *sh = stripe_head_ref;
711	struct bio *return_bi = NULL;
712	struct r5conf *conf = sh->raid_conf;
713	int i;
714
715	pr_debug("%s: stripe %llu\n", __func__,
716		(unsigned long long)sh->sector);
717
718	/* clear completed biofills */
719	spin_lock_irq(&conf->device_lock);
720	for (i = sh->disks; i--; ) {
721		struct r5dev *dev = &sh->dev[i];
722
723		/* acknowledge completion of a biofill operation */
724		/* and check if we need to reply to a read request,
725		 * new R5_Wantfill requests are held off until
726		 * !STRIPE_BIOFILL_RUN
727		 */
728		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
729			struct bio *rbi, *rbi2;
730
731			BUG_ON(!dev->read);
732			rbi = dev->read;
733			dev->read = NULL;
734			while (rbi && rbi->bi_sector <
735				dev->sector + STRIPE_SECTORS) {
736				rbi2 = r5_next_bio(rbi, dev->sector);
737				if (!raid5_dec_bi_phys_segments(rbi)) {
738					rbi->bi_next = return_bi;
739					return_bi = rbi;
740				}
741				rbi = rbi2;
742			}
743		}
744	}
745	spin_unlock_irq(&conf->device_lock);
746	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
747
748	return_io(return_bi);
749
750	set_bit(STRIPE_HANDLE, &sh->state);
751	release_stripe(sh);
752}
753
754static void ops_run_biofill(struct stripe_head *sh)
755{
756	struct dma_async_tx_descriptor *tx = NULL;
757	struct r5conf *conf = sh->raid_conf;
758	struct async_submit_ctl submit;
759	int i;
760
761	pr_debug("%s: stripe %llu\n", __func__,
762		(unsigned long long)sh->sector);
763
764	for (i = sh->disks; i--; ) {
765		struct r5dev *dev = &sh->dev[i];
766		if (test_bit(R5_Wantfill, &dev->flags)) {
767			struct bio *rbi;
768			spin_lock_irq(&conf->device_lock);
769			dev->read = rbi = dev->toread;
770			dev->toread = NULL;
771			spin_unlock_irq(&conf->device_lock);
772			while (rbi && rbi->bi_sector <
773				dev->sector + STRIPE_SECTORS) {
774				tx = async_copy_data(0, rbi, dev->page,
775					dev->sector, tx);
776				rbi = r5_next_bio(rbi, dev->sector);
777			}
778		}
779	}
780
781	atomic_inc(&sh->count);
782	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
783	async_trigger_callback(&submit);
784}
785
786static void mark_target_uptodate(struct stripe_head *sh, int target)
787{
788	struct r5dev *tgt;
789
790	if (target < 0)
791		return;
792
793	tgt = &sh->dev[target];
794	set_bit(R5_UPTODATE, &tgt->flags);
795	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
796	clear_bit(R5_Wantcompute, &tgt->flags);
797}
798
799static void ops_complete_compute(void *stripe_head_ref)
800{
801	struct stripe_head *sh = stripe_head_ref;
802
803	pr_debug("%s: stripe %llu\n", __func__,
804		(unsigned long long)sh->sector);
805
806	/* mark the computed target(s) as uptodate */
807	mark_target_uptodate(sh, sh->ops.target);
808	mark_target_uptodate(sh, sh->ops.target2);
809
810	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
811	if (sh->check_state == check_state_compute_run)
812		sh->check_state = check_state_compute_result;
813	set_bit(STRIPE_HANDLE, &sh->state);
814	release_stripe(sh);
815}
816
817/* return a pointer to the address conversion region of the scribble buffer */
818static addr_conv_t *to_addr_conv(struct stripe_head *sh,
819				 struct raid5_percpu *percpu)
820{
821	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
822}
823
824static struct dma_async_tx_descriptor *
825ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
826{
827	int disks = sh->disks;
828	struct page **xor_srcs = percpu->scribble;
829	int target = sh->ops.target;
830	struct r5dev *tgt = &sh->dev[target];
831	struct page *xor_dest = tgt->page;
832	int count = 0;
833	struct dma_async_tx_descriptor *tx;
834	struct async_submit_ctl submit;
835	int i;
836
837	pr_debug("%s: stripe %llu block: %d\n",
838		__func__, (unsigned long long)sh->sector, target);
839	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
840
841	for (i = disks; i--; )
842		if (i != target)
843			xor_srcs[count++] = sh->dev[i].page;
844
845	atomic_inc(&sh->count);
846
847	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
848			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
849	if (unlikely(count == 1))
850		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
851	else
852		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
853
854	return tx;
855}
856
857/* set_syndrome_sources - populate source buffers for gen_syndrome
858 * @srcs - (struct page *) array of size sh->disks
859 * @sh - stripe_head to parse
860 *
861 * Populates srcs in proper layout order for the stripe and returns the
862 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
863 * destination buffer is recorded in srcs[count] and the Q destination
864 * is recorded in srcs[count+1]].
865 */
866static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
867{
868	int disks = sh->disks;
869	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
870	int d0_idx = raid6_d0(sh);
871	int count;
872	int i;
873
874	for (i = 0; i < disks; i++)
875		srcs[i] = NULL;
876
877	count = 0;
878	i = d0_idx;
879	do {
880		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
881
882		srcs[slot] = sh->dev[i].page;
883		i = raid6_next_disk(i, disks);
884	} while (i != d0_idx);
885
886	return syndrome_disks;
887}
888
889static struct dma_async_tx_descriptor *
890ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
891{
892	int disks = sh->disks;
893	struct page **blocks = percpu->scribble;
894	int target;
895	int qd_idx = sh->qd_idx;
896	struct dma_async_tx_descriptor *tx;
897	struct async_submit_ctl submit;
898	struct r5dev *tgt;
899	struct page *dest;
900	int i;
901	int count;
902
903	if (sh->ops.target < 0)
904		target = sh->ops.target2;
905	else if (sh->ops.target2 < 0)
906		target = sh->ops.target;
907	else
908		/* we should only have one valid target */
909		BUG();
910	BUG_ON(target < 0);
911	pr_debug("%s: stripe %llu block: %d\n",
912		__func__, (unsigned long long)sh->sector, target);
913
914	tgt = &sh->dev[target];
915	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
916	dest = tgt->page;
917
918	atomic_inc(&sh->count);
919
920	if (target == qd_idx) {
921		count = set_syndrome_sources(blocks, sh);
922		blocks[count] = NULL; /* regenerating p is not necessary */
923		BUG_ON(blocks[count+1] != dest); /* q should already be set */
924		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
925				  ops_complete_compute, sh,
926				  to_addr_conv(sh, percpu));
927		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
928	} else {
929		/* Compute any data- or p-drive using XOR */
930		count = 0;
931		for (i = disks; i-- ; ) {
932			if (i == target || i == qd_idx)
933				continue;
934			blocks[count++] = sh->dev[i].page;
935		}
936
937		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
938				  NULL, ops_complete_compute, sh,
939				  to_addr_conv(sh, percpu));
940		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
941	}
942
943	return tx;
944}
945
946static struct dma_async_tx_descriptor *
947ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
948{
949	int i, count, disks = sh->disks;
950	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
951	int d0_idx = raid6_d0(sh);
952	int faila = -1, failb = -1;
953	int target = sh->ops.target;
954	int target2 = sh->ops.target2;
955	struct r5dev *tgt = &sh->dev[target];
956	struct r5dev *tgt2 = &sh->dev[target2];
957	struct dma_async_tx_descriptor *tx;
958	struct page **blocks = percpu->scribble;
959	struct async_submit_ctl submit;
960
961	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
962		 __func__, (unsigned long long)sh->sector, target, target2);
963	BUG_ON(target < 0 || target2 < 0);
964	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
965	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
966
967	/* we need to open-code set_syndrome_sources to handle the
968	 * slot number conversion for 'faila' and 'failb'
969	 */
970	for (i = 0; i < disks ; i++)
971		blocks[i] = NULL;
972	count = 0;
973	i = d0_idx;
974	do {
975		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
976
977		blocks[slot] = sh->dev[i].page;
978
979		if (i == target)
980			faila = slot;
981		if (i == target2)
982			failb = slot;
983		i = raid6_next_disk(i, disks);
984	} while (i != d0_idx);
985
986	BUG_ON(faila == failb);
987	if (failb < faila)
988		swap(faila, failb);
989	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
990		 __func__, (unsigned long long)sh->sector, faila, failb);
991
992	atomic_inc(&sh->count);
993
994	if (failb == syndrome_disks+1) {
995		/* Q disk is one of the missing disks */
996		if (faila == syndrome_disks) {
997			/* Missing P+Q, just recompute */
998			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
999					  ops_complete_compute, sh,
1000					  to_addr_conv(sh, percpu));
1001			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1002						  STRIPE_SIZE, &submit);
1003		} else {
1004			struct page *dest;
1005			int data_target;
1006			int qd_idx = sh->qd_idx;
1007
1008			/* Missing D+Q: recompute D from P, then recompute Q */
1009			if (target == qd_idx)
1010				data_target = target2;
1011			else
1012				data_target = target;
1013
1014			count = 0;
1015			for (i = disks; i-- ; ) {
1016				if (i == data_target || i == qd_idx)
1017					continue;
1018				blocks[count++] = sh->dev[i].page;
1019			}
1020			dest = sh->dev[data_target].page;
1021			init_async_submit(&submit,
1022					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1023					  NULL, NULL, NULL,
1024					  to_addr_conv(sh, percpu));
1025			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1026				       &submit);
1027
1028			count = set_syndrome_sources(blocks, sh);
1029			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1030					  ops_complete_compute, sh,
1031					  to_addr_conv(sh, percpu));
1032			return async_gen_syndrome(blocks, 0, count+2,
1033						  STRIPE_SIZE, &submit);
1034		}
1035	} else {
1036		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1037				  ops_complete_compute, sh,
1038				  to_addr_conv(sh, percpu));
1039		if (failb == syndrome_disks) {
1040			/* We're missing D+P. */
1041			return async_raid6_datap_recov(syndrome_disks+2,
1042						       STRIPE_SIZE, faila,
1043						       blocks, &submit);
1044		} else {
1045			/* We're missing D+D. */
1046			return async_raid6_2data_recov(syndrome_disks+2,
1047						       STRIPE_SIZE, faila, failb,
1048						       blocks, &submit);
1049		}
1050	}
1051}
1052
1053
1054static void ops_complete_prexor(void *stripe_head_ref)
1055{
1056	struct stripe_head *sh = stripe_head_ref;
1057
1058	pr_debug("%s: stripe %llu\n", __func__,
1059		(unsigned long long)sh->sector);
1060}
1061
1062static struct dma_async_tx_descriptor *
1063ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1064	       struct dma_async_tx_descriptor *tx)
1065{
1066	int disks = sh->disks;
1067	struct page **xor_srcs = percpu->scribble;
1068	int count = 0, pd_idx = sh->pd_idx, i;
1069	struct async_submit_ctl submit;
1070
1071	/* existing parity data subtracted */
1072	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1073
1074	pr_debug("%s: stripe %llu\n", __func__,
1075		(unsigned long long)sh->sector);
1076
1077	for (i = disks; i--; ) {
1078		struct r5dev *dev = &sh->dev[i];
1079		/* Only process blocks that are known to be uptodate */
1080		if (test_bit(R5_Wantdrain, &dev->flags))
1081			xor_srcs[count++] = dev->page;
1082	}
1083
1084	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1085			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1086	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1087
1088	return tx;
1089}
1090
1091static struct dma_async_tx_descriptor *
1092ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1093{
1094	int disks = sh->disks;
1095	int i;
1096
1097	pr_debug("%s: stripe %llu\n", __func__,
1098		(unsigned long long)sh->sector);
1099
1100	for (i = disks; i--; ) {
1101		struct r5dev *dev = &sh->dev[i];
1102		struct bio *chosen;
1103
1104		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1105			struct bio *wbi;
1106
1107			spin_lock_irq(&sh->raid_conf->device_lock);
1108			chosen = dev->towrite;
1109			dev->towrite = NULL;
1110			BUG_ON(dev->written);
1111			wbi = dev->written = chosen;
1112			spin_unlock_irq(&sh->raid_conf->device_lock);
1113
1114			while (wbi && wbi->bi_sector <
1115				dev->sector + STRIPE_SECTORS) {
1116				if (wbi->bi_rw & REQ_FUA)
1117					set_bit(R5_WantFUA, &dev->flags);
1118				tx = async_copy_data(1, wbi, dev->page,
1119					dev->sector, tx);
1120				wbi = r5_next_bio(wbi, dev->sector);
1121			}
1122		}
1123	}
1124
1125	return tx;
1126}
1127
1128static void ops_complete_reconstruct(void *stripe_head_ref)
1129{
1130	struct stripe_head *sh = stripe_head_ref;
1131	int disks = sh->disks;
1132	int pd_idx = sh->pd_idx;
1133	int qd_idx = sh->qd_idx;
1134	int i;
1135	bool fua = false;
1136
1137	pr_debug("%s: stripe %llu\n", __func__,
1138		(unsigned long long)sh->sector);
1139
1140	for (i = disks; i--; )
1141		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1142
1143	for (i = disks; i--; ) {
1144		struct r5dev *dev = &sh->dev[i];
1145
1146		if (dev->written || i == pd_idx || i == qd_idx) {
1147			set_bit(R5_UPTODATE, &dev->flags);
1148			if (fua)
1149				set_bit(R5_WantFUA, &dev->flags);
1150		}
1151	}
1152
1153	if (sh->reconstruct_state == reconstruct_state_drain_run)
1154		sh->reconstruct_state = reconstruct_state_drain_result;
1155	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1156		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1157	else {
1158		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1159		sh->reconstruct_state = reconstruct_state_result;
1160	}
1161
1162	set_bit(STRIPE_HANDLE, &sh->state);
1163	release_stripe(sh);
1164}
1165
1166static void
1167ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1168		     struct dma_async_tx_descriptor *tx)
1169{
1170	int disks = sh->disks;
1171	struct page **xor_srcs = percpu->scribble;
1172	struct async_submit_ctl submit;
1173	int count = 0, pd_idx = sh->pd_idx, i;
1174	struct page *xor_dest;
1175	int prexor = 0;
1176	unsigned long flags;
1177
1178	pr_debug("%s: stripe %llu\n", __func__,
1179		(unsigned long long)sh->sector);
1180
1181	/* check if prexor is active which means only process blocks
1182	 * that are part of a read-modify-write (written)
1183	 */
1184	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1185		prexor = 1;
1186		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1187		for (i = disks; i--; ) {
1188			struct r5dev *dev = &sh->dev[i];
1189			if (dev->written)
1190				xor_srcs[count++] = dev->page;
1191		}
1192	} else {
1193		xor_dest = sh->dev[pd_idx].page;
1194		for (i = disks; i--; ) {
1195			struct r5dev *dev = &sh->dev[i];
1196			if (i != pd_idx)
1197				xor_srcs[count++] = dev->page;
1198		}
1199	}
1200
1201	/* 1/ if we prexor'd then the dest is reused as a source
1202	 * 2/ if we did not prexor then we are redoing the parity
1203	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1204	 * for the synchronous xor case
1205	 */
1206	flags = ASYNC_TX_ACK |
1207		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1208
1209	atomic_inc(&sh->count);
1210
1211	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1212			  to_addr_conv(sh, percpu));
1213	if (unlikely(count == 1))
1214		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1215	else
1216		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1217}
1218
1219static void
1220ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1221		     struct dma_async_tx_descriptor *tx)
1222{
1223	struct async_submit_ctl submit;
1224	struct page **blocks = percpu->scribble;
1225	int count;
1226
1227	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1228
1229	count = set_syndrome_sources(blocks, sh);
1230
1231	atomic_inc(&sh->count);
1232
1233	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1234			  sh, to_addr_conv(sh, percpu));
1235	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1236}
1237
1238static void ops_complete_check(void *stripe_head_ref)
1239{
1240	struct stripe_head *sh = stripe_head_ref;
1241
1242	pr_debug("%s: stripe %llu\n", __func__,
1243		(unsigned long long)sh->sector);
1244
1245	sh->check_state = check_state_check_result;
1246	set_bit(STRIPE_HANDLE, &sh->state);
1247	release_stripe(sh);
1248}
1249
1250static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1251{
1252	int disks = sh->disks;
1253	int pd_idx = sh->pd_idx;
1254	int qd_idx = sh->qd_idx;
1255	struct page *xor_dest;
1256	struct page **xor_srcs = percpu->scribble;
1257	struct dma_async_tx_descriptor *tx;
1258	struct async_submit_ctl submit;
1259	int count;
1260	int i;
1261
1262	pr_debug("%s: stripe %llu\n", __func__,
1263		(unsigned long long)sh->sector);
1264
1265	count = 0;
1266	xor_dest = sh->dev[pd_idx].page;
1267	xor_srcs[count++] = xor_dest;
1268	for (i = disks; i--; ) {
1269		if (i == pd_idx || i == qd_idx)
1270			continue;
1271		xor_srcs[count++] = sh->dev[i].page;
1272	}
1273
1274	init_async_submit(&submit, 0, NULL, NULL, NULL,
1275			  to_addr_conv(sh, percpu));
1276	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1277			   &sh->ops.zero_sum_result, &submit);
1278
1279	atomic_inc(&sh->count);
1280	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1281	tx = async_trigger_callback(&submit);
1282}
1283
1284static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1285{
1286	struct page **srcs = percpu->scribble;
1287	struct async_submit_ctl submit;
1288	int count;
1289
1290	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1291		(unsigned long long)sh->sector, checkp);
1292
1293	count = set_syndrome_sources(srcs, sh);
1294	if (!checkp)
1295		srcs[count] = NULL;
1296
1297	atomic_inc(&sh->count);
1298	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1299			  sh, to_addr_conv(sh, percpu));
1300	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1301			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1302}
1303
1304static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1305{
1306	int overlap_clear = 0, i, disks = sh->disks;
1307	struct dma_async_tx_descriptor *tx = NULL;
1308	struct r5conf *conf = sh->raid_conf;
1309	int level = conf->level;
1310	struct raid5_percpu *percpu;
1311	unsigned long cpu;
1312
1313	cpu = get_cpu();
1314	percpu = per_cpu_ptr(conf->percpu, cpu);
1315	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1316		ops_run_biofill(sh);
1317		overlap_clear++;
1318	}
1319
1320	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1321		if (level < 6)
1322			tx = ops_run_compute5(sh, percpu);
1323		else {
1324			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1325				tx = ops_run_compute6_1(sh, percpu);
1326			else
1327				tx = ops_run_compute6_2(sh, percpu);
1328		}
1329		/* terminate the chain if reconstruct is not set to be run */
1330		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1331			async_tx_ack(tx);
1332	}
1333
1334	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1335		tx = ops_run_prexor(sh, percpu, tx);
1336
1337	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1338		tx = ops_run_biodrain(sh, tx);
1339		overlap_clear++;
1340	}
1341
1342	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1343		if (level < 6)
1344			ops_run_reconstruct5(sh, percpu, tx);
1345		else
1346			ops_run_reconstruct6(sh, percpu, tx);
1347	}
1348
1349	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1350		if (sh->check_state == check_state_run)
1351			ops_run_check_p(sh, percpu);
1352		else if (sh->check_state == check_state_run_q)
1353			ops_run_check_pq(sh, percpu, 0);
1354		else if (sh->check_state == check_state_run_pq)
1355			ops_run_check_pq(sh, percpu, 1);
1356		else
1357			BUG();
1358	}
1359
1360	if (overlap_clear)
1361		for (i = disks; i--; ) {
1362			struct r5dev *dev = &sh->dev[i];
1363			if (test_and_clear_bit(R5_Overlap, &dev->flags))
1364				wake_up(&sh->raid_conf->wait_for_overlap);
1365		}
1366	put_cpu();
1367}
1368
1369#ifdef CONFIG_MULTICORE_RAID456
1370static void async_run_ops(void *param, async_cookie_t cookie)
1371{
1372	struct stripe_head *sh = param;
1373	unsigned long ops_request = sh->ops.request;
1374
1375	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1376	wake_up(&sh->ops.wait_for_ops);
1377
1378	__raid_run_ops(sh, ops_request);
1379	release_stripe(sh);
1380}
1381
1382static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1383{
1384	/* since handle_stripe can be called outside of raid5d context
1385	 * we need to ensure sh->ops.request is de-staged before another
1386	 * request arrives
1387	 */
1388	wait_event(sh->ops.wait_for_ops,
1389		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1390	sh->ops.request = ops_request;
1391
1392	atomic_inc(&sh->count);
1393	async_schedule(async_run_ops, sh);
1394}
1395#else
1396#define raid_run_ops __raid_run_ops
1397#endif
1398
1399static int grow_one_stripe(struct r5conf *conf)
1400{
1401	struct stripe_head *sh;
1402	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1403	if (!sh)
1404		return 0;
1405
1406	sh->raid_conf = conf;
1407	#ifdef CONFIG_MULTICORE_RAID456
1408	init_waitqueue_head(&sh->ops.wait_for_ops);
1409	#endif
1410
1411	if (grow_buffers(sh)) {
1412		shrink_buffers(sh);
1413		kmem_cache_free(conf->slab_cache, sh);
1414		return 0;
1415	}
1416	/* we just created an active stripe so... */
1417	atomic_set(&sh->count, 1);
1418	atomic_inc(&conf->active_stripes);
1419	INIT_LIST_HEAD(&sh->lru);
1420	release_stripe(sh);
1421	return 1;
1422}
1423
1424static int grow_stripes(struct r5conf *conf, int num)
1425{
1426	struct kmem_cache *sc;
1427	int devs = max(conf->raid_disks, conf->previous_raid_disks);
1428
1429	if (conf->mddev->gendisk)
1430		sprintf(conf->cache_name[0],
1431			"raid%d-%s", conf->level, mdname(conf->mddev));
1432	else
1433		sprintf(conf->cache_name[0],
1434			"raid%d-%p", conf->level, conf->mddev);
1435	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1436
1437	conf->active_name = 0;
1438	sc = kmem_cache_create(conf->cache_name[conf->active_name],
1439			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1440			       0, 0, NULL);
1441	if (!sc)
1442		return 1;
1443	conf->slab_cache = sc;
1444	conf->pool_size = devs;
1445	while (num--)
1446		if (!grow_one_stripe(conf))
1447			return 1;
1448	return 0;
1449}
1450
1451/**
1452 * scribble_len - return the required size of the scribble region
1453 * @num - total number of disks in the array
1454 *
1455 * The size must be enough to contain:
1456 * 1/ a struct page pointer for each device in the array +2
1457 * 2/ room to convert each entry in (1) to its corresponding dma
1458 *    (dma_map_page()) or page (page_address()) address.
1459 *
1460 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1461 * calculate over all devices (not just the data blocks), using zeros in place
1462 * of the P and Q blocks.
1463 */
1464static size_t scribble_len(int num)
1465{
1466	size_t len;
1467
1468	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1469
1470	return len;
1471}
1472
1473static int resize_stripes(struct r5conf *conf, int newsize)
1474{
1475	/* Make all the stripes able to hold 'newsize' devices.
1476	 * New slots in each stripe get 'page' set to a new page.
1477	 *
1478	 * This happens in stages:
1479	 * 1/ create a new kmem_cache and allocate the required number of
1480	 *    stripe_heads.
1481	 * 2/ gather all the old stripe_heads and tranfer the pages across
1482	 *    to the new stripe_heads.  This will have the side effect of
1483	 *    freezing the array as once all stripe_heads have been collected,
1484	 *    no IO will be possible.  Old stripe heads are freed once their
1485	 *    pages have been transferred over, and the old kmem_cache is
1486	 *    freed when all stripes are done.
1487	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1488	 *    we simple return a failre status - no need to clean anything up.
1489	 * 4/ allocate new pages for the new slots in the new stripe_heads.
1490	 *    If this fails, we don't bother trying the shrink the
1491	 *    stripe_heads down again, we just leave them as they are.
1492	 *    As each stripe_head is processed the new one is released into
1493	 *    active service.
1494	 *
1495	 * Once step2 is started, we cannot afford to wait for a write,
1496	 * so we use GFP_NOIO allocations.
1497	 */
1498	struct stripe_head *osh, *nsh;
1499	LIST_HEAD(newstripes);
1500	struct disk_info *ndisks;
1501	unsigned long cpu;
1502	int err;
1503	struct kmem_cache *sc;
1504	int i;
1505
1506	if (newsize <= conf->pool_size)
1507		return 0; /* never bother to shrink */
1508
1509	err = md_allow_write(conf->mddev);
1510	if (err)
1511		return err;
1512
1513	/* Step 1 */
1514	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1515			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1516			       0, 0, NULL);
1517	if (!sc)
1518		return -ENOMEM;
1519
1520	for (i = conf->max_nr_stripes; i; i--) {
1521		nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1522		if (!nsh)
1523			break;
1524
1525		nsh->raid_conf = conf;
1526		#ifdef CONFIG_MULTICORE_RAID456
1527		init_waitqueue_head(&nsh->ops.wait_for_ops);
1528		#endif
1529
1530		list_add(&nsh->lru, &newstripes);
1531	}
1532	if (i) {
1533		/* didn't get enough, give up */
1534		while (!list_empty(&newstripes)) {
1535			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1536			list_del(&nsh->lru);
1537			kmem_cache_free(sc, nsh);
1538		}
1539		kmem_cache_destroy(sc);
1540		return -ENOMEM;
1541	}
1542	/* Step 2 - Must use GFP_NOIO now.
1543	 * OK, we have enough stripes, start collecting inactive
1544	 * stripes and copying them over
1545	 */
1546	list_for_each_entry(nsh, &newstripes, lru) {
1547		spin_lock_irq(&conf->device_lock);
1548		wait_event_lock_irq(conf->wait_for_stripe,
1549				    !list_empty(&conf->inactive_list),
1550				    conf->device_lock,
1551				    );
1552		osh = get_free_stripe(conf);
1553		spin_unlock_irq(&conf->device_lock);
1554		atomic_set(&nsh->count, 1);
1555		for(i=0; i<conf->pool_size; i++)
1556			nsh->dev[i].page = osh->dev[i].page;
1557		for( ; i<newsize; i++)
1558			nsh->dev[i].page = NULL;
1559		kmem_cache_free(conf->slab_cache, osh);
1560	}
1561	kmem_cache_destroy(conf->slab_cache);
1562
1563	/* Step 3.
1564	 * At this point, we are holding all the stripes so the array
1565	 * is completely stalled, so now is a good time to resize
1566	 * conf->disks and the scribble region
1567	 */
1568	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1569	if (ndisks) {
1570		for (i=0; i<conf->raid_disks; i++)
1571			ndisks[i] = conf->disks[i];
1572		kfree(conf->disks);
1573		conf->disks = ndisks;
1574	} else
1575		err = -ENOMEM;
1576
1577	get_online_cpus();
1578	conf->scribble_len = scribble_len(newsize);
1579	for_each_present_cpu(cpu) {
1580		struct raid5_percpu *percpu;
1581		void *scribble;
1582
1583		percpu = per_cpu_ptr(conf->percpu, cpu);
1584		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1585
1586		if (scribble) {
1587			kfree(percpu->scribble);
1588			percpu->scribble = scribble;
1589		} else {
1590			err = -ENOMEM;
1591			break;
1592		}
1593	}
1594	put_online_cpus();
1595
1596	/* Step 4, return new stripes to service */
1597	while(!list_empty(&newstripes)) {
1598		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1599		list_del_init(&nsh->lru);
1600
1601		for (i=conf->raid_disks; i < newsize; i++)
1602			if (nsh->dev[i].page == NULL) {
1603				struct page *p = alloc_page(GFP_NOIO);
1604				nsh->dev[i].page = p;
1605				if (!p)
1606					err = -ENOMEM;
1607			}
1608		release_stripe(nsh);
1609	}
1610	/* critical section pass, GFP_NOIO no longer needed */
1611
1612	conf->slab_cache = sc;
1613	conf->active_name = 1-conf->active_name;
1614	conf->pool_size = newsize;
1615	return err;
1616}
1617
1618static int drop_one_stripe(struct r5conf *conf)
1619{
1620	struct stripe_head *sh;
1621
1622	spin_lock_irq(&conf->device_lock);
1623	sh = get_free_stripe(conf);
1624	spin_unlock_irq(&conf->device_lock);
1625	if (!sh)
1626		return 0;
1627	BUG_ON(atomic_read(&sh->count));
1628	shrink_buffers(sh);
1629	kmem_cache_free(conf->slab_cache, sh);
1630	atomic_dec(&conf->active_stripes);
1631	return 1;
1632}
1633
1634static void shrink_stripes(struct r5conf *conf)
1635{
1636	while (drop_one_stripe(conf))
1637		;
1638
1639	if (conf->slab_cache)
1640		kmem_cache_destroy(conf->slab_cache);
1641	conf->slab_cache = NULL;
1642}
1643
1644static void raid5_end_read_request(struct bio * bi, int error)
1645{
1646	struct stripe_head *sh = bi->bi_private;
1647	struct r5conf *conf = sh->raid_conf;
1648	int disks = sh->disks, i;
1649	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1650	char b[BDEVNAME_SIZE];
1651	struct md_rdev *rdev = NULL;
1652
1653
1654	for (i=0 ; i<disks; i++)
1655		if (bi == &sh->dev[i].req)
1656			break;
1657
1658	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1659		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1660		uptodate);
1661	if (i == disks) {
1662		BUG();
1663		return;
1664	}
1665	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1666		/* If replacement finished while this request was outstanding,
1667		 * 'replacement' might be NULL already.
1668		 * In that case it moved down to 'rdev'.
1669		 * rdev is not removed until all requests are finished.
1670		 */
1671		rdev = conf->disks[i].replacement;
1672	if (!rdev)
1673		rdev = conf->disks[i].rdev;
1674
1675	if (uptodate) {
1676		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1677		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1678			/* Note that this cannot happen on a
1679			 * replacement device.  We just fail those on
1680			 * any error
1681			 */
1682			printk_ratelimited(
1683				KERN_INFO
1684				"md/raid:%s: read error corrected"
1685				" (%lu sectors at %llu on %s)\n",
1686				mdname(conf->mddev), STRIPE_SECTORS,
1687				(unsigned long long)(sh->sector
1688						     + rdev->data_offset),
1689				bdevname(rdev->bdev, b));
1690			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1691			clear_bit(R5_ReadError, &sh->dev[i].flags);
1692			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1693		}
1694		if (atomic_read(&rdev->read_errors))
1695			atomic_set(&rdev->read_errors, 0);
1696	} else {
1697		const char *bdn = bdevname(rdev->bdev, b);
1698		int retry = 0;
1699
1700		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1701		atomic_inc(&rdev->read_errors);
1702		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1703			printk_ratelimited(
1704				KERN_WARNING
1705				"md/raid:%s: read error on replacement device "
1706				"(sector %llu on %s).\n",
1707				mdname(conf->mddev),
1708				(unsigned long long)(sh->sector
1709						     + rdev->data_offset),
1710				bdn);
1711		else if (conf->mddev->degraded >= conf->max_degraded)
1712			printk_ratelimited(
1713				KERN_WARNING
1714				"md/raid:%s: read error not correctable "
1715				"(sector %llu on %s).\n",
1716				mdname(conf->mddev),
1717				(unsigned long long)(sh->sector
1718						     + rdev->data_offset),
1719				bdn);
1720		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1721			/* Oh, no!!! */
1722			printk_ratelimited(
1723				KERN_WARNING
1724				"md/raid:%s: read error NOT corrected!! "
1725				"(sector %llu on %s).\n",
1726				mdname(conf->mddev),
1727				(unsigned long long)(sh->sector
1728						     + rdev->data_offset),
1729				bdn);
1730		else if (atomic_read(&rdev->read_errors)
1731			 > conf->max_nr_stripes)
1732			printk(KERN_WARNING
1733			       "md/raid:%s: Too many read errors, failing device %s.\n",
1734			       mdname(conf->mddev), bdn);
1735		else
1736			retry = 1;
1737		if (retry)
1738			set_bit(R5_ReadError, &sh->dev[i].flags);
1739		else {
1740			clear_bit(R5_ReadError, &sh->dev[i].flags);
1741			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1742			md_error(conf->mddev, rdev);
1743		}
1744	}
1745	rdev_dec_pending(rdev, conf->mddev);
1746	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1747	set_bit(STRIPE_HANDLE, &sh->state);
1748	release_stripe(sh);
1749}
1750
1751static void raid5_end_write_request(struct bio *bi, int error)
1752{
1753	struct stripe_head *sh = bi->bi_private;
1754	struct r5conf *conf = sh->raid_conf;
1755	int disks = sh->disks, i;
1756	struct md_rdev *uninitialized_var(rdev);
1757	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1758	sector_t first_bad;
1759	int bad_sectors;
1760	int replacement = 0;
1761
1762	for (i = 0 ; i < disks; i++) {
1763		if (bi == &sh->dev[i].req) {
1764			rdev = conf->disks[i].rdev;
1765			break;
1766		}
1767		if (bi == &sh->dev[i].rreq) {
1768			rdev = conf->disks[i].replacement;
1769			if (rdev)
1770				replacement = 1;
1771			else
1772				/* rdev was removed and 'replacement'
1773				 * replaced it.  rdev is not removed
1774				 * until all requests are finished.
1775				 */
1776				rdev = conf->disks[i].rdev;
1777			break;
1778		}
1779	}
1780	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1781		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1782		uptodate);
1783	if (i == disks) {
1784		BUG();
1785		return;
1786	}
1787
1788	if (replacement) {
1789		if (!uptodate)
1790			md_error(conf->mddev, rdev);
1791		else if (is_badblock(rdev, sh->sector,
1792				     STRIPE_SECTORS,
1793				     &first_bad, &bad_sectors))
1794			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1795	} else {
1796		if (!uptodate) {
1797			set_bit(WriteErrorSeen, &rdev->flags);
1798			set_bit(R5_WriteError, &sh->dev[i].flags);
1799		} else if (is_badblock(rdev, sh->sector,
1800				       STRIPE_SECTORS,
1801				       &first_bad, &bad_sectors))
1802			set_bit(R5_MadeGood, &sh->dev[i].flags);
1803	}
1804	rdev_dec_pending(rdev, conf->mddev);
1805
1806	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1807		clear_bit(R5_LOCKED, &sh->dev[i].flags);
1808	set_bit(STRIPE_HANDLE, &sh->state);
1809	release_stripe(sh);
1810}
1811
1812static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1813
1814static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1815{
1816	struct r5dev *dev = &sh->dev[i];
1817
1818	bio_init(&dev->req);
1819	dev->req.bi_io_vec = &dev->vec;
1820	dev->req.bi_vcnt++;
1821	dev->req.bi_max_vecs++;
1822	dev->req.bi_private = sh;
1823	dev->vec.bv_page = dev->page;
1824
1825	bio_init(&dev->rreq);
1826	dev->rreq.bi_io_vec = &dev->rvec;
1827	dev->rreq.bi_vcnt++;
1828	dev->rreq.bi_max_vecs++;
1829	dev->rreq.bi_private = sh;
1830	dev->rvec.bv_page = dev->page;
1831
1832	dev->flags = 0;
1833	dev->sector = compute_blocknr(sh, i, previous);
1834}
1835
1836static void error(struct mddev *mddev, struct md_rdev *rdev)
1837{
1838	char b[BDEVNAME_SIZE];
1839	struct r5conf *conf = mddev->private;
1840	unsigned long flags;
1841	pr_debug("raid456: error called\n");
1842
1843	spin_lock_irqsave(&conf->device_lock, flags);
1844	clear_bit(In_sync, &rdev->flags);
1845	mddev->degraded = calc_degraded(conf);
1846	spin_unlock_irqrestore(&conf->device_lock, flags);
1847	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1848
1849	set_bit(Blocked, &rdev->flags);
1850	set_bit(Faulty, &rdev->flags);
1851	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1852	printk(KERN_ALERT
1853	       "md/raid:%s: Disk failure on %s, disabling device.\n"
1854	       "md/raid:%s: Operation continuing on %d devices.\n",
1855	       mdname(mddev),
1856	       bdevname(rdev->bdev, b),
1857	       mdname(mddev),
1858	       conf->raid_disks - mddev->degraded);
1859}
1860
1861/*
1862 * Input: a 'big' sector number,
1863 * Output: index of the data and parity disk, and the sector # in them.
1864 */
1865static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
1866				     int previous, int *dd_idx,
1867				     struct stripe_head *sh)
1868{
1869	sector_t stripe, stripe2;
1870	sector_t chunk_number;
1871	unsigned int chunk_offset;
1872	int pd_idx, qd_idx;
1873	int ddf_layout = 0;
1874	sector_t new_sector;
1875	int algorithm = previous ? conf->prev_algo
1876				 : conf->algorithm;
1877	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1878					 : conf->chunk_sectors;
1879	int raid_disks = previous ? conf->previous_raid_disks
1880				  : conf->raid_disks;
1881	int data_disks = raid_disks - conf->max_degraded;
1882
1883	/* First compute the information on this sector */
1884
1885	/*
1886	 * Compute the chunk number and the sector offset inside the chunk
1887	 */
1888	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1889	chunk_number = r_sector;
1890
1891	/*
1892	 * Compute the stripe number
1893	 */
1894	stripe = chunk_number;
1895	*dd_idx = sector_div(stripe, data_disks);
1896	stripe2 = stripe;
1897	/*
1898	 * Select the parity disk based on the user selected algorithm.
1899	 */
1900	pd_idx = qd_idx = -1;
1901	switch(conf->level) {
1902	case 4:
1903		pd_idx = data_disks;
1904		break;
1905	case 5:
1906		switch (algorithm) {
1907		case ALGORITHM_LEFT_ASYMMETRIC:
1908			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1909			if (*dd_idx >= pd_idx)
1910				(*dd_idx)++;
1911			break;
1912		case ALGORITHM_RIGHT_ASYMMETRIC:
1913			pd_idx = sector_div(stripe2, raid_disks);
1914			if (*dd_idx >= pd_idx)
1915				(*dd_idx)++;
1916			break;
1917		case ALGORITHM_LEFT_SYMMETRIC:
1918			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1919			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1920			break;
1921		case ALGORITHM_RIGHT_SYMMETRIC:
1922			pd_idx = sector_div(stripe2, raid_disks);
1923			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1924			break;
1925		case ALGORITHM_PARITY_0:
1926			pd_idx = 0;
1927			(*dd_idx)++;
1928			break;
1929		case ALGORITHM_PARITY_N:
1930			pd_idx = data_disks;
1931			break;
1932		default:
1933			BUG();
1934		}
1935		break;
1936	case 6:
1937
1938		switch (algorithm) {
1939		case ALGORITHM_LEFT_ASYMMETRIC:
1940			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1941			qd_idx = pd_idx + 1;
1942			if (pd_idx == raid_disks-1) {
1943				(*dd_idx)++;	/* Q D D D P */
1944				qd_idx = 0;
1945			} else if (*dd_idx >= pd_idx)
1946				(*dd_idx) += 2; /* D D P Q D */
1947			break;
1948		case ALGORITHM_RIGHT_ASYMMETRIC:
1949			pd_idx = sector_div(stripe2, raid_disks);
1950			qd_idx = pd_idx + 1;
1951			if (pd_idx == raid_disks-1) {
1952				(*dd_idx)++;	/* Q D D D P */
1953				qd_idx = 0;
1954			} else if (*dd_idx >= pd_idx)
1955				(*dd_idx) += 2; /* D D P Q D */
1956			break;
1957		case ALGORITHM_LEFT_SYMMETRIC:
1958			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1959			qd_idx = (pd_idx + 1) % raid_disks;
1960			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1961			break;
1962		case ALGORITHM_RIGHT_SYMMETRIC:
1963			pd_idx = sector_div(stripe2, raid_disks);
1964			qd_idx = (pd_idx + 1) % raid_disks;
1965			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1966			break;
1967
1968		case ALGORITHM_PARITY_0:
1969			pd_idx = 0;
1970			qd_idx = 1;
1971			(*dd_idx) += 2;
1972			break;
1973		case ALGORITHM_PARITY_N:
1974			pd_idx = data_disks;
1975			qd_idx = data_disks + 1;
1976			break;
1977
1978		case ALGORITHM_ROTATING_ZERO_RESTART:
1979			/* Exactly the same as RIGHT_ASYMMETRIC, but or
1980			 * of blocks for computing Q is different.
1981			 */
1982			pd_idx = sector_div(stripe2, raid_disks);
1983			qd_idx = pd_idx + 1;
1984			if (pd_idx == raid_disks-1) {
1985				(*dd_idx)++;	/* Q D D D P */
1986				qd_idx = 0;
1987			} else if (*dd_idx >= pd_idx)
1988				(*dd_idx) += 2; /* D D P Q D */
1989			ddf_layout = 1;
1990			break;
1991
1992		case ALGORITHM_ROTATING_N_RESTART:
1993			/* Same a left_asymmetric, by first stripe is
1994			 * D D D P Q  rather than
1995			 * Q D D D P
1996			 */
1997			stripe2 += 1;
1998			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1999			qd_idx = pd_idx + 1;
2000			if (pd_idx == raid_disks-1) {
2001				(*dd_idx)++;	/* Q D D D P */
2002				qd_idx = 0;
2003			} else if (*dd_idx >= pd_idx)
2004				(*dd_idx) += 2; /* D D P Q D */
2005			ddf_layout = 1;
2006			break;
2007
2008		case ALGORITHM_ROTATING_N_CONTINUE:
2009			/* Same as left_symmetric but Q is before P */
2010			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2011			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2012			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2013			ddf_layout = 1;
2014			break;
2015
2016		case ALGORITHM_LEFT_ASYMMETRIC_6:
2017			/* RAID5 left_asymmetric, with Q on last device */
2018			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2019			if (*dd_idx >= pd_idx)
2020				(*dd_idx)++;
2021			qd_idx = raid_disks - 1;
2022			break;
2023
2024		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2025			pd_idx = sector_div(stripe2, raid_disks-1);
2026			if (*dd_idx >= pd_idx)
2027				(*dd_idx)++;
2028			qd_idx = raid_disks - 1;
2029			break;
2030
2031		case ALGORITHM_LEFT_SYMMETRIC_6:
2032			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2033			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2034			qd_idx = raid_disks - 1;
2035			break;
2036
2037		case ALGORITHM_RIGHT_SYMMETRIC_6:
2038			pd_idx = sector_div(stripe2, raid_disks-1);
2039			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2040			qd_idx = raid_disks - 1;
2041			break;
2042
2043		case ALGORITHM_PARITY_0_6:
2044			pd_idx = 0;
2045			(*dd_idx)++;
2046			qd_idx = raid_disks - 1;
2047			break;
2048
2049		default:
2050			BUG();
2051		}
2052		break;
2053	}
2054
2055	if (sh) {
2056		sh->pd_idx = pd_idx;
2057		sh->qd_idx = qd_idx;
2058		sh->ddf_layout = ddf_layout;
2059	}
2060	/*
2061	 * Finally, compute the new sector number
2062	 */
2063	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2064	return new_sector;
2065}
2066
2067
2068static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
2069{
2070	struct r5conf *conf = sh->raid_conf;
2071	int raid_disks = sh->disks;
2072	int data_disks = raid_disks - conf->max_degraded;
2073	sector_t new_sector = sh->sector, check;
2074	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2075					 : conf->chunk_sectors;
2076	int algorithm = previous ? conf->prev_algo
2077				 : conf->algorithm;
2078	sector_t stripe;
2079	int chunk_offset;
2080	sector_t chunk_number;
2081	int dummy1, dd_idx = i;
2082	sector_t r_sector;
2083	struct stripe_head sh2;
2084
2085
2086	chunk_offset = sector_div(new_sector, sectors_per_chunk);
2087	stripe = new_sector;
2088
2089	if (i == sh->pd_idx)
2090		return 0;
2091	switch(conf->level) {
2092	case 4: break;
2093	case 5:
2094		switch (algorithm) {
2095		case ALGORITHM_LEFT_ASYMMETRIC:
2096		case ALGORITHM_RIGHT_ASYMMETRIC:
2097			if (i > sh->pd_idx)
2098				i--;
2099			break;
2100		case ALGORITHM_LEFT_SYMMETRIC:
2101		case ALGORITHM_RIGHT_SYMMETRIC:
2102			if (i < sh->pd_idx)
2103				i += raid_disks;
2104			i -= (sh->pd_idx + 1);
2105			break;
2106		case ALGORITHM_PARITY_0:
2107			i -= 1;
2108			break;
2109		case ALGORITHM_PARITY_N:
2110			break;
2111		default:
2112			BUG();
2113		}
2114		break;
2115	case 6:
2116		if (i == sh->qd_idx)
2117			return 0; /* It is the Q disk */
2118		switch (algorithm) {
2119		case ALGORITHM_LEFT_ASYMMETRIC:
2120		case ALGORITHM_RIGHT_ASYMMETRIC:
2121		case ALGORITHM_ROTATING_ZERO_RESTART:
2122		case ALGORITHM_ROTATING_N_RESTART:
2123			if (sh->pd_idx == raid_disks-1)
2124				i--;	/* Q D D D P */
2125			else if (i > sh->pd_idx)
2126				i -= 2; /* D D P Q D */
2127			break;
2128		case ALGORITHM_LEFT_SYMMETRIC:
2129		case ALGORITHM_RIGHT_SYMMETRIC:
2130			if (sh->pd_idx == raid_disks-1)
2131				i--; /* Q D D D P */
2132			else {
2133				/* D D P Q D */
2134				if (i < sh->pd_idx)
2135					i += raid_disks;
2136				i -= (sh->pd_idx + 2);
2137			}
2138			break;
2139		case ALGORITHM_PARITY_0:
2140			i -= 2;
2141			break;
2142		case ALGORITHM_PARITY_N:
2143			break;
2144		case ALGORITHM_ROTATING_N_CONTINUE:
2145			/* Like left_symmetric, but P is before Q */
2146			if (sh->pd_idx == 0)
2147				i--;	/* P D D D Q */
2148			else {
2149				/* D D Q P D */
2150				if (i < sh->pd_idx)
2151					i += raid_disks;
2152				i -= (sh->pd_idx + 1);
2153			}
2154			break;
2155		case ALGORITHM_LEFT_ASYMMETRIC_6:
2156		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2157			if (i > sh->pd_idx)
2158				i--;
2159			break;
2160		case ALGORITHM_LEFT_SYMMETRIC_6:
2161		case ALGORITHM_RIGHT_SYMMETRIC_6:
2162			if (i < sh->pd_idx)
2163				i += data_disks + 1;
2164			i -= (sh->pd_idx + 1);
2165			break;
2166		case ALGORITHM_PARITY_0_6:
2167			i -= 1;
2168			break;
2169		default:
2170			BUG();
2171		}
2172		break;
2173	}
2174
2175	chunk_number = stripe * data_disks + i;
2176	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2177
2178	check = raid5_compute_sector(conf, r_sector,
2179				     previous, &dummy1, &sh2);
2180	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2181		|| sh2.qd_idx != sh->qd_idx) {
2182		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2183		       mdname(conf->mddev));
2184		return 0;
2185	}
2186	return r_sector;
2187}
2188
2189
2190static void
2191schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2192			 int rcw, int expand)
2193{
2194	int i, pd_idx = sh->pd_idx, disks = sh->disks;
2195	struct r5conf *conf = sh->raid_conf;
2196	int level = conf->level;
2197
2198	if (rcw) {
2199		/* if we are not expanding this is a proper write request, and
2200		 * there will be bios with new data to be drained into the
2201		 * stripe cache
2202		 */
2203		if (!expand) {
2204			sh->reconstruct_state = reconstruct_state_drain_run;
2205			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2206		} else
2207			sh->reconstruct_state = reconstruct_state_run;
2208
2209		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2210
2211		for (i = disks; i--; ) {
2212			struct r5dev *dev = &sh->dev[i];
2213
2214			if (dev->towrite) {
2215				set_bit(R5_LOCKED, &dev->flags);
2216				set_bit(R5_Wantdrain, &dev->flags);
2217				if (!expand)
2218					clear_bit(R5_UPTODATE, &dev->flags);
2219				s->locked++;
2220			}
2221		}
2222		if (s->locked + conf->max_degraded == disks)
2223			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2224				atomic_inc(&conf->pending_full_writes);
2225	} else {
2226		BUG_ON(level == 6);
2227		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2228			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2229
2230		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2231		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2232		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2233		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2234
2235		for (i = disks; i--; ) {
2236			struct r5dev *dev = &sh->dev[i];
2237			if (i == pd_idx)
2238				continue;
2239
2240			if (dev->towrite &&
2241			    (test_bit(R5_UPTODATE, &dev->flags) ||
2242			     test_bit(R5_Wantcompute, &dev->flags))) {
2243				set_bit(R5_Wantdrain, &dev->flags);
2244				set_bit(R5_LOCKED, &dev->flags);
2245				clear_bit(R5_UPTODATE, &dev->flags);
2246				s->locked++;
2247			}
2248		}
2249	}
2250
2251	/* keep the parity disk(s) locked while asynchronous operations
2252	 * are in flight
2253	 */
2254	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2255	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2256	s->locked++;
2257
2258	if (level == 6) {
2259		int qd_idx = sh->qd_idx;
2260		struct r5dev *dev = &sh->dev[qd_idx];
2261
2262		set_bit(R5_LOCKED, &dev->flags);
2263		clear_bit(R5_UPTODATE, &dev->flags);
2264		s->locked++;
2265	}
2266
2267	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2268		__func__, (unsigned long long)sh->sector,
2269		s->locked, s->ops_request);
2270}
2271
2272/*
2273 * Each stripe/dev can have one or more bion attached.
2274 * toread/towrite point to the first in a chain.
2275 * The bi_next chain must be in order.
2276 */
2277static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2278{
2279	struct bio **bip;
2280	struct r5conf *conf = sh->raid_conf;
2281	int firstwrite=0;
2282
2283	pr_debug("adding bi b#%llu to stripe s#%llu\n",
2284		(unsigned long long)bi->bi_sector,
2285		(unsigned long long)sh->sector);
2286
2287
2288	spin_lock_irq(&conf->device_lock);
2289	if (forwrite) {
2290		bip = &sh->dev[dd_idx].towrite;
2291		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2292			firstwrite = 1;
2293	} else
2294		bip = &sh->dev[dd_idx].toread;
2295	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2296		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2297			goto overlap;
2298		bip = & (*bip)->bi_next;
2299	}
2300	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2301		goto overlap;
2302
2303	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2304	if (*bip)
2305		bi->bi_next = *bip;
2306	*bip = bi;
2307	bi->bi_phys_segments++;
2308
2309	if (forwrite) {
2310		/* check if page is covered */
2311		sector_t sector = sh->dev[dd_idx].sector;
2312		for (bi=sh->dev[dd_idx].towrite;
2313		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2314			     bi && bi->bi_sector <= sector;
2315		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2316			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2317				sector = bi->bi_sector + (bi->bi_size>>9);
2318		}
2319		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2320			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2321	}
2322	spin_unlock_irq(&conf->device_lock);
2323
2324	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2325		(unsigned long long)(*bip)->bi_sector,
2326		(unsigned long long)sh->sector, dd_idx);
2327
2328	if (conf->mddev->bitmap && firstwrite) {
2329		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2330				  STRIPE_SECTORS, 0);
2331		sh->bm_seq = conf->seq_flush+1;
2332		set_bit(STRIPE_BIT_DELAY, &sh->state);
2333	}
2334	return 1;
2335
2336 overlap:
2337	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2338	spin_unlock_irq(&conf->device_lock);
2339	return 0;
2340}
2341
2342static void end_reshape(struct r5conf *conf);
2343
2344static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2345			    struct stripe_head *sh)
2346{
2347	int sectors_per_chunk =
2348		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2349	int dd_idx;
2350	int chunk_offset = sector_div(stripe, sectors_per_chunk);
2351	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2352
2353	raid5_compute_sector(conf,
2354			     stripe * (disks - conf->max_degraded)
2355			     *sectors_per_chunk + chunk_offset,
2356			     previous,
2357			     &dd_idx, sh);
2358}
2359
2360static void
2361handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2362				struct stripe_head_state *s, int disks,
2363				struct bio **return_bi)
2364{
2365	int i;
2366	for (i = disks; i--; ) {
2367		struct bio *bi;
2368		int bitmap_end = 0;
2369
2370		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2371			struct md_rdev *rdev;
2372			rcu_read_lock();
2373			rdev = rcu_dereference(conf->disks[i].rdev);
2374			if (rdev && test_bit(In_sync, &rdev->flags))
2375				atomic_inc(&rdev->nr_pending);
2376			else
2377				rdev = NULL;
2378			rcu_read_unlock();
2379			if (rdev) {
2380				if (!rdev_set_badblocks(
2381					    rdev,
2382					    sh->sector,
2383					    STRIPE_SECTORS, 0))
2384					md_error(conf->mddev, rdev);
2385				rdev_dec_pending(rdev, conf->mddev);
2386			}
2387		}
2388		spin_lock_irq(&conf->device_lock);
2389		/* fail all writes first */
2390		bi = sh->dev[i].towrite;
2391		sh->dev[i].towrite = NULL;
2392		if (bi) {
2393			s->to_write--;
2394			bitmap_end = 1;
2395		}
2396
2397		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2398			wake_up(&conf->wait_for_overlap);
2399
2400		while (bi && bi->bi_sector <
2401			sh->dev[i].sector + STRIPE_SECTORS) {
2402			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2403			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2404			if (!raid5_dec_bi_phys_segments(bi)) {
2405				md_write_end(conf->mddev);
2406				bi->bi_next = *return_bi;
2407				*return_bi = bi;
2408			}
2409			bi = nextbi;
2410		}
2411		/* and fail all 'written' */
2412		bi = sh->dev[i].written;
2413		sh->dev[i].written = NULL;
2414		if (bi) bitmap_end = 1;
2415		while (bi && bi->bi_sector <
2416		       sh->dev[i].sector + STRIPE_SECTORS) {
2417			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2418			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2419			if (!raid5_dec_bi_phys_segments(bi)) {
2420				md_write_end(conf->mddev);
2421				bi->bi_next = *return_bi;
2422				*return_bi = bi;
2423			}
2424			bi = bi2;
2425		}
2426
2427		/* fail any reads if this device is non-operational and
2428		 * the data has not reached the cache yet.
2429		 */
2430		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2431		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2432		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
2433			bi = sh->dev[i].toread;
2434			sh->dev[i].toread = NULL;
2435			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2436				wake_up(&conf->wait_for_overlap);
2437			if (bi) s->to_read--;
2438			while (bi && bi->bi_sector <
2439			       sh->dev[i].sector + STRIPE_SECTORS) {
2440				struct bio *nextbi =
2441					r5_next_bio(bi, sh->dev[i].sector);
2442				clear_bit(BIO_UPTODATE, &bi->bi_flags);
2443				if (!raid5_dec_bi_phys_segments(bi)) {
2444					bi->bi_next = *return_bi;
2445					*return_bi = bi;
2446				}
2447				bi = nextbi;
2448			}
2449		}
2450		spin_unlock_irq(&conf->device_lock);
2451		if (bitmap_end)
2452			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2453					STRIPE_SECTORS, 0, 0);
2454		/* If we were in the middle of a write the parity block might
2455		 * still be locked - so just clear all R5_LOCKED flags
2456		 */
2457		clear_bit(R5_LOCKED, &sh->dev[i].flags);
2458	}
2459
2460	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2461		if (atomic_dec_and_test(&conf->pending_full_writes))
2462			md_wakeup_thread(conf->mddev->thread);
2463}
2464
2465static void
2466handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2467		   struct stripe_head_state *s)
2468{
2469	int abort = 0;
2470	int i;
2471
2472	md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2473	clear_bit(STRIPE_SYNCING, &sh->state);
2474	s->syncing = 0;
2475	s->replacing = 0;
2476	/* There is nothing more to do for sync/check/repair.
2477	 * For recover/replace we need to record a bad block on all
2478	 * non-sync devices, or abort the recovery
2479	 */
2480	if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2481		return;
2482	/* During recovery devices cannot be removed, so locking and
2483	 * refcounting of rdevs is not needed
2484	 */
2485	for (i = 0; i < conf->raid_disks; i++) {
2486		struct md_rdev *rdev = conf->disks[i].rdev;
2487		if (rdev
2488		    && !test_bit(Faulty, &rdev->flags)
2489		    && !test_bit(In_sync, &rdev->flags)
2490		    && !rdev_set_badblocks(rdev, sh->sector,
2491					   STRIPE_SECTORS, 0))
2492			abort = 1;
2493		rdev = conf->disks[i].replacement;
2494		if (rdev
2495		    && !test_bit(Faulty, &rdev->flags)
2496		    && !test_bit(In_sync, &rdev->flags)
2497		    && !rdev_set_badblocks(rdev, sh->sector,
2498					   STRIPE_SECTORS, 0))
2499			abort = 1;
2500	}
2501	if (abort) {
2502		conf->recovery_disabled = conf->mddev->recovery_disabled;
2503		set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2504	}
2505}
2506
2507static int want_replace(struct stripe_head *sh, int disk_idx)
2508{
2509	struct md_rdev *rdev;
2510	int rv = 0;
2511	/* Doing recovery so rcu locking not required */
2512	rdev = sh->raid_conf->disks[disk_idx].replacement;
2513	if (rdev
2514	    && !test_bit(Faulty, &rdev->flags)
2515	    && !test_bit(In_sync, &rdev->flags)
2516	    && (rdev->recovery_offset <= sh->sector
2517		|| rdev->mddev->recovery_cp <= sh->sector))
2518		rv = 1;
2519
2520	return rv;
2521}
2522
2523/* fetch_block - checks the given member device to see if its data needs
2524 * to be read or computed to satisfy a request.
2525 *
2526 * Returns 1 when no more member devices need to be checked, otherwise returns
2527 * 0 to tell the loop in handle_stripe_fill to continue
2528 */
2529static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2530		       int disk_idx, int disks)
2531{
2532	struct r5dev *dev = &sh->dev[disk_idx];
2533	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2534				  &sh->dev[s->failed_num[1]] };
2535
2536	/* is the data in this block needed, and can we get it? */
2537	if (!test_bit(R5_LOCKED, &dev->flags) &&
2538	    !test_bit(R5_UPTODATE, &dev->flags) &&
2539	    (dev->toread ||
2540	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2541	     s->syncing || s->expanding ||
2542	     (s->replacing && want_replace(sh, disk_idx)) ||
2543	     (s->failed >= 1 && fdev[0]->toread) ||
2544	     (s->failed >= 2 && fdev[1]->toread) ||
2545	     (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2546	      !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2547	     (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2548		/* we would like to get this block, possibly by computing it,
2549		 * otherwise read it if the backing disk is insync
2550		 */
2551		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2552		BUG_ON(test_bit(R5_Wantread, &dev->flags));
2553		if ((s->uptodate == disks - 1) &&
2554		    (s->failed && (disk_idx == s->failed_num[0] ||
2555				   disk_idx == s->failed_num[1]))) {
2556			/* have disk failed, and we're requested to fetch it;
2557			 * do compute it
2558			 */
2559			pr_debug("Computing stripe %llu block %d\n",
2560			       (unsigned long long)sh->sector, disk_idx);
2561			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2562			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2563			set_bit(R5_Wantcompute, &dev->flags);
2564			sh->ops.target = disk_idx;
2565			sh->ops.target2 = -1; /* no 2nd target */
2566			s->req_compute = 1;
2567			/* Careful: from this point on 'uptodate' is in the eye
2568			 * of raid_run_ops which services 'compute' operations
2569			 * before writes. R5_Wantcompute flags a block that will
2570			 * be R5_UPTODATE by the time it is needed for a
2571			 * subsequent operation.
2572			 */
2573			s->uptodate++;
2574			return 1;
2575		} else if (s->uptodate == disks-2 && s->failed >= 2) {
2576			/* Computing 2-failure is *very* expensive; only
2577			 * do it if failed >= 2
2578			 */
2579			int other;
2580			for (other = disks; other--; ) {
2581				if (other == disk_idx)
2582					continue;
2583				if (!test_bit(R5_UPTODATE,
2584				      &sh->dev[other].flags))
2585					break;
2586			}
2587			BUG_ON(other < 0);
2588			pr_debug("Computing stripe %llu blocks %d,%d\n",
2589			       (unsigned long long)sh->sector,
2590			       disk_idx, other);
2591			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2592			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2593			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2594			set_bit(R5_Wantcompute, &sh->dev[other].flags);
2595			sh->ops.target = disk_idx;
2596			sh->ops.target2 = other;
2597			s->uptodate += 2;
2598			s->req_compute = 1;
2599			return 1;
2600		} else if (test_bit(R5_Insync, &dev->flags)) {
2601			set_bit(R5_LOCKED, &dev->flags);
2602			set_bit(R5_Wantread, &dev->flags);
2603			s->locked++;
2604			pr_debug("Reading block %d (sync=%d)\n",
2605				disk_idx, s->syncing);
2606		}
2607	}
2608
2609	return 0;
2610}
2611
2612/**
2613 * handle_stripe_fill - read or compute data to satisfy pending requests.
2614 */
2615static void handle_stripe_fill(struct stripe_head *sh,
2616			       struct stripe_head_state *s,
2617			       int disks)
2618{
2619	int i;
2620
2621	/* look for blocks to read/compute, skip this if a compute
2622	 * is already in flight, or if the stripe contents are in the
2623	 * midst of changing due to a write
2624	 */
2625	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2626	    !sh->reconstruct_state)
2627		for (i = disks; i--; )
2628			if (fetch_block(sh, s, i, disks))
2629				break;
2630	set_bit(STRIPE_HANDLE, &sh->state);
2631}
2632
2633
2634/* handle_stripe_clean_event
2635 * any written block on an uptodate or failed drive can be returned.
2636 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2637 * never LOCKED, so we don't need to test 'failed' directly.
2638 */
2639static void handle_stripe_clean_event(struct r5conf *conf,
2640	struct stripe_head *sh, int disks, struct bio **return_bi)
2641{
2642	int i;
2643	struct r5dev *dev;
2644
2645	for (i = disks; i--; )
2646		if (sh->dev[i].written) {
2647			dev = &sh->dev[i];
2648			if (!test_bit(R5_LOCKED, &dev->flags) &&
2649				test_bit(R5_UPTODATE, &dev->flags)) {
2650				/* We can return any write requests */
2651				struct bio *wbi, *wbi2;
2652				int bitmap_end = 0;
2653				pr_debug("Return write for disc %d\n", i);
2654				spin_lock_irq(&conf->device_lock);
2655				wbi = dev->written;
2656				dev->written = NULL;
2657				while (wbi && wbi->bi_sector <
2658					dev->sector + STRIPE_SECTORS) {
2659					wbi2 = r5_next_bio(wbi, dev->sector);
2660					if (!raid5_dec_bi_phys_segments(wbi)) {
2661						md_write_end(conf->mddev);
2662						wbi->bi_next = *return_bi;
2663						*return_bi = wbi;
2664					}
2665					wbi = wbi2;
2666				}
2667				if (dev->towrite == NULL)
2668					bitmap_end = 1;
2669				spin_unlock_irq(&conf->device_lock);
2670				if (bitmap_end)
2671					bitmap_endwrite(conf->mddev->bitmap,
2672							sh->sector,
2673							STRIPE_SECTORS,
2674					 !test_bit(STRIPE_DEGRADED, &sh->state),
2675							0);
2676			}
2677		}
2678
2679	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2680		if (atomic_dec_and_test(&conf->pending_full_writes))
2681			md_wakeup_thread(conf->mddev->thread);
2682}
2683
2684static void handle_stripe_dirtying(struct r5conf *conf,
2685				   struct stripe_head *sh,
2686				   struct stripe_head_state *s,
2687				   int disks)
2688{
2689	int rmw = 0, rcw = 0, i;
2690	if (conf->max_degraded == 2) {
2691		/* RAID6 requires 'rcw' in current implementation
2692		 * Calculate the real rcw later - for now fake it
2693		 * look like rcw is cheaper
2694		 */
2695		rcw = 1; rmw = 2;
2696	} else for (i = disks; i--; ) {
2697		/* would I have to read this buffer for read_modify_write */
2698		struct r5dev *dev = &sh->dev[i];
2699		if ((dev->towrite || i == sh->pd_idx) &&
2700		    !test_bit(R5_LOCKED, &dev->flags) &&
2701		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2702		      test_bit(R5_Wantcompute, &dev->flags))) {
2703			if (test_bit(R5_Insync, &dev->flags))
2704				rmw++;
2705			else
2706				rmw += 2*disks;  /* cannot read it */
2707		}
2708		/* Would I have to read this buffer for reconstruct_write */
2709		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2710		    !test_bit(R5_LOCKED, &dev->flags) &&
2711		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2712		    test_bit(R5_Wantcompute, &dev->flags))) {
2713			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2714			else
2715				rcw += 2*disks;
2716		}
2717	}
2718	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2719		(unsigned long long)sh->sector, rmw, rcw);
2720	set_bit(STRIPE_HANDLE, &sh->state);
2721	if (rmw < rcw && rmw > 0)
2722		/* prefer read-modify-write, but need to get some data */
2723		for (i = disks; i--; ) {
2724			struct r5dev *dev = &sh->dev[i];
2725			if ((dev->towrite || i == sh->pd_idx) &&
2726			    !test_bit(R5_LOCKED, &dev->flags) &&
2727			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2728			    test_bit(R5_Wantcompute, &dev->flags)) &&
2729			    test_bit(R5_Insync, &dev->flags)) {
2730				if (
2731				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2732					pr_debug("Read_old block "
2733						"%d for r-m-w\n", i);
2734					set_bit(R5_LOCKED, &dev->flags);
2735					set_bit(R5_Wantread, &dev->flags);
2736					s->locked++;
2737				} else {
2738					set_bit(STRIPE_DELAYED, &sh->state);
2739					set_bit(STRIPE_HANDLE, &sh->state);
2740				}
2741			}
2742		}
2743	if (rcw <= rmw && rcw > 0) {
2744		/* want reconstruct write, but need to get some data */
2745		rcw = 0;
2746		for (i = disks; i--; ) {
2747			struct r5dev *dev = &sh->dev[i];
2748			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2749			    i != sh->pd_idx && i != sh->qd_idx &&
2750			    !test_bit(R5_LOCKED, &dev->flags) &&
2751			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2752			      test_bit(R5_Wantcompute, &dev->flags))) {
2753				rcw++;
2754				if (!test_bit(R5_Insync, &dev->flags))
2755					continue; /* it's a failed drive */
2756				if (
2757				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2758					pr_debug("Read_old block "
2759						"%d for Reconstruct\n", i);
2760					set_bit(R5_LOCKED, &dev->flags);
2761					set_bit(R5_Wantread, &dev->flags);
2762					s->locked++;
2763				} else {
2764					set_bit(STRIPE_DELAYED, &sh->state);
2765					set_bit(STRIPE_HANDLE, &sh->state);
2766				}
2767			}
2768		}
2769	}
2770	/* now if nothing is locked, and if we have enough data,
2771	 * we can start a write request
2772	 */
2773	/* since handle_stripe can be called at any time we need to handle the
2774	 * case where a compute block operation has been submitted and then a
2775	 * subsequent call wants to start a write request.  raid_run_ops only
2776	 * handles the case where compute block and reconstruct are requested
2777	 * simultaneously.  If this is not the case then new writes need to be
2778	 * held off until the compute completes.
2779	 */
2780	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2781	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2782	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2783		schedule_reconstruction(sh, s, rcw == 0, 0);
2784}
2785
2786static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2787				struct stripe_head_state *s, int disks)
2788{
2789	struct r5dev *dev = NULL;
2790
2791	set_bit(STRIPE_HANDLE, &sh->state);
2792
2793	switch (sh->check_state) {
2794	case check_state_idle:
2795		/* start a new check operation if there are no failures */
2796		if (s->failed == 0) {
2797			BUG_ON(s->uptodate != disks);
2798			sh->check_state = check_state_run;
2799			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2800			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2801			s->uptodate--;
2802			break;
2803		}
2804		dev = &sh->dev[s->failed_num[0]];
2805		/* fall through */
2806	case check_state_compute_result:
2807		sh->check_state = check_state_idle;
2808		if (!dev)
2809			dev = &sh->dev[sh->pd_idx];
2810
2811		/* check that a write has not made the stripe insync */
2812		if (test_bit(STRIPE_INSYNC, &sh->state))
2813			break;
2814
2815		/* either failed parity check, or recovery is happening */
2816		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2817		BUG_ON(s->uptodate != disks);
2818
2819		set_bit(R5_LOCKED, &dev->flags);
2820		s->locked++;
2821		set_bit(R5_Wantwrite, &dev->flags);
2822
2823		clear_bit(STRIPE_DEGRADED, &sh->state);
2824		set_bit(STRIPE_INSYNC, &sh->state);
2825		break;
2826	case check_state_run:
2827		break; /* we will be called again upon completion */
2828	case check_state_check_result:
2829		sh->check_state = check_state_idle;
2830
2831		/* if a failure occurred during the check operation, leave
2832		 * STRIPE_INSYNC not set and let the stripe be handled again
2833		 */
2834		if (s->failed)
2835			break;
2836
2837		/* handle a successful check operation, if parity is correct
2838		 * we are done.  Otherwise update the mismatch count and repair
2839		 * parity if !MD_RECOVERY_CHECK
2840		 */
2841		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2842			/* parity is correct (on disc,
2843			 * not in buffer any more)
2844			 */
2845			set_bit(STRIPE_INSYNC, &sh->state);
2846		else {
2847			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2848			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2849				/* don't try to repair!! */
2850				set_bit(STRIPE_INSYNC, &sh->state);
2851			else {
2852				sh->check_state = check_state_compute_run;
2853				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2854				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2855				set_bit(R5_Wantcompute,
2856					&sh->dev[sh->pd_idx].flags);
2857				sh->ops.target = sh->pd_idx;
2858				sh->ops.target2 = -1;
2859				s->uptodate++;
2860			}
2861		}
2862		break;
2863	case check_state_compute_run:
2864		break;
2865	default:
2866		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2867		       __func__, sh->check_state,
2868		       (unsigned long long) sh->sector);
2869		BUG();
2870	}
2871}
2872
2873
2874static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
2875				  struct stripe_head_state *s,
2876				  int disks)
2877{
2878	int pd_idx = sh->pd_idx;
2879	int qd_idx = sh->qd_idx;
2880	struct r5dev *dev;
2881
2882	set_bit(STRIPE_HANDLE, &sh->state);
2883
2884	BUG_ON(s->failed > 2);
2885
2886	/* Want to check and possibly repair P and Q.
2887	 * However there could be one 'failed' device, in which
2888	 * case we can only check one of them, possibly using the
2889	 * other to generate missing data
2890	 */
2891
2892	switch (sh->check_state) {
2893	case check_state_idle:
2894		/* start a new check operation if there are < 2 failures */
2895		if (s->failed == s->q_failed) {
2896			/* The only possible failed device holds Q, so it
2897			 * makes sense to check P (If anything else were failed,
2898			 * we would have used P to recreate it).
2899			 */
2900			sh->check_state = check_state_run;
2901		}
2902		if (!s->q_failed && s->failed < 2) {
2903			/* Q is not failed, and we didn't use it to generate
2904			 * anything, so it makes sense to check it
2905			 */
2906			if (sh->check_state == check_state_run)
2907				sh->check_state = check_state_run_pq;
2908			else
2909				sh->check_state = check_state_run_q;
2910		}
2911
2912		/* discard potentially stale zero_sum_result */
2913		sh->ops.zero_sum_result = 0;
2914
2915		if (sh->check_state == check_state_run) {
2916			/* async_xor_zero_sum destroys the contents of P */
2917			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2918			s->uptodate--;
2919		}
2920		if (sh->check_state >= check_state_run &&
2921		    sh->check_state <= check_state_run_pq) {
2922			/* async_syndrome_zero_sum preserves P and Q, so
2923			 * no need to mark them !uptodate here
2924			 */
2925			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2926			break;
2927		}
2928
2929		/* we have 2-disk failure */
2930		BUG_ON(s->failed != 2);
2931		/* fall through */
2932	case check_state_compute_result:
2933		sh->check_state = check_state_idle;
2934
2935		/* check that a write has not made the stripe insync */
2936		if (test_bit(STRIPE_INSYNC, &sh->state))
2937			break;
2938
2939		/* now write out any block on a failed drive,
2940		 * or P or Q if they were recomputed
2941		 */
2942		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2943		if (s->failed == 2) {
2944			dev = &sh->dev[s->failed_num[1]];
2945			s->locked++;
2946			set_bit(R5_LOCKED, &dev->flags);
2947			set_bit(R5_Wantwrite, &dev->flags);
2948		}
2949		if (s->failed >= 1) {
2950			dev = &sh->dev[s->failed_num[0]];
2951			s->locked++;
2952			set_bit(R5_LOCKED, &dev->flags);
2953			set_bit(R5_Wantwrite, &dev->flags);
2954		}
2955		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2956			dev = &sh->dev[pd_idx];
2957			s->locked++;
2958			set_bit(R5_LOCKED, &dev->flags);
2959			set_bit(R5_Wantwrite, &dev->flags);
2960		}
2961		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2962			dev = &sh->dev[qd_idx];
2963			s->locked++;
2964			set_bit(R5_LOCKED, &dev->flags);
2965			set_bit(R5_Wantwrite, &dev->flags);
2966		}
2967		clear_bit(STRIPE_DEGRADED, &sh->state);
2968
2969		set_bit(STRIPE_INSYNC, &sh->state);
2970		break;
2971	case check_state_run:
2972	case check_state_run_q:
2973	case check_state_run_pq:
2974		break; /* we will be called again upon completion */
2975	case check_state_check_result:
2976		sh->check_state = check_state_idle;
2977
2978		/* handle a successful check operation, if parity is correct
2979		 * we are done.  Otherwise update the mismatch count and repair
2980		 * parity if !MD_RECOVERY_CHECK
2981		 */
2982		if (sh->ops.zero_sum_result == 0) {
2983			/* both parities are correct */
2984			if (!s->failed)
2985				set_bit(STRIPE_INSYNC, &sh->state);
2986			else {
2987				/* in contrast to the raid5 case we can validate
2988				 * parity, but still have a failure to write
2989				 * back
2990				 */
2991				sh->check_state = check_state_compute_result;
2992				/* Returning at this point means that we may go
2993				 * off and bring p and/or q uptodate again so
2994				 * we make sure to check zero_sum_result again
2995				 * to verify if p or q need writeback
2996				 */
2997			}
2998		} else {
2999			conf->mddev->resync_mismatches += STRIPE_SECTORS;
3000			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3001				/* don't try to repair!! */
3002				set_bit(STRIPE_INSYNC, &sh->state);
3003			else {
3004				int *target = &sh->ops.target;
3005
3006				sh->ops.target = -1;
3007				sh->ops.target2 = -1;
3008				sh->check_state = check_state_compute_run;
3009				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3010				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3011				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3012					set_bit(R5_Wantcompute,
3013						&sh->dev[pd_idx].flags);
3014					*target = pd_idx;
3015					target = &sh->ops.target2;
3016					s->uptodate++;
3017				}
3018				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3019					set_bit(R5_Wantcompute,
3020						&sh->dev[qd_idx].flags);
3021					*target = qd_idx;
3022					s->uptodate++;
3023				}
3024			}
3025		}
3026		break;
3027	case check_state_compute_run:
3028		break;
3029	default:
3030		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3031		       __func__, sh->check_state,
3032		       (unsigned long long) sh->sector);
3033		BUG();
3034	}
3035}
3036
3037static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3038{
3039	int i;
3040
3041	/* We have read all the blocks in this stripe and now we need to
3042	 * copy some of them into a target stripe for expand.
3043	 */
3044	struct dma_async_tx_descriptor *tx = NULL;
3045	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3046	for (i = 0; i < sh->disks; i++)
3047		if (i != sh->pd_idx && i != sh->qd_idx) {
3048			int dd_idx, j;
3049			struct stripe_head *sh2;
3050			struct async_submit_ctl submit;
3051
3052			sector_t bn = compute_blocknr(sh, i, 1);
3053			sector_t s = raid5_compute_sector(conf, bn, 0,
3054							  &dd_idx, NULL);
3055			sh2 = get_active_stripe(conf, s, 0, 1, 1);
3056			if (sh2 == NULL)
3057				/* so far only the early blocks of this stripe
3058				 * have been requested.  When later blocks
3059				 * get requested, we will try again
3060				 */
3061				continue;
3062			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
3063			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
3064				/* must have already done this block */
3065				release_stripe(sh2);
3066				continue;
3067			}
3068
3069			/* place all the copies on one channel */
3070			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
3071			tx = async_memcpy(sh2->dev[dd_idx].page,
3072					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
3073					  &submit);
3074
3075			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
3076			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
3077			for (j = 0; j < conf->raid_disks; j++)
3078				if (j != sh2->pd_idx &&
3079				    j != sh2->qd_idx &&
3080				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
3081					break;
3082			if (j == conf->raid_disks) {
3083				set_bit(STRIPE_EXPAND_READY, &sh2->state);
3084				set_bit(STRIPE_HANDLE, &sh2->state);
3085			}
3086			release_stripe(sh2);
3087
3088		}
3089	/* done submitting copies, wait for them to complete */
3090	if (tx) {
3091		async_tx_ack(tx);
3092		dma_wait_for_async_tx(tx);
3093	}
3094}
3095
3096/*
3097 * handle_stripe - do things to a stripe.
3098 *
3099 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3100 * state of various bits to see what needs to be done.
3101 * Possible results:
3102 *    return some read requests which now have data
3103 *    return some write requests which are safely on storage
3104 *    schedule a read on some buffers
3105 *    schedule a write of some buffers
3106 *    return confirmation of parity correctness
3107 *
3108 */
3109
3110static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3111{
3112	struct r5conf *conf = sh->raid_conf;
3113	int disks = sh->disks;
3114	struct r5dev *dev;
3115	int i;
3116	int do_recovery = 0;
3117
3118	memset(s, 0, sizeof(*s));
3119
3120	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3121	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3122	s->failed_num[0] = -1;
3123	s->failed_num[1] = -1;
3124
3125	/* Now to look around and see what can be done */
3126	rcu_read_lock();
3127	spin_lock_irq(&conf->device_lock);
3128	for (i=disks; i--; ) {
3129		struct md_rdev *rdev;
3130		sector_t first_bad;
3131		int bad_sectors;
3132		int is_bad = 0;
3133
3134		dev = &sh->dev[i];
3135
3136		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3137			 i, dev->flags,
3138			 dev->toread, dev->towrite, dev->written);
3139		/* maybe we can reply to a read
3140		 *
3141		 * new wantfill requests are only permitted while
3142		 * ops_complete_biofill is guaranteed to be inactive
3143		 */
3144		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3145		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3146			set_bit(R5_Wantfill, &dev->flags);
3147
3148		/* now count some things */
3149		if (test_bit(R5_LOCKED, &dev->flags))
3150			s->locked++;
3151		if (test_bit(R5_UPTODATE, &dev->flags))
3152			s->uptodate++;
3153		if (test_bit(R5_Wantcompute, &dev->flags)) {
3154			s->compute++;
3155			BUG_ON(s->compute > 2);
3156		}
3157
3158		if (test_bit(R5_Wantfill, &dev->flags))
3159			s->to_fill++;
3160		else if (dev->toread)
3161			s->to_read++;
3162		if (dev->towrite) {
3163			s->to_write++;
3164			if (!test_bit(R5_OVERWRITE, &dev->flags))
3165				s->non_overwrite++;
3166		}
3167		if (dev->written)
3168			s->written++;
3169		/* Prefer to use the replacement for reads, but only
3170		 * if it is recovered enough and has no bad blocks.
3171		 */
3172		rdev = rcu_dereference(conf->disks[i].replacement);
3173		if (rdev && !test_bit(Faulty, &rdev->flags) &&
3174		    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3175		    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3176				 &first_bad, &bad_sectors))
3177			set_bit(R5_ReadRepl, &dev->flags);
3178		else {
3179			if (rdev)
3180				set_bit(R5_NeedReplace, &dev->flags);
3181			rdev = rcu_dereference(conf->disks[i].rdev);
3182			clear_bit(R5_ReadRepl, &dev->flags);
3183		}
3184		if (rdev && test_bit(Faulty, &rdev->flags))
3185			rdev = NULL;
3186		if (rdev) {
3187			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3188					     &first_bad, &bad_sectors);
3189			if (s->blocked_rdev == NULL
3190			    && (test_bit(Blocked, &rdev->flags)
3191				|| is_bad < 0)) {
3192				if (is_bad < 0)
3193					set_bit(BlockedBadBlocks,
3194						&rdev->flags);
3195				s->blocked_rdev = rdev;
3196				atomic_inc(&rdev->nr_pending);
3197			}
3198		}
3199		clear_bit(R5_Insync, &dev->flags);
3200		if (!rdev)
3201			/* Not in-sync */;
3202		else if (is_bad) {
3203			/* also not in-sync */
3204			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3205				/* treat as in-sync, but with a read error
3206				 * which we can now try to correct
3207				 */
3208				set_bit(R5_Insync, &dev->flags);
3209				set_bit(R5_ReadError, &dev->flags);
3210			}
3211		} else if (test_bit(In_sync, &rdev->flags))
3212			set_bit(R5_Insync, &dev->flags);
3213		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3214			/* in sync if before recovery_offset */
3215			set_bit(R5_Insync, &dev->flags);
3216		else if (test_bit(R5_UPTODATE, &dev->flags) &&
3217			 test_bit(R5_Expanded, &dev->flags))
3218			/* If we've reshaped into here, we assume it is Insync.
3219			 * We will shortly update recovery_offset to make
3220			 * it official.
3221			 */
3222			set_bit(R5_Insync, &dev->flags);
3223
3224		if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3225			/* This flag does not apply to '.replacement'
3226			 * only to .rdev, so make sure to check that*/
3227			struct md_rdev *rdev2 = rcu_dereference(
3228				conf->disks[i].rdev);
3229			if (rdev2 == rdev)
3230				clear_bit(R5_Insync, &dev->flags);
3231			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3232				s->handle_bad_blocks = 1;
3233				atomic_inc(&rdev2->nr_pending);
3234			} else
3235				clear_bit(R5_WriteError, &dev->flags);
3236		}
3237		if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3238			/* This flag does not apply to '.replacement'
3239			 * only to .rdev, so make sure to check that*/
3240			struct md_rdev *rdev2 = rcu_dereference(
3241				conf->disks[i].rdev);
3242			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3243				s->handle_bad_blocks = 1;
3244				atomic_inc(&rdev2->nr_pending);
3245			} else
3246				clear_bit(R5_MadeGood, &dev->flags);
3247		}
3248		if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3249			struct md_rdev *rdev2 = rcu_dereference(
3250				conf->disks[i].replacement);
3251			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3252				s->handle_bad_blocks = 1;
3253				atomic_inc(&rdev2->nr_pending);
3254			} else
3255				clear_bit(R5_MadeGoodRepl, &dev->flags);
3256		}
3257		if (!test_bit(R5_Insync, &dev->flags)) {
3258			/* The ReadError flag will just be confusing now */
3259			clear_bit(R5_ReadError, &dev->flags);
3260			clear_bit(R5_ReWrite, &dev->flags);
3261		}
3262		if (test_bit(R5_ReadError, &dev->flags))
3263			clear_bit(R5_Insync, &dev->flags);
3264		if (!test_bit(R5_Insync, &dev->flags)) {
3265			if (s->failed < 2)
3266				s->failed_num[s->failed] = i;
3267			s->failed++;
3268			if (rdev && !test_bit(Faulty, &rdev->flags))
3269				do_recovery = 1;
3270		}
3271	}
3272	spin_unlock_irq(&conf->device_lock);
3273	if (test_bit(STRIPE_SYNCING, &sh->state)) {
3274		/* If there is a failed device being replaced,
3275		 *     we must be recovering.
3276		 * else if we are after recovery_cp, we must be syncing
3277		 * else we can only be replacing
3278		 * sync and recovery both need to read all devices, and so
3279		 * use the same flag.
3280		 */
3281		if (do_recovery ||
3282		    sh->sector >= conf->mddev->recovery_cp)
3283			s->syncing = 1;
3284		else
3285			s->replacing = 1;
3286	}
3287	rcu_read_unlock();
3288}
3289
3290static void handle_stripe(struct stripe_head *sh)
3291{
3292	struct stripe_head_state s;
3293	struct r5conf *conf = sh->raid_conf;
3294	int i;
3295	int prexor;
3296	int disks = sh->disks;
3297	struct r5dev *pdev, *qdev;
3298
3299	clear_bit(STRIPE_HANDLE, &sh->state);
3300	if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
3301		/* already being handled, ensure it gets handled
3302		 * again when current action finishes */
3303		set_bit(STRIPE_HANDLE, &sh->state);
3304		return;
3305	}
3306
3307	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3308		set_bit(STRIPE_SYNCING, &sh->state);
3309		clear_bit(STRIPE_INSYNC, &sh->state);
3310	}
3311	clear_bit(STRIPE_DELAYED, &sh->state);
3312
3313	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3314		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3315	       (unsigned long long)sh->sector, sh->state,
3316	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3317	       sh->check_state, sh->reconstruct_state);
3318
3319	analyse_stripe(sh, &s);
3320
3321	if (s.handle_bad_blocks) {
3322		set_bit(STRIPE_HANDLE, &sh->state);
3323		goto finish;
3324	}
3325
3326	if (unlikely(s.blocked_rdev)) {
3327		if (s.syncing || s.expanding || s.expanded ||
3328		    s.replacing || s.to_write || s.written) {
3329			set_bit(STRIPE_HANDLE, &sh->state);
3330			goto finish;
3331		}
3332		/* There is nothing for the blocked_rdev to block */
3333		rdev_dec_pending(s.blocked_rdev, conf->mddev);
3334		s.blocked_rdev = NULL;
3335	}
3336
3337	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3338		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3339		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3340	}
3341
3342	pr_debug("locked=%d uptodate=%d to_read=%d"
3343	       " to_write=%d failed=%d failed_num=%d,%d\n",
3344	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3345	       s.failed_num[0], s.failed_num[1]);
3346	/* check if the array has lost more than max_degraded devices and,
3347	 * if so, some requests might need to be failed.
3348	 */
3349	if (s.failed > conf->max_degraded) {
3350		sh->check_state = 0;
3351		sh->reconstruct_state = 0;
3352		if (s.to_read+s.to_write+s.written)
3353			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3354		if (s.syncing + s.replacing)
3355			handle_failed_sync(conf, sh, &s);
3356	}
3357
3358	/*
3359	 * might be able to return some write requests if the parity blocks
3360	 * are safe, or on a failed drive
3361	 */
3362	pdev = &sh->dev[sh->pd_idx];
3363	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3364		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3365	qdev = &sh->dev[sh->qd_idx];
3366	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3367		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3368		|| conf->level < 6;
3369
3370	if (s.written &&
3371	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3372			     && !test_bit(R5_LOCKED, &pdev->flags)
3373			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3374	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3375			     && !test_bit(R5_LOCKED, &qdev->flags)
3376			     && test_bit(R5_UPTODATE, &qdev->flags)))))
3377		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3378
3379	/* Now we might consider reading some blocks, either to check/generate
3380	 * parity, or to satisfy requests
3381	 * or to load a block that is being partially written.
3382	 */
3383	if (s.to_read || s.non_overwrite
3384	    || (conf->level == 6 && s.to_write && s.failed)
3385	    || (s.syncing && (s.uptodate + s.compute < disks))
3386	    || s.replacing
3387	    || s.expanding)
3388		handle_stripe_fill(sh, &s, disks);
3389
3390	/* Now we check to see if any write operations have recently
3391	 * completed
3392	 */
3393	prexor = 0;
3394	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3395		prexor = 1;
3396	if (sh->reconstruct_state == reconstruct_state_drain_result ||
3397	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3398		sh->reconstruct_state = reconstruct_state_idle;
3399
3400		/* All the 'written' buffers and the parity block are ready to
3401		 * be written back to disk
3402		 */
3403		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3404		BUG_ON(sh->qd_idx >= 0 &&
3405		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3406		for (i = disks; i--; ) {
3407			struct r5dev *dev = &sh->dev[i];
3408			if (test_bit(R5_LOCKED, &dev->flags) &&
3409				(i == sh->pd_idx || i == sh->qd_idx ||
3410				 dev->written)) {
3411				pr_debug("Writing block %d\n", i);
3412				set_bit(R5_Wantwrite, &dev->flags);
3413				if (prexor)
3414					continue;
3415				if (!test_bit(R5_Insync, &dev->flags) ||
3416				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
3417				     s.failed == 0))
3418					set_bit(STRIPE_INSYNC, &sh->state);
3419			}
3420		}
3421		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3422			s.dec_preread_active = 1;
3423	}
3424
3425	/* Now to consider new write requests and what else, if anything
3426	 * should be read.  We do not handle new writes when:
3427	 * 1/ A 'write' operation (copy+xor) is already in flight.
3428	 * 2/ A 'check' operation is in flight, as it may clobber the parity
3429	 *    block.
3430	 */
3431	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3432		handle_stripe_dirtying(conf, sh, &s, disks);
3433
3434	/* maybe we need to check and possibly fix the parity for this stripe
3435	 * Any reads will already have been scheduled, so we just see if enough
3436	 * data is available.  The parity check is held off while parity
3437	 * dependent operations are in flight.
3438	 */
3439	if (sh->check_state ||
3440	    (s.syncing && s.locked == 0 &&
3441	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3442	     !test_bit(STRIPE_INSYNC, &sh->state))) {
3443		if (conf->level == 6)
3444			handle_parity_checks6(conf, sh, &s, disks);
3445		else
3446			handle_parity_checks5(conf, sh, &s, disks);
3447	}
3448
3449	if (s.replacing && s.locked == 0
3450	    && !test_bit(STRIPE_INSYNC, &sh->state)) {
3451		/* Write out to replacement devices where possible */
3452		for (i = 0; i < conf->raid_disks; i++)
3453			if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3454			    test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3455				set_bit(R5_WantReplace, &sh->dev[i].flags);
3456				set_bit(R5_LOCKED, &sh->dev[i].flags);
3457				s.locked++;
3458			}
3459		set_bit(STRIPE_INSYNC, &sh->state);
3460	}
3461	if ((s.syncing || s.replacing) && s.locked == 0 &&
3462	    test_bit(STRIPE_INSYNC, &sh->state)) {
3463		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3464		clear_bit(STRIPE_SYNCING, &sh->state);
3465	}
3466
3467	/* If the failed drives are just a ReadError, then we might need
3468	 * to progress the repair/check process
3469	 */
3470	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3471		for (i = 0; i < s.failed; i++) {
3472			struct r5dev *dev = &sh->dev[s.failed_num[i]];
3473			if (test_bit(R5_ReadError, &dev->flags)
3474			    && !test_bit(R5_LOCKED, &dev->flags)
3475			    && test_bit(R5_UPTODATE, &dev->flags)
3476				) {
3477				if (!test_bit(R5_ReWrite, &dev->flags)) {
3478					set_bit(R5_Wantwrite, &dev->flags);
3479					set_bit(R5_ReWrite, &dev->flags);
3480					set_bit(R5_LOCKED, &dev->flags);
3481					s.locked++;
3482				} else {
3483					/* let's read it back */
3484					set_bit(R5_Wantread, &dev->flags);
3485					set_bit(R5_LOCKED, &dev->flags);
3486					s.locked++;
3487				}
3488			}
3489		}
3490
3491
3492	/* Finish reconstruct operations initiated by the expansion process */
3493	if (sh->reconstruct_state == reconstruct_state_result) {
3494		struct stripe_head *sh_src
3495			= get_active_stripe(conf, sh->sector, 1, 1, 1);
3496		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3497			/* sh cannot be written until sh_src has been read.
3498			 * so arrange for sh to be delayed a little
3499			 */
3500			set_bit(STRIPE_DELAYED, &sh->state);
3501			set_bit(STRIPE_HANDLE, &sh->state);
3502			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3503					      &sh_src->state))
3504				atomic_inc(&conf->preread_active_stripes);
3505			release_stripe(sh_src);
3506			goto finish;
3507		}
3508		if (sh_src)
3509			release_stripe(sh_src);
3510
3511		sh->reconstruct_state = reconstruct_state_idle;
3512		clear_bit(STRIPE_EXPANDING, &sh->state);
3513		for (i = conf->raid_disks; i--; ) {
3514			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3515			set_bit(R5_LOCKED, &sh->dev[i].flags);
3516			s.locked++;
3517		}
3518	}
3519
3520	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3521	    !sh->reconstruct_state) {
3522		/* Need to write out all blocks after computing parity */
3523		sh->disks = conf->raid_disks;
3524		stripe_set_idx(sh->sector, conf, 0, sh);
3525		schedule_reconstruction(sh, &s, 1, 1);
3526	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3527		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3528		atomic_dec(&conf->reshape_stripes);
3529		wake_up(&conf->wait_for_overlap);
3530		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3531	}
3532
3533	if (s.expanding && s.locked == 0 &&
3534	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3535		handle_stripe_expansion(conf, sh);
3536
3537finish:
3538	/* wait for this device to become unblocked */
3539	if (conf->mddev->external && unlikely(s.blocked_rdev))
3540		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3541
3542	if (s.handle_bad_blocks)
3543		for (i = disks; i--; ) {
3544			struct md_rdev *rdev;
3545			struct r5dev *dev = &sh->dev[i];
3546			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3547				/* We own a safe reference to the rdev */
3548				rdev = conf->disks[i].rdev;
3549				if (!rdev_set_badblocks(rdev, sh->sector,
3550							STRIPE_SECTORS, 0))
3551					md_error(conf->mddev, rdev);
3552				rdev_dec_pending(rdev, conf->mddev);
3553			}
3554			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3555				rdev = conf->disks[i].rdev;
3556				rdev_clear_badblocks(rdev, sh->sector,
3557						     STRIPE_SECTORS);
3558				rdev_dec_pending(rdev, conf->mddev);
3559			}
3560			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3561				rdev = conf->disks[i].replacement;
3562				if (!rdev)
3563					/* rdev have been moved down */
3564					rdev = conf->disks[i].rdev;
3565				rdev_clear_badblocks(rdev, sh->sector,
3566						     STRIPE_SECTORS);
3567				rdev_dec_pending(rdev, conf->mddev);
3568			}
3569		}
3570
3571	if (s.ops_request)
3572		raid_run_ops(sh, s.ops_request);
3573
3574	ops_run_io(sh, &s);
3575
3576	if (s.dec_preread_active) {
3577		/* We delay this until after ops_run_io so that if make_request
3578		 * is waiting on a flush, it won't continue until the writes
3579		 * have actually been submitted.
3580		 */
3581		atomic_dec(&conf->preread_active_stripes);
3582		if (atomic_read(&conf->preread_active_stripes) <
3583		    IO_THRESHOLD)
3584			md_wakeup_thread(conf->mddev->thread);
3585	}
3586
3587	return_io(s.return_bi);
3588
3589	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3590}
3591
3592static void raid5_activate_delayed(struct r5conf *conf)
3593{
3594	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3595		while (!list_empty(&conf->delayed_list)) {
3596			struct list_head *l = conf->delayed_list.next;
3597			struct stripe_head *sh;
3598			sh = list_entry(l, struct stripe_head, lru);
3599			list_del_init(l);
3600			clear_bit(STRIPE_DELAYED, &sh->state);
3601			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3602				atomic_inc(&conf->preread_active_stripes);
3603			list_add_tail(&sh->lru, &conf->hold_list);
3604		}
3605	}
3606}
3607
3608static void activate_bit_delay(struct r5conf *conf)
3609{
3610	/* device_lock is held */
3611	struct list_head head;
3612	list_add(&head, &conf->bitmap_list);
3613	list_del_init(&conf->bitmap_list);
3614	while (!list_empty(&head)) {
3615		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3616		list_del_init(&sh->lru);
3617		atomic_inc(&sh->count);
3618		__release_stripe(conf, sh);
3619	}
3620}
3621
3622int md_raid5_congested(struct mddev *mddev, int bits)
3623{
3624	struct r5conf *conf = mddev->private;
3625
3626	/* No difference between reads and writes.  Just check
3627	 * how busy the stripe_cache is
3628	 */
3629
3630	if (conf->inactive_blocked)
3631		return 1;
3632	if (conf->quiesce)
3633		return 1;
3634	if (list_empty_careful(&conf->inactive_list))
3635		return 1;
3636
3637	return 0;
3638}
3639EXPORT_SYMBOL_GPL(md_raid5_congested);
3640
3641static int raid5_congested(void *data, int bits)
3642{
3643	struct mddev *mddev = data;
3644
3645	return mddev_congested(mddev, bits) ||
3646		md_raid5_congested(mddev, bits);
3647}
3648
3649/* We want read requests to align with chunks where possible,
3650 * but write requests don't need to.
3651 */
3652static int raid5_mergeable_bvec(struct request_queue *q,
3653				struct bvec_merge_data *bvm,
3654				struct bio_vec *biovec)
3655{
3656	struct mddev *mddev = q->queuedata;
3657	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3658	int max;
3659	unsigned int chunk_sectors = mddev->chunk_sectors;
3660	unsigned int bio_sectors = bvm->bi_size >> 9;
3661
3662	if ((bvm->bi_rw & 1) == WRITE)
3663		return biovec->bv_len; /* always allow writes to be mergeable */
3664
3665	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3666		chunk_sectors = mddev->new_chunk_sectors;
3667	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3668	if (max < 0) max = 0;
3669	if (max <= biovec->bv_len && bio_sectors == 0)
3670		return biovec->bv_len;
3671	else
3672		return max;
3673}
3674
3675
3676static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3677{
3678	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3679	unsigned int chunk_sectors = mddev->chunk_sectors;
3680	unsigned int bio_sectors = bio->bi_size >> 9;
3681
3682	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3683		chunk_sectors = mddev->new_chunk_sectors;
3684	return  chunk_sectors >=
3685		((sector & (chunk_sectors - 1)) + bio_sectors);
3686}
3687
3688/*
3689 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3690 *  later sampled by raid5d.
3691 */
3692static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
3693{
3694	unsigned long flags;
3695
3696	spin_lock_irqsave(&conf->device_lock, flags);
3697
3698	bi->bi_next = conf->retry_read_aligned_list;
3699	conf->retry_read_aligned_list = bi;
3700
3701	spin_unlock_irqrestore(&conf->device_lock, flags);
3702	md_wakeup_thread(conf->mddev->thread);
3703}
3704
3705
3706static struct bio *remove_bio_from_retry(struct r5conf *conf)
3707{
3708	struct bio *bi;
3709
3710	bi = conf->retry_read_aligned;
3711	if (bi) {
3712		conf->retry_read_aligned = NULL;
3713		return bi;
3714	}
3715	bi = conf->retry_read_aligned_list;
3716	if(bi) {
3717		conf->retry_read_aligned_list = bi->bi_next;
3718		bi->bi_next = NULL;
3719		/*
3720		 * this sets the active strip count to 1 and the processed
3721		 * strip count to zero (upper 8 bits)
3722		 */
3723		bi->bi_phys_segments = 1; /* biased count of active stripes */
3724	}
3725
3726	return bi;
3727}
3728
3729
3730/*
3731 *  The "raid5_align_endio" should check if the read succeeded and if it
3732 *  did, call bio_endio on the original bio (having bio_put the new bio
3733 *  first).
3734 *  If the read failed..
3735 */
3736static void raid5_align_endio(struct bio *bi, int error)
3737{
3738	struct bio* raid_bi  = bi->bi_private;
3739	struct mddev *mddev;
3740	struct r5conf *conf;
3741	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3742	struct md_rdev *rdev;
3743
3744	bio_put(bi);
3745
3746	rdev = (void*)raid_bi->bi_next;
3747	raid_bi->bi_next = NULL;
3748	mddev = rdev->mddev;
3749	conf = mddev->private;
3750
3751	rdev_dec_pending(rdev, conf->mddev);
3752
3753	if (!error && uptodate) {
3754		bio_endio(raid_bi, 0);
3755		if (atomic_dec_and_test(&conf->active_aligned_reads))
3756			wake_up(&conf->wait_for_stripe);
3757		return;
3758	}
3759
3760
3761	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3762
3763	add_bio_to_retry(raid_bi, conf);
3764}
3765
3766static int bio_fits_rdev(struct bio *bi)
3767{
3768	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3769
3770	if ((bi->bi_size>>9) > queue_max_sectors(q))
3771		return 0;
3772	blk_recount_segments(q, bi);
3773	if (bi->bi_phys_segments > queue_max_segments(q))
3774		return 0;
3775
3776	if (q->merge_bvec_fn)
3777		/* it's too hard to apply the merge_bvec_fn at this stage,
3778		 * just just give up
3779		 */
3780		return 0;
3781
3782	return 1;
3783}
3784
3785
3786static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3787{
3788	struct r5conf *conf = mddev->private;
3789	int dd_idx;
3790	struct bio* align_bi;
3791	struct md_rdev *rdev;
3792	sector_t end_sector;
3793
3794	if (!in_chunk_boundary(mddev, raid_bio)) {
3795		pr_debug("chunk_aligned_read : non aligned\n");
3796		return 0;
3797	}
3798	/*
3799	 * use bio_clone_mddev to make a copy of the bio
3800	 */
3801	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3802	if (!align_bi)
3803		return 0;
3804	/*
3805	 *   set bi_end_io to a new function, and set bi_private to the
3806	 *     original bio.
3807	 */
3808	align_bi->bi_end_io  = raid5_align_endio;
3809	align_bi->bi_private = raid_bio;
3810	/*
3811	 *	compute position
3812	 */
3813	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
3814						    0,
3815						    &dd_idx, NULL);
3816
3817	end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3818	rcu_read_lock();
3819	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3820	if (!rdev || test_bit(Faulty, &rdev->flags) ||
3821	    rdev->recovery_offset < end_sector) {
3822		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3823		if (rdev &&
3824		    (test_bit(Faulty, &rdev->flags) ||
3825		    !(test_bit(In_sync, &rdev->flags) ||
3826		      rdev->recovery_offset >= end_sector)))
3827			rdev = NULL;
3828	}
3829	if (rdev) {
3830		sector_t first_bad;
3831		int bad_sectors;
3832
3833		atomic_inc(&rdev->nr_pending);
3834		rcu_read_unlock();
3835		raid_bio->bi_next = (void*)rdev;
3836		align_bi->bi_bdev =  rdev->bdev;
3837		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3838		align_bi->bi_sector += rdev->data_offset;
3839
3840		if (!bio_fits_rdev(align_bi) ||
3841		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3842				&first_bad, &bad_sectors)) {
3843			/* too big in some way, or has a known bad block */
3844			bio_put(align_bi);
3845			rdev_dec_pending(rdev, mddev);
3846			return 0;
3847		}
3848
3849		spin_lock_irq(&conf->device_lock);
3850		wait_event_lock_irq(conf->wait_for_stripe,
3851				    conf->quiesce == 0,
3852				    conf->device_lock, /* nothing */);
3853		atomic_inc(&conf->active_aligned_reads);
3854		spin_unlock_irq(&conf->device_lock);
3855
3856		generic_make_request(align_bi);
3857		return 1;
3858	} else {
3859		rcu_read_unlock();
3860		bio_put(align_bi);
3861		return 0;
3862	}
3863}
3864
3865/* __get_priority_stripe - get the next stripe to process
3866 *
3867 * Full stripe writes are allowed to pass preread active stripes up until
3868 * the bypass_threshold is exceeded.  In general the bypass_count
3869 * increments when the handle_list is handled before the hold_list; however, it
3870 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3871 * stripe with in flight i/o.  The bypass_count will be reset when the
3872 * head of the hold_list has changed, i.e. the head was promoted to the
3873 * handle_list.
3874 */
3875static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
3876{
3877	struct stripe_head *sh;
3878
3879	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3880		  __func__,
3881		  list_empty(&conf->handle_list) ? "empty" : "busy",
3882		  list_empty(&conf->hold_list) ? "empty" : "busy",
3883		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
3884
3885	if (!list_empty(&conf->handle_list)) {
3886		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3887
3888		if (list_empty(&conf->hold_list))
3889			conf->bypass_count = 0;
3890		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3891			if (conf->hold_list.next == conf->last_hold)
3892				conf->bypass_count++;
3893			else {
3894				conf->last_hold = conf->hold_list.next;
3895				conf->bypass_count -= conf->bypass_threshold;
3896				if (conf->bypass_count < 0)
3897					conf->bypass_count = 0;
3898			}
3899		}
3900	} else if (!list_empty(&conf->hold_list) &&
3901		   ((conf->bypass_threshold &&
3902		     conf->bypass_count > conf->bypass_threshold) ||
3903		    atomic_read(&conf->pending_full_writes) == 0)) {
3904		sh = list_entry(conf->hold_list.next,
3905				typeof(*sh), lru);
3906		conf->bypass_count -= conf->bypass_threshold;
3907		if (conf->bypass_count < 0)
3908			conf->bypass_count = 0;
3909	} else
3910		return NULL;
3911
3912	list_del_init(&sh->lru);
3913	atomic_inc(&sh->count);
3914	BUG_ON(atomic_read(&sh->count) != 1);
3915	return sh;
3916}
3917
3918static void make_request(struct mddev *mddev, struct bio * bi)
3919{
3920	struct r5conf *conf = mddev->private;
3921	int dd_idx;
3922	sector_t new_sector;
3923	sector_t logical_sector, last_sector;
3924	struct stripe_head *sh;
3925	const int rw = bio_data_dir(bi);
3926	int remaining;
3927	int plugged;
3928
3929	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3930		md_flush_request(mddev, bi);
3931		return;
3932	}
3933
3934	md_write_start(mddev, bi);
3935
3936	if (rw == READ &&
3937	     mddev->reshape_position == MaxSector &&
3938	     chunk_aligned_read(mddev,bi))
3939		return;
3940
3941	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3942	last_sector = bi->bi_sector + (bi->bi_size>>9);
3943	bi->bi_next = NULL;
3944	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
3945
3946	plugged = mddev_check_plugged(mddev);
3947	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3948		DEFINE_WAIT(w);
3949		int disks, data_disks;
3950		int previous;
3951
3952	retry:
3953		previous = 0;
3954		disks = conf->raid_disks;
3955		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3956		if (unlikely(conf->reshape_progress != MaxSector)) {
3957			/* spinlock is needed as reshape_progress may be
3958			 * 64bit on a 32bit platform, and so it might be
3959			 * possible to see a half-updated value
3960			 * Of course reshape_progress could change after
3961			 * the lock is dropped, so once we get a reference
3962			 * to the stripe that we think it is, we will have
3963			 * to check again.
3964			 */
3965			spin_lock_irq(&conf->device_lock);
3966			if (mddev->delta_disks < 0
3967			    ? logical_sector < conf->reshape_progress
3968			    : logical_sector >= conf->reshape_progress) {
3969				disks = conf->previous_raid_disks;
3970				previous = 1;
3971			} else {
3972				if (mddev->delta_disks < 0
3973				    ? logical_sector < conf->reshape_safe
3974				    : logical_sector >= conf->reshape_safe) {
3975					spin_unlock_irq(&conf->device_lock);
3976					schedule();
3977					goto retry;
3978				}
3979			}
3980			spin_unlock_irq(&conf->device_lock);
3981		}
3982		data_disks = disks - conf->max_degraded;
3983
3984		new_sector = raid5_compute_sector(conf, logical_sector,
3985						  previous,
3986						  &dd_idx, NULL);
3987		pr_debug("raid456: make_request, sector %llu logical %llu\n",
3988			(unsigned long long)new_sector,
3989			(unsigned long long)logical_sector);
3990
3991		sh = get_active_stripe(conf, new_sector, previous,
3992				       (bi->bi_rw&RWA_MASK), 0);
3993		if (sh) {
3994			if (unlikely(previous)) {
3995				/* expansion might have moved on while waiting for a
3996				 * stripe, so we must do the range check again.
3997				 * Expansion could still move past after this
3998				 * test, but as we are holding a reference to
3999				 * 'sh', we know that if that happens,
4000				 *  STRIPE_EXPANDING will get set and the expansion
4001				 * won't proceed until we finish with the stripe.
4002				 */
4003				int must_retry = 0;
4004				spin_lock_irq(&conf->device_lock);
4005				if (mddev->delta_disks < 0
4006				    ? logical_sector >= conf->reshape_progress
4007				    : logical_sector < conf->reshape_progress)
4008					/* mismatch, need to try again */
4009					must_retry = 1;
4010				spin_unlock_irq(&conf->device_lock);
4011				if (must_retry) {
4012					release_stripe(sh);
4013					schedule();
4014					goto retry;
4015				}
4016			}
4017
4018			if (rw == WRITE &&
4019			    logical_sector >= mddev->suspend_lo &&
4020			    logical_sector < mddev->suspend_hi) {
4021				release_stripe(sh);
4022				/* As the suspend_* range is controlled by
4023				 * userspace, we want an interruptible
4024				 * wait.
4025				 */
4026				flush_signals(current);
4027				prepare_to_wait(&conf->wait_for_overlap,
4028						&w, TASK_INTERRUPTIBLE);
4029				if (logical_sector >= mddev->suspend_lo &&
4030				    logical_sector < mddev->suspend_hi)
4031					schedule();
4032				goto retry;
4033			}
4034
4035			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4036			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
4037				/* Stripe is busy expanding or
4038				 * add failed due to overlap.  Flush everything
4039				 * and wait a while
4040				 */
4041				md_wakeup_thread(mddev->thread);
4042				release_stripe(sh);
4043				schedule();
4044				goto retry;
4045			}
4046			finish_wait(&conf->wait_for_overlap, &w);
4047			set_bit(STRIPE_HANDLE, &sh->state);
4048			clear_bit(STRIPE_DELAYED, &sh->state);
4049			if ((bi->bi_rw & REQ_SYNC) &&
4050			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4051				atomic_inc(&conf->preread_active_stripes);
4052			release_stripe(sh);
4053		} else {
4054			/* cannot get stripe for read-ahead, just give-up */
4055			clear_bit(BIO_UPTODATE, &bi->bi_flags);
4056			finish_wait(&conf->wait_for_overlap, &w);
4057			break;
4058		}
4059
4060	}
4061	if (!plugged)
4062		md_wakeup_thread(mddev->thread);
4063
4064	spin_lock_irq(&conf->device_lock);
4065	remaining = raid5_dec_bi_phys_segments(bi);
4066	spin_unlock_irq(&conf->device_lock);
4067	if (remaining == 0) {
4068
4069		if ( rw == WRITE )
4070			md_write_end(mddev);
4071
4072		bio_endio(bi, 0);
4073	}
4074}
4075
4076static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
4077
4078static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
4079{
4080	/* reshaping is quite different to recovery/resync so it is
4081	 * handled quite separately ... here.
4082	 *
4083	 * On each call to sync_request, we gather one chunk worth of
4084	 * destination stripes and flag them as expanding.
4085	 * Then we find all the source stripes and request reads.
4086	 * As the reads complete, handle_stripe will copy the data
4087	 * into the destination stripe and release that stripe.
4088	 */
4089	struct r5conf *conf = mddev->private;
4090	struct stripe_head *sh;
4091	sector_t first_sector, last_sector;
4092	int raid_disks = conf->previous_raid_disks;
4093	int data_disks = raid_disks - conf->max_degraded;
4094	int new_data_disks = conf->raid_disks - conf->max_degraded;
4095	int i;
4096	int dd_idx;
4097	sector_t writepos, readpos, safepos;
4098	sector_t stripe_addr;
4099	int reshape_sectors;
4100	struct list_head stripes;
4101
4102	if (sector_nr == 0) {
4103		/* If restarting in the middle, skip the initial sectors */
4104		if (mddev->delta_disks < 0 &&
4105		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4106			sector_nr = raid5_size(mddev, 0, 0)
4107				- conf->reshape_progress;
4108		} else if (mddev->delta_disks >= 0 &&
4109			   conf->reshape_progress > 0)
4110			sector_nr = conf->reshape_progress;
4111		sector_div(sector_nr, new_data_disks);
4112		if (sector_nr) {
4113			mddev->curr_resync_completed = sector_nr;
4114			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4115			*skipped = 1;
4116			return sector_nr;
4117		}
4118	}
4119
4120	/* We need to process a full chunk at a time.
4121	 * If old and new chunk sizes differ, we need to process the
4122	 * largest of these
4123	 */
4124	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4125		reshape_sectors = mddev->new_chunk_sectors;
4126	else
4127		reshape_sectors = mddev->chunk_sectors;
4128
4129	/* we update the metadata when there is more than 3Meg
4130	 * in the block range (that is rather arbitrary, should
4131	 * probably be time based) or when the data about to be
4132	 * copied would over-write the source of the data at
4133	 * the front of the range.
4134	 * i.e. one new_stripe along from reshape_progress new_maps
4135	 * to after where reshape_safe old_maps to
4136	 */
4137	writepos = conf->reshape_progress;
4138	sector_div(writepos, new_data_disks);
4139	readpos = conf->reshape_progress;
4140	sector_div(readpos, data_disks);
4141	safepos = conf->reshape_safe;
4142	sector_div(safepos, data_disks);
4143	if (mddev->delta_disks < 0) {
4144		writepos -= min_t(sector_t, reshape_sectors, writepos);
4145		readpos += reshape_sectors;
4146		safepos += reshape_sectors;
4147	} else {
4148		writepos += reshape_sectors;
4149		readpos -= min_t(sector_t, reshape_sectors, readpos);
4150		safepos -= min_t(sector_t, reshape_sectors, safepos);
4151	}
4152
4153	/* 'writepos' is the most advanced device address we might write.
4154	 * 'readpos' is the least advanced device address we might read.
4155	 * 'safepos' is the least address recorded in the metadata as having
4156	 *     been reshaped.
4157	 * If 'readpos' is behind 'writepos', then there is no way that we can
4158	 * ensure safety in the face of a crash - that must be done by userspace
4159	 * making a backup of the data.  So in that case there is no particular
4160	 * rush to update metadata.
4161	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
4162	 * update the metadata to advance 'safepos' to match 'readpos' so that
4163	 * we can be safe in the event of a crash.
4164	 * So we insist on updating metadata if safepos is behind writepos and
4165	 * readpos is beyond writepos.
4166	 * In any case, update the metadata every 10 seconds.
4167	 * Maybe that number should be configurable, but I'm not sure it is
4168	 * worth it.... maybe it could be a multiple of safemode_delay???
4169	 */
4170	if ((mddev->delta_disks < 0
4171	     ? (safepos > writepos && readpos < writepos)
4172	     : (safepos < writepos && readpos > writepos)) ||
4173	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4174		/* Cannot proceed until we've updated the superblock... */
4175		wait_event(conf->wait_for_overlap,
4176			   atomic_read(&conf->reshape_stripes)==0);
4177		mddev->reshape_position = conf->reshape_progress;
4178		mddev->curr_resync_completed = sector_nr;
4179		conf->reshape_checkpoint = jiffies;
4180		set_bit(MD_CHANGE_DEVS, &mddev->flags);
4181		md_wakeup_thread(mddev->thread);
4182		wait_event(mddev->sb_wait, mddev->flags == 0 ||
4183			   kthread_should_stop());
4184		spin_lock_irq(&conf->device_lock);
4185		conf->reshape_safe = mddev->reshape_position;
4186		spin_unlock_irq(&conf->device_lock);
4187		wake_up(&conf->wait_for_overlap);
4188		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4189	}
4190
4191	if (mddev->delta_disks < 0) {
4192		BUG_ON(conf->reshape_progress == 0);
4193		stripe_addr = writepos;
4194		BUG_ON((mddev->dev_sectors &
4195			~((sector_t)reshape_sectors - 1))
4196		       - reshape_sectors - stripe_addr
4197		       != sector_nr);
4198	} else {
4199		BUG_ON(writepos != sector_nr + reshape_sectors);
4200		stripe_addr = sector_nr;
4201	}
4202	INIT_LIST_HEAD(&stripes);
4203	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4204		int j;
4205		int skipped_disk = 0;
4206		sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4207		set_bit(STRIPE_EXPANDING, &sh->state);
4208		atomic_inc(&conf->reshape_stripes);
4209		/* If any of this stripe is beyond the end of the old
4210		 * array, then we need to zero those blocks
4211		 */
4212		for (j=sh->disks; j--;) {
4213			sector_t s;
4214			if (j == sh->pd_idx)
4215				continue;
4216			if (conf->level == 6 &&
4217			    j == sh->qd_idx)
4218				continue;
4219			s = compute_blocknr(sh, j, 0);
4220			if (s < raid5_size(mddev, 0, 0)) {
4221				skipped_disk = 1;
4222				continue;
4223			}
4224			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4225			set_bit(R5_Expanded, &sh->dev[j].flags);
4226			set_bit(R5_UPTODATE, &sh->dev[j].flags);
4227		}
4228		if (!skipped_disk) {
4229			set_bit(STRIPE_EXPAND_READY, &sh->state);
4230			set_bit(STRIPE_HANDLE, &sh->state);
4231		}
4232		list_add(&sh->lru, &stripes);
4233	}
4234	spin_lock_irq(&conf->device_lock);
4235	if (mddev->delta_disks < 0)
4236		conf->reshape_progress -= reshape_sectors * new_data_disks;
4237	else
4238		conf->reshape_progress += reshape_sectors * new_data_disks;
4239	spin_unlock_irq(&conf->device_lock);
4240	/* Ok, those stripe are ready. We can start scheduling
4241	 * reads on the source stripes.
4242	 * The source stripes are determined by mapping the first and last
4243	 * block on the destination stripes.
4244	 */
4245	first_sector =
4246		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4247				     1, &dd_idx, NULL);
4248	last_sector =
4249		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4250					    * new_data_disks - 1),
4251				     1, &dd_idx, NULL);
4252	if (last_sector >= mddev->dev_sectors)
4253		last_sector = mddev->dev_sectors - 1;
4254	while (first_sector <= last_sector) {
4255		sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4256		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4257		set_bit(STRIPE_HANDLE, &sh->state);
4258		release_stripe(sh);
4259		first_sector += STRIPE_SECTORS;
4260	}
4261	/* Now that the sources are clearly marked, we can release
4262	 * the destination stripes
4263	 */
4264	while (!list_empty(&stripes)) {
4265		sh = list_entry(stripes.next, struct stripe_head, lru);
4266		list_del_init(&sh->lru);
4267		release_stripe(sh);
4268	}
4269	/* If this takes us to the resync_max point where we have to pause,
4270	 * then we need to write out the superblock.
4271	 */
4272	sector_nr += reshape_sectors;
4273	if ((sector_nr - mddev->curr_resync_completed) * 2
4274	    >= mddev->resync_max - mddev->curr_resync_completed) {
4275		/* Cannot proceed until we've updated the superblock... */
4276		wait_event(conf->wait_for_overlap,
4277			   atomic_read(&conf->reshape_stripes) == 0);
4278		mddev->reshape_position = conf->reshape_progress;
4279		mddev->curr_resync_completed = sector_nr;
4280		conf->reshape_checkpoint = jiffies;
4281		set_bit(MD_CHANGE_DEVS, &mddev->flags);
4282		md_wakeup_thread(mddev->thread);
4283		wait_event(mddev->sb_wait,
4284			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4285			   || kthread_should_stop());
4286		spin_lock_irq(&conf->device_lock);
4287		conf->reshape_safe = mddev->reshape_position;
4288		spin_unlock_irq(&conf->device_lock);
4289		wake_up(&conf->wait_for_overlap);
4290		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4291	}
4292	return reshape_sectors;
4293}
4294
4295/* FIXME go_faster isn't used */
4296static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
4297{
4298	struct r5conf *conf = mddev->private;
4299	struct stripe_head *sh;
4300	sector_t max_sector = mddev->dev_sectors;
4301	sector_t sync_blocks;
4302	int still_degraded = 0;
4303	int i;
4304
4305	if (sector_nr >= max_sector) {
4306		/* just being told to finish up .. nothing much to do */
4307
4308		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4309			end_reshape(conf);
4310			return 0;
4311		}
4312
4313		if (mddev->curr_resync < max_sector) /* aborted */
4314			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4315					&sync_blocks, 1);
4316		else /* completed sync */
4317			conf->fullsync = 0;
4318		bitmap_close_sync(mddev->bitmap);
4319
4320		return 0;
4321	}
4322
4323	/* Allow raid5_quiesce to complete */
4324	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4325
4326	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4327		return reshape_request(mddev, sector_nr, skipped);
4328
4329	/* No need to check resync_max as we never do more than one
4330	 * stripe, and as resync_max will always be on a chunk boundary,
4331	 * if the check in md_do_sync didn't fire, there is no chance
4332	 * of overstepping resync_max here
4333	 */
4334
4335	/* if there is too many failed drives and we are trying
4336	 * to resync, then assert that we are finished, because there is
4337	 * nothing we can do.
4338	 */
4339	if (mddev->degraded >= conf->max_degraded &&
4340	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4341		sector_t rv = mddev->dev_sectors - sector_nr;
4342		*skipped = 1;
4343		return rv;
4344	}
4345	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4346	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4347	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4348		/* we can skip this block, and probably more */
4349		sync_blocks /= STRIPE_SECTORS;
4350		*skipped = 1;
4351		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4352	}
4353
4354	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4355
4356	sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4357	if (sh == NULL) {
4358		sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4359		/* make sure we don't swamp the stripe cache if someone else
4360		 * is trying to get access
4361		 */
4362		schedule_timeout_uninterruptible(1);
4363	}
4364	/* Need to check if array will still be degraded after recovery/resync
4365	 * We don't need to check the 'failed' flag as when that gets set,
4366	 * recovery aborts.
4367	 */
4368	for (i = 0; i < conf->raid_disks; i++)
4369		if (conf->disks[i].rdev == NULL)
4370			still_degraded = 1;
4371
4372	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4373
4374	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4375
4376	handle_stripe(sh);
4377	release_stripe(sh);
4378
4379	return STRIPE_SECTORS;
4380}
4381
4382static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4383{
4384	/* We may not be able to submit a whole bio at once as there
4385	 * may not be enough stripe_heads available.
4386	 * We cannot pre-allocate enough stripe_heads as we may need
4387	 * more than exist in the cache (if we allow ever large chunks).
4388	 * So we do one stripe head at a time and record in
4389	 * ->bi_hw_segments how many have been done.
4390	 *
4391	 * We *know* that this entire raid_bio is in one chunk, so
4392	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4393	 */
4394	struct stripe_head *sh;
4395	int dd_idx;
4396	sector_t sector, logical_sector, last_sector;
4397	int scnt = 0;
4398	int remaining;
4399	int handled = 0;
4400
4401	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4402	sector = raid5_compute_sector(conf, logical_sector,
4403				      0, &dd_idx, NULL);
4404	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4405
4406	for (; logical_sector < last_sector;
4407	     logical_sector += STRIPE_SECTORS,
4408		     sector += STRIPE_SECTORS,
4409		     scnt++) {
4410
4411		if (scnt < raid5_bi_hw_segments(raid_bio))
4412			/* already done this stripe */
4413			continue;
4414
4415		sh = get_active_stripe(conf, sector, 0, 1, 0);
4416
4417		if (!sh) {
4418			/* failed to get a stripe - must wait */
4419			raid5_set_bi_hw_segments(raid_bio, scnt);
4420			conf->retry_read_aligned = raid_bio;
4421			return handled;
4422		}
4423
4424		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4425			release_stripe(sh);
4426			raid5_set_bi_hw_segments(raid_bio, scnt);
4427			conf->retry_read_aligned = raid_bio;
4428			return handled;
4429		}
4430
4431		handle_stripe(sh);
4432		release_stripe(sh);
4433		handled++;
4434	}
4435	spin_lock_irq(&conf->device_lock);
4436	remaining = raid5_dec_bi_phys_segments(raid_bio);
4437	spin_unlock_irq(&conf->device_lock);
4438	if (remaining == 0)
4439		bio_endio(raid_bio, 0);
4440	if (atomic_dec_and_test(&conf->active_aligned_reads))
4441		wake_up(&conf->wait_for_stripe);
4442	return handled;
4443}
4444
4445
4446/*
4447 * This is our raid5 kernel thread.
4448 *
4449 * We scan the hash table for stripes which can be handled now.
4450 * During the scan, completed stripes are saved for us by the interrupt
4451 * handler, so that they will not have to wait for our next wakeup.
4452 */
4453static void raid5d(struct mddev *mddev)
4454{
4455	struct stripe_head *sh;
4456	struct r5conf *conf = mddev->private;
4457	int handled;
4458	struct blk_plug plug;
4459
4460	pr_debug("+++ raid5d active\n");
4461
4462	md_check_recovery(mddev);
4463
4464	blk_start_plug(&plug);
4465	handled = 0;
4466	spin_lock_irq(&conf->device_lock);
4467	while (1) {
4468		struct bio *bio;
4469
4470		if (atomic_read(&mddev->plug_cnt) == 0 &&
4471		    !list_empty(&conf->bitmap_list)) {
4472			/* Now is a good time to flush some bitmap updates */
4473			conf->seq_flush++;
4474			spin_unlock_irq(&conf->device_lock);
4475			bitmap_unplug(mddev->bitmap);
4476			spin_lock_irq(&conf->device_lock);
4477			conf->seq_write = conf->seq_flush;
4478			activate_bit_delay(conf);
4479		}
4480		if (atomic_read(&mddev->plug_cnt) == 0)
4481			raid5_activate_delayed(conf);
4482
4483		while ((bio = remove_bio_from_retry(conf))) {
4484			int ok;
4485			spin_unlock_irq(&conf->device_lock);
4486			ok = retry_aligned_read(conf, bio);
4487			spin_lock_irq(&conf->device_lock);
4488			if (!ok)
4489				break;
4490			handled++;
4491		}
4492
4493		sh = __get_priority_stripe(conf);
4494
4495		if (!sh)
4496			break;
4497		spin_unlock_irq(&conf->device_lock);
4498
4499		handled++;
4500		handle_stripe(sh);
4501		release_stripe(sh);
4502		cond_resched();
4503
4504		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4505			md_check_recovery(mddev);
4506
4507		spin_lock_irq(&conf->device_lock);
4508	}
4509	pr_debug("%d stripes handled\n", handled);
4510
4511	spin_unlock_irq(&conf->device_lock);
4512
4513	async_tx_issue_pending_all();
4514	blk_finish_plug(&plug);
4515
4516	pr_debug("--- raid5d inactive\n");
4517}
4518
4519static ssize_t
4520raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
4521{
4522	struct r5conf *conf = mddev->private;
4523	if (conf)
4524		return sprintf(page, "%d\n", conf->max_nr_stripes);
4525	else
4526		return 0;
4527}
4528
4529int
4530raid5_set_cache_size(struct mddev *mddev, int size)
4531{
4532	struct r5conf *conf = mddev->private;
4533	int err;
4534
4535	if (size <= 16 || size > 32768)
4536		return -EINVAL;
4537	while (size < conf->max_nr_stripes) {
4538		if (drop_one_stripe(conf))
4539			conf->max_nr_stripes--;
4540		else
4541			break;
4542	}
4543	err = md_allow_write(mddev);
4544	if (err)
4545		return err;
4546	while (size > conf->max_nr_stripes) {
4547		if (grow_one_stripe(conf))
4548			conf->max_nr_stripes++;
4549		else break;
4550	}
4551	return 0;
4552}
4553EXPORT_SYMBOL(raid5_set_cache_size);
4554
4555static ssize_t
4556raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
4557{
4558	struct r5conf *conf = mddev->private;
4559	unsigned long new;
4560	int err;
4561
4562	if (len >= PAGE_SIZE)
4563		return -EINVAL;
4564	if (!conf)
4565		return -ENODEV;
4566
4567	if (strict_strtoul(page, 10, &new))
4568		return -EINVAL;
4569	err = raid5_set_cache_size(mddev, new);
4570	if (err)
4571		return err;
4572	return len;
4573}
4574
4575static struct md_sysfs_entry
4576raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4577				raid5_show_stripe_cache_size,
4578				raid5_store_stripe_cache_size);
4579
4580static ssize_t
4581raid5_show_preread_threshold(struct mddev *mddev, char *page)
4582{
4583	struct r5conf *conf = mddev->private;
4584	if (conf)
4585		return sprintf(page, "%d\n", conf->bypass_threshold);
4586	else
4587		return 0;
4588}
4589
4590static ssize_t
4591raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
4592{
4593	struct r5conf *conf = mddev->private;
4594	unsigned long new;
4595	if (len >= PAGE_SIZE)
4596		return -EINVAL;
4597	if (!conf)
4598		return -ENODEV;
4599
4600	if (strict_strtoul(page, 10, &new))
4601		return -EINVAL;
4602	if (new > conf->max_nr_stripes)
4603		return -EINVAL;
4604	conf->bypass_threshold = new;
4605	return len;
4606}
4607
4608static struct md_sysfs_entry
4609raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4610					S_IRUGO | S_IWUSR,
4611					raid5_show_preread_threshold,
4612					raid5_store_preread_threshold);
4613
4614static ssize_t
4615stripe_cache_active_show(struct mddev *mddev, char *page)
4616{
4617	struct r5conf *conf = mddev->private;
4618	if (conf)
4619		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4620	else
4621		return 0;
4622}
4623
4624static struct md_sysfs_entry
4625raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4626
4627static struct attribute *raid5_attrs[] =  {
4628	&raid5_stripecache_size.attr,
4629	&raid5_stripecache_active.attr,
4630	&raid5_preread_bypass_threshold.attr,
4631	NULL,
4632};
4633static struct attribute_group raid5_attrs_group = {
4634	.name = NULL,
4635	.attrs = raid5_attrs,
4636};
4637
4638static sector_t
4639raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
4640{
4641	struct r5conf *conf = mddev->private;
4642
4643	if (!sectors)
4644		sectors = mddev->dev_sectors;
4645	if (!raid_disks)
4646		/* size is defined by the smallest of previous and new size */
4647		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4648
4649	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4650	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4651	return sectors * (raid_disks - conf->max_degraded);
4652}
4653
4654static void raid5_free_percpu(struct r5conf *conf)
4655{
4656	struct raid5_percpu *percpu;
4657	unsigned long cpu;
4658
4659	if (!conf->percpu)
4660		return;
4661
4662	get_online_cpus();
4663	for_each_possible_cpu(cpu) {
4664		percpu = per_cpu_ptr(conf->percpu, cpu);
4665		safe_put_page(percpu->spare_page);
4666		kfree(percpu->scribble);
4667	}
4668#ifdef CONFIG_HOTPLUG_CPU
4669	unregister_cpu_notifier(&conf->cpu_notify);
4670#endif
4671	put_online_cpus();
4672
4673	free_percpu(conf->percpu);
4674}
4675
4676static void free_conf(struct r5conf *conf)
4677{
4678	shrink_stripes(conf);
4679	raid5_free_percpu(conf);
4680	kfree(conf->disks);
4681	kfree(conf->stripe_hashtbl);
4682	kfree(conf);
4683}
4684
4685#ifdef CONFIG_HOTPLUG_CPU
4686static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4687			      void *hcpu)
4688{
4689	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
4690	long cpu = (long)hcpu;
4691	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4692
4693	switch (action) {
4694	case CPU_UP_PREPARE:
4695	case CPU_UP_PREPARE_FROZEN:
4696		if (conf->level == 6 && !percpu->spare_page)
4697			percpu->spare_page = alloc_page(GFP_KERNEL);
4698		if (!percpu->scribble)
4699			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4700
4701		if (!percpu->scribble ||
4702		    (conf->level == 6 && !percpu->spare_page)) {
4703			safe_put_page(percpu->spare_page);
4704			kfree(percpu->scribble);
4705			pr_err("%s: failed memory allocation for cpu%ld\n",
4706			       __func__, cpu);
4707			return notifier_from_errno(-ENOMEM);
4708		}
4709		break;
4710	case CPU_DEAD:
4711	case CPU_DEAD_FROZEN:
4712		safe_put_page(percpu->spare_page);
4713		kfree(percpu->scribble);
4714		percpu->spare_page = NULL;
4715		percpu->scribble = NULL;
4716		break;
4717	default:
4718		break;
4719	}
4720	return NOTIFY_OK;
4721}
4722#endif
4723
4724static int raid5_alloc_percpu(struct r5conf *conf)
4725{
4726	unsigned long cpu;
4727	struct page *spare_page;
4728	struct raid5_percpu __percpu *allcpus;
4729	void *scribble;
4730	int err;
4731
4732	allcpus = alloc_percpu(struct raid5_percpu);
4733	if (!allcpus)
4734		return -ENOMEM;
4735	conf->percpu = allcpus;
4736
4737	get_online_cpus();
4738	err = 0;
4739	for_each_present_cpu(cpu) {
4740		if (conf->level == 6) {
4741			spare_page = alloc_page(GFP_KERNEL);
4742			if (!spare_page) {
4743				err = -ENOMEM;
4744				break;
4745			}
4746			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4747		}
4748		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4749		if (!scribble) {
4750			err = -ENOMEM;
4751			break;
4752		}
4753		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4754	}
4755#ifdef CONFIG_HOTPLUG_CPU
4756	conf->cpu_notify.notifier_call = raid456_cpu_notify;
4757	conf->cpu_notify.priority = 0;
4758	if (err == 0)
4759		err = register_cpu_notifier(&conf->cpu_notify);
4760#endif
4761	put_online_cpus();
4762
4763	return err;
4764}
4765
4766static struct r5conf *setup_conf(struct mddev *mddev)
4767{
4768	struct r5conf *conf;
4769	int raid_disk, memory, max_disks;
4770	struct md_rdev *rdev;
4771	struct disk_info *disk;
4772
4773	if (mddev->new_level != 5
4774	    && mddev->new_level != 4
4775	    && mddev->new_level != 6) {
4776		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
4777		       mdname(mddev), mddev->new_level);
4778		return ERR_PTR(-EIO);
4779	}
4780	if ((mddev->new_level == 5
4781	     && !algorithm_valid_raid5(mddev->new_layout)) ||
4782	    (mddev->new_level == 6
4783	     && !algorithm_valid_raid6(mddev->new_layout))) {
4784		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
4785		       mdname(mddev), mddev->new_layout);
4786		return ERR_PTR(-EIO);
4787	}
4788	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4789		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
4790		       mdname(mddev), mddev->raid_disks);
4791		return ERR_PTR(-EINVAL);
4792	}
4793
4794	if (!mddev->new_chunk_sectors ||
4795	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4796	    !is_power_of_2(mddev->new_chunk_sectors)) {
4797		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
4798		       mdname(mddev), mddev->new_chunk_sectors << 9);
4799		return ERR_PTR(-EINVAL);
4800	}
4801
4802	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
4803	if (conf == NULL)
4804		goto abort;
4805	spin_lock_init(&conf->device_lock);
4806	init_waitqueue_head(&conf->wait_for_stripe);
4807	init_waitqueue_head(&conf->wait_for_overlap);
4808	INIT_LIST_HEAD(&conf->handle_list);
4809	INIT_LIST_HEAD(&conf->hold_list);
4810	INIT_LIST_HEAD(&conf->delayed_list);
4811	INIT_LIST_HEAD(&conf->bitmap_list);
4812	INIT_LIST_HEAD(&conf->inactive_list);
4813	atomic_set(&conf->active_stripes, 0);
4814	atomic_set(&conf->preread_active_stripes, 0);
4815	atomic_set(&conf->active_aligned_reads, 0);
4816	conf->bypass_threshold = BYPASS_THRESHOLD;
4817	conf->recovery_disabled = mddev->recovery_disabled - 1;
4818
4819	conf->raid_disks = mddev->raid_disks;
4820	if (mddev->reshape_position == MaxSector)
4821		conf->previous_raid_disks = mddev->raid_disks;
4822	else
4823		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4824	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
4825	conf->scribble_len = scribble_len(max_disks);
4826
4827	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
4828			      GFP_KERNEL);
4829	if (!conf->disks)
4830		goto abort;
4831
4832	conf->mddev = mddev;
4833
4834	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4835		goto abort;
4836
4837	conf->level = mddev->new_level;
4838	if (raid5_alloc_percpu(conf) != 0)
4839		goto abort;
4840
4841	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4842
4843	list_for_each_entry(rdev, &mddev->disks, same_set) {
4844		raid_disk = rdev->raid_disk;
4845		if (raid_disk >= max_disks
4846		    || raid_disk < 0)
4847			continue;
4848		disk = conf->disks + raid_disk;
4849
4850		disk->rdev = rdev;
4851
4852		if (test_bit(In_sync, &rdev->flags)) {
4853			char b[BDEVNAME_SIZE];
4854			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4855			       " disk %d\n",
4856			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4857		} else if (rdev->saved_raid_disk != raid_disk)
4858			/* Cannot rely on bitmap to complete recovery */
4859			conf->fullsync = 1;
4860	}
4861
4862	conf->chunk_sectors = mddev->new_chunk_sectors;
4863	conf->level = mddev->new_level;
4864	if (conf->level == 6)
4865		conf->max_degraded = 2;
4866	else
4867		conf->max_degraded = 1;
4868	conf->algorithm = mddev->new_layout;
4869	conf->max_nr_stripes = NR_STRIPES;
4870	conf->reshape_progress = mddev->reshape_position;
4871	if (conf->reshape_progress != MaxSector) {
4872		conf->prev_chunk_sectors = mddev->chunk_sectors;
4873		conf->prev_algo = mddev->layout;
4874	}
4875
4876	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4877		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4878	if (grow_stripes(conf, conf->max_nr_stripes)) {
4879		printk(KERN_ERR
4880		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
4881		       mdname(mddev), memory);
4882		goto abort;
4883	} else
4884		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
4885		       mdname(mddev), memory);
4886
4887	conf->thread = md_register_thread(raid5d, mddev, NULL);
4888	if (!conf->thread) {
4889		printk(KERN_ERR
4890		       "md/raid:%s: couldn't allocate thread.\n",
4891		       mdname(mddev));
4892		goto abort;
4893	}
4894
4895	return conf;
4896
4897 abort:
4898	if (conf) {
4899		free_conf(conf);
4900		return ERR_PTR(-EIO);
4901	} else
4902		return ERR_PTR(-ENOMEM);
4903}
4904
4905
4906static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
4907{
4908	switch (algo) {
4909	case ALGORITHM_PARITY_0:
4910		if (raid_disk < max_degraded)
4911			return 1;
4912		break;
4913	case ALGORITHM_PARITY_N:
4914		if (raid_disk >= raid_disks - max_degraded)
4915			return 1;
4916		break;
4917	case ALGORITHM_PARITY_0_6:
4918		if (raid_disk == 0 ||
4919		    raid_disk == raid_disks - 1)
4920			return 1;
4921		break;
4922	case ALGORITHM_LEFT_ASYMMETRIC_6:
4923	case ALGORITHM_RIGHT_ASYMMETRIC_6:
4924	case ALGORITHM_LEFT_SYMMETRIC_6:
4925	case ALGORITHM_RIGHT_SYMMETRIC_6:
4926		if (raid_disk == raid_disks - 1)
4927			return 1;
4928	}
4929	return 0;
4930}
4931
4932static int run(struct mddev *mddev)
4933{
4934	struct r5conf *conf;
4935	int working_disks = 0;
4936	int dirty_parity_disks = 0;
4937	struct md_rdev *rdev;
4938	sector_t reshape_offset = 0;
4939
4940	if (mddev->recovery_cp != MaxSector)
4941		printk(KERN_NOTICE "md/raid:%s: not clean"
4942		       " -- starting background reconstruction\n",
4943		       mdname(mddev));
4944	if (mddev->reshape_position != MaxSector) {
4945		/* Check that we can continue the reshape.
4946		 * Currently only disks can change, it must
4947		 * increase, and we must be past the point where
4948		 * a stripe over-writes itself
4949		 */
4950		sector_t here_new, here_old;
4951		int old_disks;
4952		int max_degraded = (mddev->level == 6 ? 2 : 1);
4953
4954		if (mddev->new_level != mddev->level) {
4955			printk(KERN_ERR "md/raid:%s: unsupported reshape "
4956			       "required - aborting.\n",
4957			       mdname(mddev));
4958			return -EINVAL;
4959		}
4960		old_disks = mddev->raid_disks - mddev->delta_disks;
4961		/* reshape_position must be on a new-stripe boundary, and one
4962		 * further up in new geometry must map after here in old
4963		 * geometry.
4964		 */
4965		here_new = mddev->reshape_position;
4966		if (sector_div(here_new, mddev->new_chunk_sectors *
4967			       (mddev->raid_disks - max_degraded))) {
4968			printk(KERN_ERR "md/raid:%s: reshape_position not "
4969			       "on a stripe boundary\n", mdname(mddev));
4970			return -EINVAL;
4971		}
4972		reshape_offset = here_new * mddev->new_chunk_sectors;
4973		/* here_new is the stripe we will write to */
4974		here_old = mddev->reshape_position;
4975		sector_div(here_old, mddev->chunk_sectors *
4976			   (old_disks-max_degraded));
4977		/* here_old is the first stripe that we might need to read
4978		 * from */
4979		if (mddev->delta_disks == 0) {
4980			/* We cannot be sure it is safe to start an in-place
4981			 * reshape.  It is only safe if user-space if monitoring
4982			 * and taking constant backups.
4983			 * mdadm always starts a situation like this in
4984			 * readonly mode so it can take control before
4985			 * allowing any writes.  So just check for that.
4986			 */
4987			if ((here_new * mddev->new_chunk_sectors !=
4988			     here_old * mddev->chunk_sectors) ||
4989			    mddev->ro == 0) {
4990				printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
4991				       " in read-only mode - aborting\n",
4992				       mdname(mddev));
4993				return -EINVAL;
4994			}
4995		} else if (mddev->delta_disks < 0
4996		    ? (here_new * mddev->new_chunk_sectors <=
4997		       here_old * mddev->chunk_sectors)
4998		    : (here_new * mddev->new_chunk_sectors >=
4999		       here_old * mddev->chunk_sectors)) {
5000			/* Reading from the same stripe as writing to - bad */
5001			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5002			       "auto-recovery - aborting.\n",
5003			       mdname(mddev));
5004			return -EINVAL;
5005		}
5006		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
5007		       mdname(mddev));
5008		/* OK, we should be able to continue; */
5009	} else {
5010		BUG_ON(mddev->level != mddev->new_level);
5011		BUG_ON(mddev->layout != mddev->new_layout);
5012		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
5013		BUG_ON(mddev->delta_disks != 0);
5014	}
5015
5016	if (mddev->private == NULL)
5017		conf = setup_conf(mddev);
5018	else
5019		conf = mddev->private;
5020
5021	if (IS_ERR(conf))
5022		return PTR_ERR(conf);
5023
5024	mddev->thread = conf->thread;
5025	conf->thread = NULL;
5026	mddev->private = conf;
5027
5028	/*
5029	 * 0 for a fully functional array, 1 or 2 for a degraded array.
5030	 */
5031	list_for_each_entry(rdev, &mddev->disks, same_set) {
5032		if (rdev->raid_disk < 0)
5033			continue;
5034		if (test_bit(In_sync, &rdev->flags)) {
5035			working_disks++;
5036			continue;
5037		}
5038		/* This disc is not fully in-sync.  However if it
5039		 * just stored parity (beyond the recovery_offset),
5040		 * when we don't need to be concerned about the
5041		 * array being dirty.
5042		 * When reshape goes 'backwards', we never have
5043		 * partially completed devices, so we only need
5044		 * to worry about reshape going forwards.
5045		 */
5046		/* Hack because v0.91 doesn't store recovery_offset properly. */
5047		if (mddev->major_version == 0 &&
5048		    mddev->minor_version > 90)
5049			rdev->recovery_offset = reshape_offset;
5050
5051		if (rdev->recovery_offset < reshape_offset) {
5052			/* We need to check old and new layout */
5053			if (!only_parity(rdev->raid_disk,
5054					 conf->algorithm,
5055					 conf->raid_disks,
5056					 conf->max_degraded))
5057				continue;
5058		}
5059		if (!only_parity(rdev->raid_disk,
5060				 conf->prev_algo,
5061				 conf->previous_raid_disks,
5062				 conf->max_degraded))
5063			continue;
5064		dirty_parity_disks++;
5065	}
5066
5067	mddev->degraded = calc_degraded(conf);
5068
5069	if (has_failed(conf)) {
5070		printk(KERN_ERR "md/raid:%s: not enough operational devices"
5071			" (%d/%d failed)\n",
5072			mdname(mddev), mddev->degraded, conf->raid_disks);
5073		goto abort;
5074	}
5075
5076	/* device size must be a multiple of chunk size */
5077	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
5078	mddev->resync_max_sectors = mddev->dev_sectors;
5079
5080	if (mddev->degraded > dirty_parity_disks &&
5081	    mddev->recovery_cp != MaxSector) {
5082		if (mddev->ok_start_degraded)
5083			printk(KERN_WARNING
5084			       "md/raid:%s: starting dirty degraded array"
5085			       " - data corruption possible.\n",
5086			       mdname(mddev));
5087		else {
5088			printk(KERN_ERR
5089			       "md/raid:%s: cannot start dirty degraded array.\n",
5090			       mdname(mddev));
5091			goto abort;
5092		}
5093	}
5094
5095	if (mddev->degraded == 0)
5096		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
5097		       " devices, algorithm %d\n", mdname(mddev), conf->level,
5098		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
5099		       mddev->new_layout);
5100	else
5101		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
5102		       " out of %d devices, algorithm %d\n",
5103		       mdname(mddev), conf->level,
5104		       mddev->raid_disks - mddev->degraded,
5105		       mddev->raid_disks, mddev->new_layout);
5106
5107	print_raid5_conf(conf);
5108
5109	if (conf->reshape_progress != MaxSector) {
5110		conf->reshape_safe = conf->reshape_progress;
5111		atomic_set(&conf->reshape_stripes, 0);
5112		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5113		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5114		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5115		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5116		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5117							"reshape");
5118	}
5119
5120
5121	/* Ok, everything is just fine now */
5122	if (mddev->to_remove == &raid5_attrs_group)
5123		mddev->to_remove = NULL;
5124	else if (mddev->kobj.sd &&
5125	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
5126		printk(KERN_WARNING
5127		       "raid5: failed to create sysfs attributes for %s\n",
5128		       mdname(mddev));
5129	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5130
5131	if (mddev->queue) {
5132		int chunk_size;
5133		/* read-ahead size must cover two whole stripes, which
5134		 * is 2 * (datadisks) * chunksize where 'n' is the
5135		 * number of raid devices
5136		 */
5137		int data_disks = conf->previous_raid_disks - conf->max_degraded;
5138		int stripe = data_disks *
5139			((mddev->chunk_sectors << 9) / PAGE_SIZE);
5140		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5141			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5142
5143		blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
5144
5145		mddev->queue->backing_dev_info.congested_data = mddev;
5146		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
5147
5148		chunk_size = mddev->chunk_sectors << 9;
5149		blk_queue_io_min(mddev->queue, chunk_size);
5150		blk_queue_io_opt(mddev->queue, chunk_size *
5151				 (conf->raid_disks - conf->max_degraded));
5152
5153		list_for_each_entry(rdev, &mddev->disks, same_set)
5154			disk_stack_limits(mddev->gendisk, rdev->bdev,
5155					  rdev->data_offset << 9);
5156	}
5157
5158	return 0;
5159abort:
5160	md_unregister_thread(&mddev->thread);
5161	print_raid5_conf(conf);
5162	free_conf(conf);
5163	mddev->private = NULL;
5164	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
5165	return -EIO;
5166}
5167
5168static int stop(struct mddev *mddev)
5169{
5170	struct r5conf *conf = mddev->private;
5171
5172	md_unregister_thread(&mddev->thread);
5173	if (mddev->queue)
5174		mddev->queue->backing_dev_info.congested_fn = NULL;
5175	free_conf(conf);
5176	mddev->private = NULL;
5177	mddev->to_remove = &raid5_attrs_group;
5178	return 0;
5179}
5180
5181static void status(struct seq_file *seq, struct mddev *mddev)
5182{
5183	struct r5conf *conf = mddev->private;
5184	int i;
5185
5186	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
5187		mddev->chunk_sectors / 2, mddev->layout);
5188	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
5189	for (i = 0; i < conf->raid_disks; i++)
5190		seq_printf (seq, "%s",
5191			       conf->disks[i].rdev &&
5192			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
5193	seq_printf (seq, "]");
5194}
5195
5196static void print_raid5_conf (struct r5conf *conf)
5197{
5198	int i;
5199	struct disk_info *tmp;
5200
5201	printk(KERN_DEBUG "RAID conf printout:\n");
5202	if (!conf) {
5203		printk("(conf==NULL)\n");
5204		return;
5205	}
5206	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
5207	       conf->raid_disks,
5208	       conf->raid_disks - conf->mddev->degraded);
5209
5210	for (i = 0; i < conf->raid_disks; i++) {
5211		char b[BDEVNAME_SIZE];
5212		tmp = conf->disks + i;
5213		if (tmp->rdev)
5214			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
5215			       i, !test_bit(Faulty, &tmp->rdev->flags),
5216			       bdevname(tmp->rdev->bdev, b));
5217	}
5218}
5219
5220static int raid5_spare_active(struct mddev *mddev)
5221{
5222	int i;
5223	struct r5conf *conf = mddev->private;
5224	struct disk_info *tmp;
5225	int count = 0;
5226	unsigned long flags;
5227
5228	for (i = 0; i < conf->raid_disks; i++) {
5229		tmp = conf->disks + i;
5230		if (tmp->replacement
5231		    && tmp->replacement->recovery_offset == MaxSector
5232		    && !test_bit(Faulty, &tmp->replacement->flags)
5233		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5234			/* Replacement has just become active. */
5235			if (!tmp->rdev
5236			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5237				count++;
5238			if (tmp->rdev) {
5239				/* Replaced device not technically faulty,
5240				 * but we need to be sure it gets removed
5241				 * and never re-added.
5242				 */
5243				set_bit(Faulty, &tmp->rdev->flags);
5244				sysfs_notify_dirent_safe(
5245					tmp->rdev->sysfs_state);
5246			}
5247			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5248		} else if (tmp->rdev
5249		    && tmp->rdev->recovery_offset == MaxSector
5250		    && !test_bit(Faulty, &tmp->rdev->flags)
5251		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5252			count++;
5253			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
5254		}
5255	}
5256	spin_lock_irqsave(&conf->device_lock, flags);
5257	mddev->degraded = calc_degraded(conf);
5258	spin_unlock_irqrestore(&conf->device_lock, flags);
5259	print_raid5_conf(conf);
5260	return count;
5261}
5262
5263static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5264{
5265	struct r5conf *conf = mddev->private;
5266	int err = 0;
5267	int number = rdev->raid_disk;
5268	struct md_rdev **rdevp;
5269	struct disk_info *p = conf->disks + number;
5270
5271	print_raid5_conf(conf);
5272	if (rdev == p->rdev)
5273		rdevp = &p->rdev;
5274	else if (rdev == p->replacement)
5275		rdevp = &p->replacement;
5276	else
5277		return 0;
5278
5279	if (number >= conf->raid_disks &&
5280	    conf->reshape_progress == MaxSector)
5281		clear_bit(In_sync, &rdev->flags);
5282
5283	if (test_bit(In_sync, &rdev->flags) ||
5284	    atomic_read(&rdev->nr_pending)) {
5285		err = -EBUSY;
5286		goto abort;
5287	}
5288	/* Only remove non-faulty devices if recovery
5289	 * isn't possible.
5290	 */
5291	if (!test_bit(Faulty, &rdev->flags) &&
5292	    mddev->recovery_disabled != conf->recovery_disabled &&
5293	    !has_failed(conf) &&
5294	    (!p->replacement || p->replacement == rdev) &&
5295	    number < conf->raid_disks) {
5296		err = -EBUSY;
5297		goto abort;
5298	}
5299	*rdevp = NULL;
5300	synchronize_rcu();
5301	if (atomic_read(&rdev->nr_pending)) {
5302		/* lost the race, try later */
5303		err = -EBUSY;
5304		*rdevp = rdev;
5305	} else if (p->replacement) {
5306		/* We must have just cleared 'rdev' */
5307		p->rdev = p->replacement;
5308		clear_bit(Replacement, &p->replacement->flags);
5309		smp_mb(); /* Make sure other CPUs may see both as identical
5310			   * but will never see neither - if they are careful
5311			   */
5312		p->replacement = NULL;
5313		clear_bit(WantReplacement, &rdev->flags);
5314	} else
5315		/* We might have just removed the Replacement as faulty-
5316		 * clear the bit just in case
5317		 */
5318		clear_bit(WantReplacement, &rdev->flags);
5319abort:
5320
5321	print_raid5_conf(conf);
5322	return err;
5323}
5324
5325static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5326{
5327	struct r5conf *conf = mddev->private;
5328	int err = -EEXIST;
5329	int disk;
5330	struct disk_info *p;
5331	int first = 0;
5332	int last = conf->raid_disks - 1;
5333
5334	if (mddev->recovery_disabled == conf->recovery_disabled)
5335		return -EBUSY;
5336
5337	if (has_failed(conf))
5338		/* no point adding a device */
5339		return -EINVAL;
5340
5341	if (rdev->raid_disk >= 0)
5342		first = last = rdev->raid_disk;
5343
5344	/*
5345	 * find the disk ... but prefer rdev->saved_raid_disk
5346	 * if possible.
5347	 */
5348	if (rdev->saved_raid_disk >= 0 &&
5349	    rdev->saved_raid_disk >= first &&
5350	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
5351		disk = rdev->saved_raid_disk;
5352	else
5353		disk = first;
5354	for ( ; disk <= last ; disk++)
5355		if ((p=conf->disks + disk)->rdev == NULL) {
5356			clear_bit(In_sync, &rdev->flags);
5357			rdev->raid_disk = disk;
5358			err = 0;
5359			if (rdev->saved_raid_disk != disk)
5360				conf->fullsync = 1;
5361			rcu_assign_pointer(p->rdev, rdev);
5362			break;
5363		}
5364	print_raid5_conf(conf);
5365	return err;
5366}
5367
5368static int raid5_resize(struct mddev *mddev, sector_t sectors)
5369{
5370	/* no resync is happening, and there is enough space
5371	 * on all devices, so we can resize.
5372	 * We need to make sure resync covers any new space.
5373	 * If the array is shrinking we should possibly wait until
5374	 * any io in the removed space completes, but it hardly seems
5375	 * worth it.
5376	 */
5377	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5378	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5379					       mddev->raid_disks));
5380	if (mddev->array_sectors >
5381	    raid5_size(mddev, sectors, mddev->raid_disks))
5382		return -EINVAL;
5383	set_capacity(mddev->gendisk, mddev->array_sectors);
5384	revalidate_disk(mddev->gendisk);
5385	if (sectors > mddev->dev_sectors &&
5386	    mddev->recovery_cp > mddev->dev_sectors) {
5387		mddev->recovery_cp = mddev->dev_sectors;
5388		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5389	}
5390	mddev->dev_sectors = sectors;
5391	mddev->resync_max_sectors = sectors;
5392	return 0;
5393}
5394
5395static int check_stripe_cache(struct mddev *mddev)
5396{
5397	/* Can only proceed if there are plenty of stripe_heads.
5398	 * We need a minimum of one full stripe,, and for sensible progress
5399	 * it is best to have about 4 times that.
5400	 * If we require 4 times, then the default 256 4K stripe_heads will
5401	 * allow for chunk sizes up to 256K, which is probably OK.
5402	 * If the chunk size is greater, user-space should request more
5403	 * stripe_heads first.
5404	 */
5405	struct r5conf *conf = mddev->private;
5406	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5407	    > conf->max_nr_stripes ||
5408	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5409	    > conf->max_nr_stripes) {
5410		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
5411		       mdname(mddev),
5412		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5413			/ STRIPE_SIZE)*4);
5414		return 0;
5415	}
5416	return 1;
5417}
5418
5419static int check_reshape(struct mddev *mddev)
5420{
5421	struct r5conf *conf = mddev->private;
5422
5423	if (mddev->delta_disks == 0 &&
5424	    mddev->new_layout == mddev->layout &&
5425	    mddev->new_chunk_sectors == mddev->chunk_sectors)
5426		return 0; /* nothing to do */
5427	if (mddev->bitmap)
5428		/* Cannot grow a bitmap yet */
5429		return -EBUSY;
5430	if (has_failed(conf))
5431		return -EINVAL;
5432	if (mddev->delta_disks < 0) {
5433		/* We might be able to shrink, but the devices must
5434		 * be made bigger first.
5435		 * For raid6, 4 is the minimum size.
5436		 * Otherwise 2 is the minimum
5437		 */
5438		int min = 2;
5439		if (mddev->level == 6)
5440			min = 4;
5441		if (mddev->raid_disks + mddev->delta_disks < min)
5442			return -EINVAL;
5443	}
5444
5445	if (!check_stripe_cache(mddev))
5446		return -ENOSPC;
5447
5448	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5449}
5450
5451static int raid5_start_reshape(struct mddev *mddev)
5452{
5453	struct r5conf *conf = mddev->private;
5454	struct md_rdev *rdev;
5455	int spares = 0;
5456	unsigned long flags;
5457
5458	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5459		return -EBUSY;
5460
5461	if (!check_stripe_cache(mddev))
5462		return -ENOSPC;
5463
5464	list_for_each_entry(rdev, &mddev->disks, same_set)
5465		if (!test_bit(In_sync, &rdev->flags)
5466		    && !test_bit(Faulty, &rdev->flags))
5467			spares++;
5468
5469	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5470		/* Not enough devices even to make a degraded array
5471		 * of that size
5472		 */
5473		return -EINVAL;
5474
5475	/* Refuse to reduce size of the array.  Any reductions in
5476	 * array size must be through explicit setting of array_size
5477	 * attribute.
5478	 */
5479	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5480	    < mddev->array_sectors) {
5481		printk(KERN_ERR "md/raid:%s: array size must be reduced "
5482		       "before number of disks\n", mdname(mddev));
5483		return -EINVAL;
5484	}
5485
5486	atomic_set(&conf->reshape_stripes, 0);
5487	spin_lock_irq(&conf->device_lock);
5488	conf->previous_raid_disks = conf->raid_disks;
5489	conf->raid_disks += mddev->delta_disks;
5490	conf->prev_chunk_sectors = conf->chunk_sectors;
5491	conf->chunk_sectors = mddev->new_chunk_sectors;
5492	conf->prev_algo = conf->algorithm;
5493	conf->algorithm = mddev->new_layout;
5494	if (mddev->delta_disks < 0)
5495		conf->reshape_progress = raid5_size(mddev, 0, 0);
5496	else
5497		conf->reshape_progress = 0;
5498	conf->reshape_safe = conf->reshape_progress;
5499	conf->generation++;
5500	spin_unlock_irq(&conf->device_lock);
5501
5502	/* Add some new drives, as many as will fit.
5503	 * We know there are enough to make the newly sized array work.
5504	 * Don't add devices if we are reducing the number of
5505	 * devices in the array.  This is because it is not possible
5506	 * to correctly record the "partially reconstructed" state of
5507	 * such devices during the reshape and confusion could result.
5508	 */
5509	if (mddev->delta_disks >= 0) {
5510		int added_devices = 0;
5511		list_for_each_entry(rdev, &mddev->disks, same_set)
5512			if (rdev->raid_disk < 0 &&
5513			    !test_bit(Faulty, &rdev->flags)) {
5514				if (raid5_add_disk(mddev, rdev) == 0) {
5515					if (rdev->raid_disk
5516					    >= conf->previous_raid_disks) {
5517						set_bit(In_sync, &rdev->flags);
5518						added_devices++;
5519					} else
5520						rdev->recovery_offset = 0;
5521
5522					if (sysfs_link_rdev(mddev, rdev))
5523						/* Failure here is OK */;
5524				}
5525			} else if (rdev->raid_disk >= conf->previous_raid_disks
5526				   && !test_bit(Faulty, &rdev->flags)) {
5527				/* This is a spare that was manually added */
5528				set_bit(In_sync, &rdev->flags);
5529				added_devices++;
5530			}
5531
5532		/* When a reshape changes the number of devices,
5533		 * ->degraded is measured against the larger of the
5534		 * pre and post number of devices.
5535		 */
5536		spin_lock_irqsave(&conf->device_lock, flags);
5537		mddev->degraded = calc_degraded(conf);
5538		spin_unlock_irqrestore(&conf->device_lock, flags);
5539	}
5540	mddev->raid_disks = conf->raid_disks;
5541	mddev->reshape_position = conf->reshape_progress;
5542	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5543
5544	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5545	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5546	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5547	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5548	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5549						"reshape");
5550	if (!mddev->sync_thread) {
5551		mddev->recovery = 0;
5552		spin_lock_irq(&conf->device_lock);
5553		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5554		conf->reshape_progress = MaxSector;
5555		spin_unlock_irq(&conf->device_lock);
5556		return -EAGAIN;
5557	}
5558	conf->reshape_checkpoint = jiffies;
5559	md_wakeup_thread(mddev->sync_thread);
5560	md_new_event(mddev);
5561	return 0;
5562}
5563
5564/* This is called from the reshape thread and should make any
5565 * changes needed in 'conf'
5566 */
5567static void end_reshape(struct r5conf *conf)
5568{
5569
5570	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5571
5572		spin_lock_irq(&conf->device_lock);
5573		conf->previous_raid_disks = conf->raid_disks;
5574		conf->reshape_progress = MaxSector;
5575		spin_unlock_irq(&conf->device_lock);
5576		wake_up(&conf->wait_for_overlap);
5577
5578		/* read-ahead size must cover two whole stripes, which is
5579		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
5580		 */
5581		if (conf->mddev->queue) {
5582			int data_disks = conf->raid_disks - conf->max_degraded;
5583			int stripe = data_disks * ((conf->chunk_sectors << 9)
5584						   / PAGE_SIZE);
5585			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5586				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5587		}
5588	}
5589}
5590
5591/* This is called from the raid5d thread with mddev_lock held.
5592 * It makes config changes to the device.
5593 */
5594static void raid5_finish_reshape(struct mddev *mddev)
5595{
5596	struct r5conf *conf = mddev->private;
5597
5598	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5599
5600		if (mddev->delta_disks > 0) {
5601			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5602			set_capacity(mddev->gendisk, mddev->array_sectors);
5603			revalidate_disk(mddev->gendisk);
5604		} else {
5605			int d;
5606			spin_lock_irq(&conf->device_lock);
5607			mddev->degraded = calc_degraded(conf);
5608			spin_unlock_irq(&conf->device_lock);
5609			for (d = conf->raid_disks ;
5610			     d < conf->raid_disks - mddev->delta_disks;
5611			     d++) {
5612				struct md_rdev *rdev = conf->disks[d].rdev;
5613				if (rdev &&
5614				    raid5_remove_disk(mddev, rdev) == 0) {
5615					sysfs_unlink_rdev(mddev, rdev);
5616					rdev->raid_disk = -1;
5617				}
5618			}
5619		}
5620		mddev->layout = conf->algorithm;
5621		mddev->chunk_sectors = conf->chunk_sectors;
5622		mddev->reshape_position = MaxSector;
5623		mddev->delta_disks = 0;
5624	}
5625}
5626
5627static void raid5_quiesce(struct mddev *mddev, int state)
5628{
5629	struct r5conf *conf = mddev->private;
5630
5631	switch(state) {
5632	case 2: /* resume for a suspend */
5633		wake_up(&conf->wait_for_overlap);
5634		break;
5635
5636	case 1: /* stop all writes */
5637		spin_lock_irq(&conf->device_lock);
5638		/* '2' tells resync/reshape to pause so that all
5639		 * active stripes can drain
5640		 */
5641		conf->quiesce = 2;
5642		wait_event_lock_irq(conf->wait_for_stripe,
5643				    atomic_read(&conf->active_stripes) == 0 &&
5644				    atomic_read(&conf->active_aligned_reads) == 0,
5645				    conf->device_lock, /* nothing */);
5646		conf->quiesce = 1;
5647		spin_unlock_irq(&conf->device_lock);
5648		/* allow reshape to continue */
5649		wake_up(&conf->wait_for_overlap);
5650		break;
5651
5652	case 0: /* re-enable writes */
5653		spin_lock_irq(&conf->device_lock);
5654		conf->quiesce = 0;
5655		wake_up(&conf->wait_for_stripe);
5656		wake_up(&conf->wait_for_overlap);
5657		spin_unlock_irq(&conf->device_lock);
5658		break;
5659	}
5660}
5661
5662
5663static void *raid45_takeover_raid0(struct mddev *mddev, int level)
5664{
5665	struct r0conf *raid0_conf = mddev->private;
5666	sector_t sectors;
5667
5668	/* for raid0 takeover only one zone is supported */
5669	if (raid0_conf->nr_strip_zones > 1) {
5670		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
5671		       mdname(mddev));
5672		return ERR_PTR(-EINVAL);
5673	}
5674
5675	sectors = raid0_conf->strip_zone[0].zone_end;
5676	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
5677	mddev->dev_sectors = sectors;
5678	mddev->new_level = level;
5679	mddev->new_layout = ALGORITHM_PARITY_N;
5680	mddev->new_chunk_sectors = mddev->chunk_sectors;
5681	mddev->raid_disks += 1;
5682	mddev->delta_disks = 1;
5683	/* make sure it will be not marked as dirty */
5684	mddev->recovery_cp = MaxSector;
5685
5686	return setup_conf(mddev);
5687}
5688
5689
5690static void *raid5_takeover_raid1(struct mddev *mddev)
5691{
5692	int chunksect;
5693
5694	if (mddev->raid_disks != 2 ||
5695	    mddev->degraded > 1)
5696		return ERR_PTR(-EINVAL);
5697
5698	/* Should check if there are write-behind devices? */
5699
5700	chunksect = 64*2; /* 64K by default */
5701
5702	/* The array must be an exact multiple of chunksize */
5703	while (chunksect && (mddev->array_sectors & (chunksect-1)))
5704		chunksect >>= 1;
5705
5706	if ((chunksect<<9) < STRIPE_SIZE)
5707		/* array size does not allow a suitable chunk size */
5708		return ERR_PTR(-EINVAL);
5709
5710	mddev->new_level = 5;
5711	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5712	mddev->new_chunk_sectors = chunksect;
5713
5714	return setup_conf(mddev);
5715}
5716
5717static void *raid5_takeover_raid6(struct mddev *mddev)
5718{
5719	int new_layout;
5720
5721	switch (mddev->layout) {
5722	case ALGORITHM_LEFT_ASYMMETRIC_6:
5723		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
5724		break;
5725	case ALGORITHM_RIGHT_ASYMMETRIC_6:
5726		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
5727		break;
5728	case ALGORITHM_LEFT_SYMMETRIC_6:
5729		new_layout = ALGORITHM_LEFT_SYMMETRIC;
5730		break;
5731	case ALGORITHM_RIGHT_SYMMETRIC_6:
5732		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
5733		break;
5734	case ALGORITHM_PARITY_0_6:
5735		new_layout = ALGORITHM_PARITY_0;
5736		break;
5737	case ALGORITHM_PARITY_N:
5738		new_layout = ALGORITHM_PARITY_N;
5739		break;
5740	default:
5741		return ERR_PTR(-EINVAL);
5742	}
5743	mddev->new_level = 5;
5744	mddev->new_layout = new_layout;
5745	mddev->delta_disks = -1;
5746	mddev->raid_disks -= 1;
5747	return setup_conf(mddev);
5748}
5749
5750
5751static int raid5_check_reshape(struct mddev *mddev)
5752{
5753	/* For a 2-drive array, the layout and chunk size can be changed
5754	 * immediately as not restriping is needed.
5755	 * For larger arrays we record the new value - after validation
5756	 * to be used by a reshape pass.
5757	 */
5758	struct r5conf *conf = mddev->private;
5759	int new_chunk = mddev->new_chunk_sectors;
5760
5761	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
5762		return -EINVAL;
5763	if (new_chunk > 0) {
5764		if (!is_power_of_2(new_chunk))
5765			return -EINVAL;
5766		if (new_chunk < (PAGE_SIZE>>9))
5767			return -EINVAL;
5768		if (mddev->array_sectors & (new_chunk-1))
5769			/* not factor of array size */
5770			return -EINVAL;
5771	}
5772
5773	/* They look valid */
5774
5775	if (mddev->raid_disks == 2) {
5776		/* can make the change immediately */
5777		if (mddev->new_layout >= 0) {
5778			conf->algorithm = mddev->new_layout;
5779			mddev->layout = mddev->new_layout;
5780		}
5781		if (new_chunk > 0) {
5782			conf->chunk_sectors = new_chunk ;
5783			mddev->chunk_sectors = new_chunk;
5784		}
5785		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5786		md_wakeup_thread(mddev->thread);
5787	}
5788	return check_reshape(mddev);
5789}
5790
5791static int raid6_check_reshape(struct mddev *mddev)
5792{
5793	int new_chunk = mddev->new_chunk_sectors;
5794
5795	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
5796		return -EINVAL;
5797	if (new_chunk > 0) {
5798		if (!is_power_of_2(new_chunk))
5799			return -EINVAL;
5800		if (new_chunk < (PAGE_SIZE >> 9))
5801			return -EINVAL;
5802		if (mddev->array_sectors & (new_chunk-1))
5803			/* not factor of array size */
5804			return -EINVAL;
5805	}
5806
5807	/* They look valid */
5808	return check_reshape(mddev);
5809}
5810
5811static void *raid5_takeover(struct mddev *mddev)
5812{
5813	/* raid5 can take over:
5814	 *  raid0 - if there is only one strip zone - make it a raid4 layout
5815	 *  raid1 - if there are two drives.  We need to know the chunk size
5816	 *  raid4 - trivial - just use a raid4 layout.
5817	 *  raid6 - Providing it is a *_6 layout
5818	 */
5819	if (mddev->level == 0)
5820		return raid45_takeover_raid0(mddev, 5);
5821	if (mddev->level == 1)
5822		return raid5_takeover_raid1(mddev);
5823	if (mddev->level == 4) {
5824		mddev->new_layout = ALGORITHM_PARITY_N;
5825		mddev->new_level = 5;
5826		return setup_conf(mddev);
5827	}
5828	if (mddev->level == 6)
5829		return raid5_takeover_raid6(mddev);
5830
5831	return ERR_PTR(-EINVAL);
5832}
5833
5834static void *raid4_takeover(struct mddev *mddev)
5835{
5836	/* raid4 can take over:
5837	 *  raid0 - if there is only one strip zone
5838	 *  raid5 - if layout is right
5839	 */
5840	if (mddev->level == 0)
5841		return raid45_takeover_raid0(mddev, 4);
5842	if (mddev->level == 5 &&
5843	    mddev->layout == ALGORITHM_PARITY_N) {
5844		mddev->new_layout = 0;
5845		mddev->new_level = 4;
5846		return setup_conf(mddev);
5847	}
5848	return ERR_PTR(-EINVAL);
5849}
5850
5851static struct md_personality raid5_personality;
5852
5853static void *raid6_takeover(struct mddev *mddev)
5854{
5855	/* Currently can only take over a raid5.  We map the
5856	 * personality to an equivalent raid6 personality
5857	 * with the Q block at the end.
5858	 */
5859	int new_layout;
5860
5861	if (mddev->pers != &raid5_personality)
5862		return ERR_PTR(-EINVAL);
5863	if (mddev->degraded > 1)
5864		return ERR_PTR(-EINVAL);
5865	if (mddev->raid_disks > 253)
5866		return ERR_PTR(-EINVAL);
5867	if (mddev->raid_disks < 3)
5868		return ERR_PTR(-EINVAL);
5869
5870	switch (mddev->layout) {
5871	case ALGORITHM_LEFT_ASYMMETRIC:
5872		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
5873		break;
5874	case ALGORITHM_RIGHT_ASYMMETRIC:
5875		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
5876		break;
5877	case ALGORITHM_LEFT_SYMMETRIC:
5878		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
5879		break;
5880	case ALGORITHM_RIGHT_SYMMETRIC:
5881		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
5882		break;
5883	case ALGORITHM_PARITY_0:
5884		new_layout = ALGORITHM_PARITY_0_6;
5885		break;
5886	case ALGORITHM_PARITY_N:
5887		new_layout = ALGORITHM_PARITY_N;
5888		break;
5889	default:
5890		return ERR_PTR(-EINVAL);
5891	}
5892	mddev->new_level = 6;
5893	mddev->new_layout = new_layout;
5894	mddev->delta_disks = 1;
5895	mddev->raid_disks += 1;
5896	return setup_conf(mddev);
5897}
5898
5899
5900static struct md_personality raid6_personality =
5901{
5902	.name		= "raid6",
5903	.level		= 6,
5904	.owner		= THIS_MODULE,
5905	.make_request	= make_request,
5906	.run		= run,
5907	.stop		= stop,
5908	.status		= status,
5909	.error_handler	= error,
5910	.hot_add_disk	= raid5_add_disk,
5911	.hot_remove_disk= raid5_remove_disk,
5912	.spare_active	= raid5_spare_active,
5913	.sync_request	= sync_request,
5914	.resize		= raid5_resize,
5915	.size		= raid5_size,
5916	.check_reshape	= raid6_check_reshape,
5917	.start_reshape  = raid5_start_reshape,
5918	.finish_reshape = raid5_finish_reshape,
5919	.quiesce	= raid5_quiesce,
5920	.takeover	= raid6_takeover,
5921};
5922static struct md_personality raid5_personality =
5923{
5924	.name		= "raid5",
5925	.level		= 5,
5926	.owner		= THIS_MODULE,
5927	.make_request	= make_request,
5928	.run		= run,
5929	.stop		= stop,
5930	.status		= status,
5931	.error_handler	= error,
5932	.hot_add_disk	= raid5_add_disk,
5933	.hot_remove_disk= raid5_remove_disk,
5934	.spare_active	= raid5_spare_active,
5935	.sync_request	= sync_request,
5936	.resize		= raid5_resize,
5937	.size		= raid5_size,
5938	.check_reshape	= raid5_check_reshape,
5939	.start_reshape  = raid5_start_reshape,
5940	.finish_reshape = raid5_finish_reshape,
5941	.quiesce	= raid5_quiesce,
5942	.takeover	= raid5_takeover,
5943};
5944
5945static struct md_personality raid4_personality =
5946{
5947	.name		= "raid4",
5948	.level		= 4,
5949	.owner		= THIS_MODULE,
5950	.make_request	= make_request,
5951	.run		= run,
5952	.stop		= stop,
5953	.status		= status,
5954	.error_handler	= error,
5955	.hot_add_disk	= raid5_add_disk,
5956	.hot_remove_disk= raid5_remove_disk,
5957	.spare_active	= raid5_spare_active,
5958	.sync_request	= sync_request,
5959	.resize		= raid5_resize,
5960	.size		= raid5_size,
5961	.check_reshape	= raid5_check_reshape,
5962	.start_reshape  = raid5_start_reshape,
5963	.finish_reshape = raid5_finish_reshape,
5964	.quiesce	= raid5_quiesce,
5965	.takeover	= raid4_takeover,
5966};
5967
5968static int __init raid5_init(void)
5969{
5970	register_md_personality(&raid6_personality);
5971	register_md_personality(&raid5_personality);
5972	register_md_personality(&raid4_personality);
5973	return 0;
5974}
5975
5976static void raid5_exit(void)
5977{
5978	unregister_md_personality(&raid6_personality);
5979	unregister_md_personality(&raid5_personality);
5980	unregister_md_personality(&raid4_personality);
5981}
5982
5983module_init(raid5_init);
5984module_exit(raid5_exit);
5985MODULE_LICENSE("GPL");
5986MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5987MODULE_ALIAS("md-personality-4"); /* RAID5 */
5988MODULE_ALIAS("md-raid5");
5989MODULE_ALIAS("md-raid4");
5990MODULE_ALIAS("md-level-5");
5991MODULE_ALIAS("md-level-4");
5992MODULE_ALIAS("md-personality-8"); /* RAID6 */
5993MODULE_ALIAS("md-raid6");
5994MODULE_ALIAS("md-level-6");
5995
5996/* This used to be two separate modules, they were: */
5997MODULE_ALIAS("raid5");
5998MODULE_ALIAS("raid6");
5999