1/*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob Ãstergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 *      - bitmap marked during normal i/o
19 *      - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
24 * This program is free software; you can redistribute it and/or modify
25 * it under the terms of the GNU General Public License as published by
26 * the Free Software Foundation; either version 2, or (at your option)
27 * any later version.
28 *
29 * You should have received a copy of the GNU General Public License
30 * (for example /usr/src/linux/COPYING); if not, write to the Free
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */
33
34#include <linux/slab.h>
35#include <linux/delay.h>
36#include <linux/blkdev.h>
37#include <linux/module.h>
38#include <linux/seq_file.h>
39#include <linux/ratelimit.h>
40#include "md.h"
41#include "raid1.h"
42#include "bitmap.h"
43
44/*
45 * Number of guaranteed r1bios in case of extreme VM load:
46 */
47#define	NR_RAID1_BIOS 256
48
49/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback.
52 */
53static int max_queued_requests = 1024;
54
55static void allow_barrier(struct r1conf *conf);
56static void lower_barrier(struct r1conf *conf);
57
58static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
59{
60	struct pool_info *pi = data;
61	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
62
63	/* allocate a r1bio with room for raid_disks entries in the bios array */
64	return kzalloc(size, gfp_flags);
65}
66
67static void r1bio_pool_free(void *r1_bio, void *data)
68{
69	kfree(r1_bio);
70}
71
72#define RESYNC_BLOCK_SIZE (64*1024)
73//#define RESYNC_BLOCK_SIZE PAGE_SIZE
74#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
75#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
76#define RESYNC_WINDOW (2048*1024)
77
78static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
79{
80	struct pool_info *pi = data;
81	struct page *page;
82	struct r1bio *r1_bio;
83	struct bio *bio;
84	int i, j;
85
86	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
87	if (!r1_bio)
88		return NULL;
89
90	/*
91	 * Allocate bios : 1 for reading, n-1 for writing
92	 */
93	for (j = pi->raid_disks ; j-- ; ) {
94		bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
95		if (!bio)
96			goto out_free_bio;
97		r1_bio->bios[j] = bio;
98	}
99	/*
100	 * Allocate RESYNC_PAGES data pages and attach them to
101	 * the first bio.
102	 * If this is a user-requested check/repair, allocate
103	 * RESYNC_PAGES for each bio.
104	 */
105	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
106		j = pi->raid_disks;
107	else
108		j = 1;
109	while(j--) {
110		bio = r1_bio->bios[j];
111		for (i = 0; i < RESYNC_PAGES; i++) {
112			page = alloc_page(gfp_flags);
113			if (unlikely(!page))
114				goto out_free_pages;
115
116			bio->bi_io_vec[i].bv_page = page;
117			bio->bi_vcnt = i+1;
118		}
119	}
120	/* If not user-requests, copy the page pointers to all bios */
121	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
122		for (i=0; i<RESYNC_PAGES ; i++)
123			for (j=1; j<pi->raid_disks; j++)
124				r1_bio->bios[j]->bi_io_vec[i].bv_page =
125					r1_bio->bios[0]->bi_io_vec[i].bv_page;
126	}
127
128	r1_bio->master_bio = NULL;
129
130	return r1_bio;
131
132out_free_pages:
133	for (j=0 ; j < pi->raid_disks; j++)
134		for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
135			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
136	j = -1;
137out_free_bio:
138	while (++j < pi->raid_disks)
139		bio_put(r1_bio->bios[j]);
140	r1bio_pool_free(r1_bio, data);
141	return NULL;
142}
143
144static void r1buf_pool_free(void *__r1_bio, void *data)
145{
146	struct pool_info *pi = data;
147	int i,j;
148	struct r1bio *r1bio = __r1_bio;
149
150	for (i = 0; i < RESYNC_PAGES; i++)
151		for (j = pi->raid_disks; j-- ;) {
152			if (j == 0 ||
153			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
154			    r1bio->bios[0]->bi_io_vec[i].bv_page)
155				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
156		}
157	for (i=0 ; i < pi->raid_disks; i++)
158		bio_put(r1bio->bios[i]);
159
160	r1bio_pool_free(r1bio, data);
161}
162
163static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
164{
165	int i;
166
167	for (i = 0; i < conf->raid_disks * 2; i++) {
168		struct bio **bio = r1_bio->bios + i;
169		if (!BIO_SPECIAL(*bio))
170			bio_put(*bio);
171		*bio = NULL;
172	}
173}
174
175static void free_r1bio(struct r1bio *r1_bio)
176{
177	struct r1conf *conf = r1_bio->mddev->private;
178
179	put_all_bios(conf, r1_bio);
180	mempool_free(r1_bio, conf->r1bio_pool);
181}
182
183static void put_buf(struct r1bio *r1_bio)
184{
185	struct r1conf *conf = r1_bio->mddev->private;
186	int i;
187
188	for (i = 0; i < conf->raid_disks * 2; i++) {
189		struct bio *bio = r1_bio->bios[i];
190		if (bio->bi_end_io)
191			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
192	}
193
194	mempool_free(r1_bio, conf->r1buf_pool);
195
196	lower_barrier(conf);
197}
198
199static void reschedule_retry(struct r1bio *r1_bio)
200{
201	unsigned long flags;
202	struct mddev *mddev = r1_bio->mddev;
203	struct r1conf *conf = mddev->private;
204
205	spin_lock_irqsave(&conf->device_lock, flags);
206	list_add(&r1_bio->retry_list, &conf->retry_list);
207	conf->nr_queued ++;
208	spin_unlock_irqrestore(&conf->device_lock, flags);
209
210	wake_up(&conf->wait_barrier);
211	md_wakeup_thread(mddev->thread);
212}
213
214/*
215 * raid_end_bio_io() is called when we have finished servicing a mirrored
216 * operation and are ready to return a success/failure code to the buffer
217 * cache layer.
218 */
219static void call_bio_endio(struct r1bio *r1_bio)
220{
221	struct bio *bio = r1_bio->master_bio;
222	int done;
223	struct r1conf *conf = r1_bio->mddev->private;
224
225	if (bio->bi_phys_segments) {
226		unsigned long flags;
227		spin_lock_irqsave(&conf->device_lock, flags);
228		bio->bi_phys_segments--;
229		done = (bio->bi_phys_segments == 0);
230		spin_unlock_irqrestore(&conf->device_lock, flags);
231	} else
232		done = 1;
233
234	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
235		clear_bit(BIO_UPTODATE, &bio->bi_flags);
236	if (done) {
237		bio_endio(bio, 0);
238		/*
239		 * Wake up any possible resync thread that waits for the device
240		 * to go idle.
241		 */
242		allow_barrier(conf);
243	}
244}
245
246static void raid_end_bio_io(struct r1bio *r1_bio)
247{
248	struct bio *bio = r1_bio->master_bio;
249
250	/* if nobody has done the final endio yet, do it now */
251	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
252		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
253			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
254			 (unsigned long long) bio->bi_sector,
255			 (unsigned long long) bio->bi_sector +
256			 (bio->bi_size >> 9) - 1);
257
258		call_bio_endio(r1_bio);
259	}
260	free_r1bio(r1_bio);
261}
262
263/*
264 * Update disk head position estimator based on IRQ completion info.
265 */
266static inline void update_head_pos(int disk, struct r1bio *r1_bio)
267{
268	struct r1conf *conf = r1_bio->mddev->private;
269
270	conf->mirrors[disk].head_position =
271		r1_bio->sector + (r1_bio->sectors);
272}
273
274/*
275 * Find the disk number which triggered given bio
276 */
277static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
278{
279	int mirror;
280	struct r1conf *conf = r1_bio->mddev->private;
281	int raid_disks = conf->raid_disks;
282
283	for (mirror = 0; mirror < raid_disks * 2; mirror++)
284		if (r1_bio->bios[mirror] == bio)
285			break;
286
287	BUG_ON(mirror == raid_disks * 2);
288	update_head_pos(mirror, r1_bio);
289
290	return mirror;
291}
292
293static void raid1_end_read_request(struct bio *bio, int error)
294{
295	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
296	struct r1bio *r1_bio = bio->bi_private;
297	int mirror;
298	struct r1conf *conf = r1_bio->mddev->private;
299
300	mirror = r1_bio->read_disk;
301	/*
302	 * this branch is our 'one mirror IO has finished' event handler:
303	 */
304	update_head_pos(mirror, r1_bio);
305
306	if (uptodate)
307		set_bit(R1BIO_Uptodate, &r1_bio->state);
308	else {
309		/* If all other devices have failed, we want to return
310		 * the error upwards rather than fail the last device.
311		 * Here we redefine "uptodate" to mean "Don't want to retry"
312		 */
313		unsigned long flags;
314		spin_lock_irqsave(&conf->device_lock, flags);
315		if (r1_bio->mddev->degraded == conf->raid_disks ||
316		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
317		     !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
318			uptodate = 1;
319		spin_unlock_irqrestore(&conf->device_lock, flags);
320	}
321
322	if (uptodate)
323		raid_end_bio_io(r1_bio);
324	else {
325		/*
326		 * oops, read error:
327		 */
328		char b[BDEVNAME_SIZE];
329		printk_ratelimited(
330			KERN_ERR "md/raid1:%s: %s: "
331			"rescheduling sector %llu\n",
332			mdname(conf->mddev),
333			bdevname(conf->mirrors[mirror].rdev->bdev,
334				 b),
335			(unsigned long long)r1_bio->sector);
336		set_bit(R1BIO_ReadError, &r1_bio->state);
337		reschedule_retry(r1_bio);
338	}
339
340	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
341}
342
343static void close_write(struct r1bio *r1_bio)
344{
345	/* it really is the end of this request */
346	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
347		/* free extra copy of the data pages */
348		int i = r1_bio->behind_page_count;
349		while (i--)
350			safe_put_page(r1_bio->behind_bvecs[i].bv_page);
351		kfree(r1_bio->behind_bvecs);
352		r1_bio->behind_bvecs = NULL;
353	}
354	/* clear the bitmap if all writes complete successfully */
355	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
356			r1_bio->sectors,
357			!test_bit(R1BIO_Degraded, &r1_bio->state),
358			test_bit(R1BIO_BehindIO, &r1_bio->state));
359	md_write_end(r1_bio->mddev);
360}
361
362static void r1_bio_write_done(struct r1bio *r1_bio)
363{
364	if (!atomic_dec_and_test(&r1_bio->remaining))
365		return;
366
367	if (test_bit(R1BIO_WriteError, &r1_bio->state))
368		reschedule_retry(r1_bio);
369	else {
370		close_write(r1_bio);
371		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
372			reschedule_retry(r1_bio);
373		else
374			raid_end_bio_io(r1_bio);
375	}
376}
377
378static void raid1_end_write_request(struct bio *bio, int error)
379{
380	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
381	struct r1bio *r1_bio = bio->bi_private;
382	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
383	struct r1conf *conf = r1_bio->mddev->private;
384	struct bio *to_put = NULL;
385
386	mirror = find_bio_disk(r1_bio, bio);
387
388	/*
389	 * 'one mirror IO has finished' event handler:
390	 */
391	if (!uptodate) {
392		set_bit(WriteErrorSeen,
393			&conf->mirrors[mirror].rdev->flags);
394		if (!test_and_set_bit(WantReplacement,
395				      &conf->mirrors[mirror].rdev->flags))
396			set_bit(MD_RECOVERY_NEEDED, &
397				conf->mddev->recovery);
398
399		set_bit(R1BIO_WriteError, &r1_bio->state);
400	} else {
401		/*
402		 * Set R1BIO_Uptodate in our master bio, so that we
403		 * will return a good error code for to the higher
404		 * levels even if IO on some other mirrored buffer
405		 * fails.
406		 *
407		 * The 'master' represents the composite IO operation
408		 * to user-side. So if something waits for IO, then it
409		 * will wait for the 'master' bio.
410		 */
411		sector_t first_bad;
412		int bad_sectors;
413
414		r1_bio->bios[mirror] = NULL;
415		to_put = bio;
416		set_bit(R1BIO_Uptodate, &r1_bio->state);
417
418		/* Maybe we can clear some bad blocks. */
419		if (is_badblock(conf->mirrors[mirror].rdev,
420				r1_bio->sector, r1_bio->sectors,
421				&first_bad, &bad_sectors)) {
422			r1_bio->bios[mirror] = IO_MADE_GOOD;
423			set_bit(R1BIO_MadeGood, &r1_bio->state);
424		}
425	}
426
427	if (behind) {
428		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
429			atomic_dec(&r1_bio->behind_remaining);
430
431		/*
432		 * In behind mode, we ACK the master bio once the I/O
433		 * has safely reached all non-writemostly
434		 * disks. Setting the Returned bit ensures that this
435		 * gets done only once -- we don't ever want to return
436		 * -EIO here, instead we'll wait
437		 */
438		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
439		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
440			/* Maybe we can return now */
441			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
442				struct bio *mbio = r1_bio->master_bio;
443				pr_debug("raid1: behind end write sectors"
444					 " %llu-%llu\n",
445					 (unsigned long long) mbio->bi_sector,
446					 (unsigned long long) mbio->bi_sector +
447					 (mbio->bi_size >> 9) - 1);
448				call_bio_endio(r1_bio);
449			}
450		}
451	}
452	if (r1_bio->bios[mirror] == NULL)
453		rdev_dec_pending(conf->mirrors[mirror].rdev,
454				 conf->mddev);
455
456	/*
457	 * Let's see if all mirrored write operations have finished
458	 * already.
459	 */
460	r1_bio_write_done(r1_bio);
461
462	if (to_put)
463		bio_put(to_put);
464}
465
466
467/*
468 * This routine returns the disk from which the requested read should
469 * be done. There is a per-array 'next expected sequential IO' sector
470 * number - if this matches on the next IO then we use the last disk.
471 * There is also a per-disk 'last know head position' sector that is
472 * maintained from IRQ contexts, both the normal and the resync IO
473 * completion handlers update this position correctly. If there is no
474 * perfect sequential match then we pick the disk whose head is closest.
475 *
476 * If there are 2 mirrors in the same 2 devices, performance degrades
477 * because position is mirror, not device based.
478 *
479 * The rdev for the device selected will have nr_pending incremented.
480 */
481static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
482{
483	const sector_t this_sector = r1_bio->sector;
484	int sectors;
485	int best_good_sectors;
486	int start_disk;
487	int best_disk;
488	int i;
489	sector_t best_dist;
490	struct md_rdev *rdev;
491	int choose_first;
492
493	rcu_read_lock();
494	/*
495	 * Check if we can balance. We can balance on the whole
496	 * device if no resync is going on, or below the resync window.
497	 * We take the first readable disk when above the resync window.
498	 */
499 retry:
500	sectors = r1_bio->sectors;
501	best_disk = -1;
502	best_dist = MaxSector;
503	best_good_sectors = 0;
504
505	if (conf->mddev->recovery_cp < MaxSector &&
506	    (this_sector + sectors >= conf->next_resync)) {
507		choose_first = 1;
508		start_disk = 0;
509	} else {
510		choose_first = 0;
511		start_disk = conf->last_used;
512	}
513
514	for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
515		sector_t dist;
516		sector_t first_bad;
517		int bad_sectors;
518
519		int disk = start_disk + i;
520		if (disk >= conf->raid_disks)
521			disk -= conf->raid_disks;
522
523		rdev = rcu_dereference(conf->mirrors[disk].rdev);
524		if (r1_bio->bios[disk] == IO_BLOCKED
525		    || rdev == NULL
526		    || test_bit(Faulty, &rdev->flags))
527			continue;
528		if (!test_bit(In_sync, &rdev->flags) &&
529		    rdev->recovery_offset < this_sector + sectors)
530			continue;
531		if (test_bit(WriteMostly, &rdev->flags)) {
532			/* Don't balance among write-mostly, just
533			 * use the first as a last resort */
534			if (best_disk < 0) {
535				if (is_badblock(rdev, this_sector, sectors,
536						&first_bad, &bad_sectors)) {
537					if (first_bad < this_sector)
538						/* Cannot use this */
539						continue;
540					best_good_sectors = first_bad - this_sector;
541				} else
542					best_good_sectors = sectors;
543				best_disk = disk;
544			}
545			continue;
546		}
547		/* This is a reasonable device to use.  It might
548		 * even be best.
549		 */
550		if (is_badblock(rdev, this_sector, sectors,
551				&first_bad, &bad_sectors)) {
552			if (best_dist < MaxSector)
553				/* already have a better device */
554				continue;
555			if (first_bad <= this_sector) {
556				/* cannot read here. If this is the 'primary'
557				 * device, then we must not read beyond
558				 * bad_sectors from another device..
559				 */
560				bad_sectors -= (this_sector - first_bad);
561				if (choose_first && sectors > bad_sectors)
562					sectors = bad_sectors;
563				if (best_good_sectors > sectors)
564					best_good_sectors = sectors;
565
566			} else {
567				sector_t good_sectors = first_bad - this_sector;
568				if (good_sectors > best_good_sectors) {
569					best_good_sectors = good_sectors;
570					best_disk = disk;
571				}
572				if (choose_first)
573					break;
574			}
575			continue;
576		} else
577			best_good_sectors = sectors;
578
579		dist = abs(this_sector - conf->mirrors[disk].head_position);
580		if (choose_first
581		    /* Don't change to another disk for sequential reads */
582		    || conf->next_seq_sect == this_sector
583		    || dist == 0
584		    /* If device is idle, use it */
585		    || atomic_read(&rdev->nr_pending) == 0) {
586			best_disk = disk;
587			break;
588		}
589		if (dist < best_dist) {
590			best_dist = dist;
591			best_disk = disk;
592		}
593	}
594
595	if (best_disk >= 0) {
596		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
597		if (!rdev)
598			goto retry;
599		atomic_inc(&rdev->nr_pending);
600		if (test_bit(Faulty, &rdev->flags)) {
601			/* cannot risk returning a device that failed
602			 * before we inc'ed nr_pending
603			 */
604			rdev_dec_pending(rdev, conf->mddev);
605			goto retry;
606		}
607		sectors = best_good_sectors;
608		conf->next_seq_sect = this_sector + sectors;
609		conf->last_used = best_disk;
610	}
611	rcu_read_unlock();
612	*max_sectors = sectors;
613
614	return best_disk;
615}
616
617int md_raid1_congested(struct mddev *mddev, int bits)
618{
619	struct r1conf *conf = mddev->private;
620	int i, ret = 0;
621
622	if ((bits & (1 << BDI_async_congested)) &&
623	    conf->pending_count >= max_queued_requests)
624		return 1;
625
626	rcu_read_lock();
627	for (i = 0; i < conf->raid_disks; i++) {
628		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
629		if (rdev && !test_bit(Faulty, &rdev->flags)) {
630			struct request_queue *q = bdev_get_queue(rdev->bdev);
631
632			BUG_ON(!q);
633
634			/* Note the '|| 1' - when read_balance prefers
635			 * non-congested targets, it can be removed
636			 */
637			if ((bits & (1<<BDI_async_congested)) || 1)
638				ret |= bdi_congested(&q->backing_dev_info, bits);
639			else
640				ret &= bdi_congested(&q->backing_dev_info, bits);
641		}
642	}
643	rcu_read_unlock();
644	return ret;
645}
646EXPORT_SYMBOL_GPL(md_raid1_congested);
647
648static int raid1_congested(void *data, int bits)
649{
650	struct mddev *mddev = data;
651
652	return mddev_congested(mddev, bits) ||
653		md_raid1_congested(mddev, bits);
654}
655
656static void flush_pending_writes(struct r1conf *conf)
657{
658	/* Any writes that have been queued but are awaiting
659	 * bitmap updates get flushed here.
660	 */
661	spin_lock_irq(&conf->device_lock);
662
663	if (conf->pending_bio_list.head) {
664		struct bio *bio;
665		bio = bio_list_get(&conf->pending_bio_list);
666		conf->pending_count = 0;
667		spin_unlock_irq(&conf->device_lock);
668		/* flush any pending bitmap writes to
669		 * disk before proceeding w/ I/O */
670		bitmap_unplug(conf->mddev->bitmap);
671		wake_up(&conf->wait_barrier);
672
673		while (bio) { /* submit pending writes */
674			struct bio *next = bio->bi_next;
675			bio->bi_next = NULL;
676			generic_make_request(bio);
677			bio = next;
678		}
679	} else
680		spin_unlock_irq(&conf->device_lock);
681}
682
683/* Barriers....
684 * Sometimes we need to suspend IO while we do something else,
685 * either some resync/recovery, or reconfigure the array.
686 * To do this we raise a 'barrier'.
687 * The 'barrier' is a counter that can be raised multiple times
688 * to count how many activities are happening which preclude
689 * normal IO.
690 * We can only raise the barrier if there is no pending IO.
691 * i.e. if nr_pending == 0.
692 * We choose only to raise the barrier if no-one is waiting for the
693 * barrier to go down.  This means that as soon as an IO request
694 * is ready, no other operations which require a barrier will start
695 * until the IO request has had a chance.
696 *
697 * So: regular IO calls 'wait_barrier'.  When that returns there
698 *    is no backgroup IO happening,  It must arrange to call
699 *    allow_barrier when it has finished its IO.
700 * backgroup IO calls must call raise_barrier.  Once that returns
701 *    there is no normal IO happeing.  It must arrange to call
702 *    lower_barrier when the particular background IO completes.
703 */
704#define RESYNC_DEPTH 32
705
706static void raise_barrier(struct r1conf *conf)
707{
708	spin_lock_irq(&conf->resync_lock);
709
710	/* Wait until no block IO is waiting */
711	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
712			    conf->resync_lock, );
713
714	/* block any new IO from starting */
715	conf->barrier++;
716
717	/* Now wait for all pending IO to complete */
718	wait_event_lock_irq(conf->wait_barrier,
719			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
720			    conf->resync_lock, );
721
722	spin_unlock_irq(&conf->resync_lock);
723}
724
725static void lower_barrier(struct r1conf *conf)
726{
727	unsigned long flags;
728	BUG_ON(conf->barrier <= 0);
729	spin_lock_irqsave(&conf->resync_lock, flags);
730	conf->barrier--;
731	spin_unlock_irqrestore(&conf->resync_lock, flags);
732	wake_up(&conf->wait_barrier);
733}
734
735static void wait_barrier(struct r1conf *conf)
736{
737	spin_lock_irq(&conf->resync_lock);
738	if (conf->barrier) {
739		conf->nr_waiting++;
740		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
741				    conf->resync_lock,
742				    );
743		conf->nr_waiting--;
744	}
745	conf->nr_pending++;
746	spin_unlock_irq(&conf->resync_lock);
747}
748
749static void allow_barrier(struct r1conf *conf)
750{
751	unsigned long flags;
752	spin_lock_irqsave(&conf->resync_lock, flags);
753	conf->nr_pending--;
754	spin_unlock_irqrestore(&conf->resync_lock, flags);
755	wake_up(&conf->wait_barrier);
756}
757
758static void freeze_array(struct r1conf *conf)
759{
760	/* stop syncio and normal IO and wait for everything to
761	 * go quite.
762	 * We increment barrier and nr_waiting, and then
763	 * wait until nr_pending match nr_queued+1
764	 * This is called in the context of one normal IO request
765	 * that has failed. Thus any sync request that might be pending
766	 * will be blocked by nr_pending, and we need to wait for
767	 * pending IO requests to complete or be queued for re-try.
768	 * Thus the number queued (nr_queued) plus this request (1)
769	 * must match the number of pending IOs (nr_pending) before
770	 * we continue.
771	 */
772	spin_lock_irq(&conf->resync_lock);
773	conf->barrier++;
774	conf->nr_waiting++;
775	wait_event_lock_irq(conf->wait_barrier,
776			    conf->nr_pending == conf->nr_queued+1,
777			    conf->resync_lock,
778			    flush_pending_writes(conf));
779	spin_unlock_irq(&conf->resync_lock);
780}
781static void unfreeze_array(struct r1conf *conf)
782{
783	/* reverse the effect of the freeze */
784	spin_lock_irq(&conf->resync_lock);
785	conf->barrier--;
786	conf->nr_waiting--;
787	wake_up(&conf->wait_barrier);
788	spin_unlock_irq(&conf->resync_lock);
789}
790
791
792/* duplicate the data pages for behind I/O
793 */
794static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
795{
796	int i;
797	struct bio_vec *bvec;
798	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
799					GFP_NOIO);
800	if (unlikely(!bvecs))
801		return;
802
803	bio_for_each_segment(bvec, bio, i) {
804		bvecs[i] = *bvec;
805		bvecs[i].bv_page = alloc_page(GFP_NOIO);
806		if (unlikely(!bvecs[i].bv_page))
807			goto do_sync_io;
808		memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
809		       kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
810		kunmap(bvecs[i].bv_page);
811		kunmap(bvec->bv_page);
812	}
813	r1_bio->behind_bvecs = bvecs;
814	r1_bio->behind_page_count = bio->bi_vcnt;
815	set_bit(R1BIO_BehindIO, &r1_bio->state);
816	return;
817
818do_sync_io:
819	for (i = 0; i < bio->bi_vcnt; i++)
820		if (bvecs[i].bv_page)
821			put_page(bvecs[i].bv_page);
822	kfree(bvecs);
823	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
824}
825
826static void make_request(struct mddev *mddev, struct bio * bio)
827{
828	struct r1conf *conf = mddev->private;
829	struct mirror_info *mirror;
830	struct r1bio *r1_bio;
831	struct bio *read_bio;
832	int i, disks;
833	struct bitmap *bitmap;
834	unsigned long flags;
835	const int rw = bio_data_dir(bio);
836	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
837	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
838	struct md_rdev *blocked_rdev;
839	int plugged;
840	int first_clone;
841	int sectors_handled;
842	int max_sectors;
843
844	/*
845	 * Register the new request and wait if the reconstruction
846	 * thread has put up a bar for new requests.
847	 * Continue immediately if no resync is active currently.
848	 */
849
850	md_write_start(mddev, bio); /* wait on superblock update early */
851
852	if (bio_data_dir(bio) == WRITE &&
853	    bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
854	    bio->bi_sector < mddev->suspend_hi) {
855		/* As the suspend_* range is controlled by
856		 * userspace, we want an interruptible
857		 * wait.
858		 */
859		DEFINE_WAIT(w);
860		for (;;) {
861			flush_signals(current);
862			prepare_to_wait(&conf->wait_barrier,
863					&w, TASK_INTERRUPTIBLE);
864			if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
865			    bio->bi_sector >= mddev->suspend_hi)
866				break;
867			schedule();
868		}
869		finish_wait(&conf->wait_barrier, &w);
870	}
871
872	wait_barrier(conf);
873
874	bitmap = mddev->bitmap;
875
876	/*
877	 * make_request() can abort the operation when READA is being
878	 * used and no empty request is available.
879	 *
880	 */
881	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
882
883	r1_bio->master_bio = bio;
884	r1_bio->sectors = bio->bi_size >> 9;
885	r1_bio->state = 0;
886	r1_bio->mddev = mddev;
887	r1_bio->sector = bio->bi_sector;
888
889	/* We might need to issue multiple reads to different
890	 * devices if there are bad blocks around, so we keep
891	 * track of the number of reads in bio->bi_phys_segments.
892	 * If this is 0, there is only one r1_bio and no locking
893	 * will be needed when requests complete.  If it is
894	 * non-zero, then it is the number of not-completed requests.
895	 */
896	bio->bi_phys_segments = 0;
897	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
898
899	if (rw == READ) {
900		/*
901		 * read balancing logic:
902		 */
903		int rdisk;
904
905read_again:
906		rdisk = read_balance(conf, r1_bio, &max_sectors);
907
908		if (rdisk < 0) {
909			/* couldn't find anywhere to read from */
910			raid_end_bio_io(r1_bio);
911			return;
912		}
913		mirror = conf->mirrors + rdisk;
914
915		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
916		    bitmap) {
917			/* Reading from a write-mostly device must
918			 * take care not to over-take any writes
919			 * that are 'behind'
920			 */
921			wait_event(bitmap->behind_wait,
922				   atomic_read(&bitmap->behind_writes) == 0);
923		}
924		r1_bio->read_disk = rdisk;
925
926		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
927		md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
928			    max_sectors);
929
930		r1_bio->bios[rdisk] = read_bio;
931
932		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
933		read_bio->bi_bdev = mirror->rdev->bdev;
934		read_bio->bi_end_io = raid1_end_read_request;
935		read_bio->bi_rw = READ | do_sync;
936		read_bio->bi_private = r1_bio;
937
938		if (max_sectors < r1_bio->sectors) {
939			/* could not read all from this device, so we will
940			 * need another r1_bio.
941			 */
942
943			sectors_handled = (r1_bio->sector + max_sectors
944					   - bio->bi_sector);
945			r1_bio->sectors = max_sectors;
946			spin_lock_irq(&conf->device_lock);
947			if (bio->bi_phys_segments == 0)
948				bio->bi_phys_segments = 2;
949			else
950				bio->bi_phys_segments++;
951			spin_unlock_irq(&conf->device_lock);
952			/* Cannot call generic_make_request directly
953			 * as that will be queued in __make_request
954			 * and subsequent mempool_alloc might block waiting
955			 * for it.  So hand bio over to raid1d.
956			 */
957			reschedule_retry(r1_bio);
958
959			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
960
961			r1_bio->master_bio = bio;
962			r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
963			r1_bio->state = 0;
964			r1_bio->mddev = mddev;
965			r1_bio->sector = bio->bi_sector + sectors_handled;
966			goto read_again;
967		} else
968			generic_make_request(read_bio);
969		return;
970	}
971
972	/*
973	 * WRITE:
974	 */
975	if (conf->pending_count >= max_queued_requests) {
976		md_wakeup_thread(mddev->thread);
977		wait_event(conf->wait_barrier,
978			   conf->pending_count < max_queued_requests);
979	}
980	/* first select target devices under rcu_lock and
981	 * inc refcount on their rdev.  Record them by setting
982	 * bios[x] to bio
983	 * If there are known/acknowledged bad blocks on any device on
984	 * which we have seen a write error, we want to avoid writing those
985	 * blocks.
986	 * This potentially requires several writes to write around
987	 * the bad blocks.  Each set of writes gets it's own r1bio
988	 * with a set of bios attached.
989	 */
990	plugged = mddev_check_plugged(mddev);
991
992	disks = conf->raid_disks * 2;
993 retry_write:
994	blocked_rdev = NULL;
995	rcu_read_lock();
996	max_sectors = r1_bio->sectors;
997	for (i = 0;  i < disks; i++) {
998		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
999		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1000			atomic_inc(&rdev->nr_pending);
1001			blocked_rdev = rdev;
1002			break;
1003		}
1004		r1_bio->bios[i] = NULL;
1005		if (!rdev || test_bit(Faulty, &rdev->flags)) {
1006			if (i < conf->raid_disks)
1007				set_bit(R1BIO_Degraded, &r1_bio->state);
1008			continue;
1009		}
1010
1011		atomic_inc(&rdev->nr_pending);
1012		if (test_bit(WriteErrorSeen, &rdev->flags)) {
1013			sector_t first_bad;
1014			int bad_sectors;
1015			int is_bad;
1016
1017			is_bad = is_badblock(rdev, r1_bio->sector,
1018					     max_sectors,
1019					     &first_bad, &bad_sectors);
1020			if (is_bad < 0) {
1021				/* mustn't write here until the bad block is
1022				 * acknowledged*/
1023				set_bit(BlockedBadBlocks, &rdev->flags);
1024				blocked_rdev = rdev;
1025				break;
1026			}
1027			if (is_bad && first_bad <= r1_bio->sector) {
1028				/* Cannot write here at all */
1029				bad_sectors -= (r1_bio->sector - first_bad);
1030				if (bad_sectors < max_sectors)
1031					/* mustn't write more than bad_sectors
1032					 * to other devices yet
1033					 */
1034					max_sectors = bad_sectors;
1035				rdev_dec_pending(rdev, mddev);
1036				/* We don't set R1BIO_Degraded as that
1037				 * only applies if the disk is
1038				 * missing, so it might be re-added,
1039				 * and we want to know to recover this
1040				 * chunk.
1041				 * In this case the device is here,
1042				 * and the fact that this chunk is not
1043				 * in-sync is recorded in the bad
1044				 * block log
1045				 */
1046				continue;
1047			}
1048			if (is_bad) {
1049				int good_sectors = first_bad - r1_bio->sector;
1050				if (good_sectors < max_sectors)
1051					max_sectors = good_sectors;
1052			}
1053		}
1054		r1_bio->bios[i] = bio;
1055	}
1056	rcu_read_unlock();
1057
1058	if (unlikely(blocked_rdev)) {
1059		/* Wait for this device to become unblocked */
1060		int j;
1061
1062		for (j = 0; j < i; j++)
1063			if (r1_bio->bios[j])
1064				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1065		r1_bio->state = 0;
1066		allow_barrier(conf);
1067		md_wait_for_blocked_rdev(blocked_rdev, mddev);
1068		wait_barrier(conf);
1069		goto retry_write;
1070	}
1071
1072	if (max_sectors < r1_bio->sectors) {
1073		/* We are splitting this write into multiple parts, so
1074		 * we need to prepare for allocating another r1_bio.
1075		 */
1076		r1_bio->sectors = max_sectors;
1077		spin_lock_irq(&conf->device_lock);
1078		if (bio->bi_phys_segments == 0)
1079			bio->bi_phys_segments = 2;
1080		else
1081			bio->bi_phys_segments++;
1082		spin_unlock_irq(&conf->device_lock);
1083	}
1084	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
1085
1086	atomic_set(&r1_bio->remaining, 1);
1087	atomic_set(&r1_bio->behind_remaining, 0);
1088
1089	first_clone = 1;
1090	for (i = 0; i < disks; i++) {
1091		struct bio *mbio;
1092		if (!r1_bio->bios[i])
1093			continue;
1094
1095		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1096		md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
1097
1098		if (first_clone) {
1099			/* do behind I/O ?
1100			 * Not if there are too many, or cannot
1101			 * allocate memory, or a reader on WriteMostly
1102			 * is waiting for behind writes to flush */
1103			if (bitmap &&
1104			    (atomic_read(&bitmap->behind_writes)
1105			     < mddev->bitmap_info.max_write_behind) &&
1106			    !waitqueue_active(&bitmap->behind_wait))
1107				alloc_behind_pages(mbio, r1_bio);
1108
1109			bitmap_startwrite(bitmap, r1_bio->sector,
1110					  r1_bio->sectors,
1111					  test_bit(R1BIO_BehindIO,
1112						   &r1_bio->state));
1113			first_clone = 0;
1114		}
1115		if (r1_bio->behind_bvecs) {
1116			struct bio_vec *bvec;
1117			int j;
1118
1119			/* Yes, I really want the '__' version so that
1120			 * we clear any unused pointer in the io_vec, rather
1121			 * than leave them unchanged.  This is important
1122			 * because when we come to free the pages, we won't
1123			 * know the original bi_idx, so we just free
1124			 * them all
1125			 */
1126			__bio_for_each_segment(bvec, mbio, j, 0)
1127				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
1128			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
1129				atomic_inc(&r1_bio->behind_remaining);
1130		}
1131
1132		r1_bio->bios[i] = mbio;
1133
1134		mbio->bi_sector	= (r1_bio->sector +
1135				   conf->mirrors[i].rdev->data_offset);
1136		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1137		mbio->bi_end_io	= raid1_end_write_request;
1138		mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1139		mbio->bi_private = r1_bio;
1140
1141		atomic_inc(&r1_bio->remaining);
1142		spin_lock_irqsave(&conf->device_lock, flags);
1143		bio_list_add(&conf->pending_bio_list, mbio);
1144		conf->pending_count++;
1145		spin_unlock_irqrestore(&conf->device_lock, flags);
1146	}
1147	/* Mustn't call r1_bio_write_done before this next test,
1148	 * as it could result in the bio being freed.
1149	 */
1150	if (sectors_handled < (bio->bi_size >> 9)) {
1151		r1_bio_write_done(r1_bio);
1152		/* We need another r1_bio.  It has already been counted
1153		 * in bio->bi_phys_segments
1154		 */
1155		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1156		r1_bio->master_bio = bio;
1157		r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1158		r1_bio->state = 0;
1159		r1_bio->mddev = mddev;
1160		r1_bio->sector = bio->bi_sector + sectors_handled;
1161		goto retry_write;
1162	}
1163
1164	r1_bio_write_done(r1_bio);
1165
1166	/* In case raid1d snuck in to freeze_array */
1167	wake_up(&conf->wait_barrier);
1168
1169	if (do_sync || !bitmap || !plugged)
1170		md_wakeup_thread(mddev->thread);
1171}
1172
1173static void status(struct seq_file *seq, struct mddev *mddev)
1174{
1175	struct r1conf *conf = mddev->private;
1176	int i;
1177
1178	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1179		   conf->raid_disks - mddev->degraded);
1180	rcu_read_lock();
1181	for (i = 0; i < conf->raid_disks; i++) {
1182		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1183		seq_printf(seq, "%s",
1184			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1185	}
1186	rcu_read_unlock();
1187	seq_printf(seq, "]");
1188}
1189
1190
1191static void error(struct mddev *mddev, struct md_rdev *rdev)
1192{
1193	char b[BDEVNAME_SIZE];
1194	struct r1conf *conf = mddev->private;
1195
1196	/*
1197	 * If it is not operational, then we have already marked it as dead
1198	 * else if it is the last working disks, ignore the error, let the
1199	 * next level up know.
1200	 * else mark the drive as failed
1201	 */
1202	if (test_bit(In_sync, &rdev->flags)
1203	    && (conf->raid_disks - mddev->degraded) == 1) {
1204		/*
1205		 * Don't fail the drive, act as though we were just a
1206		 * normal single drive.
1207		 * However don't try a recovery from this drive as
1208		 * it is very likely to fail.
1209		 */
1210		conf->recovery_disabled = mddev->recovery_disabled;
1211		return;
1212	}
1213	set_bit(Blocked, &rdev->flags);
1214	if (test_and_clear_bit(In_sync, &rdev->flags)) {
1215		unsigned long flags;
1216		spin_lock_irqsave(&conf->device_lock, flags);
1217		mddev->degraded++;
1218		set_bit(Faulty, &rdev->flags);
1219		spin_unlock_irqrestore(&conf->device_lock, flags);
1220		/*
1221		 * if recovery is running, make sure it aborts.
1222		 */
1223		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1224	} else
1225		set_bit(Faulty, &rdev->flags);
1226	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1227	printk(KERN_ALERT
1228	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
1229	       "md/raid1:%s: Operation continuing on %d devices.\n",
1230	       mdname(mddev), bdevname(rdev->bdev, b),
1231	       mdname(mddev), conf->raid_disks - mddev->degraded);
1232}
1233
1234static void print_conf(struct r1conf *conf)
1235{
1236	int i;
1237
1238	printk(KERN_DEBUG "RAID1 conf printout:\n");
1239	if (!conf) {
1240		printk(KERN_DEBUG "(!conf)\n");
1241		return;
1242	}
1243	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1244		conf->raid_disks);
1245
1246	rcu_read_lock();
1247	for (i = 0; i < conf->raid_disks; i++) {
1248		char b[BDEVNAME_SIZE];
1249		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1250		if (rdev)
1251			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1252			       i, !test_bit(In_sync, &rdev->flags),
1253			       !test_bit(Faulty, &rdev->flags),
1254			       bdevname(rdev->bdev,b));
1255	}
1256	rcu_read_unlock();
1257}
1258
1259static void close_sync(struct r1conf *conf)
1260{
1261	wait_barrier(conf);
1262	allow_barrier(conf);
1263
1264	mempool_destroy(conf->r1buf_pool);
1265	conf->r1buf_pool = NULL;
1266}
1267
1268static int raid1_spare_active(struct mddev *mddev)
1269{
1270	int i;
1271	struct r1conf *conf = mddev->private;
1272	int count = 0;
1273	unsigned long flags;
1274
1275	/*
1276	 * Find all failed disks within the RAID1 configuration
1277	 * and mark them readable.
1278	 * Called under mddev lock, so rcu protection not needed.
1279	 */
1280	for (i = 0; i < conf->raid_disks; i++) {
1281		struct md_rdev *rdev = conf->mirrors[i].rdev;
1282		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1283		if (repl
1284		    && repl->recovery_offset == MaxSector
1285		    && !test_bit(Faulty, &repl->flags)
1286		    && !test_and_set_bit(In_sync, &repl->flags)) {
1287			/* replacement has just become active */
1288			if (!rdev ||
1289			    !test_and_clear_bit(In_sync, &rdev->flags))
1290				count++;
1291			if (rdev) {
1292				/* Replaced device not technically
1293				 * faulty, but we need to be sure
1294				 * it gets removed and never re-added
1295				 */
1296				set_bit(Faulty, &rdev->flags);
1297				sysfs_notify_dirent_safe(
1298					rdev->sysfs_state);
1299			}
1300		}
1301		if (rdev
1302		    && !test_bit(Faulty, &rdev->flags)
1303		    && !test_and_set_bit(In_sync, &rdev->flags)) {
1304			count++;
1305			sysfs_notify_dirent_safe(rdev->sysfs_state);
1306		}
1307	}
1308	spin_lock_irqsave(&conf->device_lock, flags);
1309	mddev->degraded -= count;
1310	spin_unlock_irqrestore(&conf->device_lock, flags);
1311
1312	print_conf(conf);
1313	return count;
1314}
1315
1316
1317static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1318{
1319	struct r1conf *conf = mddev->private;
1320	int err = -EEXIST;
1321	int mirror = 0;
1322	struct mirror_info *p;
1323	int first = 0;
1324	int last = conf->raid_disks - 1;
1325
1326	if (mddev->recovery_disabled == conf->recovery_disabled)
1327		return -EBUSY;
1328
1329	if (rdev->raid_disk >= 0)
1330		first = last = rdev->raid_disk;
1331
1332	for (mirror = first; mirror <= last; mirror++) {
1333		p = conf->mirrors+mirror;
1334		if (!p->rdev) {
1335
1336			disk_stack_limits(mddev->gendisk, rdev->bdev,
1337					  rdev->data_offset << 9);
1338			/* as we don't honour merge_bvec_fn, we must
1339			 * never risk violating it, so limit
1340			 * ->max_segments to one lying with a single
1341			 * page, as a one page request is never in
1342			 * violation.
1343			 */
1344			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1345				blk_queue_max_segments(mddev->queue, 1);
1346				blk_queue_segment_boundary(mddev->queue,
1347							   PAGE_CACHE_SIZE - 1);
1348			}
1349
1350			p->head_position = 0;
1351			rdev->raid_disk = mirror;
1352			err = 0;
1353			/* As all devices are equivalent, we don't need a full recovery
1354			 * if this was recently any drive of the array
1355			 */
1356			if (rdev->saved_raid_disk < 0)
1357				conf->fullsync = 1;
1358			rcu_assign_pointer(p->rdev, rdev);
1359			break;
1360		}
1361		if (test_bit(WantReplacement, &p->rdev->flags) &&
1362		    p[conf->raid_disks].rdev == NULL) {
1363			/* Add this device as a replacement */
1364			clear_bit(In_sync, &rdev->flags);
1365			set_bit(Replacement, &rdev->flags);
1366			rdev->raid_disk = mirror;
1367			err = 0;
1368			conf->fullsync = 1;
1369			rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1370			break;
1371		}
1372	}
1373	md_integrity_add_rdev(rdev, mddev);
1374	print_conf(conf);
1375	return err;
1376}
1377
1378static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1379{
1380	struct r1conf *conf = mddev->private;
1381	int err = 0;
1382	int number = rdev->raid_disk;
1383	struct mirror_info *p = conf->mirrors+ number;
1384
1385	if (rdev != p->rdev)
1386		p = conf->mirrors + conf->raid_disks + number;
1387
1388	print_conf(conf);
1389	if (rdev == p->rdev) {
1390		if (test_bit(In_sync, &rdev->flags) ||
1391		    atomic_read(&rdev->nr_pending)) {
1392			err = -EBUSY;
1393			goto abort;
1394		}
1395		/* Only remove non-faulty devices if recovery
1396		 * is not possible.
1397		 */
1398		if (!test_bit(Faulty, &rdev->flags) &&
1399		    mddev->recovery_disabled != conf->recovery_disabled &&
1400		    mddev->degraded < conf->raid_disks) {
1401			err = -EBUSY;
1402			goto abort;
1403		}
1404		p->rdev = NULL;
1405		synchronize_rcu();
1406		if (atomic_read(&rdev->nr_pending)) {
1407			/* lost the race, try later */
1408			err = -EBUSY;
1409			p->rdev = rdev;
1410			goto abort;
1411		} else if (conf->mirrors[conf->raid_disks + number].rdev) {
1412			/* We just removed a device that is being replaced.
1413			 * Move down the replacement.  We drain all IO before
1414			 * doing this to avoid confusion.
1415			 */
1416			struct md_rdev *repl =
1417				conf->mirrors[conf->raid_disks + number].rdev;
1418			raise_barrier(conf);
1419			clear_bit(Replacement, &repl->flags);
1420			p->rdev = repl;
1421			conf->mirrors[conf->raid_disks + number].rdev = NULL;
1422			lower_barrier(conf);
1423			clear_bit(WantReplacement, &rdev->flags);
1424		} else
1425			clear_bit(WantReplacement, &rdev->flags);
1426		err = md_integrity_register(mddev);
1427	}
1428abort:
1429
1430	print_conf(conf);
1431	return err;
1432}
1433
1434
1435static void end_sync_read(struct bio *bio, int error)
1436{
1437	struct r1bio *r1_bio = bio->bi_private;
1438
1439	update_head_pos(r1_bio->read_disk, r1_bio);
1440
1441	/*
1442	 * we have read a block, now it needs to be re-written,
1443	 * or re-read if the read failed.
1444	 * We don't do much here, just schedule handling by raid1d
1445	 */
1446	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1447		set_bit(R1BIO_Uptodate, &r1_bio->state);
1448
1449	if (atomic_dec_and_test(&r1_bio->remaining))
1450		reschedule_retry(r1_bio);
1451}
1452
1453static void end_sync_write(struct bio *bio, int error)
1454{
1455	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1456	struct r1bio *r1_bio = bio->bi_private;
1457	struct mddev *mddev = r1_bio->mddev;
1458	struct r1conf *conf = mddev->private;
1459	int mirror=0;
1460	sector_t first_bad;
1461	int bad_sectors;
1462
1463	mirror = find_bio_disk(r1_bio, bio);
1464
1465	if (!uptodate) {
1466		sector_t sync_blocks = 0;
1467		sector_t s = r1_bio->sector;
1468		long sectors_to_go = r1_bio->sectors;
1469		/* make sure these bits doesn't get cleared. */
1470		do {
1471			bitmap_end_sync(mddev->bitmap, s,
1472					&sync_blocks, 1);
1473			s += sync_blocks;
1474			sectors_to_go -= sync_blocks;
1475		} while (sectors_to_go > 0);
1476		set_bit(WriteErrorSeen,
1477			&conf->mirrors[mirror].rdev->flags);
1478		if (!test_and_set_bit(WantReplacement,
1479				      &conf->mirrors[mirror].rdev->flags))
1480			set_bit(MD_RECOVERY_NEEDED, &
1481				mddev->recovery);
1482		set_bit(R1BIO_WriteError, &r1_bio->state);
1483	} else if (is_badblock(conf->mirrors[mirror].rdev,
1484			       r1_bio->sector,
1485			       r1_bio->sectors,
1486			       &first_bad, &bad_sectors) &&
1487		   !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1488				r1_bio->sector,
1489				r1_bio->sectors,
1490				&first_bad, &bad_sectors)
1491		)
1492		set_bit(R1BIO_MadeGood, &r1_bio->state);
1493
1494	if (atomic_dec_and_test(&r1_bio->remaining)) {
1495		int s = r1_bio->sectors;
1496		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1497		    test_bit(R1BIO_WriteError, &r1_bio->state))
1498			reschedule_retry(r1_bio);
1499		else {
1500			put_buf(r1_bio);
1501			md_done_sync(mddev, s, uptodate);
1502		}
1503	}
1504}
1505
1506static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
1507			    int sectors, struct page *page, int rw)
1508{
1509	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1510		/* success */
1511		return 1;
1512	if (rw == WRITE) {
1513		set_bit(WriteErrorSeen, &rdev->flags);
1514		if (!test_and_set_bit(WantReplacement,
1515				      &rdev->flags))
1516			set_bit(MD_RECOVERY_NEEDED, &
1517				rdev->mddev->recovery);
1518	}
1519	/* need to record an error - either for the block or the device */
1520	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1521		md_error(rdev->mddev, rdev);
1522	return 0;
1523}
1524
1525static int fix_sync_read_error(struct r1bio *r1_bio)
1526{
1527	/* Try some synchronous reads of other devices to get
1528	 * good data, much like with normal read errors.  Only
1529	 * read into the pages we already have so we don't
1530	 * need to re-issue the read request.
1531	 * We don't need to freeze the array, because being in an
1532	 * active sync request, there is no normal IO, and
1533	 * no overlapping syncs.
1534	 * We don't need to check is_badblock() again as we
1535	 * made sure that anything with a bad block in range
1536	 * will have bi_end_io clear.
1537	 */
1538	struct mddev *mddev = r1_bio->mddev;
1539	struct r1conf *conf = mddev->private;
1540	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1541	sector_t sect = r1_bio->sector;
1542	int sectors = r1_bio->sectors;
1543	int idx = 0;
1544
1545	while(sectors) {
1546		int s = sectors;
1547		int d = r1_bio->read_disk;
1548		int success = 0;
1549		struct md_rdev *rdev;
1550		int start;
1551
1552		if (s > (PAGE_SIZE>>9))
1553			s = PAGE_SIZE >> 9;
1554		do {
1555			if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1556				/* No rcu protection needed here devices
1557				 * can only be removed when no resync is
1558				 * active, and resync is currently active
1559				 */
1560				rdev = conf->mirrors[d].rdev;
1561				if (sync_page_io(rdev, sect, s<<9,
1562						 bio->bi_io_vec[idx].bv_page,
1563						 READ, false)) {
1564					success = 1;
1565					break;
1566				}
1567			}
1568			d++;
1569			if (d == conf->raid_disks * 2)
1570				d = 0;
1571		} while (!success && d != r1_bio->read_disk);
1572
1573		if (!success) {
1574			char b[BDEVNAME_SIZE];
1575			int abort = 0;
1576			/* Cannot read from anywhere, this block is lost.
1577			 * Record a bad block on each device.  If that doesn't
1578			 * work just disable and interrupt the recovery.
1579			 * Don't fail devices as that won't really help.
1580			 */
1581			printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1582			       " for block %llu\n",
1583			       mdname(mddev),
1584			       bdevname(bio->bi_bdev, b),
1585			       (unsigned long long)r1_bio->sector);
1586			for (d = 0; d < conf->raid_disks * 2; d++) {
1587				rdev = conf->mirrors[d].rdev;
1588				if (!rdev || test_bit(Faulty, &rdev->flags))
1589					continue;
1590				if (!rdev_set_badblocks(rdev, sect, s, 0))
1591					abort = 1;
1592			}
1593			if (abort) {
1594				conf->recovery_disabled =
1595					mddev->recovery_disabled;
1596				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1597				md_done_sync(mddev, r1_bio->sectors, 0);
1598				put_buf(r1_bio);
1599				return 0;
1600			}
1601			/* Try next page */
1602			sectors -= s;
1603			sect += s;
1604			idx++;
1605			continue;
1606		}
1607
1608		start = d;
1609		/* write it back and re-read */
1610		while (d != r1_bio->read_disk) {
1611			if (d == 0)
1612				d = conf->raid_disks * 2;
1613			d--;
1614			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1615				continue;
1616			rdev = conf->mirrors[d].rdev;
1617			if (r1_sync_page_io(rdev, sect, s,
1618					    bio->bi_io_vec[idx].bv_page,
1619					    WRITE) == 0) {
1620				r1_bio->bios[d]->bi_end_io = NULL;
1621				rdev_dec_pending(rdev, mddev);
1622			}
1623		}
1624		d = start;
1625		while (d != r1_bio->read_disk) {
1626			if (d == 0)
1627				d = conf->raid_disks * 2;
1628			d--;
1629			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1630				continue;
1631			rdev = conf->mirrors[d].rdev;
1632			if (r1_sync_page_io(rdev, sect, s,
1633					    bio->bi_io_vec[idx].bv_page,
1634					    READ) != 0)
1635				atomic_add(s, &rdev->corrected_errors);
1636		}
1637		sectors -= s;
1638		sect += s;
1639		idx ++;
1640	}
1641	set_bit(R1BIO_Uptodate, &r1_bio->state);
1642	set_bit(BIO_UPTODATE, &bio->bi_flags);
1643	return 1;
1644}
1645
1646static int process_checks(struct r1bio *r1_bio)
1647{
1648	/* We have read all readable devices.  If we haven't
1649	 * got the block, then there is no hope left.
1650	 * If we have, then we want to do a comparison
1651	 * and skip the write if everything is the same.
1652	 * If any blocks failed to read, then we need to
1653	 * attempt an over-write
1654	 */
1655	struct mddev *mddev = r1_bio->mddev;
1656	struct r1conf *conf = mddev->private;
1657	int primary;
1658	int i;
1659
1660	for (primary = 0; primary < conf->raid_disks * 2; primary++)
1661		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1662		    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1663			r1_bio->bios[primary]->bi_end_io = NULL;
1664			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1665			break;
1666		}
1667	r1_bio->read_disk = primary;
1668	for (i = 0; i < conf->raid_disks * 2; i++) {
1669		int j;
1670		int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1671		struct bio *pbio = r1_bio->bios[primary];
1672		struct bio *sbio = r1_bio->bios[i];
1673		int size;
1674
1675		if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1676			continue;
1677
1678		if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1679			for (j = vcnt; j-- ; ) {
1680				struct page *p, *s;
1681				p = pbio->bi_io_vec[j].bv_page;
1682				s = sbio->bi_io_vec[j].bv_page;
1683				if (memcmp(page_address(p),
1684					   page_address(s),
1685					   PAGE_SIZE))
1686					break;
1687			}
1688		} else
1689			j = 0;
1690		if (j >= 0)
1691			mddev->resync_mismatches += r1_bio->sectors;
1692		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1693			      && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1694			/* No need to write to this device. */
1695			sbio->bi_end_io = NULL;
1696			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1697			continue;
1698		}
1699		/* fixup the bio for reuse */
1700		sbio->bi_vcnt = vcnt;
1701		sbio->bi_size = r1_bio->sectors << 9;
1702		sbio->bi_idx = 0;
1703		sbio->bi_phys_segments = 0;
1704		sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1705		sbio->bi_flags |= 1 << BIO_UPTODATE;
1706		sbio->bi_next = NULL;
1707		sbio->bi_sector = r1_bio->sector +
1708			conf->mirrors[i].rdev->data_offset;
1709		sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1710		size = sbio->bi_size;
1711		for (j = 0; j < vcnt ; j++) {
1712			struct bio_vec *bi;
1713			bi = &sbio->bi_io_vec[j];
1714			bi->bv_offset = 0;
1715			if (size > PAGE_SIZE)
1716				bi->bv_len = PAGE_SIZE;
1717			else
1718				bi->bv_len = size;
1719			size -= PAGE_SIZE;
1720			memcpy(page_address(bi->bv_page),
1721			       page_address(pbio->bi_io_vec[j].bv_page),
1722			       PAGE_SIZE);
1723		}
1724	}
1725	return 0;
1726}
1727
1728static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1729{
1730	struct r1conf *conf = mddev->private;
1731	int i;
1732	int disks = conf->raid_disks * 2;
1733	struct bio *bio, *wbio;
1734
1735	bio = r1_bio->bios[r1_bio->read_disk];
1736
1737	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1738		/* ouch - failed to read all of that. */
1739		if (!fix_sync_read_error(r1_bio))
1740			return;
1741
1742	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1743		if (process_checks(r1_bio) < 0)
1744			return;
1745	/*
1746	 * schedule writes
1747	 */
1748	atomic_set(&r1_bio->remaining, 1);
1749	for (i = 0; i < disks ; i++) {
1750		wbio = r1_bio->bios[i];
1751		if (wbio->bi_end_io == NULL ||
1752		    (wbio->bi_end_io == end_sync_read &&
1753		     (i == r1_bio->read_disk ||
1754		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1755			continue;
1756
1757		wbio->bi_rw = WRITE;
1758		wbio->bi_end_io = end_sync_write;
1759		atomic_inc(&r1_bio->remaining);
1760		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1761
1762		generic_make_request(wbio);
1763	}
1764
1765	if (atomic_dec_and_test(&r1_bio->remaining)) {
1766		/* if we're here, all write(s) have completed, so clean up */
1767		md_done_sync(mddev, r1_bio->sectors, 1);
1768		put_buf(r1_bio);
1769	}
1770}
1771
1772/*
1773 * This is a kernel thread which:
1774 *
1775 *	1.	Retries failed read operations on working mirrors.
1776 *	2.	Updates the raid superblock when problems encounter.
1777 *	3.	Performs writes following reads for array synchronising.
1778 */
1779
1780static void fix_read_error(struct r1conf *conf, int read_disk,
1781			   sector_t sect, int sectors)
1782{
1783	struct mddev *mddev = conf->mddev;
1784	while(sectors) {
1785		int s = sectors;
1786		int d = read_disk;
1787		int success = 0;
1788		int start;
1789		struct md_rdev *rdev;
1790
1791		if (s > (PAGE_SIZE>>9))
1792			s = PAGE_SIZE >> 9;
1793
1794		do {
1795			/* Note: no rcu protection needed here
1796			 * as this is synchronous in the raid1d thread
1797			 * which is the thread that might remove
1798			 * a device.  If raid1d ever becomes multi-threaded....
1799			 */
1800			sector_t first_bad;
1801			int bad_sectors;
1802
1803			rdev = conf->mirrors[d].rdev;
1804			if (rdev &&
1805			    test_bit(In_sync, &rdev->flags) &&
1806			    is_badblock(rdev, sect, s,
1807					&first_bad, &bad_sectors) == 0 &&
1808			    sync_page_io(rdev, sect, s<<9,
1809					 conf->tmppage, READ, false))
1810				success = 1;
1811			else {
1812				d++;
1813				if (d == conf->raid_disks * 2)
1814					d = 0;
1815			}
1816		} while (!success && d != read_disk);
1817
1818		if (!success) {
1819			/* Cannot read from anywhere - mark it bad */
1820			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
1821			if (!rdev_set_badblocks(rdev, sect, s, 0))
1822				md_error(mddev, rdev);
1823			break;
1824		}
1825		/* write it back and re-read */
1826		start = d;
1827		while (d != read_disk) {
1828			if (d==0)
1829				d = conf->raid_disks * 2;
1830			d--;
1831			rdev = conf->mirrors[d].rdev;
1832			if (rdev &&
1833			    test_bit(In_sync, &rdev->flags))
1834				r1_sync_page_io(rdev, sect, s,
1835						conf->tmppage, WRITE);
1836		}
1837		d = start;
1838		while (d != read_disk) {
1839			char b[BDEVNAME_SIZE];
1840			if (d==0)
1841				d = conf->raid_disks * 2;
1842			d--;
1843			rdev = conf->mirrors[d].rdev;
1844			if (rdev &&
1845			    test_bit(In_sync, &rdev->flags)) {
1846				if (r1_sync_page_io(rdev, sect, s,
1847						    conf->tmppage, READ)) {
1848					atomic_add(s, &rdev->corrected_errors);
1849					printk(KERN_INFO
1850					       "md/raid1:%s: read error corrected "
1851					       "(%d sectors at %llu on %s)\n",
1852					       mdname(mddev), s,
1853					       (unsigned long long)(sect +
1854					           rdev->data_offset),
1855					       bdevname(rdev->bdev, b));
1856				}
1857			}
1858		}
1859		sectors -= s;
1860		sect += s;
1861	}
1862}
1863
1864static void bi_complete(struct bio *bio, int error)
1865{
1866	complete((struct completion *)bio->bi_private);
1867}
1868
1869static int submit_bio_wait(int rw, struct bio *bio)
1870{
1871	struct completion event;
1872	rw |= REQ_SYNC;
1873
1874	init_completion(&event);
1875	bio->bi_private = &event;
1876	bio->bi_end_io = bi_complete;
1877	submit_bio(rw, bio);
1878	wait_for_completion(&event);
1879
1880	return test_bit(BIO_UPTODATE, &bio->bi_flags);
1881}
1882
1883static int narrow_write_error(struct r1bio *r1_bio, int i)
1884{
1885	struct mddev *mddev = r1_bio->mddev;
1886	struct r1conf *conf = mddev->private;
1887	struct md_rdev *rdev = conf->mirrors[i].rdev;
1888	int vcnt, idx;
1889	struct bio_vec *vec;
1890
1891	/* bio has the data to be written to device 'i' where
1892	 * we just recently had a write error.
1893	 * We repeatedly clone the bio and trim down to one block,
1894	 * then try the write.  Where the write fails we record
1895	 * a bad block.
1896	 * It is conceivable that the bio doesn't exactly align with
1897	 * blocks.  We must handle this somehow.
1898	 *
1899	 * We currently own a reference on the rdev.
1900	 */
1901
1902	int block_sectors;
1903	sector_t sector;
1904	int sectors;
1905	int sect_to_write = r1_bio->sectors;
1906	int ok = 1;
1907
1908	if (rdev->badblocks.shift < 0)
1909		return 0;
1910
1911	block_sectors = 1 << rdev->badblocks.shift;
1912	sector = r1_bio->sector;
1913	sectors = ((sector + block_sectors)
1914		   & ~(sector_t)(block_sectors - 1))
1915		- sector;
1916
1917	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
1918		vcnt = r1_bio->behind_page_count;
1919		vec = r1_bio->behind_bvecs;
1920		idx = 0;
1921		while (vec[idx].bv_page == NULL)
1922			idx++;
1923	} else {
1924		vcnt = r1_bio->master_bio->bi_vcnt;
1925		vec = r1_bio->master_bio->bi_io_vec;
1926		idx = r1_bio->master_bio->bi_idx;
1927	}
1928	while (sect_to_write) {
1929		struct bio *wbio;
1930		if (sectors > sect_to_write)
1931			sectors = sect_to_write;
1932		/* Write at 'sector' for 'sectors'*/
1933
1934		wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
1935		memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
1936		wbio->bi_sector = r1_bio->sector;
1937		wbio->bi_rw = WRITE;
1938		wbio->bi_vcnt = vcnt;
1939		wbio->bi_size = r1_bio->sectors << 9;
1940		wbio->bi_idx = idx;
1941
1942		md_trim_bio(wbio, sector - r1_bio->sector, sectors);
1943		wbio->bi_sector += rdev->data_offset;
1944		wbio->bi_bdev = rdev->bdev;
1945		if (submit_bio_wait(WRITE, wbio) == 0)
1946			/* failure! */
1947			ok = rdev_set_badblocks(rdev, sector,
1948						sectors, 0)
1949				&& ok;
1950
1951		bio_put(wbio);
1952		sect_to_write -= sectors;
1953		sector += sectors;
1954		sectors = block_sectors;
1955	}
1956	return ok;
1957}
1958
1959static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
1960{
1961	int m;
1962	int s = r1_bio->sectors;
1963	for (m = 0; m < conf->raid_disks * 2 ; m++) {
1964		struct md_rdev *rdev = conf->mirrors[m].rdev;
1965		struct bio *bio = r1_bio->bios[m];
1966		if (bio->bi_end_io == NULL)
1967			continue;
1968		if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1969		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
1970			rdev_clear_badblocks(rdev, r1_bio->sector, s);
1971		}
1972		if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1973		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
1974			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
1975				md_error(conf->mddev, rdev);
1976		}
1977	}
1978	put_buf(r1_bio);
1979	md_done_sync(conf->mddev, s, 1);
1980}
1981
1982static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
1983{
1984	int m;
1985	for (m = 0; m < conf->raid_disks * 2 ; m++)
1986		if (r1_bio->bios[m] == IO_MADE_GOOD) {
1987			struct md_rdev *rdev = conf->mirrors[m].rdev;
1988			rdev_clear_badblocks(rdev,
1989					     r1_bio->sector,
1990					     r1_bio->sectors);
1991			rdev_dec_pending(rdev, conf->mddev);
1992		} else if (r1_bio->bios[m] != NULL) {
1993			/* This drive got a write error.  We need to
1994			 * narrow down and record precise write
1995			 * errors.
1996			 */
1997			if (!narrow_write_error(r1_bio, m)) {
1998				md_error(conf->mddev,
1999					 conf->mirrors[m].rdev);
2000				/* an I/O failed, we can't clear the bitmap */
2001				set_bit(R1BIO_Degraded, &r1_bio->state);
2002			}
2003			rdev_dec_pending(conf->mirrors[m].rdev,
2004					 conf->mddev);
2005		}
2006	if (test_bit(R1BIO_WriteError, &r1_bio->state))
2007		close_write(r1_bio);
2008	raid_end_bio_io(r1_bio);
2009}
2010
2011static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
2012{
2013	int disk;
2014	int max_sectors;
2015	struct mddev *mddev = conf->mddev;
2016	struct bio *bio;
2017	char b[BDEVNAME_SIZE];
2018	struct md_rdev *rdev;
2019
2020	clear_bit(R1BIO_ReadError, &r1_bio->state);
2021	/* we got a read error. Maybe the drive is bad.  Maybe just
2022	 * the block and we can fix it.
2023	 * We freeze all other IO, and try reading the block from
2024	 * other devices.  When we find one, we re-write
2025	 * and check it that fixes the read error.
2026	 * This is all done synchronously while the array is
2027	 * frozen
2028	 */
2029	if (mddev->ro == 0) {
2030		freeze_array(conf);
2031		fix_read_error(conf, r1_bio->read_disk,
2032			       r1_bio->sector, r1_bio->sectors);
2033		unfreeze_array(conf);
2034	} else
2035		md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
2036
2037	bio = r1_bio->bios[r1_bio->read_disk];
2038	bdevname(bio->bi_bdev, b);
2039read_more:
2040	disk = read_balance(conf, r1_bio, &max_sectors);
2041	if (disk == -1) {
2042		printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
2043		       " read error for block %llu\n",
2044		       mdname(mddev), b, (unsigned long long)r1_bio->sector);
2045		raid_end_bio_io(r1_bio);
2046	} else {
2047		const unsigned long do_sync
2048			= r1_bio->master_bio->bi_rw & REQ_SYNC;
2049		if (bio) {
2050			r1_bio->bios[r1_bio->read_disk] =
2051				mddev->ro ? IO_BLOCKED : NULL;
2052			bio_put(bio);
2053		}
2054		r1_bio->read_disk = disk;
2055		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
2056		md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
2057		r1_bio->bios[r1_bio->read_disk] = bio;
2058		rdev = conf->mirrors[disk].rdev;
2059		printk_ratelimited(KERN_ERR
2060				   "md/raid1:%s: redirecting sector %llu"
2061				   " to other mirror: %s\n",
2062				   mdname(mddev),
2063				   (unsigned long long)r1_bio->sector,
2064				   bdevname(rdev->bdev, b));
2065		bio->bi_sector = r1_bio->sector + rdev->data_offset;
2066		bio->bi_bdev = rdev->bdev;
2067		bio->bi_end_io = raid1_end_read_request;
2068		bio->bi_rw = READ | do_sync;
2069		bio->bi_private = r1_bio;
2070		if (max_sectors < r1_bio->sectors) {
2071			/* Drat - have to split this up more */
2072			struct bio *mbio = r1_bio->master_bio;
2073			int sectors_handled = (r1_bio->sector + max_sectors
2074					       - mbio->bi_sector);
2075			r1_bio->sectors = max_sectors;
2076			spin_lock_irq(&conf->device_lock);
2077			if (mbio->bi_phys_segments == 0)
2078				mbio->bi_phys_segments = 2;
2079			else
2080				mbio->bi_phys_segments++;
2081			spin_unlock_irq(&conf->device_lock);
2082			generic_make_request(bio);
2083			bio = NULL;
2084
2085			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
2086
2087			r1_bio->master_bio = mbio;
2088			r1_bio->sectors = (mbio->bi_size >> 9)
2089					  - sectors_handled;
2090			r1_bio->state = 0;
2091			set_bit(R1BIO_ReadError, &r1_bio->state);
2092			r1_bio->mddev = mddev;
2093			r1_bio->sector = mbio->bi_sector + sectors_handled;
2094
2095			goto read_more;
2096		} else
2097			generic_make_request(bio);
2098	}
2099}
2100
2101static void raid1d(struct mddev *mddev)
2102{
2103	struct r1bio *r1_bio;
2104	unsigned long flags;
2105	struct r1conf *conf = mddev->private;
2106	struct list_head *head = &conf->retry_list;
2107	struct blk_plug plug;
2108
2109	md_check_recovery(mddev);
2110
2111	blk_start_plug(&plug);
2112	for (;;) {
2113
2114		if (atomic_read(&mddev->plug_cnt) == 0)
2115			flush_pending_writes(conf);
2116
2117		spin_lock_irqsave(&conf->device_lock, flags);
2118		if (list_empty(head)) {
2119			spin_unlock_irqrestore(&conf->device_lock, flags);
2120			break;
2121		}
2122		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2123		list_del(head->prev);
2124		conf->nr_queued--;
2125		spin_unlock_irqrestore(&conf->device_lock, flags);
2126
2127		mddev = r1_bio->mddev;
2128		conf = mddev->private;
2129		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
2130			if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2131			    test_bit(R1BIO_WriteError, &r1_bio->state))
2132				handle_sync_write_finished(conf, r1_bio);
2133			else
2134				sync_request_write(mddev, r1_bio);
2135		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2136			   test_bit(R1BIO_WriteError, &r1_bio->state))
2137			handle_write_finished(conf, r1_bio);
2138		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
2139			handle_read_error(conf, r1_bio);
2140		else
2141			/* just a partial read to be scheduled from separate
2142			 * context
2143			 */
2144			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
2145
2146		cond_resched();
2147		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2148			md_check_recovery(mddev);
2149	}
2150	blk_finish_plug(&plug);
2151}
2152
2153
2154static int init_resync(struct r1conf *conf)
2155{
2156	int buffs;
2157
2158	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2159	BUG_ON(conf->r1buf_pool);
2160	conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
2161					  conf->poolinfo);
2162	if (!conf->r1buf_pool)
2163		return -ENOMEM;
2164	conf->next_resync = 0;
2165	return 0;
2166}
2167
2168/*
2169 * perform a "sync" on one "block"
2170 *
2171 * We need to make sure that no normal I/O request - particularly write
2172 * requests - conflict with active sync requests.
2173 *
2174 * This is achieved by tracking pending requests and a 'barrier' concept
2175 * that can be installed to exclude normal IO requests.
2176 */
2177
2178static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
2179{
2180	struct r1conf *conf = mddev->private;
2181	struct r1bio *r1_bio;
2182	struct bio *bio;
2183	sector_t max_sector, nr_sectors;
2184	int disk = -1;
2185	int i;
2186	int wonly = -1;
2187	int write_targets = 0, read_targets = 0;
2188	sector_t sync_blocks;
2189	int still_degraded = 0;
2190	int good_sectors = RESYNC_SECTORS;
2191	int min_bad = 0; /* number of sectors that are bad in all devices */
2192
2193	if (!conf->r1buf_pool)
2194		if (init_resync(conf))
2195			return 0;
2196
2197	max_sector = mddev->dev_sectors;
2198	if (sector_nr >= max_sector) {
2199		/* If we aborted, we need to abort the
2200		 * sync on the 'current' bitmap chunk (there will
2201		 * only be one in raid1 resync.
2202		 * We can find the current addess in mddev->curr_resync
2203		 */
2204		if (mddev->curr_resync < max_sector) /* aborted */
2205			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2206						&sync_blocks, 1);
2207		else /* completed sync */
2208			conf->fullsync = 0;
2209
2210		bitmap_close_sync(mddev->bitmap);
2211		close_sync(conf);
2212		return 0;
2213	}
2214
2215	if (mddev->bitmap == NULL &&
2216	    mddev->recovery_cp == MaxSector &&
2217	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2218	    conf->fullsync == 0) {
2219		*skipped = 1;
2220		return max_sector - sector_nr;
2221	}
2222	/* before building a request, check if we can skip these blocks..
2223	 * This call the bitmap_start_sync doesn't actually record anything
2224	 */
2225	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
2226	    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2227		/* We can skip this block, and probably several more */
2228		*skipped = 1;
2229		return sync_blocks;
2230	}
2231	/*
2232	 * If there is non-resync activity waiting for a turn,
2233	 * and resync is going fast enough,
2234	 * then let it though before starting on this new sync request.
2235	 */
2236	if (!go_faster && conf->nr_waiting)
2237		msleep_interruptible(1000);
2238
2239	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2240	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
2241	raise_barrier(conf);
2242
2243	conf->next_resync = sector_nr;
2244
2245	rcu_read_lock();
2246	/*
2247	 * If we get a correctably read error during resync or recovery,
2248	 * we might want to read from a different device.  So we
2249	 * flag all drives that could conceivably be read from for READ,
2250	 * and any others (which will be non-In_sync devices) for WRITE.
2251	 * If a read fails, we try reading from something else for which READ
2252	 * is OK.
2253	 */
2254
2255	r1_bio->mddev = mddev;
2256	r1_bio->sector = sector_nr;
2257	r1_bio->state = 0;
2258	set_bit(R1BIO_IsSync, &r1_bio->state);
2259
2260	for (i = 0; i < conf->raid_disks * 2; i++) {
2261		struct md_rdev *rdev;
2262		bio = r1_bio->bios[i];
2263
2264		/* take from bio_init */
2265		bio->bi_next = NULL;
2266		bio->bi_flags &= ~(BIO_POOL_MASK-1);
2267		bio->bi_flags |= 1 << BIO_UPTODATE;
2268		bio->bi_rw = READ;
2269		bio->bi_vcnt = 0;
2270		bio->bi_idx = 0;
2271		bio->bi_phys_segments = 0;
2272		bio->bi_size = 0;
2273		bio->bi_end_io = NULL;
2274		bio->bi_private = NULL;
2275
2276		rdev = rcu_dereference(conf->mirrors[i].rdev);
2277		if (rdev == NULL ||
2278		    test_bit(Faulty, &rdev->flags)) {
2279			if (i < conf->raid_disks)
2280				still_degraded = 1;
2281		} else if (!test_bit(In_sync, &rdev->flags)) {
2282			bio->bi_rw = WRITE;
2283			bio->bi_end_io = end_sync_write;
2284			write_targets ++;
2285		} else {
2286			/* may need to read from here */
2287			sector_t first_bad = MaxSector;
2288			int bad_sectors;
2289
2290			if (is_badblock(rdev, sector_nr, good_sectors,
2291					&first_bad, &bad_sectors)) {
2292				if (first_bad > sector_nr)
2293					good_sectors = first_bad - sector_nr;
2294				else {
2295					bad_sectors -= (sector_nr - first_bad);
2296					if (min_bad == 0 ||
2297					    min_bad > bad_sectors)
2298						min_bad = bad_sectors;
2299				}
2300			}
2301			if (sector_nr < first_bad) {
2302				if (test_bit(WriteMostly, &rdev->flags)) {
2303					if (wonly < 0)
2304						wonly = i;
2305				} else {
2306					if (disk < 0)
2307						disk = i;
2308				}
2309				bio->bi_rw = READ;
2310				bio->bi_end_io = end_sync_read;
2311				read_targets++;
2312			}
2313		}
2314		if (bio->bi_end_io) {
2315			atomic_inc(&rdev->nr_pending);
2316			bio->bi_sector = sector_nr + rdev->data_offset;
2317			bio->bi_bdev = rdev->bdev;
2318			bio->bi_private = r1_bio;
2319		}
2320	}
2321	rcu_read_unlock();
2322	if (disk < 0)
2323		disk = wonly;
2324	r1_bio->read_disk = disk;
2325
2326	if (read_targets == 0 && min_bad > 0) {
2327		/* These sectors are bad on all InSync devices, so we
2328		 * need to mark them bad on all write targets
2329		 */
2330		int ok = 1;
2331		for (i = 0 ; i < conf->raid_disks * 2 ; i++)
2332			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2333				struct md_rdev *rdev =
2334					rcu_dereference(conf->mirrors[i].rdev);
2335				ok = rdev_set_badblocks(rdev, sector_nr,
2336							min_bad, 0
2337					) && ok;
2338			}
2339		set_bit(MD_CHANGE_DEVS, &mddev->flags);
2340		*skipped = 1;
2341		put_buf(r1_bio);
2342
2343		if (!ok) {
2344			/* Cannot record the badblocks, so need to
2345			 * abort the resync.
2346			 * If there are multiple read targets, could just
2347			 * fail the really bad ones ???
2348			 */
2349			conf->recovery_disabled = mddev->recovery_disabled;
2350			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2351			return 0;
2352		} else
2353			return min_bad;
2354
2355	}
2356	if (min_bad > 0 && min_bad < good_sectors) {
2357		/* only resync enough to reach the next bad->good
2358		 * transition */
2359		good_sectors = min_bad;
2360	}
2361
2362	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
2363		/* extra read targets are also write targets */
2364		write_targets += read_targets-1;
2365
2366	if (write_targets == 0 || read_targets == 0) {
2367		/* There is nowhere to write, so all non-sync
2368		 * drives must be failed - so we are finished
2369		 */
2370		sector_t rv = max_sector - sector_nr;
2371		*skipped = 1;
2372		put_buf(r1_bio);
2373		return rv;
2374	}
2375
2376	if (max_sector > mddev->resync_max)
2377		max_sector = mddev->resync_max; /* Don't do IO beyond here */
2378	if (max_sector > sector_nr + good_sectors)
2379		max_sector = sector_nr + good_sectors;
2380	nr_sectors = 0;
2381	sync_blocks = 0;
2382	do {
2383		struct page *page;
2384		int len = PAGE_SIZE;
2385		if (sector_nr + (len>>9) > max_sector)
2386			len = (max_sector - sector_nr) << 9;
2387		if (len == 0)
2388			break;
2389		if (sync_blocks == 0) {
2390			if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2391					       &sync_blocks, still_degraded) &&
2392			    !conf->fullsync &&
2393			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2394				break;
2395			BUG_ON(sync_blocks < (PAGE_SIZE>>9));
2396			if ((len >> 9) > sync_blocks)
2397				len = sync_blocks<<9;
2398		}
2399
2400		for (i = 0 ; i < conf->raid_disks * 2; i++) {
2401			bio = r1_bio->bios[i];
2402			if (bio->bi_end_io) {
2403				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2404				if (bio_add_page(bio, page, len, 0) == 0) {
2405					/* stop here */
2406					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2407					while (i > 0) {
2408						i--;
2409						bio = r1_bio->bios[i];
2410						if (bio->bi_end_io==NULL)
2411							continue;
2412						/* remove last page from this bio */
2413						bio->bi_vcnt--;
2414						bio->bi_size -= len;
2415						bio->bi_flags &= ~(1<< BIO_SEG_VALID);
2416					}
2417					goto bio_full;
2418				}
2419			}
2420		}
2421		nr_sectors += len>>9;
2422		sector_nr += len>>9;
2423		sync_blocks -= (len>>9);
2424	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
2425 bio_full:
2426	r1_bio->sectors = nr_sectors;
2427
2428	/* For a user-requested sync, we read all readable devices and do a
2429	 * compare
2430	 */
2431	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2432		atomic_set(&r1_bio->remaining, read_targets);
2433		for (i = 0; i < conf->raid_disks * 2; i++) {
2434			bio = r1_bio->bios[i];
2435			if (bio->bi_end_io == end_sync_read) {
2436				md_sync_acct(bio->bi_bdev, nr_sectors);
2437				generic_make_request(bio);
2438			}
2439		}
2440	} else {
2441		atomic_set(&r1_bio->remaining, 1);
2442		bio = r1_bio->bios[r1_bio->read_disk];
2443		md_sync_acct(bio->bi_bdev, nr_sectors);
2444		generic_make_request(bio);
2445
2446	}
2447	return nr_sectors;
2448}
2449
2450static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
2451{
2452	if (sectors)
2453		return sectors;
2454
2455	return mddev->dev_sectors;
2456}
2457
2458static struct r1conf *setup_conf(struct mddev *mddev)
2459{
2460	struct r1conf *conf;
2461	int i;
2462	struct mirror_info *disk;
2463	struct md_rdev *rdev;
2464	int err = -ENOMEM;
2465
2466	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
2467	if (!conf)
2468		goto abort;
2469
2470	conf->mirrors = kzalloc(sizeof(struct mirror_info)
2471				* mddev->raid_disks * 2,
2472				 GFP_KERNEL);
2473	if (!conf->mirrors)
2474		goto abort;
2475
2476	conf->tmppage = alloc_page(GFP_KERNEL);
2477	if (!conf->tmppage)
2478		goto abort;
2479
2480	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2481	if (!conf->poolinfo)
2482		goto abort;
2483	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2484	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2485					  r1bio_pool_free,
2486					  conf->poolinfo);
2487	if (!conf->r1bio_pool)
2488		goto abort;
2489
2490	conf->poolinfo->mddev = mddev;
2491
2492	err = -EINVAL;
2493	spin_lock_init(&conf->device_lock);
2494	list_for_each_entry(rdev, &mddev->disks, same_set) {
2495		int disk_idx = rdev->raid_disk;
2496		if (disk_idx >= mddev->raid_disks
2497		    || disk_idx < 0)
2498			continue;
2499		if (test_bit(Replacement, &rdev->flags))
2500			disk = conf->mirrors + conf->raid_disks + disk_idx;
2501		else
2502			disk = conf->mirrors + disk_idx;
2503
2504		if (disk->rdev)
2505			goto abort;
2506		disk->rdev = rdev;
2507
2508		disk->head_position = 0;
2509	}
2510	conf->raid_disks = mddev->raid_disks;
2511	conf->mddev = mddev;
2512	INIT_LIST_HEAD(&conf->retry_list);
2513
2514	spin_lock_init(&conf->resync_lock);
2515	init_waitqueue_head(&conf->wait_barrier);
2516
2517	bio_list_init(&conf->pending_bio_list);
2518	conf->pending_count = 0;
2519	conf->recovery_disabled = mddev->recovery_disabled - 1;
2520
2521	err = -EIO;
2522	conf->last_used = -1;
2523	for (i = 0; i < conf->raid_disks * 2; i++) {
2524
2525		disk = conf->mirrors + i;
2526
2527		if (i < conf->raid_disks &&
2528		    disk[conf->raid_disks].rdev) {
2529			/* This slot has a replacement. */
2530			if (!disk->rdev) {
2531				/* No original, just make the replacement
2532				 * a recovering spare
2533				 */
2534				disk->rdev =
2535					disk[conf->raid_disks].rdev;
2536				disk[conf->raid_disks].rdev = NULL;
2537			} else if (!test_bit(In_sync, &disk->rdev->flags))
2538				/* Original is not in_sync - bad */
2539				goto abort;
2540		}
2541
2542		if (!disk->rdev ||
2543		    !test_bit(In_sync, &disk->rdev->flags)) {
2544			disk->head_position = 0;
2545			if (disk->rdev)
2546				conf->fullsync = 1;
2547		} else if (conf->last_used < 0)
2548			/*
2549			 * The first working device is used as a
2550			 * starting point to read balancing.
2551			 */
2552			conf->last_used = i;
2553	}
2554
2555	if (conf->last_used < 0) {
2556		printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2557		       mdname(mddev));
2558		goto abort;
2559	}
2560	err = -ENOMEM;
2561	conf->thread = md_register_thread(raid1d, mddev, NULL);
2562	if (!conf->thread) {
2563		printk(KERN_ERR
2564		       "md/raid1:%s: couldn't allocate thread\n",
2565		       mdname(mddev));
2566		goto abort;
2567	}
2568
2569	return conf;
2570
2571 abort:
2572	if (conf) {
2573		if (conf->r1bio_pool)
2574			mempool_destroy(conf->r1bio_pool);
2575		kfree(conf->mirrors);
2576		safe_put_page(conf->tmppage);
2577		kfree(conf->poolinfo);
2578		kfree(conf);
2579	}
2580	return ERR_PTR(err);
2581}
2582
2583static int run(struct mddev *mddev)
2584{
2585	struct r1conf *conf;
2586	int i;
2587	struct md_rdev *rdev;
2588
2589	if (mddev->level != 1) {
2590		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2591		       mdname(mddev), mddev->level);
2592		return -EIO;
2593	}
2594	if (mddev->reshape_position != MaxSector) {
2595		printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2596		       mdname(mddev));
2597		return -EIO;
2598	}
2599	/*
2600	 * copy the already verified devices into our private RAID1
2601	 * bookkeeping area. [whatever we allocate in run(),
2602	 * should be freed in stop()]
2603	 */
2604	if (mddev->private == NULL)
2605		conf = setup_conf(mddev);
2606	else
2607		conf = mddev->private;
2608
2609	if (IS_ERR(conf))
2610		return PTR_ERR(conf);
2611
2612	list_for_each_entry(rdev, &mddev->disks, same_set) {
2613		if (!mddev->gendisk)
2614			continue;
2615		disk_stack_limits(mddev->gendisk, rdev->bdev,
2616				  rdev->data_offset << 9);
2617		/* as we don't honour merge_bvec_fn, we must never risk
2618		 * violating it, so limit ->max_segments to 1 lying within
2619		 * a single page, as a one page request is never in violation.
2620		 */
2621		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2622			blk_queue_max_segments(mddev->queue, 1);
2623			blk_queue_segment_boundary(mddev->queue,
2624						   PAGE_CACHE_SIZE - 1);
2625		}
2626	}
2627
2628	mddev->degraded = 0;
2629	for (i=0; i < conf->raid_disks; i++)
2630		if (conf->mirrors[i].rdev == NULL ||
2631		    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2632		    test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2633			mddev->degraded++;
2634
2635	if (conf->raid_disks - mddev->degraded == 1)
2636		mddev->recovery_cp = MaxSector;
2637
2638	if (mddev->recovery_cp != MaxSector)
2639		printk(KERN_NOTICE "md/raid1:%s: not clean"
2640		       " -- starting background reconstruction\n",
2641		       mdname(mddev));
2642	printk(KERN_INFO
2643		"md/raid1:%s: active with %d out of %d mirrors\n",
2644		mdname(mddev), mddev->raid_disks - mddev->degraded,
2645		mddev->raid_disks);
2646
2647	/*
2648	 * Ok, everything is just fine now
2649	 */
2650	mddev->thread = conf->thread;
2651	conf->thread = NULL;
2652	mddev->private = conf;
2653
2654	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2655
2656	if (mddev->queue) {
2657		mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2658		mddev->queue->backing_dev_info.congested_data = mddev;
2659	}
2660	return md_integrity_register(mddev);
2661}
2662
2663static int stop(struct mddev *mddev)
2664{
2665	struct r1conf *conf = mddev->private;
2666	struct bitmap *bitmap = mddev->bitmap;
2667
2668	/* wait for behind writes to complete */
2669	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2670		printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2671		       mdname(mddev));
2672		/* need to kick something here to make sure I/O goes? */
2673		wait_event(bitmap->behind_wait,
2674			   atomic_read(&bitmap->behind_writes) == 0);
2675	}
2676
2677	raise_barrier(conf);
2678	lower_barrier(conf);
2679
2680	md_unregister_thread(&mddev->thread);
2681	if (conf->r1bio_pool)
2682		mempool_destroy(conf->r1bio_pool);
2683	kfree(conf->mirrors);
2684	kfree(conf->poolinfo);
2685	kfree(conf);
2686	mddev->private = NULL;
2687	return 0;
2688}
2689
2690static int raid1_resize(struct mddev *mddev, sector_t sectors)
2691{
2692	/* no resync is happening, and there is enough space
2693	 * on all devices, so we can resize.
2694	 * We need to make sure resync covers any new space.
2695	 * If the array is shrinking we should possibly wait until
2696	 * any io in the removed space completes, but it hardly seems
2697	 * worth it.
2698	 */
2699	md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2700	if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2701		return -EINVAL;
2702	set_capacity(mddev->gendisk, mddev->array_sectors);
2703	revalidate_disk(mddev->gendisk);
2704	if (sectors > mddev->dev_sectors &&
2705	    mddev->recovery_cp > mddev->dev_sectors) {
2706		mddev->recovery_cp = mddev->dev_sectors;
2707		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2708	}
2709	mddev->dev_sectors = sectors;
2710	mddev->resync_max_sectors = sectors;
2711	return 0;
2712}
2713
2714static int raid1_reshape(struct mddev *mddev)
2715{
2716	/* We need to:
2717	 * 1/ resize the r1bio_pool
2718	 * 2/ resize conf->mirrors
2719	 *
2720	 * We allocate a new r1bio_pool if we can.
2721	 * Then raise a device barrier and wait until all IO stops.
2722	 * Then resize conf->mirrors and swap in the new r1bio pool.
2723	 *
2724	 * At the same time, we "pack" the devices so that all the missing
2725	 * devices have the higher raid_disk numbers.
2726	 */
2727	mempool_t *newpool, *oldpool;
2728	struct pool_info *newpoolinfo;
2729	struct mirror_info *newmirrors;
2730	struct r1conf *conf = mddev->private;
2731	int cnt, raid_disks;
2732	unsigned long flags;
2733	int d, d2, err;
2734
2735	/* Cannot change chunk_size, layout, or level */
2736	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
2737	    mddev->layout != mddev->new_layout ||
2738	    mddev->level != mddev->new_level) {
2739		mddev->new_chunk_sectors = mddev->chunk_sectors;
2740		mddev->new_layout = mddev->layout;
2741		mddev->new_level = mddev->level;
2742		return -EINVAL;
2743	}
2744
2745	err = md_allow_write(mddev);
2746	if (err)
2747		return err;
2748
2749	raid_disks = mddev->raid_disks + mddev->delta_disks;
2750
2751	if (raid_disks < conf->raid_disks) {
2752		cnt=0;
2753		for (d= 0; d < conf->raid_disks; d++)
2754			if (conf->mirrors[d].rdev)
2755				cnt++;
2756		if (cnt > raid_disks)
2757			return -EBUSY;
2758	}
2759
2760	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2761	if (!newpoolinfo)
2762		return -ENOMEM;
2763	newpoolinfo->mddev = mddev;
2764	newpoolinfo->raid_disks = raid_disks * 2;
2765
2766	newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2767				 r1bio_pool_free, newpoolinfo);
2768	if (!newpool) {
2769		kfree(newpoolinfo);
2770		return -ENOMEM;
2771	}
2772	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
2773			     GFP_KERNEL);
2774	if (!newmirrors) {
2775		kfree(newpoolinfo);
2776		mempool_destroy(newpool);
2777		return -ENOMEM;
2778	}
2779
2780	raise_barrier(conf);
2781
2782	/* ok, everything is stopped */
2783	oldpool = conf->r1bio_pool;
2784	conf->r1bio_pool = newpool;
2785
2786	for (d = d2 = 0; d < conf->raid_disks; d++) {
2787		struct md_rdev *rdev = conf->mirrors[d].rdev;
2788		if (rdev && rdev->raid_disk != d2) {
2789			sysfs_unlink_rdev(mddev, rdev);
2790			rdev->raid_disk = d2;
2791			sysfs_unlink_rdev(mddev, rdev);
2792			if (sysfs_link_rdev(mddev, rdev))
2793				printk(KERN_WARNING
2794				       "md/raid1:%s: cannot register rd%d\n",
2795				       mdname(mddev), rdev->raid_disk);
2796		}
2797		if (rdev)
2798			newmirrors[d2++].rdev = rdev;
2799	}
2800	kfree(conf->mirrors);
2801	conf->mirrors = newmirrors;
2802	kfree(conf->poolinfo);
2803	conf->poolinfo = newpoolinfo;
2804
2805	spin_lock_irqsave(&conf->device_lock, flags);
2806	mddev->degraded += (raid_disks - conf->raid_disks);
2807	spin_unlock_irqrestore(&conf->device_lock, flags);
2808	conf->raid_disks = mddev->raid_disks = raid_disks;
2809	mddev->delta_disks = 0;
2810
2811	conf->last_used = 0; /* just make sure it is in-range */
2812	lower_barrier(conf);
2813
2814	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2815	md_wakeup_thread(mddev->thread);
2816
2817	mempool_destroy(oldpool);
2818	return 0;
2819}
2820
2821static void raid1_quiesce(struct mddev *mddev, int state)
2822{
2823	struct r1conf *conf = mddev->private;
2824
2825	switch(state) {
2826	case 2: /* wake for suspend */
2827		wake_up(&conf->wait_barrier);
2828		break;
2829	case 1:
2830		raise_barrier(conf);
2831		break;
2832	case 0:
2833		lower_barrier(conf);
2834		break;
2835	}
2836}
2837
2838static void *raid1_takeover(struct mddev *mddev)
2839{
2840	/* raid1 can take over:
2841	 *  raid5 with 2 devices, any layout or chunk size
2842	 */
2843	if (mddev->level == 5 && mddev->raid_disks == 2) {
2844		struct r1conf *conf;
2845		mddev->new_level = 1;
2846		mddev->new_layout = 0;
2847		mddev->new_chunk_sectors = 0;
2848		conf = setup_conf(mddev);
2849		if (!IS_ERR(conf))
2850			conf->barrier = 1;
2851		return conf;
2852	}
2853	return ERR_PTR(-EINVAL);
2854}
2855
2856static struct md_personality raid1_personality =
2857{
2858	.name		= "raid1",
2859	.level		= 1,
2860	.owner		= THIS_MODULE,
2861	.make_request	= make_request,
2862	.run		= run,
2863	.stop		= stop,
2864	.status		= status,
2865	.error_handler	= error,
2866	.hot_add_disk	= raid1_add_disk,
2867	.hot_remove_disk= raid1_remove_disk,
2868	.spare_active	= raid1_spare_active,
2869	.sync_request	= sync_request,
2870	.resize		= raid1_resize,
2871	.size		= raid1_size,
2872	.check_reshape	= raid1_reshape,
2873	.quiesce	= raid1_quiesce,
2874	.takeover	= raid1_takeover,
2875};
2876
2877static int __init raid_init(void)
2878{
2879	return register_md_personality(&raid1_personality);
2880}
2881
2882static void raid_exit(void)
2883{
2884	unregister_md_personality(&raid1_personality);
2885}
2886
2887module_init(raid_init);
2888module_exit(raid_exit);
2889MODULE_LICENSE("GPL");
2890MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2891MODULE_ALIAS("md-personality-3"); /* RAID1 */
2892MODULE_ALIAS("md-raid1");
2893MODULE_ALIAS("md-level-1");
2894
2895module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
2896