1/*
2   md.c : Multiple Devices driver for Linux
3     Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5     completely rewritten, based on the MD driver code from Marc Zyngier
6
7   Changes:
8
9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13   - kmod support by: Cyrus Durgin
14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17   - lots of fixes and improvements to the RAID1/RAID5 and generic
18     RAID code (such as request based resynchronization):
19
20     Neil Brown <neilb@cse.unsw.edu.au>.
21
22   - persistent bitmap code
23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25   This program is free software; you can redistribute it and/or modify
26   it under the terms of the GNU General Public License as published by
27   the Free Software Foundation; either version 2, or (at your option)
28   any later version.
29
30   You should have received a copy of the GNU General Public License
31   (for example /usr/src/linux/COPYING); if not, write to the Free
32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33*/
34
35#include <linux/kthread.h>
36#include <linux/blkdev.h>
37#include <linux/sysctl.h>
38#include <linux/seq_file.h>
39#include <linux/fs.h>
40#include <linux/poll.h>
41#include <linux/ctype.h>
42#include <linux/string.h>
43#include <linux/hdreg.h>
44#include <linux/proc_fs.h>
45#include <linux/random.h>
46#include <linux/module.h>
47#include <linux/reboot.h>
48#include <linux/file.h>
49#include <linux/compat.h>
50#include <linux/delay.h>
51#include <linux/raid/md_p.h>
52#include <linux/raid/md_u.h>
53#include <linux/slab.h>
54#include "md.h"
55#include "bitmap.h"
56
57#ifndef MODULE
58static void autostart_arrays(int part);
59#endif
60
61/* pers_list is a list of registered personalities protected
62 * by pers_lock.
63 * pers_lock does extra service to protect accesses to
64 * mddev->thread when the mutex cannot be held.
65 */
66static LIST_HEAD(pers_list);
67static DEFINE_SPINLOCK(pers_lock);
68
69static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70static struct workqueue_struct *md_wq;
71static struct workqueue_struct *md_misc_wq;
72
73static int remove_and_add_spares(struct mddev *mddev,
74				 struct md_rdev *this);
75
76/*
77 * Default number of read corrections we'll attempt on an rdev
78 * before ejecting it from the array. We divide the read error
79 * count by 2 for every hour elapsed between read errors.
80 */
81#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
82/*
83 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
84 * is 1000 KB/sec, so the extra system load does not show up that much.
85 * Increase it if you want to have more _guaranteed_ speed. Note that
86 * the RAID driver will use the maximum available bandwidth if the IO
87 * subsystem is idle. There is also an 'absolute maximum' reconstruction
88 * speed limit - in case reconstruction slows down your system despite
89 * idle IO detection.
90 *
91 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
92 * or /sys/block/mdX/md/sync_speed_{min,max}
93 */
94
95static int sysctl_speed_limit_min = 1000;
96static int sysctl_speed_limit_max = 200000;
97static inline int speed_min(struct mddev *mddev)
98{
99	return mddev->sync_speed_min ?
100		mddev->sync_speed_min : sysctl_speed_limit_min;
101}
102
103static inline int speed_max(struct mddev *mddev)
104{
105	return mddev->sync_speed_max ?
106		mddev->sync_speed_max : sysctl_speed_limit_max;
107}
108
109static struct ctl_table_header *raid_table_header;
110
111static struct ctl_table raid_table[] = {
112	{
113		.procname	= "speed_limit_min",
114		.data		= &sysctl_speed_limit_min,
115		.maxlen		= sizeof(int),
116		.mode		= S_IRUGO|S_IWUSR,
117		.proc_handler	= proc_dointvec,
118	},
119	{
120		.procname	= "speed_limit_max",
121		.data		= &sysctl_speed_limit_max,
122		.maxlen		= sizeof(int),
123		.mode		= S_IRUGO|S_IWUSR,
124		.proc_handler	= proc_dointvec,
125	},
126	{ }
127};
128
129static struct ctl_table raid_dir_table[] = {
130	{
131		.procname	= "raid",
132		.maxlen		= 0,
133		.mode		= S_IRUGO|S_IXUGO,
134		.child		= raid_table,
135	},
136	{ }
137};
138
139static struct ctl_table raid_root_table[] = {
140	{
141		.procname	= "dev",
142		.maxlen		= 0,
143		.mode		= 0555,
144		.child		= raid_dir_table,
145	},
146	{  }
147};
148
149static const struct block_device_operations md_fops;
150
151static int start_readonly;
152
153/* bio_clone_mddev
154 * like bio_clone, but with a local bio set
155 */
156
157struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
158			    struct mddev *mddev)
159{
160	struct bio *b;
161
162	if (!mddev || !mddev->bio_set)
163		return bio_alloc(gfp_mask, nr_iovecs);
164
165	b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
166	if (!b)
167		return NULL;
168	return b;
169}
170EXPORT_SYMBOL_GPL(bio_alloc_mddev);
171
172struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
173			    struct mddev *mddev)
174{
175	if (!mddev || !mddev->bio_set)
176		return bio_clone(bio, gfp_mask);
177
178	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
179}
180EXPORT_SYMBOL_GPL(bio_clone_mddev);
181
182/*
183 * We have a system wide 'event count' that is incremented
184 * on any 'interesting' event, and readers of /proc/mdstat
185 * can use 'poll' or 'select' to find out when the event
186 * count increases.
187 *
188 * Events are:
189 *  start array, stop array, error, add device, remove device,
190 *  start build, activate spare
191 */
192static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
193static atomic_t md_event_count;
194void md_new_event(struct mddev *mddev)
195{
196	atomic_inc(&md_event_count);
197	wake_up(&md_event_waiters);
198}
199EXPORT_SYMBOL_GPL(md_new_event);
200
201/* Alternate version that can be called from interrupts
202 * when calling sysfs_notify isn't needed.
203 */
204static void md_new_event_inintr(struct mddev *mddev)
205{
206	atomic_inc(&md_event_count);
207	wake_up(&md_event_waiters);
208}
209
210/*
211 * Enables to iterate over all existing md arrays
212 * all_mddevs_lock protects this list.
213 */
214static LIST_HEAD(all_mddevs);
215static DEFINE_SPINLOCK(all_mddevs_lock);
216
217/*
218 * iterates through all used mddevs in the system.
219 * We take care to grab the all_mddevs_lock whenever navigating
220 * the list, and to always hold a refcount when unlocked.
221 * Any code which breaks out of this loop while own
222 * a reference to the current mddev and must mddev_put it.
223 */
224#define for_each_mddev(_mddev,_tmp)					\
225									\
226	for (({ spin_lock(&all_mddevs_lock);				\
227		_tmp = all_mddevs.next;					\
228		_mddev = NULL;});					\
229	     ({ if (_tmp != &all_mddevs)				\
230			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
231		spin_unlock(&all_mddevs_lock);				\
232		if (_mddev) mddev_put(_mddev);				\
233		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
234		_tmp != &all_mddevs;});					\
235	     ({ spin_lock(&all_mddevs_lock);				\
236		_tmp = _tmp->next;})					\
237		)
238
239/* Rather than calling directly into the personality make_request function,
240 * IO requests come here first so that we can check if the device is
241 * being suspended pending a reconfiguration.
242 * We hold a refcount over the call to ->make_request.  By the time that
243 * call has finished, the bio has been linked into some internal structure
244 * and so is visible to ->quiesce(), so we don't need the refcount any more.
245 */
246static void md_make_request(struct request_queue *q, struct bio *bio)
247{
248	const int rw = bio_data_dir(bio);
249	struct mddev *mddev = q->queuedata;
250	int cpu;
251	unsigned int sectors;
252
253	if (mddev == NULL || mddev->pers == NULL
254	    || !mddev->ready) {
255		bio_io_error(bio);
256		return;
257	}
258	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
259		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
260		return;
261	}
262	smp_rmb(); /* Ensure implications of  'active' are visible */
263	rcu_read_lock();
264	if (mddev->suspended) {
265		DEFINE_WAIT(__wait);
266		for (;;) {
267			prepare_to_wait(&mddev->sb_wait, &__wait,
268					TASK_UNINTERRUPTIBLE);
269			if (!mddev->suspended)
270				break;
271			rcu_read_unlock();
272			schedule();
273			rcu_read_lock();
274		}
275		finish_wait(&mddev->sb_wait, &__wait);
276	}
277	atomic_inc(&mddev->active_io);
278	rcu_read_unlock();
279
280	/*
281	 * save the sectors now since our bio can
282	 * go away inside make_request
283	 */
284	sectors = bio_sectors(bio);
285	mddev->pers->make_request(mddev, bio);
286
287	cpu = part_stat_lock();
288	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
289	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
290	part_stat_unlock();
291
292	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
293		wake_up(&mddev->sb_wait);
294}
295
296/* mddev_suspend makes sure no new requests are submitted
297 * to the device, and that any requests that have been submitted
298 * are completely handled.
299 * Once ->stop is called and completes, the module will be completely
300 * unused.
301 */
302void mddev_suspend(struct mddev *mddev)
303{
304	BUG_ON(mddev->suspended);
305	mddev->suspended = 1;
306	synchronize_rcu();
307	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
308	mddev->pers->quiesce(mddev, 1);
309
310	del_timer_sync(&mddev->safemode_timer);
311}
312EXPORT_SYMBOL_GPL(mddev_suspend);
313
314void mddev_resume(struct mddev *mddev)
315{
316	mddev->suspended = 0;
317	wake_up(&mddev->sb_wait);
318	mddev->pers->quiesce(mddev, 0);
319
320	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
321	md_wakeup_thread(mddev->thread);
322	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
323}
324EXPORT_SYMBOL_GPL(mddev_resume);
325
326int mddev_congested(struct mddev *mddev, int bits)
327{
328	return mddev->suspended;
329}
330EXPORT_SYMBOL(mddev_congested);
331
332/*
333 * Generic flush handling for md
334 */
335
336static void md_end_flush(struct bio *bio, int err)
337{
338	struct md_rdev *rdev = bio->bi_private;
339	struct mddev *mddev = rdev->mddev;
340
341	rdev_dec_pending(rdev, mddev);
342
343	if (atomic_dec_and_test(&mddev->flush_pending)) {
344		/* The pre-request flush has finished */
345		queue_work(md_wq, &mddev->flush_work);
346	}
347	bio_put(bio);
348}
349
350static void md_submit_flush_data(struct work_struct *ws);
351
352static void submit_flushes(struct work_struct *ws)
353{
354	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
355	struct md_rdev *rdev;
356
357	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
358	atomic_set(&mddev->flush_pending, 1);
359	rcu_read_lock();
360	rdev_for_each_rcu(rdev, mddev)
361		if (rdev->raid_disk >= 0 &&
362		    !test_bit(Faulty, &rdev->flags)) {
363			/* Take two references, one is dropped
364			 * when request finishes, one after
365			 * we reclaim rcu_read_lock
366			 */
367			struct bio *bi;
368			atomic_inc(&rdev->nr_pending);
369			atomic_inc(&rdev->nr_pending);
370			rcu_read_unlock();
371			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
372			bi->bi_end_io = md_end_flush;
373			bi->bi_private = rdev;
374			bi->bi_bdev = rdev->bdev;
375			atomic_inc(&mddev->flush_pending);
376			submit_bio(WRITE_FLUSH, bi);
377			rcu_read_lock();
378			rdev_dec_pending(rdev, mddev);
379		}
380	rcu_read_unlock();
381	if (atomic_dec_and_test(&mddev->flush_pending))
382		queue_work(md_wq, &mddev->flush_work);
383}
384
385static void md_submit_flush_data(struct work_struct *ws)
386{
387	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
388	struct bio *bio = mddev->flush_bio;
389
390	if (bio->bi_iter.bi_size == 0)
391		/* an empty barrier - all done */
392		bio_endio(bio, 0);
393	else {
394		bio->bi_rw &= ~REQ_FLUSH;
395		mddev->pers->make_request(mddev, bio);
396	}
397
398	mddev->flush_bio = NULL;
399	wake_up(&mddev->sb_wait);
400}
401
402void md_flush_request(struct mddev *mddev, struct bio *bio)
403{
404	spin_lock_irq(&mddev->write_lock);
405	wait_event_lock_irq(mddev->sb_wait,
406			    !mddev->flush_bio,
407			    mddev->write_lock);
408	mddev->flush_bio = bio;
409	spin_unlock_irq(&mddev->write_lock);
410
411	INIT_WORK(&mddev->flush_work, submit_flushes);
412	queue_work(md_wq, &mddev->flush_work);
413}
414EXPORT_SYMBOL(md_flush_request);
415
416void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
417{
418	struct mddev *mddev = cb->data;
419	md_wakeup_thread(mddev->thread);
420	kfree(cb);
421}
422EXPORT_SYMBOL(md_unplug);
423
424static inline struct mddev *mddev_get(struct mddev *mddev)
425{
426	atomic_inc(&mddev->active);
427	return mddev;
428}
429
430static void mddev_delayed_delete(struct work_struct *ws);
431
432static void mddev_put(struct mddev *mddev)
433{
434	struct bio_set *bs = NULL;
435
436	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
437		return;
438	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
439	    mddev->ctime == 0 && !mddev->hold_active) {
440		/* Array is not configured at all, and not held active,
441		 * so destroy it */
442		list_del_init(&mddev->all_mddevs);
443		bs = mddev->bio_set;
444		mddev->bio_set = NULL;
445		if (mddev->gendisk) {
446			/* We did a probe so need to clean up.  Call
447			 * queue_work inside the spinlock so that
448			 * flush_workqueue() after mddev_find will
449			 * succeed in waiting for the work to be done.
450			 */
451			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
452			queue_work(md_misc_wq, &mddev->del_work);
453		} else
454			kfree(mddev);
455	}
456	spin_unlock(&all_mddevs_lock);
457	if (bs)
458		bioset_free(bs);
459}
460
461void mddev_init(struct mddev *mddev)
462{
463	mutex_init(&mddev->open_mutex);
464	mutex_init(&mddev->reconfig_mutex);
465	mutex_init(&mddev->bitmap_info.mutex);
466	INIT_LIST_HEAD(&mddev->disks);
467	INIT_LIST_HEAD(&mddev->all_mddevs);
468	init_timer(&mddev->safemode_timer);
469	atomic_set(&mddev->active, 1);
470	atomic_set(&mddev->openers, 0);
471	atomic_set(&mddev->active_io, 0);
472	spin_lock_init(&mddev->write_lock);
473	atomic_set(&mddev->flush_pending, 0);
474	init_waitqueue_head(&mddev->sb_wait);
475	init_waitqueue_head(&mddev->recovery_wait);
476	mddev->reshape_position = MaxSector;
477	mddev->reshape_backwards = 0;
478	mddev->last_sync_action = "none";
479	mddev->resync_min = 0;
480	mddev->resync_max = MaxSector;
481	mddev->level = LEVEL_NONE;
482}
483EXPORT_SYMBOL_GPL(mddev_init);
484
485static struct mddev *mddev_find(dev_t unit)
486{
487	struct mddev *mddev, *new = NULL;
488
489	if (unit && MAJOR(unit) != MD_MAJOR)
490		unit &= ~((1<<MdpMinorShift)-1);
491
492 retry:
493	spin_lock(&all_mddevs_lock);
494
495	if (unit) {
496		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
497			if (mddev->unit == unit) {
498				mddev_get(mddev);
499				spin_unlock(&all_mddevs_lock);
500				kfree(new);
501				return mddev;
502			}
503
504		if (new) {
505			list_add(&new->all_mddevs, &all_mddevs);
506			spin_unlock(&all_mddevs_lock);
507			new->hold_active = UNTIL_IOCTL;
508			return new;
509		}
510	} else if (new) {
511		/* find an unused unit number */
512		static int next_minor = 512;
513		int start = next_minor;
514		int is_free = 0;
515		int dev = 0;
516		while (!is_free) {
517			dev = MKDEV(MD_MAJOR, next_minor);
518			next_minor++;
519			if (next_minor > MINORMASK)
520				next_minor = 0;
521			if (next_minor == start) {
522				/* Oh dear, all in use. */
523				spin_unlock(&all_mddevs_lock);
524				kfree(new);
525				return NULL;
526			}
527
528			is_free = 1;
529			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
530				if (mddev->unit == dev) {
531					is_free = 0;
532					break;
533				}
534		}
535		new->unit = dev;
536		new->md_minor = MINOR(dev);
537		new->hold_active = UNTIL_STOP;
538		list_add(&new->all_mddevs, &all_mddevs);
539		spin_unlock(&all_mddevs_lock);
540		return new;
541	}
542	spin_unlock(&all_mddevs_lock);
543
544	new = kzalloc(sizeof(*new), GFP_KERNEL);
545	if (!new)
546		return NULL;
547
548	new->unit = unit;
549	if (MAJOR(unit) == MD_MAJOR)
550		new->md_minor = MINOR(unit);
551	else
552		new->md_minor = MINOR(unit) >> MdpMinorShift;
553
554	mddev_init(new);
555
556	goto retry;
557}
558
559static inline int __must_check mddev_lock(struct mddev *mddev)
560{
561	return mutex_lock_interruptible(&mddev->reconfig_mutex);
562}
563
564/* Sometimes we need to take the lock in a situation where
565 * failure due to interrupts is not acceptable.
566 */
567static inline void mddev_lock_nointr(struct mddev *mddev)
568{
569	mutex_lock(&mddev->reconfig_mutex);
570}
571
572static inline int mddev_is_locked(struct mddev *mddev)
573{
574	return mutex_is_locked(&mddev->reconfig_mutex);
575}
576
577static inline int mddev_trylock(struct mddev *mddev)
578{
579	return mutex_trylock(&mddev->reconfig_mutex);
580}
581
582static struct attribute_group md_redundancy_group;
583
584static void mddev_unlock(struct mddev *mddev)
585{
586	if (mddev->to_remove) {
587		/* These cannot be removed under reconfig_mutex as
588		 * an access to the files will try to take reconfig_mutex
589		 * while holding the file unremovable, which leads to
590		 * a deadlock.
591		 * So hold set sysfs_active while the remove in happeing,
592		 * and anything else which might set ->to_remove or my
593		 * otherwise change the sysfs namespace will fail with
594		 * -EBUSY if sysfs_active is still set.
595		 * We set sysfs_active under reconfig_mutex and elsewhere
596		 * test it under the same mutex to ensure its correct value
597		 * is seen.
598		 */
599		struct attribute_group *to_remove = mddev->to_remove;
600		mddev->to_remove = NULL;
601		mddev->sysfs_active = 1;
602		mutex_unlock(&mddev->reconfig_mutex);
603
604		if (mddev->kobj.sd) {
605			if (to_remove != &md_redundancy_group)
606				sysfs_remove_group(&mddev->kobj, to_remove);
607			if (mddev->pers == NULL ||
608			    mddev->pers->sync_request == NULL) {
609				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
610				if (mddev->sysfs_action)
611					sysfs_put(mddev->sysfs_action);
612				mddev->sysfs_action = NULL;
613			}
614		}
615		mddev->sysfs_active = 0;
616	} else
617		mutex_unlock(&mddev->reconfig_mutex);
618
619	/* As we've dropped the mutex we need a spinlock to
620	 * make sure the thread doesn't disappear
621	 */
622	spin_lock(&pers_lock);
623	md_wakeup_thread(mddev->thread);
624	spin_unlock(&pers_lock);
625}
626
627static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
628{
629	struct md_rdev *rdev;
630
631	rdev_for_each_rcu(rdev, mddev)
632		if (rdev->desc_nr == nr)
633			return rdev;
634
635	return NULL;
636}
637
638static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
639{
640	struct md_rdev *rdev;
641
642	rdev_for_each(rdev, mddev)
643		if (rdev->bdev->bd_dev == dev)
644			return rdev;
645
646	return NULL;
647}
648
649static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
650{
651	struct md_rdev *rdev;
652
653	rdev_for_each_rcu(rdev, mddev)
654		if (rdev->bdev->bd_dev == dev)
655			return rdev;
656
657	return NULL;
658}
659
660static struct md_personality *find_pers(int level, char *clevel)
661{
662	struct md_personality *pers;
663	list_for_each_entry(pers, &pers_list, list) {
664		if (level != LEVEL_NONE && pers->level == level)
665			return pers;
666		if (strcmp(pers->name, clevel)==0)
667			return pers;
668	}
669	return NULL;
670}
671
672/* return the offset of the super block in 512byte sectors */
673static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
674{
675	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
676	return MD_NEW_SIZE_SECTORS(num_sectors);
677}
678
679static int alloc_disk_sb(struct md_rdev *rdev)
680{
681	rdev->sb_page = alloc_page(GFP_KERNEL);
682	if (!rdev->sb_page) {
683		printk(KERN_ALERT "md: out of memory.\n");
684		return -ENOMEM;
685	}
686
687	return 0;
688}
689
690void md_rdev_clear(struct md_rdev *rdev)
691{
692	if (rdev->sb_page) {
693		put_page(rdev->sb_page);
694		rdev->sb_loaded = 0;
695		rdev->sb_page = NULL;
696		rdev->sb_start = 0;
697		rdev->sectors = 0;
698	}
699	if (rdev->bb_page) {
700		put_page(rdev->bb_page);
701		rdev->bb_page = NULL;
702	}
703	kfree(rdev->badblocks.page);
704	rdev->badblocks.page = NULL;
705}
706EXPORT_SYMBOL_GPL(md_rdev_clear);
707
708static void super_written(struct bio *bio, int error)
709{
710	struct md_rdev *rdev = bio->bi_private;
711	struct mddev *mddev = rdev->mddev;
712
713	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
714		printk("md: super_written gets error=%d, uptodate=%d\n",
715		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
716		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
717		md_error(mddev, rdev);
718	}
719
720	if (atomic_dec_and_test(&mddev->pending_writes))
721		wake_up(&mddev->sb_wait);
722	bio_put(bio);
723}
724
725void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
726		   sector_t sector, int size, struct page *page)
727{
728	/* write first size bytes of page to sector of rdev
729	 * Increment mddev->pending_writes before returning
730	 * and decrement it on completion, waking up sb_wait
731	 * if zero is reached.
732	 * If an error occurred, call md_error
733	 */
734	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
735
736	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
737	bio->bi_iter.bi_sector = sector;
738	bio_add_page(bio, page, size, 0);
739	bio->bi_private = rdev;
740	bio->bi_end_io = super_written;
741
742	atomic_inc(&mddev->pending_writes);
743	submit_bio(WRITE_FLUSH_FUA, bio);
744}
745
746void md_super_wait(struct mddev *mddev)
747{
748	/* wait for all superblock writes that were scheduled to complete */
749	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
750}
751
752int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
753		 struct page *page, int rw, bool metadata_op)
754{
755	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
756	int ret;
757
758	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
759		rdev->meta_bdev : rdev->bdev;
760	if (metadata_op)
761		bio->bi_iter.bi_sector = sector + rdev->sb_start;
762	else if (rdev->mddev->reshape_position != MaxSector &&
763		 (rdev->mddev->reshape_backwards ==
764		  (sector >= rdev->mddev->reshape_position)))
765		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
766	else
767		bio->bi_iter.bi_sector = sector + rdev->data_offset;
768	bio_add_page(bio, page, size, 0);
769	submit_bio_wait(rw, bio);
770
771	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
772	bio_put(bio);
773	return ret;
774}
775EXPORT_SYMBOL_GPL(sync_page_io);
776
777static int read_disk_sb(struct md_rdev *rdev, int size)
778{
779	char b[BDEVNAME_SIZE];
780
781	if (rdev->sb_loaded)
782		return 0;
783
784	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
785		goto fail;
786	rdev->sb_loaded = 1;
787	return 0;
788
789fail:
790	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
791		bdevname(rdev->bdev,b));
792	return -EINVAL;
793}
794
795static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
796{
797	return	sb1->set_uuid0 == sb2->set_uuid0 &&
798		sb1->set_uuid1 == sb2->set_uuid1 &&
799		sb1->set_uuid2 == sb2->set_uuid2 &&
800		sb1->set_uuid3 == sb2->set_uuid3;
801}
802
803static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
804{
805	int ret;
806	mdp_super_t *tmp1, *tmp2;
807
808	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
809	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
810
811	if (!tmp1 || !tmp2) {
812		ret = 0;
813		printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
814		goto abort;
815	}
816
817	*tmp1 = *sb1;
818	*tmp2 = *sb2;
819
820	/*
821	 * nr_disks is not constant
822	 */
823	tmp1->nr_disks = 0;
824	tmp2->nr_disks = 0;
825
826	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
827abort:
828	kfree(tmp1);
829	kfree(tmp2);
830	return ret;
831}
832
833static u32 md_csum_fold(u32 csum)
834{
835	csum = (csum & 0xffff) + (csum >> 16);
836	return (csum & 0xffff) + (csum >> 16);
837}
838
839static unsigned int calc_sb_csum(mdp_super_t *sb)
840{
841	u64 newcsum = 0;
842	u32 *sb32 = (u32*)sb;
843	int i;
844	unsigned int disk_csum, csum;
845
846	disk_csum = sb->sb_csum;
847	sb->sb_csum = 0;
848
849	for (i = 0; i < MD_SB_BYTES/4 ; i++)
850		newcsum += sb32[i];
851	csum = (newcsum & 0xffffffff) + (newcsum>>32);
852
853#ifdef CONFIG_ALPHA
854	/* This used to use csum_partial, which was wrong for several
855	 * reasons including that different results are returned on
856	 * different architectures.  It isn't critical that we get exactly
857	 * the same return value as before (we always csum_fold before
858	 * testing, and that removes any differences).  However as we
859	 * know that csum_partial always returned a 16bit value on
860	 * alphas, do a fold to maximise conformity to previous behaviour.
861	 */
862	sb->sb_csum = md_csum_fold(disk_csum);
863#else
864	sb->sb_csum = disk_csum;
865#endif
866	return csum;
867}
868
869/*
870 * Handle superblock details.
871 * We want to be able to handle multiple superblock formats
872 * so we have a common interface to them all, and an array of
873 * different handlers.
874 * We rely on user-space to write the initial superblock, and support
875 * reading and updating of superblocks.
876 * Interface methods are:
877 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
878 *      loads and validates a superblock on dev.
879 *      if refdev != NULL, compare superblocks on both devices
880 *    Return:
881 *      0 - dev has a superblock that is compatible with refdev
882 *      1 - dev has a superblock that is compatible and newer than refdev
883 *          so dev should be used as the refdev in future
884 *     -EINVAL superblock incompatible or invalid
885 *     -othererror e.g. -EIO
886 *
887 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
888 *      Verify that dev is acceptable into mddev.
889 *       The first time, mddev->raid_disks will be 0, and data from
890 *       dev should be merged in.  Subsequent calls check that dev
891 *       is new enough.  Return 0 or -EINVAL
892 *
893 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
894 *     Update the superblock for rdev with data in mddev
895 *     This does not write to disc.
896 *
897 */
898
899struct super_type  {
900	char		    *name;
901	struct module	    *owner;
902	int		    (*load_super)(struct md_rdev *rdev,
903					  struct md_rdev *refdev,
904					  int minor_version);
905	int		    (*validate_super)(struct mddev *mddev,
906					      struct md_rdev *rdev);
907	void		    (*sync_super)(struct mddev *mddev,
908					  struct md_rdev *rdev);
909	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
910						sector_t num_sectors);
911	int		    (*allow_new_offset)(struct md_rdev *rdev,
912						unsigned long long new_offset);
913};
914
915/*
916 * Check that the given mddev has no bitmap.
917 *
918 * This function is called from the run method of all personalities that do not
919 * support bitmaps. It prints an error message and returns non-zero if mddev
920 * has a bitmap. Otherwise, it returns 0.
921 *
922 */
923int md_check_no_bitmap(struct mddev *mddev)
924{
925	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
926		return 0;
927	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
928		mdname(mddev), mddev->pers->name);
929	return 1;
930}
931EXPORT_SYMBOL(md_check_no_bitmap);
932
933/*
934 * load_super for 0.90.0
935 */
936static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
937{
938	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
939	mdp_super_t *sb;
940	int ret;
941
942	/*
943	 * Calculate the position of the superblock (512byte sectors),
944	 * it's at the end of the disk.
945	 *
946	 * It also happens to be a multiple of 4Kb.
947	 */
948	rdev->sb_start = calc_dev_sboffset(rdev);
949
950	ret = read_disk_sb(rdev, MD_SB_BYTES);
951	if (ret) return ret;
952
953	ret = -EINVAL;
954
955	bdevname(rdev->bdev, b);
956	sb = page_address(rdev->sb_page);
957
958	if (sb->md_magic != MD_SB_MAGIC) {
959		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
960		       b);
961		goto abort;
962	}
963
964	if (sb->major_version != 0 ||
965	    sb->minor_version < 90 ||
966	    sb->minor_version > 91) {
967		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
968			sb->major_version, sb->minor_version,
969			b);
970		goto abort;
971	}
972
973	if (sb->raid_disks <= 0)
974		goto abort;
975
976	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
977		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
978			b);
979		goto abort;
980	}
981
982	rdev->preferred_minor = sb->md_minor;
983	rdev->data_offset = 0;
984	rdev->new_data_offset = 0;
985	rdev->sb_size = MD_SB_BYTES;
986	rdev->badblocks.shift = -1;
987
988	if (sb->level == LEVEL_MULTIPATH)
989		rdev->desc_nr = -1;
990	else
991		rdev->desc_nr = sb->this_disk.number;
992
993	if (!refdev) {
994		ret = 1;
995	} else {
996		__u64 ev1, ev2;
997		mdp_super_t *refsb = page_address(refdev->sb_page);
998		if (!uuid_equal(refsb, sb)) {
999			printk(KERN_WARNING "md: %s has different UUID to %s\n",
1000				b, bdevname(refdev->bdev,b2));
1001			goto abort;
1002		}
1003		if (!sb_equal(refsb, sb)) {
1004			printk(KERN_WARNING "md: %s has same UUID"
1005			       " but different superblock to %s\n",
1006			       b, bdevname(refdev->bdev, b2));
1007			goto abort;
1008		}
1009		ev1 = md_event(sb);
1010		ev2 = md_event(refsb);
1011		if (ev1 > ev2)
1012			ret = 1;
1013		else
1014			ret = 0;
1015	}
1016	rdev->sectors = rdev->sb_start;
1017	/* Limit to 4TB as metadata cannot record more than that.
1018	 * (not needed for Linear and RAID0 as metadata doesn't
1019	 * record this size)
1020	 */
1021	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1022		rdev->sectors = (2ULL << 32) - 2;
1023
1024	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1025		/* "this cannot possibly happen" ... */
1026		ret = -EINVAL;
1027
1028 abort:
1029	return ret;
1030}
1031
1032/*
1033 * validate_super for 0.90.0
1034 */
1035static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1036{
1037	mdp_disk_t *desc;
1038	mdp_super_t *sb = page_address(rdev->sb_page);
1039	__u64 ev1 = md_event(sb);
1040
1041	rdev->raid_disk = -1;
1042	clear_bit(Faulty, &rdev->flags);
1043	clear_bit(In_sync, &rdev->flags);
1044	clear_bit(Bitmap_sync, &rdev->flags);
1045	clear_bit(WriteMostly, &rdev->flags);
1046
1047	if (mddev->raid_disks == 0) {
1048		mddev->major_version = 0;
1049		mddev->minor_version = sb->minor_version;
1050		mddev->patch_version = sb->patch_version;
1051		mddev->external = 0;
1052		mddev->chunk_sectors = sb->chunk_size >> 9;
1053		mddev->ctime = sb->ctime;
1054		mddev->utime = sb->utime;
1055		mddev->level = sb->level;
1056		mddev->clevel[0] = 0;
1057		mddev->layout = sb->layout;
1058		mddev->raid_disks = sb->raid_disks;
1059		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1060		mddev->events = ev1;
1061		mddev->bitmap_info.offset = 0;
1062		mddev->bitmap_info.space = 0;
1063		/* bitmap can use 60 K after the 4K superblocks */
1064		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1065		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1066		mddev->reshape_backwards = 0;
1067
1068		if (mddev->minor_version >= 91) {
1069			mddev->reshape_position = sb->reshape_position;
1070			mddev->delta_disks = sb->delta_disks;
1071			mddev->new_level = sb->new_level;
1072			mddev->new_layout = sb->new_layout;
1073			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1074			if (mddev->delta_disks < 0)
1075				mddev->reshape_backwards = 1;
1076		} else {
1077			mddev->reshape_position = MaxSector;
1078			mddev->delta_disks = 0;
1079			mddev->new_level = mddev->level;
1080			mddev->new_layout = mddev->layout;
1081			mddev->new_chunk_sectors = mddev->chunk_sectors;
1082		}
1083
1084		if (sb->state & (1<<MD_SB_CLEAN))
1085			mddev->recovery_cp = MaxSector;
1086		else {
1087			if (sb->events_hi == sb->cp_events_hi &&
1088				sb->events_lo == sb->cp_events_lo) {
1089				mddev->recovery_cp = sb->recovery_cp;
1090			} else
1091				mddev->recovery_cp = 0;
1092		}
1093
1094		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1095		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1096		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1097		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1098
1099		mddev->max_disks = MD_SB_DISKS;
1100
1101		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1102		    mddev->bitmap_info.file == NULL) {
1103			mddev->bitmap_info.offset =
1104				mddev->bitmap_info.default_offset;
1105			mddev->bitmap_info.space =
1106				mddev->bitmap_info.default_space;
1107		}
1108
1109	} else if (mddev->pers == NULL) {
1110		/* Insist on good event counter while assembling, except
1111		 * for spares (which don't need an event count) */
1112		++ev1;
1113		if (sb->disks[rdev->desc_nr].state & (
1114			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1115			if (ev1 < mddev->events)
1116				return -EINVAL;
1117	} else if (mddev->bitmap) {
1118		/* if adding to array with a bitmap, then we can accept an
1119		 * older device ... but not too old.
1120		 */
1121		if (ev1 < mddev->bitmap->events_cleared)
1122			return 0;
1123		if (ev1 < mddev->events)
1124			set_bit(Bitmap_sync, &rdev->flags);
1125	} else {
1126		if (ev1 < mddev->events)
1127			/* just a hot-add of a new device, leave raid_disk at -1 */
1128			return 0;
1129	}
1130
1131	if (mddev->level != LEVEL_MULTIPATH) {
1132		desc = sb->disks + rdev->desc_nr;
1133
1134		if (desc->state & (1<<MD_DISK_FAULTY))
1135			set_bit(Faulty, &rdev->flags);
1136		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1137			    desc->raid_disk < mddev->raid_disks */) {
1138			set_bit(In_sync, &rdev->flags);
1139			rdev->raid_disk = desc->raid_disk;
1140			rdev->saved_raid_disk = desc->raid_disk;
1141		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1142			/* active but not in sync implies recovery up to
1143			 * reshape position.  We don't know exactly where
1144			 * that is, so set to zero for now */
1145			if (mddev->minor_version >= 91) {
1146				rdev->recovery_offset = 0;
1147				rdev->raid_disk = desc->raid_disk;
1148			}
1149		}
1150		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1151			set_bit(WriteMostly, &rdev->flags);
1152	} else /* MULTIPATH are always insync */
1153		set_bit(In_sync, &rdev->flags);
1154	return 0;
1155}
1156
1157/*
1158 * sync_super for 0.90.0
1159 */
1160static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1161{
1162	mdp_super_t *sb;
1163	struct md_rdev *rdev2;
1164	int next_spare = mddev->raid_disks;
1165
1166	/* make rdev->sb match mddev data..
1167	 *
1168	 * 1/ zero out disks
1169	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1170	 * 3/ any empty disks < next_spare become removed
1171	 *
1172	 * disks[0] gets initialised to REMOVED because
1173	 * we cannot be sure from other fields if it has
1174	 * been initialised or not.
1175	 */
1176	int i;
1177	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1178
1179	rdev->sb_size = MD_SB_BYTES;
1180
1181	sb = page_address(rdev->sb_page);
1182
1183	memset(sb, 0, sizeof(*sb));
1184
1185	sb->md_magic = MD_SB_MAGIC;
1186	sb->major_version = mddev->major_version;
1187	sb->patch_version = mddev->patch_version;
1188	sb->gvalid_words  = 0; /* ignored */
1189	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1190	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1191	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1192	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1193
1194	sb->ctime = mddev->ctime;
1195	sb->level = mddev->level;
1196	sb->size = mddev->dev_sectors / 2;
1197	sb->raid_disks = mddev->raid_disks;
1198	sb->md_minor = mddev->md_minor;
1199	sb->not_persistent = 0;
1200	sb->utime = mddev->utime;
1201	sb->state = 0;
1202	sb->events_hi = (mddev->events>>32);
1203	sb->events_lo = (u32)mddev->events;
1204
1205	if (mddev->reshape_position == MaxSector)
1206		sb->minor_version = 90;
1207	else {
1208		sb->minor_version = 91;
1209		sb->reshape_position = mddev->reshape_position;
1210		sb->new_level = mddev->new_level;
1211		sb->delta_disks = mddev->delta_disks;
1212		sb->new_layout = mddev->new_layout;
1213		sb->new_chunk = mddev->new_chunk_sectors << 9;
1214	}
1215	mddev->minor_version = sb->minor_version;
1216	if (mddev->in_sync)
1217	{
1218		sb->recovery_cp = mddev->recovery_cp;
1219		sb->cp_events_hi = (mddev->events>>32);
1220		sb->cp_events_lo = (u32)mddev->events;
1221		if (mddev->recovery_cp == MaxSector)
1222			sb->state = (1<< MD_SB_CLEAN);
1223	} else
1224		sb->recovery_cp = 0;
1225
1226	sb->layout = mddev->layout;
1227	sb->chunk_size = mddev->chunk_sectors << 9;
1228
1229	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1230		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1231
1232	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1233	rdev_for_each(rdev2, mddev) {
1234		mdp_disk_t *d;
1235		int desc_nr;
1236		int is_active = test_bit(In_sync, &rdev2->flags);
1237
1238		if (rdev2->raid_disk >= 0 &&
1239		    sb->minor_version >= 91)
1240			/* we have nowhere to store the recovery_offset,
1241			 * but if it is not below the reshape_position,
1242			 * we can piggy-back on that.
1243			 */
1244			is_active = 1;
1245		if (rdev2->raid_disk < 0 ||
1246		    test_bit(Faulty, &rdev2->flags))
1247			is_active = 0;
1248		if (is_active)
1249			desc_nr = rdev2->raid_disk;
1250		else
1251			desc_nr = next_spare++;
1252		rdev2->desc_nr = desc_nr;
1253		d = &sb->disks[rdev2->desc_nr];
1254		nr_disks++;
1255		d->number = rdev2->desc_nr;
1256		d->major = MAJOR(rdev2->bdev->bd_dev);
1257		d->minor = MINOR(rdev2->bdev->bd_dev);
1258		if (is_active)
1259			d->raid_disk = rdev2->raid_disk;
1260		else
1261			d->raid_disk = rdev2->desc_nr; /* compatibility */
1262		if (test_bit(Faulty, &rdev2->flags))
1263			d->state = (1<<MD_DISK_FAULTY);
1264		else if (is_active) {
1265			d->state = (1<<MD_DISK_ACTIVE);
1266			if (test_bit(In_sync, &rdev2->flags))
1267				d->state |= (1<<MD_DISK_SYNC);
1268			active++;
1269			working++;
1270		} else {
1271			d->state = 0;
1272			spare++;
1273			working++;
1274		}
1275		if (test_bit(WriteMostly, &rdev2->flags))
1276			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1277	}
1278	/* now set the "removed" and "faulty" bits on any missing devices */
1279	for (i=0 ; i < mddev->raid_disks ; i++) {
1280		mdp_disk_t *d = &sb->disks[i];
1281		if (d->state == 0 && d->number == 0) {
1282			d->number = i;
1283			d->raid_disk = i;
1284			d->state = (1<<MD_DISK_REMOVED);
1285			d->state |= (1<<MD_DISK_FAULTY);
1286			failed++;
1287		}
1288	}
1289	sb->nr_disks = nr_disks;
1290	sb->active_disks = active;
1291	sb->working_disks = working;
1292	sb->failed_disks = failed;
1293	sb->spare_disks = spare;
1294
1295	sb->this_disk = sb->disks[rdev->desc_nr];
1296	sb->sb_csum = calc_sb_csum(sb);
1297}
1298
1299/*
1300 * rdev_size_change for 0.90.0
1301 */
1302static unsigned long long
1303super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1304{
1305	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1306		return 0; /* component must fit device */
1307	if (rdev->mddev->bitmap_info.offset)
1308		return 0; /* can't move bitmap */
1309	rdev->sb_start = calc_dev_sboffset(rdev);
1310	if (!num_sectors || num_sectors > rdev->sb_start)
1311		num_sectors = rdev->sb_start;
1312	/* Limit to 4TB as metadata cannot record more than that.
1313	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1314	 */
1315	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1316		num_sectors = (2ULL << 32) - 2;
1317	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1318		       rdev->sb_page);
1319	md_super_wait(rdev->mddev);
1320	return num_sectors;
1321}
1322
1323static int
1324super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1325{
1326	/* non-zero offset changes not possible with v0.90 */
1327	return new_offset == 0;
1328}
1329
1330/*
1331 * version 1 superblock
1332 */
1333
1334static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1335{
1336	__le32 disk_csum;
1337	u32 csum;
1338	unsigned long long newcsum;
1339	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1340	__le32 *isuper = (__le32*)sb;
1341
1342	disk_csum = sb->sb_csum;
1343	sb->sb_csum = 0;
1344	newcsum = 0;
1345	for (; size >= 4; size -= 4)
1346		newcsum += le32_to_cpu(*isuper++);
1347
1348	if (size == 2)
1349		newcsum += le16_to_cpu(*(__le16*) isuper);
1350
1351	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1352	sb->sb_csum = disk_csum;
1353	return cpu_to_le32(csum);
1354}
1355
1356static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1357			    int acknowledged);
1358static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1359{
1360	struct mdp_superblock_1 *sb;
1361	int ret;
1362	sector_t sb_start;
1363	sector_t sectors;
1364	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1365	int bmask;
1366
1367	/*
1368	 * Calculate the position of the superblock in 512byte sectors.
1369	 * It is always aligned to a 4K boundary and
1370	 * depeding on minor_version, it can be:
1371	 * 0: At least 8K, but less than 12K, from end of device
1372	 * 1: At start of device
1373	 * 2: 4K from start of device.
1374	 */
1375	switch(minor_version) {
1376	case 0:
1377		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1378		sb_start -= 8*2;
1379		sb_start &= ~(sector_t)(4*2-1);
1380		break;
1381	case 1:
1382		sb_start = 0;
1383		break;
1384	case 2:
1385		sb_start = 8;
1386		break;
1387	default:
1388		return -EINVAL;
1389	}
1390	rdev->sb_start = sb_start;
1391
1392	/* superblock is rarely larger than 1K, but it can be larger,
1393	 * and it is safe to read 4k, so we do that
1394	 */
1395	ret = read_disk_sb(rdev, 4096);
1396	if (ret) return ret;
1397
1398	sb = page_address(rdev->sb_page);
1399
1400	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1401	    sb->major_version != cpu_to_le32(1) ||
1402	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1403	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1404	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1405		return -EINVAL;
1406
1407	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1408		printk("md: invalid superblock checksum on %s\n",
1409			bdevname(rdev->bdev,b));
1410		return -EINVAL;
1411	}
1412	if (le64_to_cpu(sb->data_size) < 10) {
1413		printk("md: data_size too small on %s\n",
1414		       bdevname(rdev->bdev,b));
1415		return -EINVAL;
1416	}
1417	if (sb->pad0 ||
1418	    sb->pad3[0] ||
1419	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1420		/* Some padding is non-zero, might be a new feature */
1421		return -EINVAL;
1422
1423	rdev->preferred_minor = 0xffff;
1424	rdev->data_offset = le64_to_cpu(sb->data_offset);
1425	rdev->new_data_offset = rdev->data_offset;
1426	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1427	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1428		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1429	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1430
1431	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1432	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1433	if (rdev->sb_size & bmask)
1434		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1435
1436	if (minor_version
1437	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1438		return -EINVAL;
1439	if (minor_version
1440	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1441		return -EINVAL;
1442
1443	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1444		rdev->desc_nr = -1;
1445	else
1446		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1447
1448	if (!rdev->bb_page) {
1449		rdev->bb_page = alloc_page(GFP_KERNEL);
1450		if (!rdev->bb_page)
1451			return -ENOMEM;
1452	}
1453	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1454	    rdev->badblocks.count == 0) {
1455		/* need to load the bad block list.
1456		 * Currently we limit it to one page.
1457		 */
1458		s32 offset;
1459		sector_t bb_sector;
1460		u64 *bbp;
1461		int i;
1462		int sectors = le16_to_cpu(sb->bblog_size);
1463		if (sectors > (PAGE_SIZE / 512))
1464			return -EINVAL;
1465		offset = le32_to_cpu(sb->bblog_offset);
1466		if (offset == 0)
1467			return -EINVAL;
1468		bb_sector = (long long)offset;
1469		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1470				  rdev->bb_page, READ, true))
1471			return -EIO;
1472		bbp = (u64 *)page_address(rdev->bb_page);
1473		rdev->badblocks.shift = sb->bblog_shift;
1474		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1475			u64 bb = le64_to_cpu(*bbp);
1476			int count = bb & (0x3ff);
1477			u64 sector = bb >> 10;
1478			sector <<= sb->bblog_shift;
1479			count <<= sb->bblog_shift;
1480			if (bb + 1 == 0)
1481				break;
1482			if (md_set_badblocks(&rdev->badblocks,
1483					     sector, count, 1) == 0)
1484				return -EINVAL;
1485		}
1486	} else if (sb->bblog_offset != 0)
1487		rdev->badblocks.shift = 0;
1488
1489	if (!refdev) {
1490		ret = 1;
1491	} else {
1492		__u64 ev1, ev2;
1493		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1494
1495		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1496		    sb->level != refsb->level ||
1497		    sb->layout != refsb->layout ||
1498		    sb->chunksize != refsb->chunksize) {
1499			printk(KERN_WARNING "md: %s has strangely different"
1500				" superblock to %s\n",
1501				bdevname(rdev->bdev,b),
1502				bdevname(refdev->bdev,b2));
1503			return -EINVAL;
1504		}
1505		ev1 = le64_to_cpu(sb->events);
1506		ev2 = le64_to_cpu(refsb->events);
1507
1508		if (ev1 > ev2)
1509			ret = 1;
1510		else
1511			ret = 0;
1512	}
1513	if (minor_version) {
1514		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1515		sectors -= rdev->data_offset;
1516	} else
1517		sectors = rdev->sb_start;
1518	if (sectors < le64_to_cpu(sb->data_size))
1519		return -EINVAL;
1520	rdev->sectors = le64_to_cpu(sb->data_size);
1521	return ret;
1522}
1523
1524static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1525{
1526	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1527	__u64 ev1 = le64_to_cpu(sb->events);
1528
1529	rdev->raid_disk = -1;
1530	clear_bit(Faulty, &rdev->flags);
1531	clear_bit(In_sync, &rdev->flags);
1532	clear_bit(Bitmap_sync, &rdev->flags);
1533	clear_bit(WriteMostly, &rdev->flags);
1534
1535	if (mddev->raid_disks == 0) {
1536		mddev->major_version = 1;
1537		mddev->patch_version = 0;
1538		mddev->external = 0;
1539		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1540		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1541		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1542		mddev->level = le32_to_cpu(sb->level);
1543		mddev->clevel[0] = 0;
1544		mddev->layout = le32_to_cpu(sb->layout);
1545		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1546		mddev->dev_sectors = le64_to_cpu(sb->size);
1547		mddev->events = ev1;
1548		mddev->bitmap_info.offset = 0;
1549		mddev->bitmap_info.space = 0;
1550		/* Default location for bitmap is 1K after superblock
1551		 * using 3K - total of 4K
1552		 */
1553		mddev->bitmap_info.default_offset = 1024 >> 9;
1554		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1555		mddev->reshape_backwards = 0;
1556
1557		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1558		memcpy(mddev->uuid, sb->set_uuid, 16);
1559
1560		mddev->max_disks =  (4096-256)/2;
1561
1562		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1563		    mddev->bitmap_info.file == NULL) {
1564			mddev->bitmap_info.offset =
1565				(__s32)le32_to_cpu(sb->bitmap_offset);
1566			/* Metadata doesn't record how much space is available.
1567			 * For 1.0, we assume we can use up to the superblock
1568			 * if before, else to 4K beyond superblock.
1569			 * For others, assume no change is possible.
1570			 */
1571			if (mddev->minor_version > 0)
1572				mddev->bitmap_info.space = 0;
1573			else if (mddev->bitmap_info.offset > 0)
1574				mddev->bitmap_info.space =
1575					8 - mddev->bitmap_info.offset;
1576			else
1577				mddev->bitmap_info.space =
1578					-mddev->bitmap_info.offset;
1579		}
1580
1581		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1582			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1583			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1584			mddev->new_level = le32_to_cpu(sb->new_level);
1585			mddev->new_layout = le32_to_cpu(sb->new_layout);
1586			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1587			if (mddev->delta_disks < 0 ||
1588			    (mddev->delta_disks == 0 &&
1589			     (le32_to_cpu(sb->feature_map)
1590			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1591				mddev->reshape_backwards = 1;
1592		} else {
1593			mddev->reshape_position = MaxSector;
1594			mddev->delta_disks = 0;
1595			mddev->new_level = mddev->level;
1596			mddev->new_layout = mddev->layout;
1597			mddev->new_chunk_sectors = mddev->chunk_sectors;
1598		}
1599
1600	} else if (mddev->pers == NULL) {
1601		/* Insist of good event counter while assembling, except for
1602		 * spares (which don't need an event count) */
1603		++ev1;
1604		if (rdev->desc_nr >= 0 &&
1605		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1606		    le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1607			if (ev1 < mddev->events)
1608				return -EINVAL;
1609	} else if (mddev->bitmap) {
1610		/* If adding to array with a bitmap, then we can accept an
1611		 * older device, but not too old.
1612		 */
1613		if (ev1 < mddev->bitmap->events_cleared)
1614			return 0;
1615		if (ev1 < mddev->events)
1616			set_bit(Bitmap_sync, &rdev->flags);
1617	} else {
1618		if (ev1 < mddev->events)
1619			/* just a hot-add of a new device, leave raid_disk at -1 */
1620			return 0;
1621	}
1622	if (mddev->level != LEVEL_MULTIPATH) {
1623		int role;
1624		if (rdev->desc_nr < 0 ||
1625		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1626			role = 0xffff;
1627			rdev->desc_nr = -1;
1628		} else
1629			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1630		switch(role) {
1631		case 0xffff: /* spare */
1632			break;
1633		case 0xfffe: /* faulty */
1634			set_bit(Faulty, &rdev->flags);
1635			break;
1636		default:
1637			rdev->saved_raid_disk = role;
1638			if ((le32_to_cpu(sb->feature_map) &
1639			     MD_FEATURE_RECOVERY_OFFSET)) {
1640				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1641				if (!(le32_to_cpu(sb->feature_map) &
1642				      MD_FEATURE_RECOVERY_BITMAP))
1643					rdev->saved_raid_disk = -1;
1644			} else
1645				set_bit(In_sync, &rdev->flags);
1646			rdev->raid_disk = role;
1647			break;
1648		}
1649		if (sb->devflags & WriteMostly1)
1650			set_bit(WriteMostly, &rdev->flags);
1651		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1652			set_bit(Replacement, &rdev->flags);
1653	} else /* MULTIPATH are always insync */
1654		set_bit(In_sync, &rdev->flags);
1655
1656	return 0;
1657}
1658
1659static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1660{
1661	struct mdp_superblock_1 *sb;
1662	struct md_rdev *rdev2;
1663	int max_dev, i;
1664	/* make rdev->sb match mddev and rdev data. */
1665
1666	sb = page_address(rdev->sb_page);
1667
1668	sb->feature_map = 0;
1669	sb->pad0 = 0;
1670	sb->recovery_offset = cpu_to_le64(0);
1671	memset(sb->pad3, 0, sizeof(sb->pad3));
1672
1673	sb->utime = cpu_to_le64((__u64)mddev->utime);
1674	sb->events = cpu_to_le64(mddev->events);
1675	if (mddev->in_sync)
1676		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1677	else
1678		sb->resync_offset = cpu_to_le64(0);
1679
1680	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1681
1682	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1683	sb->size = cpu_to_le64(mddev->dev_sectors);
1684	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1685	sb->level = cpu_to_le32(mddev->level);
1686	sb->layout = cpu_to_le32(mddev->layout);
1687
1688	if (test_bit(WriteMostly, &rdev->flags))
1689		sb->devflags |= WriteMostly1;
1690	else
1691		sb->devflags &= ~WriteMostly1;
1692	sb->data_offset = cpu_to_le64(rdev->data_offset);
1693	sb->data_size = cpu_to_le64(rdev->sectors);
1694
1695	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1696		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1697		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1698	}
1699
1700	if (rdev->raid_disk >= 0 &&
1701	    !test_bit(In_sync, &rdev->flags)) {
1702		sb->feature_map |=
1703			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1704		sb->recovery_offset =
1705			cpu_to_le64(rdev->recovery_offset);
1706		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1707			sb->feature_map |=
1708				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1709	}
1710	if (test_bit(Replacement, &rdev->flags))
1711		sb->feature_map |=
1712			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1713
1714	if (mddev->reshape_position != MaxSector) {
1715		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1716		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1717		sb->new_layout = cpu_to_le32(mddev->new_layout);
1718		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1719		sb->new_level = cpu_to_le32(mddev->new_level);
1720		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1721		if (mddev->delta_disks == 0 &&
1722		    mddev->reshape_backwards)
1723			sb->feature_map
1724				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1725		if (rdev->new_data_offset != rdev->data_offset) {
1726			sb->feature_map
1727				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1728			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1729							     - rdev->data_offset));
1730		}
1731	}
1732
1733	if (rdev->badblocks.count == 0)
1734		/* Nothing to do for bad blocks*/ ;
1735	else if (sb->bblog_offset == 0)
1736		/* Cannot record bad blocks on this device */
1737		md_error(mddev, rdev);
1738	else {
1739		struct badblocks *bb = &rdev->badblocks;
1740		u64 *bbp = (u64 *)page_address(rdev->bb_page);
1741		u64 *p = bb->page;
1742		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1743		if (bb->changed) {
1744			unsigned seq;
1745
1746retry:
1747			seq = read_seqbegin(&bb->lock);
1748
1749			memset(bbp, 0xff, PAGE_SIZE);
1750
1751			for (i = 0 ; i < bb->count ; i++) {
1752				u64 internal_bb = p[i];
1753				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1754						| BB_LEN(internal_bb));
1755				bbp[i] = cpu_to_le64(store_bb);
1756			}
1757			bb->changed = 0;
1758			if (read_seqretry(&bb->lock, seq))
1759				goto retry;
1760
1761			bb->sector = (rdev->sb_start +
1762				      (int)le32_to_cpu(sb->bblog_offset));
1763			bb->size = le16_to_cpu(sb->bblog_size);
1764		}
1765	}
1766
1767	max_dev = 0;
1768	rdev_for_each(rdev2, mddev)
1769		if (rdev2->desc_nr+1 > max_dev)
1770			max_dev = rdev2->desc_nr+1;
1771
1772	if (max_dev > le32_to_cpu(sb->max_dev)) {
1773		int bmask;
1774		sb->max_dev = cpu_to_le32(max_dev);
1775		rdev->sb_size = max_dev * 2 + 256;
1776		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1777		if (rdev->sb_size & bmask)
1778			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1779	} else
1780		max_dev = le32_to_cpu(sb->max_dev);
1781
1782	for (i=0; i<max_dev;i++)
1783		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1784
1785	rdev_for_each(rdev2, mddev) {
1786		i = rdev2->desc_nr;
1787		if (test_bit(Faulty, &rdev2->flags))
1788			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1789		else if (test_bit(In_sync, &rdev2->flags))
1790			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1791		else if (rdev2->raid_disk >= 0)
1792			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1793		else
1794			sb->dev_roles[i] = cpu_to_le16(0xffff);
1795	}
1796
1797	sb->sb_csum = calc_sb_1_csum(sb);
1798}
1799
1800static unsigned long long
1801super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1802{
1803	struct mdp_superblock_1 *sb;
1804	sector_t max_sectors;
1805	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1806		return 0; /* component must fit device */
1807	if (rdev->data_offset != rdev->new_data_offset)
1808		return 0; /* too confusing */
1809	if (rdev->sb_start < rdev->data_offset) {
1810		/* minor versions 1 and 2; superblock before data */
1811		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1812		max_sectors -= rdev->data_offset;
1813		if (!num_sectors || num_sectors > max_sectors)
1814			num_sectors = max_sectors;
1815	} else if (rdev->mddev->bitmap_info.offset) {
1816		/* minor version 0 with bitmap we can't move */
1817		return 0;
1818	} else {
1819		/* minor version 0; superblock after data */
1820		sector_t sb_start;
1821		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1822		sb_start &= ~(sector_t)(4*2 - 1);
1823		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1824		if (!num_sectors || num_sectors > max_sectors)
1825			num_sectors = max_sectors;
1826		rdev->sb_start = sb_start;
1827	}
1828	sb = page_address(rdev->sb_page);
1829	sb->data_size = cpu_to_le64(num_sectors);
1830	sb->super_offset = rdev->sb_start;
1831	sb->sb_csum = calc_sb_1_csum(sb);
1832	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1833		       rdev->sb_page);
1834	md_super_wait(rdev->mddev);
1835	return num_sectors;
1836
1837}
1838
1839static int
1840super_1_allow_new_offset(struct md_rdev *rdev,
1841			 unsigned long long new_offset)
1842{
1843	/* All necessary checks on new >= old have been done */
1844	struct bitmap *bitmap;
1845	if (new_offset >= rdev->data_offset)
1846		return 1;
1847
1848	/* with 1.0 metadata, there is no metadata to tread on
1849	 * so we can always move back */
1850	if (rdev->mddev->minor_version == 0)
1851		return 1;
1852
1853	/* otherwise we must be sure not to step on
1854	 * any metadata, so stay:
1855	 * 36K beyond start of superblock
1856	 * beyond end of badblocks
1857	 * beyond write-intent bitmap
1858	 */
1859	if (rdev->sb_start + (32+4)*2 > new_offset)
1860		return 0;
1861	bitmap = rdev->mddev->bitmap;
1862	if (bitmap && !rdev->mddev->bitmap_info.file &&
1863	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
1864	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1865		return 0;
1866	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1867		return 0;
1868
1869	return 1;
1870}
1871
1872static struct super_type super_types[] = {
1873	[0] = {
1874		.name	= "0.90.0",
1875		.owner	= THIS_MODULE,
1876		.load_super	    = super_90_load,
1877		.validate_super	    = super_90_validate,
1878		.sync_super	    = super_90_sync,
1879		.rdev_size_change   = super_90_rdev_size_change,
1880		.allow_new_offset   = super_90_allow_new_offset,
1881	},
1882	[1] = {
1883		.name	= "md-1",
1884		.owner	= THIS_MODULE,
1885		.load_super	    = super_1_load,
1886		.validate_super	    = super_1_validate,
1887		.sync_super	    = super_1_sync,
1888		.rdev_size_change   = super_1_rdev_size_change,
1889		.allow_new_offset   = super_1_allow_new_offset,
1890	},
1891};
1892
1893static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1894{
1895	if (mddev->sync_super) {
1896		mddev->sync_super(mddev, rdev);
1897		return;
1898	}
1899
1900	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1901
1902	super_types[mddev->major_version].sync_super(mddev, rdev);
1903}
1904
1905static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1906{
1907	struct md_rdev *rdev, *rdev2;
1908
1909	rcu_read_lock();
1910	rdev_for_each_rcu(rdev, mddev1)
1911		rdev_for_each_rcu(rdev2, mddev2)
1912			if (rdev->bdev->bd_contains ==
1913			    rdev2->bdev->bd_contains) {
1914				rcu_read_unlock();
1915				return 1;
1916			}
1917	rcu_read_unlock();
1918	return 0;
1919}
1920
1921static LIST_HEAD(pending_raid_disks);
1922
1923/*
1924 * Try to register data integrity profile for an mddev
1925 *
1926 * This is called when an array is started and after a disk has been kicked
1927 * from the array. It only succeeds if all working and active component devices
1928 * are integrity capable with matching profiles.
1929 */
1930int md_integrity_register(struct mddev *mddev)
1931{
1932	struct md_rdev *rdev, *reference = NULL;
1933
1934	if (list_empty(&mddev->disks))
1935		return 0; /* nothing to do */
1936	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1937		return 0; /* shouldn't register, or already is */
1938	rdev_for_each(rdev, mddev) {
1939		/* skip spares and non-functional disks */
1940		if (test_bit(Faulty, &rdev->flags))
1941			continue;
1942		if (rdev->raid_disk < 0)
1943			continue;
1944		if (!reference) {
1945			/* Use the first rdev as the reference */
1946			reference = rdev;
1947			continue;
1948		}
1949		/* does this rdev's profile match the reference profile? */
1950		if (blk_integrity_compare(reference->bdev->bd_disk,
1951				rdev->bdev->bd_disk) < 0)
1952			return -EINVAL;
1953	}
1954	if (!reference || !bdev_get_integrity(reference->bdev))
1955		return 0;
1956	/*
1957	 * All component devices are integrity capable and have matching
1958	 * profiles, register the common profile for the md device.
1959	 */
1960	if (blk_integrity_register(mddev->gendisk,
1961			bdev_get_integrity(reference->bdev)) != 0) {
1962		printk(KERN_ERR "md: failed to register integrity for %s\n",
1963			mdname(mddev));
1964		return -EINVAL;
1965	}
1966	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1967	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1968		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1969		       mdname(mddev));
1970		return -EINVAL;
1971	}
1972	return 0;
1973}
1974EXPORT_SYMBOL(md_integrity_register);
1975
1976/* Disable data integrity if non-capable/non-matching disk is being added */
1977void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
1978{
1979	struct blk_integrity *bi_rdev;
1980	struct blk_integrity *bi_mddev;
1981
1982	if (!mddev->gendisk)
1983		return;
1984
1985	bi_rdev = bdev_get_integrity(rdev->bdev);
1986	bi_mddev = blk_get_integrity(mddev->gendisk);
1987
1988	if (!bi_mddev) /* nothing to do */
1989		return;
1990	if (rdev->raid_disk < 0) /* skip spares */
1991		return;
1992	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1993					     rdev->bdev->bd_disk) >= 0)
1994		return;
1995	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1996	blk_integrity_unregister(mddev->gendisk);
1997}
1998EXPORT_SYMBOL(md_integrity_add_rdev);
1999
2000static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2001{
2002	char b[BDEVNAME_SIZE];
2003	struct kobject *ko;
2004	char *s;
2005	int err;
2006
2007	/* prevent duplicates */
2008	if (find_rdev(mddev, rdev->bdev->bd_dev))
2009		return -EEXIST;
2010
2011	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2012	if (rdev->sectors && (mddev->dev_sectors == 0 ||
2013			rdev->sectors < mddev->dev_sectors)) {
2014		if (mddev->pers) {
2015			/* Cannot change size, so fail
2016			 * If mddev->level <= 0, then we don't care
2017			 * about aligning sizes (e.g. linear)
2018			 */
2019			if (mddev->level > 0)
2020				return -ENOSPC;
2021		} else
2022			mddev->dev_sectors = rdev->sectors;
2023	}
2024
2025	/* Verify rdev->desc_nr is unique.
2026	 * If it is -1, assign a free number, else
2027	 * check number is not in use
2028	 */
2029	rcu_read_lock();
2030	if (rdev->desc_nr < 0) {
2031		int choice = 0;
2032		if (mddev->pers)
2033			choice = mddev->raid_disks;
2034		while (find_rdev_nr_rcu(mddev, choice))
2035			choice++;
2036		rdev->desc_nr = choice;
2037	} else {
2038		if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2039			rcu_read_unlock();
2040			return -EBUSY;
2041		}
2042	}
2043	rcu_read_unlock();
2044	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2045		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2046		       mdname(mddev), mddev->max_disks);
2047		return -EBUSY;
2048	}
2049	bdevname(rdev->bdev,b);
2050	while ( (s=strchr(b, '/')) != NULL)
2051		*s = '!';
2052
2053	rdev->mddev = mddev;
2054	printk(KERN_INFO "md: bind<%s>\n", b);
2055
2056	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2057		goto fail;
2058
2059	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2060	if (sysfs_create_link(&rdev->kobj, ko, "block"))
2061		/* failure here is OK */;
2062	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2063
2064	list_add_rcu(&rdev->same_set, &mddev->disks);
2065	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2066
2067	/* May as well allow recovery to be retried once */
2068	mddev->recovery_disabled++;
2069
2070	return 0;
2071
2072 fail:
2073	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2074	       b, mdname(mddev));
2075	return err;
2076}
2077
2078static void md_delayed_delete(struct work_struct *ws)
2079{
2080	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2081	kobject_del(&rdev->kobj);
2082	kobject_put(&rdev->kobj);
2083}
2084
2085static void unbind_rdev_from_array(struct md_rdev *rdev)
2086{
2087	char b[BDEVNAME_SIZE];
2088
2089	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2090	list_del_rcu(&rdev->same_set);
2091	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2092	rdev->mddev = NULL;
2093	sysfs_remove_link(&rdev->kobj, "block");
2094	sysfs_put(rdev->sysfs_state);
2095	rdev->sysfs_state = NULL;
2096	rdev->badblocks.count = 0;
2097	/* We need to delay this, otherwise we can deadlock when
2098	 * writing to 'remove' to "dev/state".  We also need
2099	 * to delay it due to rcu usage.
2100	 */
2101	synchronize_rcu();
2102	INIT_WORK(&rdev->del_work, md_delayed_delete);
2103	kobject_get(&rdev->kobj);
2104	queue_work(md_misc_wq, &rdev->del_work);
2105}
2106
2107/*
2108 * prevent the device from being mounted, repartitioned or
2109 * otherwise reused by a RAID array (or any other kernel
2110 * subsystem), by bd_claiming the device.
2111 */
2112static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2113{
2114	int err = 0;
2115	struct block_device *bdev;
2116	char b[BDEVNAME_SIZE];
2117
2118	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2119				 shared ? (struct md_rdev *)lock_rdev : rdev);
2120	if (IS_ERR(bdev)) {
2121		printk(KERN_ERR "md: could not open %s.\n",
2122			__bdevname(dev, b));
2123		return PTR_ERR(bdev);
2124	}
2125	rdev->bdev = bdev;
2126	return err;
2127}
2128
2129static void unlock_rdev(struct md_rdev *rdev)
2130{
2131	struct block_device *bdev = rdev->bdev;
2132	rdev->bdev = NULL;
2133	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2134}
2135
2136void md_autodetect_dev(dev_t dev);
2137
2138static void export_rdev(struct md_rdev *rdev)
2139{
2140	char b[BDEVNAME_SIZE];
2141
2142	printk(KERN_INFO "md: export_rdev(%s)\n",
2143		bdevname(rdev->bdev,b));
2144	md_rdev_clear(rdev);
2145#ifndef MODULE
2146	if (test_bit(AutoDetected, &rdev->flags))
2147		md_autodetect_dev(rdev->bdev->bd_dev);
2148#endif
2149	unlock_rdev(rdev);
2150	kobject_put(&rdev->kobj);
2151}
2152
2153static void kick_rdev_from_array(struct md_rdev *rdev)
2154{
2155	unbind_rdev_from_array(rdev);
2156	export_rdev(rdev);
2157}
2158
2159static void export_array(struct mddev *mddev)
2160{
2161	struct md_rdev *rdev;
2162
2163	while (!list_empty(&mddev->disks)) {
2164		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2165					same_set);
2166		kick_rdev_from_array(rdev);
2167	}
2168	mddev->raid_disks = 0;
2169	mddev->major_version = 0;
2170}
2171
2172static void sync_sbs(struct mddev *mddev, int nospares)
2173{
2174	/* Update each superblock (in-memory image), but
2175	 * if we are allowed to, skip spares which already
2176	 * have the right event counter, or have one earlier
2177	 * (which would mean they aren't being marked as dirty
2178	 * with the rest of the array)
2179	 */
2180	struct md_rdev *rdev;
2181	rdev_for_each(rdev, mddev) {
2182		if (rdev->sb_events == mddev->events ||
2183		    (nospares &&
2184		     rdev->raid_disk < 0 &&
2185		     rdev->sb_events+1 == mddev->events)) {
2186			/* Don't update this superblock */
2187			rdev->sb_loaded = 2;
2188		} else {
2189			sync_super(mddev, rdev);
2190			rdev->sb_loaded = 1;
2191		}
2192	}
2193}
2194
2195static void md_update_sb(struct mddev *mddev, int force_change)
2196{
2197	struct md_rdev *rdev;
2198	int sync_req;
2199	int nospares = 0;
2200	int any_badblocks_changed = 0;
2201
2202	if (mddev->ro) {
2203		if (force_change)
2204			set_bit(MD_CHANGE_DEVS, &mddev->flags);
2205		return;
2206	}
2207repeat:
2208	/* First make sure individual recovery_offsets are correct */
2209	rdev_for_each(rdev, mddev) {
2210		if (rdev->raid_disk >= 0 &&
2211		    mddev->delta_disks >= 0 &&
2212		    !test_bit(In_sync, &rdev->flags) &&
2213		    mddev->curr_resync_completed > rdev->recovery_offset)
2214				rdev->recovery_offset = mddev->curr_resync_completed;
2215
2216	}
2217	if (!mddev->persistent) {
2218		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2219		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2220		if (!mddev->external) {
2221			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2222			rdev_for_each(rdev, mddev) {
2223				if (rdev->badblocks.changed) {
2224					rdev->badblocks.changed = 0;
2225					md_ack_all_badblocks(&rdev->badblocks);
2226					md_error(mddev, rdev);
2227				}
2228				clear_bit(Blocked, &rdev->flags);
2229				clear_bit(BlockedBadBlocks, &rdev->flags);
2230				wake_up(&rdev->blocked_wait);
2231			}
2232		}
2233		wake_up(&mddev->sb_wait);
2234		return;
2235	}
2236
2237	spin_lock_irq(&mddev->write_lock);
2238
2239	mddev->utime = get_seconds();
2240
2241	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2242		force_change = 1;
2243	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2244		/* just a clean<-> dirty transition, possibly leave spares alone,
2245		 * though if events isn't the right even/odd, we will have to do
2246		 * spares after all
2247		 */
2248		nospares = 1;
2249	if (force_change)
2250		nospares = 0;
2251	if (mddev->degraded)
2252		/* If the array is degraded, then skipping spares is both
2253		 * dangerous and fairly pointless.
2254		 * Dangerous because a device that was removed from the array
2255		 * might have a event_count that still looks up-to-date,
2256		 * so it can be re-added without a resync.
2257		 * Pointless because if there are any spares to skip,
2258		 * then a recovery will happen and soon that array won't
2259		 * be degraded any more and the spare can go back to sleep then.
2260		 */
2261		nospares = 0;
2262
2263	sync_req = mddev->in_sync;
2264
2265	/* If this is just a dirty<->clean transition, and the array is clean
2266	 * and 'events' is odd, we can roll back to the previous clean state */
2267	if (nospares
2268	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2269	    && mddev->can_decrease_events
2270	    && mddev->events != 1) {
2271		mddev->events--;
2272		mddev->can_decrease_events = 0;
2273	} else {
2274		/* otherwise we have to go forward and ... */
2275		mddev->events ++;
2276		mddev->can_decrease_events = nospares;
2277	}
2278
2279	/*
2280	 * This 64-bit counter should never wrap.
2281	 * Either we are in around ~1 trillion A.C., assuming
2282	 * 1 reboot per second, or we have a bug...
2283	 */
2284	WARN_ON(mddev->events == 0);
2285
2286	rdev_for_each(rdev, mddev) {
2287		if (rdev->badblocks.changed)
2288			any_badblocks_changed++;
2289		if (test_bit(Faulty, &rdev->flags))
2290			set_bit(FaultRecorded, &rdev->flags);
2291	}
2292
2293	sync_sbs(mddev, nospares);
2294	spin_unlock_irq(&mddev->write_lock);
2295
2296	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2297		 mdname(mddev), mddev->in_sync);
2298
2299	bitmap_update_sb(mddev->bitmap);
2300	rdev_for_each(rdev, mddev) {
2301		char b[BDEVNAME_SIZE];
2302
2303		if (rdev->sb_loaded != 1)
2304			continue; /* no noise on spare devices */
2305
2306		if (!test_bit(Faulty, &rdev->flags)) {
2307			md_super_write(mddev,rdev,
2308				       rdev->sb_start, rdev->sb_size,
2309				       rdev->sb_page);
2310			pr_debug("md: (write) %s's sb offset: %llu\n",
2311				 bdevname(rdev->bdev, b),
2312				 (unsigned long long)rdev->sb_start);
2313			rdev->sb_events = mddev->events;
2314			if (rdev->badblocks.size) {
2315				md_super_write(mddev, rdev,
2316					       rdev->badblocks.sector,
2317					       rdev->badblocks.size << 9,
2318					       rdev->bb_page);
2319				rdev->badblocks.size = 0;
2320			}
2321
2322		} else
2323			pr_debug("md: %s (skipping faulty)\n",
2324				 bdevname(rdev->bdev, b));
2325
2326		if (mddev->level == LEVEL_MULTIPATH)
2327			/* only need to write one superblock... */
2328			break;
2329	}
2330	md_super_wait(mddev);
2331	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2332
2333	spin_lock_irq(&mddev->write_lock);
2334	if (mddev->in_sync != sync_req ||
2335	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2336		/* have to write it out again */
2337		spin_unlock_irq(&mddev->write_lock);
2338		goto repeat;
2339	}
2340	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2341	spin_unlock_irq(&mddev->write_lock);
2342	wake_up(&mddev->sb_wait);
2343	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2344		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2345
2346	rdev_for_each(rdev, mddev) {
2347		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2348			clear_bit(Blocked, &rdev->flags);
2349
2350		if (any_badblocks_changed)
2351			md_ack_all_badblocks(&rdev->badblocks);
2352		clear_bit(BlockedBadBlocks, &rdev->flags);
2353		wake_up(&rdev->blocked_wait);
2354	}
2355}
2356
2357/* words written to sysfs files may, or may not, be \n terminated.
2358 * We want to accept with case. For this we use cmd_match.
2359 */
2360static int cmd_match(const char *cmd, const char *str)
2361{
2362	/* See if cmd, written into a sysfs file, matches
2363	 * str.  They must either be the same, or cmd can
2364	 * have a trailing newline
2365	 */
2366	while (*cmd && *str && *cmd == *str) {
2367		cmd++;
2368		str++;
2369	}
2370	if (*cmd == '\n')
2371		cmd++;
2372	if (*str || *cmd)
2373		return 0;
2374	return 1;
2375}
2376
2377struct rdev_sysfs_entry {
2378	struct attribute attr;
2379	ssize_t (*show)(struct md_rdev *, char *);
2380	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2381};
2382
2383static ssize_t
2384state_show(struct md_rdev *rdev, char *page)
2385{
2386	char *sep = "";
2387	size_t len = 0;
2388
2389	if (test_bit(Faulty, &rdev->flags) ||
2390	    rdev->badblocks.unacked_exist) {
2391		len+= sprintf(page+len, "%sfaulty",sep);
2392		sep = ",";
2393	}
2394	if (test_bit(In_sync, &rdev->flags)) {
2395		len += sprintf(page+len, "%sin_sync",sep);
2396		sep = ",";
2397	}
2398	if (test_bit(WriteMostly, &rdev->flags)) {
2399		len += sprintf(page+len, "%swrite_mostly",sep);
2400		sep = ",";
2401	}
2402	if (test_bit(Blocked, &rdev->flags) ||
2403	    (rdev->badblocks.unacked_exist
2404	     && !test_bit(Faulty, &rdev->flags))) {
2405		len += sprintf(page+len, "%sblocked", sep);
2406		sep = ",";
2407	}
2408	if (!test_bit(Faulty, &rdev->flags) &&
2409	    !test_bit(In_sync, &rdev->flags)) {
2410		len += sprintf(page+len, "%sspare", sep);
2411		sep = ",";
2412	}
2413	if (test_bit(WriteErrorSeen, &rdev->flags)) {
2414		len += sprintf(page+len, "%swrite_error", sep);
2415		sep = ",";
2416	}
2417	if (test_bit(WantReplacement, &rdev->flags)) {
2418		len += sprintf(page+len, "%swant_replacement", sep);
2419		sep = ",";
2420	}
2421	if (test_bit(Replacement, &rdev->flags)) {
2422		len += sprintf(page+len, "%sreplacement", sep);
2423		sep = ",";
2424	}
2425
2426	return len+sprintf(page+len, "\n");
2427}
2428
2429static ssize_t
2430state_store(struct md_rdev *rdev, const char *buf, size_t len)
2431{
2432	/* can write
2433	 *  faulty  - simulates an error
2434	 *  remove  - disconnects the device
2435	 *  writemostly - sets write_mostly
2436	 *  -writemostly - clears write_mostly
2437	 *  blocked - sets the Blocked flags
2438	 *  -blocked - clears the Blocked and possibly simulates an error
2439	 *  insync - sets Insync providing device isn't active
2440	 *  -insync - clear Insync for a device with a slot assigned,
2441	 *            so that it gets rebuilt based on bitmap
2442	 *  write_error - sets WriteErrorSeen
2443	 *  -write_error - clears WriteErrorSeen
2444	 */
2445	int err = -EINVAL;
2446	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2447		md_error(rdev->mddev, rdev);
2448		if (test_bit(Faulty, &rdev->flags))
2449			err = 0;
2450		else
2451			err = -EBUSY;
2452	} else if (cmd_match(buf, "remove")) {
2453		if (rdev->raid_disk >= 0)
2454			err = -EBUSY;
2455		else {
2456			struct mddev *mddev = rdev->mddev;
2457			kick_rdev_from_array(rdev);
2458			if (mddev->pers)
2459				md_update_sb(mddev, 1);
2460			md_new_event(mddev);
2461			err = 0;
2462		}
2463	} else if (cmd_match(buf, "writemostly")) {
2464		set_bit(WriteMostly, &rdev->flags);
2465		err = 0;
2466	} else if (cmd_match(buf, "-writemostly")) {
2467		clear_bit(WriteMostly, &rdev->flags);
2468		err = 0;
2469	} else if (cmd_match(buf, "blocked")) {
2470		set_bit(Blocked, &rdev->flags);
2471		err = 0;
2472	} else if (cmd_match(buf, "-blocked")) {
2473		if (!test_bit(Faulty, &rdev->flags) &&
2474		    rdev->badblocks.unacked_exist) {
2475			/* metadata handler doesn't understand badblocks,
2476			 * so we need to fail the device
2477			 */
2478			md_error(rdev->mddev, rdev);
2479		}
2480		clear_bit(Blocked, &rdev->flags);
2481		clear_bit(BlockedBadBlocks, &rdev->flags);
2482		wake_up(&rdev->blocked_wait);
2483		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2484		md_wakeup_thread(rdev->mddev->thread);
2485
2486		err = 0;
2487	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2488		set_bit(In_sync, &rdev->flags);
2489		err = 0;
2490	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2491		if (rdev->mddev->pers == NULL) {
2492			clear_bit(In_sync, &rdev->flags);
2493			rdev->saved_raid_disk = rdev->raid_disk;
2494			rdev->raid_disk = -1;
2495			err = 0;
2496		}
2497	} else if (cmd_match(buf, "write_error")) {
2498		set_bit(WriteErrorSeen, &rdev->flags);
2499		err = 0;
2500	} else if (cmd_match(buf, "-write_error")) {
2501		clear_bit(WriteErrorSeen, &rdev->flags);
2502		err = 0;
2503	} else if (cmd_match(buf, "want_replacement")) {
2504		/* Any non-spare device that is not a replacement can
2505		 * become want_replacement at any time, but we then need to
2506		 * check if recovery is needed.
2507		 */
2508		if (rdev->raid_disk >= 0 &&
2509		    !test_bit(Replacement, &rdev->flags))
2510			set_bit(WantReplacement, &rdev->flags);
2511		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2512		md_wakeup_thread(rdev->mddev->thread);
2513		err = 0;
2514	} else if (cmd_match(buf, "-want_replacement")) {
2515		/* Clearing 'want_replacement' is always allowed.
2516		 * Once replacements starts it is too late though.
2517		 */
2518		err = 0;
2519		clear_bit(WantReplacement, &rdev->flags);
2520	} else if (cmd_match(buf, "replacement")) {
2521		/* Can only set a device as a replacement when array has not
2522		 * yet been started.  Once running, replacement is automatic
2523		 * from spares, or by assigning 'slot'.
2524		 */
2525		if (rdev->mddev->pers)
2526			err = -EBUSY;
2527		else {
2528			set_bit(Replacement, &rdev->flags);
2529			err = 0;
2530		}
2531	} else if (cmd_match(buf, "-replacement")) {
2532		/* Similarly, can only clear Replacement before start */
2533		if (rdev->mddev->pers)
2534			err = -EBUSY;
2535		else {
2536			clear_bit(Replacement, &rdev->flags);
2537			err = 0;
2538		}
2539	}
2540	if (!err)
2541		sysfs_notify_dirent_safe(rdev->sysfs_state);
2542	return err ? err : len;
2543}
2544static struct rdev_sysfs_entry rdev_state =
2545__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2546
2547static ssize_t
2548errors_show(struct md_rdev *rdev, char *page)
2549{
2550	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2551}
2552
2553static ssize_t
2554errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2555{
2556	char *e;
2557	unsigned long n = simple_strtoul(buf, &e, 10);
2558	if (*buf && (*e == 0 || *e == '\n')) {
2559		atomic_set(&rdev->corrected_errors, n);
2560		return len;
2561	}
2562	return -EINVAL;
2563}
2564static struct rdev_sysfs_entry rdev_errors =
2565__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2566
2567static ssize_t
2568slot_show(struct md_rdev *rdev, char *page)
2569{
2570	if (rdev->raid_disk < 0)
2571		return sprintf(page, "none\n");
2572	else
2573		return sprintf(page, "%d\n", rdev->raid_disk);
2574}
2575
2576static ssize_t
2577slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2578{
2579	char *e;
2580	int err;
2581	int slot = simple_strtoul(buf, &e, 10);
2582	if (strncmp(buf, "none", 4)==0)
2583		slot = -1;
2584	else if (e==buf || (*e && *e!= '\n'))
2585		return -EINVAL;
2586	if (rdev->mddev->pers && slot == -1) {
2587		/* Setting 'slot' on an active array requires also
2588		 * updating the 'rd%d' link, and communicating
2589		 * with the personality with ->hot_*_disk.
2590		 * For now we only support removing
2591		 * failed/spare devices.  This normally happens automatically,
2592		 * but not when the metadata is externally managed.
2593		 */
2594		if (rdev->raid_disk == -1)
2595			return -EEXIST;
2596		/* personality does all needed checks */
2597		if (rdev->mddev->pers->hot_remove_disk == NULL)
2598			return -EINVAL;
2599		clear_bit(Blocked, &rdev->flags);
2600		remove_and_add_spares(rdev->mddev, rdev);
2601		if (rdev->raid_disk >= 0)
2602			return -EBUSY;
2603		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2604		md_wakeup_thread(rdev->mddev->thread);
2605	} else if (rdev->mddev->pers) {
2606		/* Activating a spare .. or possibly reactivating
2607		 * if we ever get bitmaps working here.
2608		 */
2609
2610		if (rdev->raid_disk != -1)
2611			return -EBUSY;
2612
2613		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2614			return -EBUSY;
2615
2616		if (rdev->mddev->pers->hot_add_disk == NULL)
2617			return -EINVAL;
2618
2619		if (slot >= rdev->mddev->raid_disks &&
2620		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2621			return -ENOSPC;
2622
2623		rdev->raid_disk = slot;
2624		if (test_bit(In_sync, &rdev->flags))
2625			rdev->saved_raid_disk = slot;
2626		else
2627			rdev->saved_raid_disk = -1;
2628		clear_bit(In_sync, &rdev->flags);
2629		clear_bit(Bitmap_sync, &rdev->flags);
2630		err = rdev->mddev->pers->
2631			hot_add_disk(rdev->mddev, rdev);
2632		if (err) {
2633			rdev->raid_disk = -1;
2634			return err;
2635		} else
2636			sysfs_notify_dirent_safe(rdev->sysfs_state);
2637		if (sysfs_link_rdev(rdev->mddev, rdev))
2638			/* failure here is OK */;
2639		/* don't wakeup anyone, leave that to userspace. */
2640	} else {
2641		if (slot >= rdev->mddev->raid_disks &&
2642		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2643			return -ENOSPC;
2644		rdev->raid_disk = slot;
2645		/* assume it is working */
2646		clear_bit(Faulty, &rdev->flags);
2647		clear_bit(WriteMostly, &rdev->flags);
2648		set_bit(In_sync, &rdev->flags);
2649		sysfs_notify_dirent_safe(rdev->sysfs_state);
2650	}
2651	return len;
2652}
2653
2654static struct rdev_sysfs_entry rdev_slot =
2655__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2656
2657static ssize_t
2658offset_show(struct md_rdev *rdev, char *page)
2659{
2660	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2661}
2662
2663static ssize_t
2664offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2665{
2666	unsigned long long offset;
2667	if (kstrtoull(buf, 10, &offset) < 0)
2668		return -EINVAL;
2669	if (rdev->mddev->pers && rdev->raid_disk >= 0)
2670		return -EBUSY;
2671	if (rdev->sectors && rdev->mddev->external)
2672		/* Must set offset before size, so overlap checks
2673		 * can be sane */
2674		return -EBUSY;
2675	rdev->data_offset = offset;
2676	rdev->new_data_offset = offset;
2677	return len;
2678}
2679
2680static struct rdev_sysfs_entry rdev_offset =
2681__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2682
2683static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2684{
2685	return sprintf(page, "%llu\n",
2686		       (unsigned long long)rdev->new_data_offset);
2687}
2688
2689static ssize_t new_offset_store(struct md_rdev *rdev,
2690				const char *buf, size_t len)
2691{
2692	unsigned long long new_offset;
2693	struct mddev *mddev = rdev->mddev;
2694
2695	if (kstrtoull(buf, 10, &new_offset) < 0)
2696		return -EINVAL;
2697
2698	if (mddev->sync_thread)
2699		return -EBUSY;
2700	if (new_offset == rdev->data_offset)
2701		/* reset is always permitted */
2702		;
2703	else if (new_offset > rdev->data_offset) {
2704		/* must not push array size beyond rdev_sectors */
2705		if (new_offset - rdev->data_offset
2706		    + mddev->dev_sectors > rdev->sectors)
2707				return -E2BIG;
2708	}
2709	/* Metadata worries about other space details. */
2710
2711	/* decreasing the offset is inconsistent with a backwards
2712	 * reshape.
2713	 */
2714	if (new_offset < rdev->data_offset &&
2715	    mddev->reshape_backwards)
2716		return -EINVAL;
2717	/* Increasing offset is inconsistent with forwards
2718	 * reshape.  reshape_direction should be set to
2719	 * 'backwards' first.
2720	 */
2721	if (new_offset > rdev->data_offset &&
2722	    !mddev->reshape_backwards)
2723		return -EINVAL;
2724
2725	if (mddev->pers && mddev->persistent &&
2726	    !super_types[mddev->major_version]
2727	    .allow_new_offset(rdev, new_offset))
2728		return -E2BIG;
2729	rdev->new_data_offset = new_offset;
2730	if (new_offset > rdev->data_offset)
2731		mddev->reshape_backwards = 1;
2732	else if (new_offset < rdev->data_offset)
2733		mddev->reshape_backwards = 0;
2734
2735	return len;
2736}
2737static struct rdev_sysfs_entry rdev_new_offset =
2738__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2739
2740static ssize_t
2741rdev_size_show(struct md_rdev *rdev, char *page)
2742{
2743	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2744}
2745
2746static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2747{
2748	/* check if two start/length pairs overlap */
2749	if (s1+l1 <= s2)
2750		return 0;
2751	if (s2+l2 <= s1)
2752		return 0;
2753	return 1;
2754}
2755
2756static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2757{
2758	unsigned long long blocks;
2759	sector_t new;
2760
2761	if (kstrtoull(buf, 10, &blocks) < 0)
2762		return -EINVAL;
2763
2764	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2765		return -EINVAL; /* sector conversion overflow */
2766
2767	new = blocks * 2;
2768	if (new != blocks * 2)
2769		return -EINVAL; /* unsigned long long to sector_t overflow */
2770
2771	*sectors = new;
2772	return 0;
2773}
2774
2775static ssize_t
2776rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2777{
2778	struct mddev *my_mddev = rdev->mddev;
2779	sector_t oldsectors = rdev->sectors;
2780	sector_t sectors;
2781
2782	if (strict_blocks_to_sectors(buf, &sectors) < 0)
2783		return -EINVAL;
2784	if (rdev->data_offset != rdev->new_data_offset)
2785		return -EINVAL; /* too confusing */
2786	if (my_mddev->pers && rdev->raid_disk >= 0) {
2787		if (my_mddev->persistent) {
2788			sectors = super_types[my_mddev->major_version].
2789				rdev_size_change(rdev, sectors);
2790			if (!sectors)
2791				return -EBUSY;
2792		} else if (!sectors)
2793			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2794				rdev->data_offset;
2795		if (!my_mddev->pers->resize)
2796			/* Cannot change size for RAID0 or Linear etc */
2797			return -EINVAL;
2798	}
2799	if (sectors < my_mddev->dev_sectors)
2800		return -EINVAL; /* component must fit device */
2801
2802	rdev->sectors = sectors;
2803	if (sectors > oldsectors && my_mddev->external) {
2804		/* Need to check that all other rdevs with the same
2805		 * ->bdev do not overlap.  'rcu' is sufficient to walk
2806		 * the rdev lists safely.
2807		 * This check does not provide a hard guarantee, it
2808		 * just helps avoid dangerous mistakes.
2809		 */
2810		struct mddev *mddev;
2811		int overlap = 0;
2812		struct list_head *tmp;
2813
2814		rcu_read_lock();
2815		for_each_mddev(mddev, tmp) {
2816			struct md_rdev *rdev2;
2817
2818			rdev_for_each(rdev2, mddev)
2819				if (rdev->bdev == rdev2->bdev &&
2820				    rdev != rdev2 &&
2821				    overlaps(rdev->data_offset, rdev->sectors,
2822					     rdev2->data_offset,
2823					     rdev2->sectors)) {
2824					overlap = 1;
2825					break;
2826				}
2827			if (overlap) {
2828				mddev_put(mddev);
2829				break;
2830			}
2831		}
2832		rcu_read_unlock();
2833		if (overlap) {
2834			/* Someone else could have slipped in a size
2835			 * change here, but doing so is just silly.
2836			 * We put oldsectors back because we *know* it is
2837			 * safe, and trust userspace not to race with
2838			 * itself
2839			 */
2840			rdev->sectors = oldsectors;
2841			return -EBUSY;
2842		}
2843	}
2844	return len;
2845}
2846
2847static struct rdev_sysfs_entry rdev_size =
2848__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2849
2850static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
2851{
2852	unsigned long long recovery_start = rdev->recovery_offset;
2853
2854	if (test_bit(In_sync, &rdev->flags) ||
2855	    recovery_start == MaxSector)
2856		return sprintf(page, "none\n");
2857
2858	return sprintf(page, "%llu\n", recovery_start);
2859}
2860
2861static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
2862{
2863	unsigned long long recovery_start;
2864
2865	if (cmd_match(buf, "none"))
2866		recovery_start = MaxSector;
2867	else if (kstrtoull(buf, 10, &recovery_start))
2868		return -EINVAL;
2869
2870	if (rdev->mddev->pers &&
2871	    rdev->raid_disk >= 0)
2872		return -EBUSY;
2873
2874	rdev->recovery_offset = recovery_start;
2875	if (recovery_start == MaxSector)
2876		set_bit(In_sync, &rdev->flags);
2877	else
2878		clear_bit(In_sync, &rdev->flags);
2879	return len;
2880}
2881
2882static struct rdev_sysfs_entry rdev_recovery_start =
2883__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2884
2885static ssize_t
2886badblocks_show(struct badblocks *bb, char *page, int unack);
2887static ssize_t
2888badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2889
2890static ssize_t bb_show(struct md_rdev *rdev, char *page)
2891{
2892	return badblocks_show(&rdev->badblocks, page, 0);
2893}
2894static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
2895{
2896	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2897	/* Maybe that ack was all we needed */
2898	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2899		wake_up(&rdev->blocked_wait);
2900	return rv;
2901}
2902static struct rdev_sysfs_entry rdev_bad_blocks =
2903__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2904
2905static ssize_t ubb_show(struct md_rdev *rdev, char *page)
2906{
2907	return badblocks_show(&rdev->badblocks, page, 1);
2908}
2909static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
2910{
2911	return badblocks_store(&rdev->badblocks, page, len, 1);
2912}
2913static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2914__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2915
2916static struct attribute *rdev_default_attrs[] = {
2917	&rdev_state.attr,
2918	&rdev_errors.attr,
2919	&rdev_slot.attr,
2920	&rdev_offset.attr,
2921	&rdev_new_offset.attr,
2922	&rdev_size.attr,
2923	&rdev_recovery_start.attr,
2924	&rdev_bad_blocks.attr,
2925	&rdev_unack_bad_blocks.attr,
2926	NULL,
2927};
2928static ssize_t
2929rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2930{
2931	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2932	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
2933	struct mddev *mddev = rdev->mddev;
2934	ssize_t rv;
2935
2936	if (!entry->show)
2937		return -EIO;
2938
2939	rv = mddev ? mddev_lock(mddev) : -EBUSY;
2940	if (!rv) {
2941		if (rdev->mddev == NULL)
2942			rv = -EBUSY;
2943		else
2944			rv = entry->show(rdev, page);
2945		mddev_unlock(mddev);
2946	}
2947	return rv;
2948}
2949
2950static ssize_t
2951rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2952	      const char *page, size_t length)
2953{
2954	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2955	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
2956	ssize_t rv;
2957	struct mddev *mddev = rdev->mddev;
2958
2959	if (!entry->store)
2960		return -EIO;
2961	if (!capable(CAP_SYS_ADMIN))
2962		return -EACCES;
2963	rv = mddev ? mddev_lock(mddev): -EBUSY;
2964	if (!rv) {
2965		if (rdev->mddev == NULL)
2966			rv = -EBUSY;
2967		else
2968			rv = entry->store(rdev, page, length);
2969		mddev_unlock(mddev);
2970	}
2971	return rv;
2972}
2973
2974static void rdev_free(struct kobject *ko)
2975{
2976	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
2977	kfree(rdev);
2978}
2979static const struct sysfs_ops rdev_sysfs_ops = {
2980	.show		= rdev_attr_show,
2981	.store		= rdev_attr_store,
2982};
2983static struct kobj_type rdev_ktype = {
2984	.release	= rdev_free,
2985	.sysfs_ops	= &rdev_sysfs_ops,
2986	.default_attrs	= rdev_default_attrs,
2987};
2988
2989int md_rdev_init(struct md_rdev *rdev)
2990{
2991	rdev->desc_nr = -1;
2992	rdev->saved_raid_disk = -1;
2993	rdev->raid_disk = -1;
2994	rdev->flags = 0;
2995	rdev->data_offset = 0;
2996	rdev->new_data_offset = 0;
2997	rdev->sb_events = 0;
2998	rdev->last_read_error.tv_sec  = 0;
2999	rdev->last_read_error.tv_nsec = 0;
3000	rdev->sb_loaded = 0;
3001	rdev->bb_page = NULL;
3002	atomic_set(&rdev->nr_pending, 0);
3003	atomic_set(&rdev->read_errors, 0);
3004	atomic_set(&rdev->corrected_errors, 0);
3005
3006	INIT_LIST_HEAD(&rdev->same_set);
3007	init_waitqueue_head(&rdev->blocked_wait);
3008
3009	/* Add space to store bad block list.
3010	 * This reserves the space even on arrays where it cannot
3011	 * be used - I wonder if that matters
3012	 */
3013	rdev->badblocks.count = 0;
3014	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3015	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3016	seqlock_init(&rdev->badblocks.lock);
3017	if (rdev->badblocks.page == NULL)
3018		return -ENOMEM;
3019
3020	return 0;
3021}
3022EXPORT_SYMBOL_GPL(md_rdev_init);
3023/*
3024 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3025 *
3026 * mark the device faulty if:
3027 *
3028 *   - the device is nonexistent (zero size)
3029 *   - the device has no valid superblock
3030 *
3031 * a faulty rdev _never_ has rdev->sb set.
3032 */
3033static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3034{
3035	char b[BDEVNAME_SIZE];
3036	int err;
3037	struct md_rdev *rdev;
3038	sector_t size;
3039
3040	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3041	if (!rdev) {
3042		printk(KERN_ERR "md: could not alloc mem for new device!\n");
3043		return ERR_PTR(-ENOMEM);
3044	}
3045
3046	err = md_rdev_init(rdev);
3047	if (err)
3048		goto abort_free;
3049	err = alloc_disk_sb(rdev);
3050	if (err)
3051		goto abort_free;
3052
3053	err = lock_rdev(rdev, newdev, super_format == -2);
3054	if (err)
3055		goto abort_free;
3056
3057	kobject_init(&rdev->kobj, &rdev_ktype);
3058
3059	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3060	if (!size) {
3061		printk(KERN_WARNING
3062			"md: %s has zero or unknown size, marking faulty!\n",
3063			bdevname(rdev->bdev,b));
3064		err = -EINVAL;
3065		goto abort_free;
3066	}
3067
3068	if (super_format >= 0) {
3069		err = super_types[super_format].
3070			load_super(rdev, NULL, super_minor);
3071		if (err == -EINVAL) {
3072			printk(KERN_WARNING
3073				"md: %s does not have a valid v%d.%d "
3074			       "superblock, not importing!\n",
3075				bdevname(rdev->bdev,b),
3076			       super_format, super_minor);
3077			goto abort_free;
3078		}
3079		if (err < 0) {
3080			printk(KERN_WARNING
3081				"md: could not read %s's sb, not importing!\n",
3082				bdevname(rdev->bdev,b));
3083			goto abort_free;
3084		}
3085	}
3086
3087	return rdev;
3088
3089abort_free:
3090	if (rdev->bdev)
3091		unlock_rdev(rdev);
3092	md_rdev_clear(rdev);
3093	kfree(rdev);
3094	return ERR_PTR(err);
3095}
3096
3097/*
3098 * Check a full RAID array for plausibility
3099 */
3100
3101static void analyze_sbs(struct mddev *mddev)
3102{
3103	int i;
3104	struct md_rdev *rdev, *freshest, *tmp;
3105	char b[BDEVNAME_SIZE];
3106
3107	freshest = NULL;
3108	rdev_for_each_safe(rdev, tmp, mddev)
3109		switch (super_types[mddev->major_version].
3110			load_super(rdev, freshest, mddev->minor_version)) {
3111		case 1:
3112			freshest = rdev;
3113			break;
3114		case 0:
3115			break;
3116		default:
3117			printk( KERN_ERR \
3118				"md: fatal superblock inconsistency in %s"
3119				" -- removing from array\n",
3120				bdevname(rdev->bdev,b));
3121			kick_rdev_from_array(rdev);
3122		}
3123
3124	super_types[mddev->major_version].
3125		validate_super(mddev, freshest);
3126
3127	i = 0;
3128	rdev_for_each_safe(rdev, tmp, mddev) {
3129		if (mddev->max_disks &&
3130		    (rdev->desc_nr >= mddev->max_disks ||
3131		     i > mddev->max_disks)) {
3132			printk(KERN_WARNING
3133			       "md: %s: %s: only %d devices permitted\n",
3134			       mdname(mddev), bdevname(rdev->bdev, b),
3135			       mddev->max_disks);
3136			kick_rdev_from_array(rdev);
3137			continue;
3138		}
3139		if (rdev != freshest)
3140			if (super_types[mddev->major_version].
3141			    validate_super(mddev, rdev)) {
3142				printk(KERN_WARNING "md: kicking non-fresh %s"
3143					" from array!\n",
3144					bdevname(rdev->bdev,b));
3145				kick_rdev_from_array(rdev);
3146				continue;
3147			}
3148		if (mddev->level == LEVEL_MULTIPATH) {
3149			rdev->desc_nr = i++;
3150			rdev->raid_disk = rdev->desc_nr;
3151			set_bit(In_sync, &rdev->flags);
3152		} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3153			rdev->raid_disk = -1;
3154			clear_bit(In_sync, &rdev->flags);
3155		}
3156	}
3157}
3158
3159/* Read a fixed-point number.
3160 * Numbers in sysfs attributes should be in "standard" units where
3161 * possible, so time should be in seconds.
3162 * However we internally use a a much smaller unit such as
3163 * milliseconds or jiffies.
3164 * This function takes a decimal number with a possible fractional
3165 * component, and produces an integer which is the result of
3166 * multiplying that number by 10^'scale'.
3167 * all without any floating-point arithmetic.
3168 */
3169int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3170{
3171	unsigned long result = 0;
3172	long decimals = -1;
3173	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3174		if (*cp == '.')
3175			decimals = 0;
3176		else if (decimals < scale) {
3177			unsigned int value;
3178			value = *cp - '0';
3179			result = result * 10 + value;
3180			if (decimals >= 0)
3181				decimals++;
3182		}
3183		cp++;
3184	}
3185	if (*cp == '\n')
3186		cp++;
3187	if (*cp)
3188		return -EINVAL;
3189	if (decimals < 0)
3190		decimals = 0;
3191	while (decimals < scale) {
3192		result *= 10;
3193		decimals ++;
3194	}
3195	*res = result;
3196	return 0;
3197}
3198
3199static void md_safemode_timeout(unsigned long data);
3200
3201static ssize_t
3202safe_delay_show(struct mddev *mddev, char *page)
3203{
3204	int msec = (mddev->safemode_delay*1000)/HZ;
3205	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3206}
3207static ssize_t
3208safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3209{
3210	unsigned long msec;
3211
3212	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3213		return -EINVAL;
3214	if (msec == 0)
3215		mddev->safemode_delay = 0;
3216	else {
3217		unsigned long old_delay = mddev->safemode_delay;
3218		mddev->safemode_delay = (msec*HZ)/1000;
3219		if (mddev->safemode_delay == 0)
3220			mddev->safemode_delay = 1;
3221		if (mddev->safemode_delay < old_delay || old_delay == 0)
3222			md_safemode_timeout((unsigned long)mddev);
3223	}
3224	return len;
3225}
3226static struct md_sysfs_entry md_safe_delay =
3227__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3228
3229static ssize_t
3230level_show(struct mddev *mddev, char *page)
3231{
3232	struct md_personality *p = mddev->pers;
3233	if (p)
3234		return sprintf(page, "%s\n", p->name);
3235	else if (mddev->clevel[0])
3236		return sprintf(page, "%s\n", mddev->clevel);
3237	else if (mddev->level != LEVEL_NONE)
3238		return sprintf(page, "%d\n", mddev->level);
3239	else
3240		return 0;
3241}
3242
3243static ssize_t
3244level_store(struct mddev *mddev, const char *buf, size_t len)
3245{
3246	char clevel[16];
3247	ssize_t rv = len;
3248	struct md_personality *pers;
3249	long level;
3250	void *priv;
3251	struct md_rdev *rdev;
3252
3253	if (mddev->pers == NULL) {
3254		if (len == 0)
3255			return 0;
3256		if (len >= sizeof(mddev->clevel))
3257			return -ENOSPC;
3258		strncpy(mddev->clevel, buf, len);
3259		if (mddev->clevel[len-1] == '\n')
3260			len--;
3261		mddev->clevel[len] = 0;
3262		mddev->level = LEVEL_NONE;
3263		return rv;
3264	}
3265	if (mddev->ro)
3266		return  -EROFS;
3267
3268	/* request to change the personality.  Need to ensure:
3269	 *  - array is not engaged in resync/recovery/reshape
3270	 *  - old personality can be suspended
3271	 *  - new personality will access other array.
3272	 */
3273
3274	if (mddev->sync_thread ||
3275	    mddev->reshape_position != MaxSector ||
3276	    mddev->sysfs_active)
3277		return -EBUSY;
3278
3279	if (!mddev->pers->quiesce) {
3280		printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3281		       mdname(mddev), mddev->pers->name);
3282		return -EINVAL;
3283	}
3284
3285	/* Now find the new personality */
3286	if (len == 0 || len >= sizeof(clevel))
3287		return -EINVAL;
3288	strncpy(clevel, buf, len);
3289	if (clevel[len-1] == '\n')
3290		len--;
3291	clevel[len] = 0;
3292	if (kstrtol(clevel, 10, &level))
3293		level = LEVEL_NONE;
3294
3295	if (request_module("md-%s", clevel) != 0)
3296		request_module("md-level-%s", clevel);
3297	spin_lock(&pers_lock);
3298	pers = find_pers(level, clevel);
3299	if (!pers || !try_module_get(pers->owner)) {
3300		spin_unlock(&pers_lock);
3301		printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3302		return -EINVAL;
3303	}
3304	spin_unlock(&pers_lock);
3305
3306	if (pers == mddev->pers) {
3307		/* Nothing to do! */
3308		module_put(pers->owner);
3309		return rv;
3310	}
3311	if (!pers->takeover) {
3312		module_put(pers->owner);
3313		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3314		       mdname(mddev), clevel);
3315		return -EINVAL;
3316	}
3317
3318	rdev_for_each(rdev, mddev)
3319		rdev->new_raid_disk = rdev->raid_disk;
3320
3321	/* ->takeover must set new_* and/or delta_disks
3322	 * if it succeeds, and may set them when it fails.
3323	 */
3324	priv = pers->takeover(mddev);
3325	if (IS_ERR(priv)) {
3326		mddev->new_level = mddev->level;
3327		mddev->new_layout = mddev->layout;
3328		mddev->new_chunk_sectors = mddev->chunk_sectors;
3329		mddev->raid_disks -= mddev->delta_disks;
3330		mddev->delta_disks = 0;
3331		mddev->reshape_backwards = 0;
3332		module_put(pers->owner);
3333		printk(KERN_WARNING "md: %s: %s would not accept array\n",
3334		       mdname(mddev), clevel);
3335		return PTR_ERR(priv);
3336	}
3337
3338	/* Looks like we have a winner */
3339	mddev_suspend(mddev);
3340	mddev->pers->stop(mddev);
3341
3342	if (mddev->pers->sync_request == NULL &&
3343	    pers->sync_request != NULL) {
3344		/* need to add the md_redundancy_group */
3345		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3346			printk(KERN_WARNING
3347			       "md: cannot register extra attributes for %s\n",
3348			       mdname(mddev));
3349		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3350	}
3351	if (mddev->pers->sync_request != NULL &&
3352	    pers->sync_request == NULL) {
3353		/* need to remove the md_redundancy_group */
3354		if (mddev->to_remove == NULL)
3355			mddev->to_remove = &md_redundancy_group;
3356	}
3357
3358	if (mddev->pers->sync_request == NULL &&
3359	    mddev->external) {
3360		/* We are converting from a no-redundancy array
3361		 * to a redundancy array and metadata is managed
3362		 * externally so we need to be sure that writes
3363		 * won't block due to a need to transition
3364		 *      clean->dirty
3365		 * until external management is started.
3366		 */
3367		mddev->in_sync = 0;
3368		mddev->safemode_delay = 0;
3369		mddev->safemode = 0;
3370	}
3371
3372	rdev_for_each(rdev, mddev) {
3373		if (rdev->raid_disk < 0)
3374			continue;
3375		if (rdev->new_raid_disk >= mddev->raid_disks)
3376			rdev->new_raid_disk = -1;
3377		if (rdev->new_raid_disk == rdev->raid_disk)
3378			continue;
3379		sysfs_unlink_rdev(mddev, rdev);
3380	}
3381	rdev_for_each(rdev, mddev) {
3382		if (rdev->raid_disk < 0)
3383			continue;
3384		if (rdev->new_raid_disk == rdev->raid_disk)
3385			continue;
3386		rdev->raid_disk = rdev->new_raid_disk;
3387		if (rdev->raid_disk < 0)
3388			clear_bit(In_sync, &rdev->flags);
3389		else {
3390			if (sysfs_link_rdev(mddev, rdev))
3391				printk(KERN_WARNING "md: cannot register rd%d"
3392				       " for %s after level change\n",
3393				       rdev->raid_disk, mdname(mddev));
3394		}
3395	}
3396
3397	module_put(mddev->pers->owner);
3398	mddev->pers = pers;
3399	mddev->private = priv;
3400	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3401	mddev->level = mddev->new_level;
3402	mddev->layout = mddev->new_layout;
3403	mddev->chunk_sectors = mddev->new_chunk_sectors;
3404	mddev->delta_disks = 0;
3405	mddev->reshape_backwards = 0;
3406	mddev->degraded = 0;
3407	if (mddev->pers->sync_request == NULL) {
3408		/* this is now an array without redundancy, so
3409		 * it must always be in_sync
3410		 */
3411		mddev->in_sync = 1;
3412		del_timer_sync(&mddev->safemode_timer);
3413	}
3414	blk_set_stacking_limits(&mddev->queue->limits);
3415	pers->run(mddev);
3416	set_bit(MD_CHANGE_DEVS, &mddev->flags);
3417	mddev_resume(mddev);
3418	if (!mddev->thread)
3419		md_update_sb(mddev, 1);
3420	sysfs_notify(&mddev->kobj, NULL, "level");
3421	md_new_event(mddev);
3422	return rv;
3423}
3424
3425static struct md_sysfs_entry md_level =
3426__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3427
3428static ssize_t
3429layout_show(struct mddev *mddev, char *page)
3430{
3431	/* just a number, not meaningful for all levels */
3432	if (mddev->reshape_position != MaxSector &&
3433	    mddev->layout != mddev->new_layout)
3434		return sprintf(page, "%d (%d)\n",
3435			       mddev->new_layout, mddev->layout);
3436	return sprintf(page, "%d\n", mddev->layout);
3437}
3438
3439static ssize_t
3440layout_store(struct mddev *mddev, const char *buf, size_t len)
3441{
3442	char *e;
3443	unsigned long n = simple_strtoul(buf, &e, 10);
3444
3445	if (!*buf || (*e && *e != '\n'))
3446		return -EINVAL;
3447
3448	if (mddev->pers) {
3449		int err;
3450		if (mddev->pers->check_reshape == NULL)
3451			return -EBUSY;
3452		if (mddev->ro)
3453			return -EROFS;
3454		mddev->new_layout = n;
3455		err = mddev->pers->check_reshape(mddev);
3456		if (err) {
3457			mddev->new_layout = mddev->layout;
3458			return err;
3459		}
3460	} else {
3461		mddev->new_layout = n;
3462		if (mddev->reshape_position == MaxSector)
3463			mddev->layout = n;
3464	}
3465	return len;
3466}
3467static struct md_sysfs_entry md_layout =
3468__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3469
3470static ssize_t
3471raid_disks_show(struct mddev *mddev, char *page)
3472{
3473	if (mddev->raid_disks == 0)
3474		return 0;
3475	if (mddev->reshape_position != MaxSector &&
3476	    mddev->delta_disks != 0)
3477		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3478			       mddev->raid_disks - mddev->delta_disks);
3479	return sprintf(page, "%d\n", mddev->raid_disks);
3480}
3481
3482static int update_raid_disks(struct mddev *mddev, int raid_disks);
3483
3484static ssize_t
3485raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3486{
3487	char *e;
3488	int rv = 0;
3489	unsigned long n = simple_strtoul(buf, &e, 10);
3490
3491	if (!*buf || (*e && *e != '\n'))
3492		return -EINVAL;
3493
3494	if (mddev->pers)
3495		rv = update_raid_disks(mddev, n);
3496	else if (mddev->reshape_position != MaxSector) {
3497		struct md_rdev *rdev;
3498		int olddisks = mddev->raid_disks - mddev->delta_disks;
3499
3500		rdev_for_each(rdev, mddev) {
3501			if (olddisks < n &&
3502			    rdev->data_offset < rdev->new_data_offset)
3503				return -EINVAL;
3504			if (olddisks > n &&
3505			    rdev->data_offset > rdev->new_data_offset)
3506				return -EINVAL;
3507		}
3508		mddev->delta_disks = n - olddisks;
3509		mddev->raid_disks = n;
3510		mddev->reshape_backwards = (mddev->delta_disks < 0);
3511	} else
3512		mddev->raid_disks = n;
3513	return rv ? rv : len;
3514}
3515static struct md_sysfs_entry md_raid_disks =
3516__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3517
3518static ssize_t
3519chunk_size_show(struct mddev *mddev, char *page)
3520{
3521	if (mddev->reshape_position != MaxSector &&
3522	    mddev->chunk_sectors != mddev->new_chunk_sectors)
3523		return sprintf(page, "%d (%d)\n",
3524			       mddev->new_chunk_sectors << 9,
3525			       mddev->chunk_sectors << 9);
3526	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3527}
3528
3529static ssize_t
3530chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3531{
3532	char *e;
3533	unsigned long n = simple_strtoul(buf, &e, 10);
3534
3535	if (!*buf || (*e && *e != '\n'))
3536		return -EINVAL;
3537
3538	if (mddev->pers) {
3539		int err;
3540		if (mddev->pers->check_reshape == NULL)
3541			return -EBUSY;
3542		if (mddev->ro)
3543			return -EROFS;
3544		mddev->new_chunk_sectors = n >> 9;
3545		err = mddev->pers->check_reshape(mddev);
3546		if (err) {
3547			mddev->new_chunk_sectors = mddev->chunk_sectors;
3548			return err;
3549		}
3550	} else {
3551		mddev->new_chunk_sectors = n >> 9;
3552		if (mddev->reshape_position == MaxSector)
3553			mddev->chunk_sectors = n >> 9;
3554	}
3555	return len;
3556}
3557static struct md_sysfs_entry md_chunk_size =
3558__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3559
3560static ssize_t
3561resync_start_show(struct mddev *mddev, char *page)
3562{
3563	if (mddev->recovery_cp == MaxSector)
3564		return sprintf(page, "none\n");
3565	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3566}
3567
3568static ssize_t
3569resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3570{
3571	char *e;
3572	unsigned long long n = simple_strtoull(buf, &e, 10);
3573
3574	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3575		return -EBUSY;
3576	if (cmd_match(buf, "none"))
3577		n = MaxSector;
3578	else if (!*buf || (*e && *e != '\n'))
3579		return -EINVAL;
3580
3581	mddev->recovery_cp = n;
3582	if (mddev->pers)
3583		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3584	return len;
3585}
3586static struct md_sysfs_entry md_resync_start =
3587__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3588
3589/*
3590 * The array state can be:
3591 *
3592 * clear
3593 *     No devices, no size, no level
3594 *     Equivalent to STOP_ARRAY ioctl
3595 * inactive
3596 *     May have some settings, but array is not active
3597 *        all IO results in error
3598 *     When written, doesn't tear down array, but just stops it
3599 * suspended (not supported yet)
3600 *     All IO requests will block. The array can be reconfigured.
3601 *     Writing this, if accepted, will block until array is quiescent
3602 * readonly
3603 *     no resync can happen.  no superblocks get written.
3604 *     write requests fail
3605 * read-auto
3606 *     like readonly, but behaves like 'clean' on a write request.
3607 *
3608 * clean - no pending writes, but otherwise active.
3609 *     When written to inactive array, starts without resync
3610 *     If a write request arrives then
3611 *       if metadata is known, mark 'dirty' and switch to 'active'.
3612 *       if not known, block and switch to write-pending
3613 *     If written to an active array that has pending writes, then fails.
3614 * active
3615 *     fully active: IO and resync can be happening.
3616 *     When written to inactive array, starts with resync
3617 *
3618 * write-pending
3619 *     clean, but writes are blocked waiting for 'active' to be written.
3620 *
3621 * active-idle
3622 *     like active, but no writes have been seen for a while (100msec).
3623 *
3624 */
3625enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3626		   write_pending, active_idle, bad_word};
3627static char *array_states[] = {
3628	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3629	"write-pending", "active-idle", NULL };
3630
3631static int match_word(const char *word, char **list)
3632{
3633	int n;
3634	for (n=0; list[n]; n++)
3635		if (cmd_match(word, list[n]))
3636			break;
3637	return n;
3638}
3639
3640static ssize_t
3641array_state_show(struct mddev *mddev, char *page)
3642{
3643	enum array_state st = inactive;
3644
3645	if (mddev->pers)
3646		switch(mddev->ro) {
3647		case 1:
3648			st = readonly;
3649			break;
3650		case 2:
3651			st = read_auto;
3652			break;
3653		case 0:
3654			if (mddev->in_sync)
3655				st = clean;
3656			else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3657				st = write_pending;
3658			else if (mddev->safemode)
3659				st = active_idle;
3660			else
3661				st = active;
3662		}
3663	else {
3664		if (list_empty(&mddev->disks) &&
3665		    mddev->raid_disks == 0 &&
3666		    mddev->dev_sectors == 0)
3667			st = clear;
3668		else
3669			st = inactive;
3670	}
3671	return sprintf(page, "%s\n", array_states[st]);
3672}
3673
3674static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
3675static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
3676static int do_md_run(struct mddev *mddev);
3677static int restart_array(struct mddev *mddev);
3678
3679static ssize_t
3680array_state_store(struct mddev *mddev, const char *buf, size_t len)
3681{
3682	int err = -EINVAL;
3683	enum array_state st = match_word(buf, array_states);
3684	switch(st) {
3685	case bad_word:
3686		break;
3687	case clear:
3688		/* stopping an active array */
3689		err = do_md_stop(mddev, 0, NULL);
3690		break;
3691	case inactive:
3692		/* stopping an active array */
3693		if (mddev->pers)
3694			err = do_md_stop(mddev, 2, NULL);
3695		else
3696			err = 0; /* already inactive */
3697		break;
3698	case suspended:
3699		break; /* not supported yet */
3700	case readonly:
3701		if (mddev->pers)
3702			err = md_set_readonly(mddev, NULL);
3703		else {
3704			mddev->ro = 1;
3705			set_disk_ro(mddev->gendisk, 1);
3706			err = do_md_run(mddev);
3707		}
3708		break;
3709	case read_auto:
3710		if (mddev->pers) {
3711			if (mddev->ro == 0)
3712				err = md_set_readonly(mddev, NULL);
3713			else if (mddev->ro == 1)
3714				err = restart_array(mddev);
3715			if (err == 0) {
3716				mddev->ro = 2;
3717				set_disk_ro(mddev->gendisk, 0);
3718			}
3719		} else {
3720			mddev->ro = 2;
3721			err = do_md_run(mddev);
3722		}
3723		break;
3724	case clean:
3725		if (mddev->pers) {
3726			restart_array(mddev);
3727			spin_lock_irq(&mddev->write_lock);
3728			if (atomic_read(&mddev->writes_pending) == 0) {
3729				if (mddev->in_sync == 0) {
3730					mddev->in_sync = 1;
3731					if (mddev->safemode == 1)
3732						mddev->safemode = 0;
3733					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3734				}
3735				err = 0;
3736			} else
3737				err = -EBUSY;
3738			spin_unlock_irq(&mddev->write_lock);
3739		} else
3740			err = -EINVAL;
3741		break;
3742	case active:
3743		if (mddev->pers) {
3744			restart_array(mddev);
3745			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3746			wake_up(&mddev->sb_wait);
3747			err = 0;
3748		} else {
3749			mddev->ro = 0;
3750			set_disk_ro(mddev->gendisk, 0);
3751			err = do_md_run(mddev);
3752		}
3753		break;
3754	case write_pending:
3755	case active_idle:
3756		/* these cannot be set */
3757		break;
3758	}
3759	if (err)
3760		return err;
3761	else {
3762		if (mddev->hold_active == UNTIL_IOCTL)
3763			mddev->hold_active = 0;
3764		sysfs_notify_dirent_safe(mddev->sysfs_state);
3765		return len;
3766	}
3767}
3768static struct md_sysfs_entry md_array_state =
3769__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3770
3771static ssize_t
3772max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3773	return sprintf(page, "%d\n",
3774		       atomic_read(&mddev->max_corr_read_errors));
3775}
3776
3777static ssize_t
3778max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3779{
3780	char *e;
3781	unsigned long n = simple_strtoul(buf, &e, 10);
3782
3783	if (*buf && (*e == 0 || *e == '\n')) {
3784		atomic_set(&mddev->max_corr_read_errors, n);
3785		return len;
3786	}
3787	return -EINVAL;
3788}
3789
3790static struct md_sysfs_entry max_corr_read_errors =
3791__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3792	max_corrected_read_errors_store);
3793
3794static ssize_t
3795null_show(struct mddev *mddev, char *page)
3796{
3797	return -EINVAL;
3798}
3799
3800static ssize_t
3801new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3802{
3803	/* buf must be %d:%d\n? giving major and minor numbers */
3804	/* The new device is added to the array.
3805	 * If the array has a persistent superblock, we read the
3806	 * superblock to initialise info and check validity.
3807	 * Otherwise, only checking done is that in bind_rdev_to_array,
3808	 * which mainly checks size.
3809	 */
3810	char *e;
3811	int major = simple_strtoul(buf, &e, 10);
3812	int minor;
3813	dev_t dev;
3814	struct md_rdev *rdev;
3815	int err;
3816
3817	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3818		return -EINVAL;
3819	minor = simple_strtoul(e+1, &e, 10);
3820	if (*e && *e != '\n')
3821		return -EINVAL;
3822	dev = MKDEV(major, minor);
3823	if (major != MAJOR(dev) ||
3824	    minor != MINOR(dev))
3825		return -EOVERFLOW;
3826
3827	if (mddev->persistent) {
3828		rdev = md_import_device(dev, mddev->major_version,
3829					mddev->minor_version);
3830		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3831			struct md_rdev *rdev0
3832				= list_entry(mddev->disks.next,
3833					     struct md_rdev, same_set);
3834			err = super_types[mddev->major_version]
3835				.load_super(rdev, rdev0, mddev->minor_version);
3836			if (err < 0)
3837				goto out;
3838		}
3839	} else if (mddev->external)
3840		rdev = md_import_device(dev, -2, -1);
3841	else
3842		rdev = md_import_device(dev, -1, -1);
3843
3844	if (IS_ERR(rdev))
3845		return PTR_ERR(rdev);
3846	err = bind_rdev_to_array(rdev, mddev);
3847 out:
3848	if (err)
3849		export_rdev(rdev);
3850	return err ? err : len;
3851}
3852
3853static struct md_sysfs_entry md_new_device =
3854__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3855
3856static ssize_t
3857bitmap_store(struct mddev *mddev, const char *buf, size_t len)
3858{
3859	char *end;
3860	unsigned long chunk, end_chunk;
3861
3862	if (!mddev->bitmap)
3863		goto out;
3864	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3865	while (*buf) {
3866		chunk = end_chunk = simple_strtoul(buf, &end, 0);
3867		if (buf == end) break;
3868		if (*end == '-') { /* range */
3869			buf = end + 1;
3870			end_chunk = simple_strtoul(buf, &end, 0);
3871			if (buf == end) break;
3872		}
3873		if (*end && !isspace(*end)) break;
3874		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3875		buf = skip_spaces(end);
3876	}
3877	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3878out:
3879	return len;
3880}
3881
3882static struct md_sysfs_entry md_bitmap =
3883__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3884
3885static ssize_t
3886size_show(struct mddev *mddev, char *page)
3887{
3888	return sprintf(page, "%llu\n",
3889		(unsigned long long)mddev->dev_sectors / 2);
3890}
3891
3892static int update_size(struct mddev *mddev, sector_t num_sectors);
3893
3894static ssize_t
3895size_store(struct mddev *mddev, const char *buf, size_t len)
3896{
3897	/* If array is inactive, we can reduce the component size, but
3898	 * not increase it (except from 0).
3899	 * If array is active, we can try an on-line resize
3900	 */
3901	sector_t sectors;
3902	int err = strict_blocks_to_sectors(buf, &sectors);
3903
3904	if (err < 0)
3905		return err;
3906	if (mddev->pers) {
3907		err = update_size(mddev, sectors);
3908		md_update_sb(mddev, 1);
3909	} else {
3910		if (mddev->dev_sectors == 0 ||
3911		    mddev->dev_sectors > sectors)
3912			mddev->dev_sectors = sectors;
3913		else
3914			err = -ENOSPC;
3915	}
3916	return err ? err : len;
3917}
3918
3919static struct md_sysfs_entry md_size =
3920__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3921
3922/* Metadata version.
3923 * This is one of
3924 *   'none' for arrays with no metadata (good luck...)
3925 *   'external' for arrays with externally managed metadata,
3926 * or N.M for internally known formats
3927 */
3928static ssize_t
3929metadata_show(struct mddev *mddev, char *page)
3930{
3931	if (mddev->persistent)
3932		return sprintf(page, "%d.%d\n",
3933			       mddev->major_version, mddev->minor_version);
3934	else if (mddev->external)
3935		return sprintf(page, "external:%s\n", mddev->metadata_type);
3936	else
3937		return sprintf(page, "none\n");
3938}
3939
3940static ssize_t
3941metadata_store(struct mddev *mddev, const char *buf, size_t len)
3942{
3943	int major, minor;
3944	char *e;
3945	/* Changing the details of 'external' metadata is
3946	 * always permitted.  Otherwise there must be
3947	 * no devices attached to the array.
3948	 */
3949	if (mddev->external && strncmp(buf, "external:", 9) == 0)
3950		;
3951	else if (!list_empty(&mddev->disks))
3952		return -EBUSY;
3953
3954	if (cmd_match(buf, "none")) {
3955		mddev->persistent = 0;
3956		mddev->external = 0;
3957		mddev->major_version = 0;
3958		mddev->minor_version = 90;
3959		return len;
3960	}
3961	if (strncmp(buf, "external:", 9) == 0) {
3962		size_t namelen = len-9;
3963		if (namelen >= sizeof(mddev->metadata_type))
3964			namelen = sizeof(mddev->metadata_type)-1;
3965		strncpy(mddev->metadata_type, buf+9, namelen);
3966		mddev->metadata_type[namelen] = 0;
3967		if (namelen && mddev->metadata_type[namelen-1] == '\n')
3968			mddev->metadata_type[--namelen] = 0;
3969		mddev->persistent = 0;
3970		mddev->external = 1;
3971		mddev->major_version = 0;
3972		mddev->minor_version = 90;
3973		return len;
3974	}
3975	major = simple_strtoul(buf, &e, 10);
3976	if (e==buf || *e != '.')
3977		return -EINVAL;
3978	buf = e+1;
3979	minor = simple_strtoul(buf, &e, 10);
3980	if (e==buf || (*e && *e != '\n') )
3981		return -EINVAL;
3982	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3983		return -ENOENT;
3984	mddev->major_version = major;
3985	mddev->minor_version = minor;
3986	mddev->persistent = 1;
3987	mddev->external = 0;
3988	return len;
3989}
3990
3991static struct md_sysfs_entry md_metadata =
3992__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3993
3994static ssize_t
3995action_show(struct mddev *mddev, char *page)
3996{
3997	char *type = "idle";
3998	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3999		type = "frozen";
4000	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4001	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
4002		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4003			type = "reshape";
4004		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4005			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4006				type = "resync";
4007			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4008				type = "check";
4009			else
4010				type = "repair";
4011		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4012			type = "recover";
4013	}
4014	return sprintf(page, "%s\n", type);
4015}
4016
4017static ssize_t
4018action_store(struct mddev *mddev, const char *page, size_t len)
4019{
4020	if (!mddev->pers || !mddev->pers->sync_request)
4021		return -EINVAL;
4022
4023	if (cmd_match(page, "frozen"))
4024		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4025	else
4026		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4027
4028	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4029		if (mddev->sync_thread) {
4030			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4031			md_reap_sync_thread(mddev);
4032		}
4033	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4034		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4035		return -EBUSY;
4036	else if (cmd_match(page, "resync"))
4037		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4038	else if (cmd_match(page, "recover")) {
4039		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4040		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4041	} else if (cmd_match(page, "reshape")) {
4042		int err;
4043		if (mddev->pers->start_reshape == NULL)
4044			return -EINVAL;
4045		err = mddev->pers->start_reshape(mddev);
4046		if (err)
4047			return err;
4048		sysfs_notify(&mddev->kobj, NULL, "degraded");
4049	} else {
4050		if (cmd_match(page, "check"))
4051			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4052		else if (!cmd_match(page, "repair"))
4053			return -EINVAL;
4054		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4055		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4056	}
4057	if (mddev->ro == 2) {
4058		/* A write to sync_action is enough to justify
4059		 * canceling read-auto mode
4060		 */
4061		mddev->ro = 0;
4062		md_wakeup_thread(mddev->sync_thread);
4063	}
4064	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4065	md_wakeup_thread(mddev->thread);
4066	sysfs_notify_dirent_safe(mddev->sysfs_action);
4067	return len;
4068}
4069
4070static struct md_sysfs_entry md_scan_mode =
4071__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4072
4073static ssize_t
4074last_sync_action_show(struct mddev *mddev, char *page)
4075{
4076	return sprintf(page, "%s\n", mddev->last_sync_action);
4077}
4078
4079static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4080
4081static ssize_t
4082mismatch_cnt_show(struct mddev *mddev, char *page)
4083{
4084	return sprintf(page, "%llu\n",
4085		       (unsigned long long)
4086		       atomic64_read(&mddev->resync_mismatches));
4087}
4088
4089static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4090
4091static ssize_t
4092sync_min_show(struct mddev *mddev, char *page)
4093{
4094	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4095		       mddev->sync_speed_min ? "local": "system");
4096}
4097
4098static ssize_t
4099sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4100{
4101	int min;
4102	char *e;
4103	if (strncmp(buf, "system", 6)==0) {
4104		mddev->sync_speed_min = 0;
4105		return len;
4106	}
4107	min = simple_strtoul(buf, &e, 10);
4108	if (buf == e || (*e && *e != '\n') || min <= 0)
4109		return -EINVAL;
4110	mddev->sync_speed_min = min;
4111	return len;
4112}
4113
4114static struct md_sysfs_entry md_sync_min =
4115__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4116
4117static ssize_t
4118sync_max_show(struct mddev *mddev, char *page)
4119{
4120	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4121		       mddev->sync_speed_max ? "local": "system");
4122}
4123
4124static ssize_t
4125sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4126{
4127	int max;
4128	char *e;
4129	if (strncmp(buf, "system", 6)==0) {
4130		mddev->sync_speed_max = 0;
4131		return len;
4132	}
4133	max = simple_strtoul(buf, &e, 10);
4134	if (buf == e || (*e && *e != '\n') || max <= 0)
4135		return -EINVAL;
4136	mddev->sync_speed_max = max;
4137	return len;
4138}
4139
4140static struct md_sysfs_entry md_sync_max =
4141__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4142
4143static ssize_t
4144degraded_show(struct mddev *mddev, char *page)
4145{
4146	return sprintf(page, "%d\n", mddev->degraded);
4147}
4148static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4149
4150static ssize_t
4151sync_force_parallel_show(struct mddev *mddev, char *page)
4152{
4153	return sprintf(page, "%d\n", mddev->parallel_resync);
4154}
4155
4156static ssize_t
4157sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4158{
4159	long n;
4160
4161	if (kstrtol(buf, 10, &n))
4162		return -EINVAL;
4163
4164	if (n != 0 && n != 1)
4165		return -EINVAL;
4166
4167	mddev->parallel_resync = n;
4168
4169	if (mddev->sync_thread)
4170		wake_up(&resync_wait);
4171
4172	return len;
4173}
4174
4175/* force parallel resync, even with shared block devices */
4176static struct md_sysfs_entry md_sync_force_parallel =
4177__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4178       sync_force_parallel_show, sync_force_parallel_store);
4179
4180static ssize_t
4181sync_speed_show(struct mddev *mddev, char *page)
4182{
4183	unsigned long resync, dt, db;
4184	if (mddev->curr_resync == 0)
4185		return sprintf(page, "none\n");
4186	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4187	dt = (jiffies - mddev->resync_mark) / HZ;
4188	if (!dt) dt++;
4189	db = resync - mddev->resync_mark_cnt;
4190	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4191}
4192
4193static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4194
4195static ssize_t
4196sync_completed_show(struct mddev *mddev, char *page)
4197{
4198	unsigned long long max_sectors, resync;
4199
4200	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4201		return sprintf(page, "none\n");
4202
4203	if (mddev->curr_resync == 1 ||
4204	    mddev->curr_resync == 2)
4205		return sprintf(page, "delayed\n");
4206
4207	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4208	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4209		max_sectors = mddev->resync_max_sectors;
4210	else
4211		max_sectors = mddev->dev_sectors;
4212
4213	resync = mddev->curr_resync_completed;
4214	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4215}
4216
4217static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4218
4219static ssize_t
4220min_sync_show(struct mddev *mddev, char *page)
4221{
4222	return sprintf(page, "%llu\n",
4223		       (unsigned long long)mddev->resync_min);
4224}
4225static ssize_t
4226min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4227{
4228	unsigned long long min;
4229	if (kstrtoull(buf, 10, &min))
4230		return -EINVAL;
4231	if (min > mddev->resync_max)
4232		return -EINVAL;
4233	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4234		return -EBUSY;
4235
4236	/* Must be a multiple of chunk_size */
4237	if (mddev->chunk_sectors) {
4238		sector_t temp = min;
4239		if (sector_div(temp, mddev->chunk_sectors))
4240			return -EINVAL;
4241	}
4242	mddev->resync_min = min;
4243
4244	return len;
4245}
4246
4247static struct md_sysfs_entry md_min_sync =
4248__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4249
4250static ssize_t
4251max_sync_show(struct mddev *mddev, char *page)
4252{
4253	if (mddev->resync_max == MaxSector)
4254		return sprintf(page, "max\n");
4255	else
4256		return sprintf(page, "%llu\n",
4257			       (unsigned long long)mddev->resync_max);
4258}
4259static ssize_t
4260max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4261{
4262	if (strncmp(buf, "max", 3) == 0)
4263		mddev->resync_max = MaxSector;
4264	else {
4265		unsigned long long max;
4266		if (kstrtoull(buf, 10, &max))
4267			return -EINVAL;
4268		if (max < mddev->resync_min)
4269			return -EINVAL;
4270		if (max < mddev->resync_max &&
4271		    mddev->ro == 0 &&
4272		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4273			return -EBUSY;
4274
4275		/* Must be a multiple of chunk_size */
4276		if (mddev->chunk_sectors) {
4277			sector_t temp = max;
4278			if (sector_div(temp, mddev->chunk_sectors))
4279				return -EINVAL;
4280		}
4281		mddev->resync_max = max;
4282	}
4283	wake_up(&mddev->recovery_wait);
4284	return len;
4285}
4286
4287static struct md_sysfs_entry md_max_sync =
4288__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4289
4290static ssize_t
4291suspend_lo_show(struct mddev *mddev, char *page)
4292{
4293	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4294}
4295
4296static ssize_t
4297suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4298{
4299	char *e;
4300	unsigned long long new = simple_strtoull(buf, &e, 10);
4301	unsigned long long old = mddev->suspend_lo;
4302
4303	if (mddev->pers == NULL ||
4304	    mddev->pers->quiesce == NULL)
4305		return -EINVAL;
4306	if (buf == e || (*e && *e != '\n'))
4307		return -EINVAL;
4308
4309	mddev->suspend_lo = new;
4310	if (new >= old)
4311		/* Shrinking suspended region */
4312		mddev->pers->quiesce(mddev, 2);
4313	else {
4314		/* Expanding suspended region - need to wait */
4315		mddev->pers->quiesce(mddev, 1);
4316		mddev->pers->quiesce(mddev, 0);
4317	}
4318	return len;
4319}
4320static struct md_sysfs_entry md_suspend_lo =
4321__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4322
4323static ssize_t
4324suspend_hi_show(struct mddev *mddev, char *page)
4325{
4326	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4327}
4328
4329static ssize_t
4330suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4331{
4332	char *e;
4333	unsigned long long new = simple_strtoull(buf, &e, 10);
4334	unsigned long long old = mddev->suspend_hi;
4335
4336	if (mddev->pers == NULL ||
4337	    mddev->pers->quiesce == NULL)
4338		return -EINVAL;
4339	if (buf == e || (*e && *e != '\n'))
4340		return -EINVAL;
4341
4342	mddev->suspend_hi = new;
4343	if (new <= old)
4344		/* Shrinking suspended region */
4345		mddev->pers->quiesce(mddev, 2);
4346	else {
4347		/* Expanding suspended region - need to wait */
4348		mddev->pers->quiesce(mddev, 1);
4349		mddev->pers->quiesce(mddev, 0);
4350	}
4351	return len;
4352}
4353static struct md_sysfs_entry md_suspend_hi =
4354__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4355
4356static ssize_t
4357reshape_position_show(struct mddev *mddev, char *page)
4358{
4359	if (mddev->reshape_position != MaxSector)
4360		return sprintf(page, "%llu\n",
4361			       (unsigned long long)mddev->reshape_position);
4362	strcpy(page, "none\n");
4363	return 5;
4364}
4365
4366static ssize_t
4367reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4368{
4369	struct md_rdev *rdev;
4370	char *e;
4371	unsigned long long new = simple_strtoull(buf, &e, 10);
4372	if (mddev->pers)
4373		return -EBUSY;
4374	if (buf == e || (*e && *e != '\n'))
4375		return -EINVAL;
4376	mddev->reshape_position = new;
4377	mddev->delta_disks = 0;
4378	mddev->reshape_backwards = 0;
4379	mddev->new_level = mddev->level;
4380	mddev->new_layout = mddev->layout;
4381	mddev->new_chunk_sectors = mddev->chunk_sectors;
4382	rdev_for_each(rdev, mddev)
4383		rdev->new_data_offset = rdev->data_offset;
4384	return len;
4385}
4386
4387static struct md_sysfs_entry md_reshape_position =
4388__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4389       reshape_position_store);
4390
4391static ssize_t
4392reshape_direction_show(struct mddev *mddev, char *page)
4393{
4394	return sprintf(page, "%s\n",
4395		       mddev->reshape_backwards ? "backwards" : "forwards");
4396}
4397
4398static ssize_t
4399reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4400{
4401	int backwards = 0;
4402	if (cmd_match(buf, "forwards"))
4403		backwards = 0;
4404	else if (cmd_match(buf, "backwards"))
4405		backwards = 1;
4406	else
4407		return -EINVAL;
4408	if (mddev->reshape_backwards == backwards)
4409		return len;
4410
4411	/* check if we are allowed to change */
4412	if (mddev->delta_disks)
4413		return -EBUSY;
4414
4415	if (mddev->persistent &&
4416	    mddev->major_version == 0)
4417		return -EINVAL;
4418
4419	mddev->reshape_backwards = backwards;
4420	return len;
4421}
4422
4423static struct md_sysfs_entry md_reshape_direction =
4424__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4425       reshape_direction_store);
4426
4427static ssize_t
4428array_size_show(struct mddev *mddev, char *page)
4429{
4430	if (mddev->external_size)
4431		return sprintf(page, "%llu\n",
4432			       (unsigned long long)mddev->array_sectors/2);
4433	else
4434		return sprintf(page, "default\n");
4435}
4436
4437static ssize_t
4438array_size_store(struct mddev *mddev, const char *buf, size_t len)
4439{
4440	sector_t sectors;
4441
4442	if (strncmp(buf, "default", 7) == 0) {
4443		if (mddev->pers)
4444			sectors = mddev->pers->size(mddev, 0, 0);
4445		else
4446			sectors = mddev->array_sectors;
4447
4448		mddev->external_size = 0;
4449	} else {
4450		if (strict_blocks_to_sectors(buf, &sectors) < 0)
4451			return -EINVAL;
4452		if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4453			return -E2BIG;
4454
4455		mddev->external_size = 1;
4456	}
4457
4458	mddev->array_sectors = sectors;
4459	if (mddev->pers) {
4460		set_capacity(mddev->gendisk, mddev->array_sectors);
4461		revalidate_disk(mddev->gendisk);
4462	}
4463	return len;
4464}
4465
4466static struct md_sysfs_entry md_array_size =
4467__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4468       array_size_store);
4469
4470static struct attribute *md_default_attrs[] = {
4471	&md_level.attr,
4472	&md_layout.attr,
4473	&md_raid_disks.attr,
4474	&md_chunk_size.attr,
4475	&md_size.attr,
4476	&md_resync_start.attr,
4477	&md_metadata.attr,
4478	&md_new_device.attr,
4479	&md_safe_delay.attr,
4480	&md_array_state.attr,
4481	&md_reshape_position.attr,
4482	&md_reshape_direction.attr,
4483	&md_array_size.attr,
4484	&max_corr_read_errors.attr,
4485	NULL,
4486};
4487
4488static struct attribute *md_redundancy_attrs[] = {
4489	&md_scan_mode.attr,
4490	&md_last_scan_mode.attr,
4491	&md_mismatches.attr,
4492	&md_sync_min.attr,
4493	&md_sync_max.attr,
4494	&md_sync_speed.attr,
4495	&md_sync_force_parallel.attr,
4496	&md_sync_completed.attr,
4497	&md_min_sync.attr,
4498	&md_max_sync.attr,
4499	&md_suspend_lo.attr,
4500	&md_suspend_hi.attr,
4501	&md_bitmap.attr,
4502	&md_degraded.attr,
4503	NULL,
4504};
4505static struct attribute_group md_redundancy_group = {
4506	.name = NULL,
4507	.attrs = md_redundancy_attrs,
4508};
4509
4510static ssize_t
4511md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4512{
4513	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4514	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4515	ssize_t rv;
4516
4517	if (!entry->show)
4518		return -EIO;
4519	spin_lock(&all_mddevs_lock);
4520	if (list_empty(&mddev->all_mddevs)) {
4521		spin_unlock(&all_mddevs_lock);
4522		return -EBUSY;
4523	}
4524	mddev_get(mddev);
4525	spin_unlock(&all_mddevs_lock);
4526
4527	rv = mddev_lock(mddev);
4528	if (!rv) {
4529		rv = entry->show(mddev, page);
4530		mddev_unlock(mddev);
4531	}
4532	mddev_put(mddev);
4533	return rv;
4534}
4535
4536static ssize_t
4537md_attr_store(struct kobject *kobj, struct attribute *attr,
4538	      const char *page, size_t length)
4539{
4540	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4541	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4542	ssize_t rv;
4543
4544	if (!entry->store)
4545		return -EIO;
4546	if (!capable(CAP_SYS_ADMIN))
4547		return -EACCES;
4548	spin_lock(&all_mddevs_lock);
4549	if (list_empty(&mddev->all_mddevs)) {
4550		spin_unlock(&all_mddevs_lock);
4551		return -EBUSY;
4552	}
4553	mddev_get(mddev);
4554	spin_unlock(&all_mddevs_lock);
4555	if (entry->store == new_dev_store)
4556		flush_workqueue(md_misc_wq);
4557	rv = mddev_lock(mddev);
4558	if (!rv) {
4559		rv = entry->store(mddev, page, length);
4560		mddev_unlock(mddev);
4561	}
4562	mddev_put(mddev);
4563	return rv;
4564}
4565
4566static void md_free(struct kobject *ko)
4567{
4568	struct mddev *mddev = container_of(ko, struct mddev, kobj);
4569
4570	if (mddev->sysfs_state)
4571		sysfs_put(mddev->sysfs_state);
4572
4573	if (mddev->gendisk) {
4574		del_gendisk(mddev->gendisk);
4575		put_disk(mddev->gendisk);
4576	}
4577	if (mddev->queue)
4578		blk_cleanup_queue(mddev->queue);
4579
4580	kfree(mddev);
4581}
4582
4583static const struct sysfs_ops md_sysfs_ops = {
4584	.show	= md_attr_show,
4585	.store	= md_attr_store,
4586};
4587static struct kobj_type md_ktype = {
4588	.release	= md_free,
4589	.sysfs_ops	= &md_sysfs_ops,
4590	.default_attrs	= md_default_attrs,
4591};
4592
4593int mdp_major = 0;
4594
4595static void mddev_delayed_delete(struct work_struct *ws)
4596{
4597	struct mddev *mddev = container_of(ws, struct mddev, del_work);
4598
4599	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4600	kobject_del(&mddev->kobj);
4601	kobject_put(&mddev->kobj);
4602}
4603
4604static int md_alloc(dev_t dev, char *name)
4605{
4606	static DEFINE_MUTEX(disks_mutex);
4607	struct mddev *mddev = mddev_find(dev);
4608	struct gendisk *disk;
4609	int partitioned;
4610	int shift;
4611	int unit;
4612	int error;
4613
4614	if (!mddev)
4615		return -ENODEV;
4616
4617	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4618	shift = partitioned ? MdpMinorShift : 0;
4619	unit = MINOR(mddev->unit) >> shift;
4620
4621	/* wait for any previous instance of this device to be
4622	 * completely removed (mddev_delayed_delete).
4623	 */
4624	flush_workqueue(md_misc_wq);
4625
4626	mutex_lock(&disks_mutex);
4627	error = -EEXIST;
4628	if (mddev->gendisk)
4629		goto abort;
4630
4631	if (name) {
4632		/* Need to ensure that 'name' is not a duplicate.
4633		 */
4634		struct mddev *mddev2;
4635		spin_lock(&all_mddevs_lock);
4636
4637		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4638			if (mddev2->gendisk &&
4639			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
4640				spin_unlock(&all_mddevs_lock);
4641				goto abort;
4642			}
4643		spin_unlock(&all_mddevs_lock);
4644	}
4645
4646	error = -ENOMEM;
4647	mddev->queue = blk_alloc_queue(GFP_KERNEL);
4648	if (!mddev->queue)
4649		goto abort;
4650	mddev->queue->queuedata = mddev;
4651
4652	blk_queue_make_request(mddev->queue, md_make_request);
4653	blk_set_stacking_limits(&mddev->queue->limits);
4654
4655	disk = alloc_disk(1 << shift);
4656	if (!disk) {
4657		blk_cleanup_queue(mddev->queue);
4658		mddev->queue = NULL;
4659		goto abort;
4660	}
4661	disk->major = MAJOR(mddev->unit);
4662	disk->first_minor = unit << shift;
4663	if (name)
4664		strcpy(disk->disk_name, name);
4665	else if (partitioned)
4666		sprintf(disk->disk_name, "md_d%d", unit);
4667	else
4668		sprintf(disk->disk_name, "md%d", unit);
4669	disk->fops = &md_fops;
4670	disk->private_data = mddev;
4671	disk->queue = mddev->queue;
4672	blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4673	/* Allow extended partitions.  This makes the
4674	 * 'mdp' device redundant, but we can't really
4675	 * remove it now.
4676	 */
4677	disk->flags |= GENHD_FL_EXT_DEVT;
4678	mddev->gendisk = disk;
4679	/* As soon as we call add_disk(), another thread could get
4680	 * through to md_open, so make sure it doesn't get too far
4681	 */
4682	mutex_lock(&mddev->open_mutex);
4683	add_disk(disk);
4684
4685	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4686				     &disk_to_dev(disk)->kobj, "%s", "md");
4687	if (error) {
4688		/* This isn't possible, but as kobject_init_and_add is marked
4689		 * __must_check, we must do something with the result
4690		 */
4691		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4692		       disk->disk_name);
4693		error = 0;
4694	}
4695	if (mddev->kobj.sd &&
4696	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4697		printk(KERN_DEBUG "pointless warning\n");
4698	mutex_unlock(&mddev->open_mutex);
4699 abort:
4700	mutex_unlock(&disks_mutex);
4701	if (!error && mddev->kobj.sd) {
4702		kobject_uevent(&mddev->kobj, KOBJ_ADD);
4703		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4704	}
4705	mddev_put(mddev);
4706	return error;
4707}
4708
4709static struct kobject *md_probe(dev_t dev, int *part, void *data)
4710{
4711	md_alloc(dev, NULL);
4712	return NULL;
4713}
4714
4715static int add_named_array(const char *val, struct kernel_param *kp)
4716{
4717	/* val must be "md_*" where * is not all digits.
4718	 * We allocate an array with a large free minor number, and
4719	 * set the name to val.  val must not already be an active name.
4720	 */
4721	int len = strlen(val);
4722	char buf[DISK_NAME_LEN];
4723
4724	while (len && val[len-1] == '\n')
4725		len--;
4726	if (len >= DISK_NAME_LEN)
4727		return -E2BIG;
4728	strlcpy(buf, val, len+1);
4729	if (strncmp(buf, "md_", 3) != 0)
4730		return -EINVAL;
4731	return md_alloc(0, buf);
4732}
4733
4734static void md_safemode_timeout(unsigned long data)
4735{
4736	struct mddev *mddev = (struct mddev *) data;
4737
4738	if (!atomic_read(&mddev->writes_pending)) {
4739		mddev->safemode = 1;
4740		if (mddev->external)
4741			sysfs_notify_dirent_safe(mddev->sysfs_state);
4742	}
4743	md_wakeup_thread(mddev->thread);
4744}
4745
4746static int start_dirty_degraded;
4747
4748int md_run(struct mddev *mddev)
4749{
4750	int err;
4751	struct md_rdev *rdev;
4752	struct md_personality *pers;
4753
4754	if (list_empty(&mddev->disks))
4755		/* cannot run an array with no devices.. */
4756		return -EINVAL;
4757
4758	if (mddev->pers)
4759		return -EBUSY;
4760	/* Cannot run until previous stop completes properly */
4761	if (mddev->sysfs_active)
4762		return -EBUSY;
4763
4764	/*
4765	 * Analyze all RAID superblock(s)
4766	 */
4767	if (!mddev->raid_disks) {
4768		if (!mddev->persistent)
4769			return -EINVAL;
4770		analyze_sbs(mddev);
4771	}
4772
4773	if (mddev->level != LEVEL_NONE)
4774		request_module("md-level-%d", mddev->level);
4775	else if (mddev->clevel[0])
4776		request_module("md-%s", mddev->clevel);
4777
4778	/*
4779	 * Drop all container device buffers, from now on
4780	 * the only valid external interface is through the md
4781	 * device.
4782	 */
4783	rdev_for_each(rdev, mddev) {
4784		if (test_bit(Faulty, &rdev->flags))
4785			continue;
4786		sync_blockdev(rdev->bdev);
4787		invalidate_bdev(rdev->bdev);
4788
4789		/* perform some consistency tests on the device.
4790		 * We don't want the data to overlap the metadata,
4791		 * Internal Bitmap issues have been handled elsewhere.
4792		 */
4793		if (rdev->meta_bdev) {
4794			/* Nothing to check */;
4795		} else if (rdev->data_offset < rdev->sb_start) {
4796			if (mddev->dev_sectors &&
4797			    rdev->data_offset + mddev->dev_sectors
4798			    > rdev->sb_start) {
4799				printk("md: %s: data overlaps metadata\n",
4800				       mdname(mddev));
4801				return -EINVAL;
4802			}
4803		} else {
4804			if (rdev->sb_start + rdev->sb_size/512
4805			    > rdev->data_offset) {
4806				printk("md: %s: metadata overlaps data\n",
4807				       mdname(mddev));
4808				return -EINVAL;
4809			}
4810		}
4811		sysfs_notify_dirent_safe(rdev->sysfs_state);
4812	}
4813
4814	if (mddev->bio_set == NULL)
4815		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
4816
4817	spin_lock(&pers_lock);
4818	pers = find_pers(mddev->level, mddev->clevel);
4819	if (!pers || !try_module_get(pers->owner)) {
4820		spin_unlock(&pers_lock);
4821		if (mddev->level != LEVEL_NONE)
4822			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4823			       mddev->level);
4824		else
4825			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4826			       mddev->clevel);
4827		return -EINVAL;
4828	}
4829	mddev->pers = pers;
4830	spin_unlock(&pers_lock);
4831	if (mddev->level != pers->level) {
4832		mddev->level = pers->level;
4833		mddev->new_level = pers->level;
4834	}
4835	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4836
4837	if (mddev->reshape_position != MaxSector &&
4838	    pers->start_reshape == NULL) {
4839		/* This personality cannot handle reshaping... */
4840		mddev->pers = NULL;
4841		module_put(pers->owner);
4842		return -EINVAL;
4843	}
4844
4845	if (pers->sync_request) {
4846		/* Warn if this is a potentially silly
4847		 * configuration.
4848		 */
4849		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4850		struct md_rdev *rdev2;
4851		int warned = 0;
4852
4853		rdev_for_each(rdev, mddev)
4854			rdev_for_each(rdev2, mddev) {
4855				if (rdev < rdev2 &&
4856				    rdev->bdev->bd_contains ==
4857				    rdev2->bdev->bd_contains) {
4858					printk(KERN_WARNING
4859					       "%s: WARNING: %s appears to be"
4860					       " on the same physical disk as"
4861					       " %s.\n",
4862					       mdname(mddev),
4863					       bdevname(rdev->bdev,b),
4864					       bdevname(rdev2->bdev,b2));
4865					warned = 1;
4866				}
4867			}
4868
4869		if (warned)
4870			printk(KERN_WARNING
4871			       "True protection against single-disk"
4872			       " failure might be compromised.\n");
4873	}
4874
4875	mddev->recovery = 0;
4876	/* may be over-ridden by personality */
4877	mddev->resync_max_sectors = mddev->dev_sectors;
4878
4879	mddev->ok_start_degraded = start_dirty_degraded;
4880
4881	if (start_readonly && mddev->ro == 0)
4882		mddev->ro = 2; /* read-only, but switch on first write */
4883
4884	err = mddev->pers->run(mddev);
4885	if (err)
4886		printk(KERN_ERR "md: pers->run() failed ...\n");
4887	else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4888		WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4889			  " but 'external_size' not in effect?\n", __func__);
4890		printk(KERN_ERR
4891		       "md: invalid array_size %llu > default size %llu\n",
4892		       (unsigned long long)mddev->array_sectors / 2,
4893		       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4894		err = -EINVAL;
4895		mddev->pers->stop(mddev);
4896	}
4897	if (err == 0 && mddev->pers->sync_request &&
4898	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
4899		err = bitmap_create(mddev);
4900		if (err) {
4901			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4902			       mdname(mddev), err);
4903			mddev->pers->stop(mddev);
4904		}
4905	}
4906	if (err) {
4907		module_put(mddev->pers->owner);
4908		mddev->pers = NULL;
4909		bitmap_destroy(mddev);
4910		return err;
4911	}
4912	if (mddev->pers->sync_request) {
4913		if (mddev->kobj.sd &&
4914		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4915			printk(KERN_WARNING
4916			       "md: cannot register extra attributes for %s\n",
4917			       mdname(mddev));
4918		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
4919	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
4920		mddev->ro = 0;
4921
4922	atomic_set(&mddev->writes_pending,0);
4923	atomic_set(&mddev->max_corr_read_errors,
4924		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4925	mddev->safemode = 0;
4926	mddev->safemode_timer.function = md_safemode_timeout;
4927	mddev->safemode_timer.data = (unsigned long) mddev;
4928	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4929	mddev->in_sync = 1;
4930	smp_wmb();
4931	mddev->ready = 1;
4932	rdev_for_each(rdev, mddev)
4933		if (rdev->raid_disk >= 0)
4934			if (sysfs_link_rdev(mddev, rdev))
4935				/* failure here is OK */;
4936
4937	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4938
4939	if (mddev->flags & MD_UPDATE_SB_FLAGS)
4940		md_update_sb(mddev, 0);
4941
4942	md_new_event(mddev);
4943	sysfs_notify_dirent_safe(mddev->sysfs_state);
4944	sysfs_notify_dirent_safe(mddev->sysfs_action);
4945	sysfs_notify(&mddev->kobj, NULL, "degraded");
4946	return 0;
4947}
4948EXPORT_SYMBOL_GPL(md_run);
4949
4950static int do_md_run(struct mddev *mddev)
4951{
4952	int err;
4953
4954	err = md_run(mddev);
4955	if (err)
4956		goto out;
4957	err = bitmap_load(mddev);
4958	if (err) {
4959		bitmap_destroy(mddev);
4960		goto out;
4961	}
4962
4963	md_wakeup_thread(mddev->thread);
4964	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4965
4966	set_capacity(mddev->gendisk, mddev->array_sectors);
4967	revalidate_disk(mddev->gendisk);
4968	mddev->changed = 1;
4969	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4970out:
4971	return err;
4972}
4973
4974static int restart_array(struct mddev *mddev)
4975{
4976	struct gendisk *disk = mddev->gendisk;
4977
4978	/* Complain if it has no devices */
4979	if (list_empty(&mddev->disks))
4980		return -ENXIO;
4981	if (!mddev->pers)
4982		return -EINVAL;
4983	if (!mddev->ro)
4984		return -EBUSY;
4985	mddev->safemode = 0;
4986	mddev->ro = 0;
4987	set_disk_ro(disk, 0);
4988	printk(KERN_INFO "md: %s switched to read-write mode.\n",
4989		mdname(mddev));
4990	/* Kick recovery or resync if necessary */
4991	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4992	md_wakeup_thread(mddev->thread);
4993	md_wakeup_thread(mddev->sync_thread);
4994	sysfs_notify_dirent_safe(mddev->sysfs_state);
4995	return 0;
4996}
4997
4998static void md_clean(struct mddev *mddev)
4999{
5000	mddev->array_sectors = 0;
5001	mddev->external_size = 0;
5002	mddev->dev_sectors = 0;
5003	mddev->raid_disks = 0;
5004	mddev->recovery_cp = 0;
5005	mddev->resync_min = 0;
5006	mddev->resync_max = MaxSector;
5007	mddev->reshape_position = MaxSector;
5008	mddev->external = 0;
5009	mddev->persistent = 0;
5010	mddev->level = LEVEL_NONE;
5011	mddev->clevel[0] = 0;
5012	mddev->flags = 0;
5013	mddev->ro = 0;
5014	mddev->metadata_type[0] = 0;
5015	mddev->chunk_sectors = 0;
5016	mddev->ctime = mddev->utime = 0;
5017	mddev->layout = 0;
5018	mddev->max_disks = 0;
5019	mddev->events = 0;
5020	mddev->can_decrease_events = 0;
5021	mddev->delta_disks = 0;
5022	mddev->reshape_backwards = 0;
5023	mddev->new_level = LEVEL_NONE;
5024	mddev->new_layout = 0;
5025	mddev->new_chunk_sectors = 0;
5026	mddev->curr_resync = 0;
5027	atomic64_set(&mddev->resync_mismatches, 0);
5028	mddev->suspend_lo = mddev->suspend_hi = 0;
5029	mddev->sync_speed_min = mddev->sync_speed_max = 0;
5030	mddev->recovery = 0;
5031	mddev->in_sync = 0;
5032	mddev->changed = 0;
5033	mddev->degraded = 0;
5034	mddev->safemode = 0;
5035	mddev->merge_check_needed = 0;
5036	mddev->bitmap_info.offset = 0;
5037	mddev->bitmap_info.default_offset = 0;
5038	mddev->bitmap_info.default_space = 0;
5039	mddev->bitmap_info.chunksize = 0;
5040	mddev->bitmap_info.daemon_sleep = 0;
5041	mddev->bitmap_info.max_write_behind = 0;
5042}
5043
5044static void __md_stop_writes(struct mddev *mddev)
5045{
5046	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5047	if (mddev->sync_thread) {
5048		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5049		md_reap_sync_thread(mddev);
5050	}
5051
5052	del_timer_sync(&mddev->safemode_timer);
5053
5054	bitmap_flush(mddev);
5055	md_super_wait(mddev);
5056
5057	if (mddev->ro == 0 &&
5058	    (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
5059		/* mark array as shutdown cleanly */
5060		mddev->in_sync = 1;
5061		md_update_sb(mddev, 1);
5062	}
5063}
5064
5065void md_stop_writes(struct mddev *mddev)
5066{
5067	mddev_lock_nointr(mddev);
5068	__md_stop_writes(mddev);
5069	mddev_unlock(mddev);
5070}
5071EXPORT_SYMBOL_GPL(md_stop_writes);
5072
5073static void __md_stop(struct mddev *mddev)
5074{
5075	mddev->ready = 0;
5076	mddev->pers->stop(mddev);
5077	if (mddev->pers->sync_request && mddev->to_remove == NULL)
5078		mddev->to_remove = &md_redundancy_group;
5079	module_put(mddev->pers->owner);
5080	mddev->pers = NULL;
5081	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5082}
5083
5084void md_stop(struct mddev *mddev)
5085{
5086	/* stop the array and free an attached data structures.
5087	 * This is called from dm-raid
5088	 */
5089	__md_stop(mddev);
5090	bitmap_destroy(mddev);
5091	if (mddev->bio_set)
5092		bioset_free(mddev->bio_set);
5093}
5094
5095EXPORT_SYMBOL_GPL(md_stop);
5096
5097static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5098{
5099	int err = 0;
5100	int did_freeze = 0;
5101
5102	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5103		did_freeze = 1;
5104		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5105		md_wakeup_thread(mddev->thread);
5106	}
5107	if (mddev->sync_thread) {
5108		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5109		/* Thread might be blocked waiting for metadata update
5110		 * which will now never happen */
5111		wake_up_process(mddev->sync_thread->tsk);
5112	}
5113	mddev_unlock(mddev);
5114	wait_event(resync_wait, mddev->sync_thread == NULL);
5115	mddev_lock_nointr(mddev);
5116
5117	mutex_lock(&mddev->open_mutex);
5118	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5119	    mddev->sync_thread ||
5120	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5121		printk("md: %s still in use.\n",mdname(mddev));
5122		if (did_freeze) {
5123			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5124			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5125			md_wakeup_thread(mddev->thread);
5126		}
5127		err = -EBUSY;
5128		goto out;
5129	}
5130	if (mddev->pers) {
5131		__md_stop_writes(mddev);
5132
5133		err  = -ENXIO;
5134		if (mddev->ro==1)
5135			goto out;
5136		mddev->ro = 1;
5137		set_disk_ro(mddev->gendisk, 1);
5138		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5139		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5140		md_wakeup_thread(mddev->thread);
5141		sysfs_notify_dirent_safe(mddev->sysfs_state);
5142		err = 0;
5143	}
5144out:
5145	mutex_unlock(&mddev->open_mutex);
5146	return err;
5147}
5148
5149/* mode:
5150 *   0 - completely stop and dis-assemble array
5151 *   2 - stop but do not disassemble array
5152 */
5153static int do_md_stop(struct mddev *mddev, int mode,
5154		      struct block_device *bdev)
5155{
5156	struct gendisk *disk = mddev->gendisk;
5157	struct md_rdev *rdev;
5158	int did_freeze = 0;
5159
5160	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5161		did_freeze = 1;
5162		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5163		md_wakeup_thread(mddev->thread);
5164	}
5165	if (mddev->sync_thread) {
5166		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5167		/* Thread might be blocked waiting for metadata update
5168		 * which will now never happen */
5169		wake_up_process(mddev->sync_thread->tsk);
5170	}
5171	mddev_unlock(mddev);
5172	wait_event(resync_wait, mddev->sync_thread == NULL);
5173	mddev_lock_nointr(mddev);
5174
5175	mutex_lock(&mddev->open_mutex);
5176	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5177	    mddev->sysfs_active ||
5178	    mddev->sync_thread ||
5179	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5180		printk("md: %s still in use.\n",mdname(mddev));
5181		mutex_unlock(&mddev->open_mutex);
5182		if (did_freeze) {
5183			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5184			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5185			md_wakeup_thread(mddev->thread);
5186		}
5187		return -EBUSY;
5188	}
5189	if (mddev->pers) {
5190		if (mddev->ro)
5191			set_disk_ro(disk, 0);
5192
5193		__md_stop_writes(mddev);
5194		__md_stop(mddev);
5195		mddev->queue->merge_bvec_fn = NULL;
5196		mddev->queue->backing_dev_info.congested_fn = NULL;
5197
5198		/* tell userspace to handle 'inactive' */
5199		sysfs_notify_dirent_safe(mddev->sysfs_state);
5200
5201		rdev_for_each(rdev, mddev)
5202			if (rdev->raid_disk >= 0)
5203				sysfs_unlink_rdev(mddev, rdev);
5204
5205		set_capacity(disk, 0);
5206		mutex_unlock(&mddev->open_mutex);
5207		mddev->changed = 1;
5208		revalidate_disk(disk);
5209
5210		if (mddev->ro)
5211			mddev->ro = 0;
5212	} else
5213		mutex_unlock(&mddev->open_mutex);
5214	/*
5215	 * Free resources if final stop
5216	 */
5217	if (mode == 0) {
5218		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5219
5220		bitmap_destroy(mddev);
5221		if (mddev->bitmap_info.file) {
5222			fput(mddev->bitmap_info.file);
5223			mddev->bitmap_info.file = NULL;
5224		}
5225		mddev->bitmap_info.offset = 0;
5226
5227		export_array(mddev);
5228
5229		md_clean(mddev);
5230		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5231		if (mddev->hold_active == UNTIL_STOP)
5232			mddev->hold_active = 0;
5233	}
5234	blk_integrity_unregister(disk);
5235	md_new_event(mddev);
5236	sysfs_notify_dirent_safe(mddev->sysfs_state);
5237	return 0;
5238}
5239
5240#ifndef MODULE
5241static void autorun_array(struct mddev *mddev)
5242{
5243	struct md_rdev *rdev;
5244	int err;
5245
5246	if (list_empty(&mddev->disks))
5247		return;
5248
5249	printk(KERN_INFO "md: running: ");
5250
5251	rdev_for_each(rdev, mddev) {
5252		char b[BDEVNAME_SIZE];
5253		printk("<%s>", bdevname(rdev->bdev,b));
5254	}
5255	printk("\n");
5256
5257	err = do_md_run(mddev);
5258	if (err) {
5259		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5260		do_md_stop(mddev, 0, NULL);
5261	}
5262}
5263
5264/*
5265 * lets try to run arrays based on all disks that have arrived
5266 * until now. (those are in pending_raid_disks)
5267 *
5268 * the method: pick the first pending disk, collect all disks with
5269 * the same UUID, remove all from the pending list and put them into
5270 * the 'same_array' list. Then order this list based on superblock
5271 * update time (freshest comes first), kick out 'old' disks and
5272 * compare superblocks. If everything's fine then run it.
5273 *
5274 * If "unit" is allocated, then bump its reference count
5275 */
5276static void autorun_devices(int part)
5277{
5278	struct md_rdev *rdev0, *rdev, *tmp;
5279	struct mddev *mddev;
5280	char b[BDEVNAME_SIZE];
5281
5282	printk(KERN_INFO "md: autorun ...\n");
5283	while (!list_empty(&pending_raid_disks)) {
5284		int unit;
5285		dev_t dev;
5286		LIST_HEAD(candidates);
5287		rdev0 = list_entry(pending_raid_disks.next,
5288					 struct md_rdev, same_set);
5289
5290		printk(KERN_INFO "md: considering %s ...\n",
5291			bdevname(rdev0->bdev,b));
5292		INIT_LIST_HEAD(&candidates);
5293		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5294			if (super_90_load(rdev, rdev0, 0) >= 0) {
5295				printk(KERN_INFO "md:  adding %s ...\n",
5296					bdevname(rdev->bdev,b));
5297				list_move(&rdev->same_set, &candidates);
5298			}
5299		/*
5300		 * now we have a set of devices, with all of them having
5301		 * mostly sane superblocks. It's time to allocate the
5302		 * mddev.
5303		 */
5304		if (part) {
5305			dev = MKDEV(mdp_major,
5306				    rdev0->preferred_minor << MdpMinorShift);
5307			unit = MINOR(dev) >> MdpMinorShift;
5308		} else {
5309			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5310			unit = MINOR(dev);
5311		}
5312		if (rdev0->preferred_minor != unit) {
5313			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5314			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5315			break;
5316		}
5317
5318		md_probe(dev, NULL, NULL);
5319		mddev = mddev_find(dev);
5320		if (!mddev || !mddev->gendisk) {
5321			if (mddev)
5322				mddev_put(mddev);
5323			printk(KERN_ERR
5324				"md: cannot allocate memory for md drive.\n");
5325			break;
5326		}
5327		if (mddev_lock(mddev))
5328			printk(KERN_WARNING "md: %s locked, cannot run\n",
5329			       mdname(mddev));
5330		else if (mddev->raid_disks || mddev->major_version
5331			 || !list_empty(&mddev->disks)) {
5332			printk(KERN_WARNING
5333				"md: %s already running, cannot run %s\n",
5334				mdname(mddev), bdevname(rdev0->bdev,b));
5335			mddev_unlock(mddev);
5336		} else {
5337			printk(KERN_INFO "md: created %s\n", mdname(mddev));
5338			mddev->persistent = 1;
5339			rdev_for_each_list(rdev, tmp, &candidates) {
5340				list_del_init(&rdev->same_set);
5341				if (bind_rdev_to_array(rdev, mddev))
5342					export_rdev(rdev);
5343			}
5344			autorun_array(mddev);
5345			mddev_unlock(mddev);
5346		}
5347		/* on success, candidates will be empty, on error
5348		 * it won't...
5349		 */
5350		rdev_for_each_list(rdev, tmp, &candidates) {
5351			list_del_init(&rdev->same_set);
5352			export_rdev(rdev);
5353		}
5354		mddev_put(mddev);
5355	}
5356	printk(KERN_INFO "md: ... autorun DONE.\n");
5357}
5358#endif /* !MODULE */
5359
5360static int get_version(void __user *arg)
5361{
5362	mdu_version_t ver;
5363
5364	ver.major = MD_MAJOR_VERSION;
5365	ver.minor = MD_MINOR_VERSION;
5366	ver.patchlevel = MD_PATCHLEVEL_VERSION;
5367
5368	if (copy_to_user(arg, &ver, sizeof(ver)))
5369		return -EFAULT;
5370
5371	return 0;
5372}
5373
5374static int get_array_info(struct mddev *mddev, void __user *arg)
5375{
5376	mdu_array_info_t info;
5377	int nr,working,insync,failed,spare;
5378	struct md_rdev *rdev;
5379
5380	nr = working = insync = failed = spare = 0;
5381	rcu_read_lock();
5382	rdev_for_each_rcu(rdev, mddev) {
5383		nr++;
5384		if (test_bit(Faulty, &rdev->flags))
5385			failed++;
5386		else {
5387			working++;
5388			if (test_bit(In_sync, &rdev->flags))
5389				insync++;
5390			else
5391				spare++;
5392		}
5393	}
5394	rcu_read_unlock();
5395
5396	info.major_version = mddev->major_version;
5397	info.minor_version = mddev->minor_version;
5398	info.patch_version = MD_PATCHLEVEL_VERSION;
5399	info.ctime         = mddev->ctime;
5400	info.level         = mddev->level;
5401	info.size          = mddev->dev_sectors / 2;
5402	if (info.size != mddev->dev_sectors / 2) /* overflow */
5403		info.size = -1;
5404	info.nr_disks      = nr;
5405	info.raid_disks    = mddev->raid_disks;
5406	info.md_minor      = mddev->md_minor;
5407	info.not_persistent= !mddev->persistent;
5408
5409	info.utime         = mddev->utime;
5410	info.state         = 0;
5411	if (mddev->in_sync)
5412		info.state = (1<<MD_SB_CLEAN);
5413	if (mddev->bitmap && mddev->bitmap_info.offset)
5414		info.state |= (1<<MD_SB_BITMAP_PRESENT);
5415	info.active_disks  = insync;
5416	info.working_disks = working;
5417	info.failed_disks  = failed;
5418	info.spare_disks   = spare;
5419
5420	info.layout        = mddev->layout;
5421	info.chunk_size    = mddev->chunk_sectors << 9;
5422
5423	if (copy_to_user(arg, &info, sizeof(info)))
5424		return -EFAULT;
5425
5426	return 0;
5427}
5428
5429static int get_bitmap_file(struct mddev *mddev, void __user * arg)
5430{
5431	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5432	char *ptr, *buf = NULL;
5433	int err = -ENOMEM;
5434
5435	file = kmalloc(sizeof(*file), GFP_NOIO);
5436
5437	if (!file)
5438		goto out;
5439
5440	/* bitmap disabled, zero the first byte and copy out */
5441	if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5442		file->pathname[0] = '\0';
5443		goto copy_out;
5444	}
5445
5446	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5447	if (!buf)
5448		goto out;
5449
5450	ptr = d_path(&mddev->bitmap->storage.file->f_path,
5451		     buf, sizeof(file->pathname));
5452	if (IS_ERR(ptr))
5453		goto out;
5454
5455	strcpy(file->pathname, ptr);
5456
5457copy_out:
5458	err = 0;
5459	if (copy_to_user(arg, file, sizeof(*file)))
5460		err = -EFAULT;
5461out:
5462	kfree(buf);
5463	kfree(file);
5464	return err;
5465}
5466
5467static int get_disk_info(struct mddev *mddev, void __user * arg)
5468{
5469	mdu_disk_info_t info;
5470	struct md_rdev *rdev;
5471
5472	if (copy_from_user(&info, arg, sizeof(info)))
5473		return -EFAULT;
5474
5475	rcu_read_lock();
5476	rdev = find_rdev_nr_rcu(mddev, info.number);
5477	if (rdev) {
5478		info.major = MAJOR(rdev->bdev->bd_dev);
5479		info.minor = MINOR(rdev->bdev->bd_dev);
5480		info.raid_disk = rdev->raid_disk;
5481		info.state = 0;
5482		if (test_bit(Faulty, &rdev->flags))
5483			info.state |= (1<<MD_DISK_FAULTY);
5484		else if (test_bit(In_sync, &rdev->flags)) {
5485			info.state |= (1<<MD_DISK_ACTIVE);
5486			info.state |= (1<<MD_DISK_SYNC);
5487		}
5488		if (test_bit(WriteMostly, &rdev->flags))
5489			info.state |= (1<<MD_DISK_WRITEMOSTLY);
5490	} else {
5491		info.major = info.minor = 0;
5492		info.raid_disk = -1;
5493		info.state = (1<<MD_DISK_REMOVED);
5494	}
5495	rcu_read_unlock();
5496
5497	if (copy_to_user(arg, &info, sizeof(info)))
5498		return -EFAULT;
5499
5500	return 0;
5501}
5502
5503static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5504{
5505	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5506	struct md_rdev *rdev;
5507	dev_t dev = MKDEV(info->major,info->minor);
5508
5509	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5510		return -EOVERFLOW;
5511
5512	if (!mddev->raid_disks) {
5513		int err;
5514		/* expecting a device which has a superblock */
5515		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5516		if (IS_ERR(rdev)) {
5517			printk(KERN_WARNING
5518				"md: md_import_device returned %ld\n",
5519				PTR_ERR(rdev));
5520			return PTR_ERR(rdev);
5521		}
5522		if (!list_empty(&mddev->disks)) {
5523			struct md_rdev *rdev0
5524				= list_entry(mddev->disks.next,
5525					     struct md_rdev, same_set);
5526			err = super_types[mddev->major_version]
5527				.load_super(rdev, rdev0, mddev->minor_version);
5528			if (err < 0) {
5529				printk(KERN_WARNING
5530					"md: %s has different UUID to %s\n",
5531					bdevname(rdev->bdev,b),
5532					bdevname(rdev0->bdev,b2));
5533				export_rdev(rdev);
5534				return -EINVAL;
5535			}
5536		}
5537		err = bind_rdev_to_array(rdev, mddev);
5538		if (err)
5539			export_rdev(rdev);
5540		return err;
5541	}
5542
5543	/*
5544	 * add_new_disk can be used once the array is assembled
5545	 * to add "hot spares".  They must already have a superblock
5546	 * written
5547	 */
5548	if (mddev->pers) {
5549		int err;
5550		if (!mddev->pers->hot_add_disk) {
5551			printk(KERN_WARNING
5552				"%s: personality does not support diskops!\n",
5553			       mdname(mddev));
5554			return -EINVAL;
5555		}
5556		if (mddev->persistent)
5557			rdev = md_import_device(dev, mddev->major_version,
5558						mddev->minor_version);
5559		else
5560			rdev = md_import_device(dev, -1, -1);
5561		if (IS_ERR(rdev)) {
5562			printk(KERN_WARNING
5563				"md: md_import_device returned %ld\n",
5564				PTR_ERR(rdev));
5565			return PTR_ERR(rdev);
5566		}
5567		/* set saved_raid_disk if appropriate */
5568		if (!mddev->persistent) {
5569			if (info->state & (1<<MD_DISK_SYNC)  &&
5570			    info->raid_disk < mddev->raid_disks) {
5571				rdev->raid_disk = info->raid_disk;
5572				set_bit(In_sync, &rdev->flags);
5573				clear_bit(Bitmap_sync, &rdev->flags);
5574			} else
5575				rdev->raid_disk = -1;
5576			rdev->saved_raid_disk = rdev->raid_disk;
5577		} else
5578			super_types[mddev->major_version].
5579				validate_super(mddev, rdev);
5580		if ((info->state & (1<<MD_DISK_SYNC)) &&
5581		     rdev->raid_disk != info->raid_disk) {
5582			/* This was a hot-add request, but events doesn't
5583			 * match, so reject it.
5584			 */
5585			export_rdev(rdev);
5586			return -EINVAL;
5587		}
5588
5589		clear_bit(In_sync, &rdev->flags); /* just to be sure */
5590		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5591			set_bit(WriteMostly, &rdev->flags);
5592		else
5593			clear_bit(WriteMostly, &rdev->flags);
5594
5595		rdev->raid_disk = -1;
5596		err = bind_rdev_to_array(rdev, mddev);
5597		if (!err && !mddev->pers->hot_remove_disk) {
5598			/* If there is hot_add_disk but no hot_remove_disk
5599			 * then added disks for geometry changes,
5600			 * and should be added immediately.
5601			 */
5602			super_types[mddev->major_version].
5603				validate_super(mddev, rdev);
5604			err = mddev->pers->hot_add_disk(mddev, rdev);
5605			if (err)
5606				unbind_rdev_from_array(rdev);
5607		}
5608		if (err)
5609			export_rdev(rdev);
5610		else
5611			sysfs_notify_dirent_safe(rdev->sysfs_state);
5612
5613		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5614		if (mddev->degraded)
5615			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5616		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5617		if (!err)
5618			md_new_event(mddev);
5619		md_wakeup_thread(mddev->thread);
5620		return err;
5621	}
5622
5623	/* otherwise, add_new_disk is only allowed
5624	 * for major_version==0 superblocks
5625	 */
5626	if (mddev->major_version != 0) {
5627		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5628		       mdname(mddev));
5629		return -EINVAL;
5630	}
5631
5632	if (!(info->state & (1<<MD_DISK_FAULTY))) {
5633		int err;
5634		rdev = md_import_device(dev, -1, 0);
5635		if (IS_ERR(rdev)) {
5636			printk(KERN_WARNING
5637				"md: error, md_import_device() returned %ld\n",
5638				PTR_ERR(rdev));
5639			return PTR_ERR(rdev);
5640		}
5641		rdev->desc_nr = info->number;
5642		if (info->raid_disk < mddev->raid_disks)
5643			rdev->raid_disk = info->raid_disk;
5644		else
5645			rdev->raid_disk = -1;
5646
5647		if (rdev->raid_disk < mddev->raid_disks)
5648			if (info->state & (1<<MD_DISK_SYNC))
5649				set_bit(In_sync, &rdev->flags);
5650
5651		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5652			set_bit(WriteMostly, &rdev->flags);
5653
5654		if (!mddev->persistent) {
5655			printk(KERN_INFO "md: nonpersistent superblock ...\n");
5656			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5657		} else
5658			rdev->sb_start = calc_dev_sboffset(rdev);
5659		rdev->sectors = rdev->sb_start;
5660
5661		err = bind_rdev_to_array(rdev, mddev);
5662		if (err) {
5663			export_rdev(rdev);
5664			return err;
5665		}
5666	}
5667
5668	return 0;
5669}
5670
5671static int hot_remove_disk(struct mddev *mddev, dev_t dev)
5672{
5673	char b[BDEVNAME_SIZE];
5674	struct md_rdev *rdev;
5675
5676	rdev = find_rdev(mddev, dev);
5677	if (!rdev)
5678		return -ENXIO;
5679
5680	clear_bit(Blocked, &rdev->flags);
5681	remove_and_add_spares(mddev, rdev);
5682
5683	if (rdev->raid_disk >= 0)
5684		goto busy;
5685
5686	kick_rdev_from_array(rdev);
5687	md_update_sb(mddev, 1);
5688	md_new_event(mddev);
5689
5690	return 0;
5691busy:
5692	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5693		bdevname(rdev->bdev,b), mdname(mddev));
5694	return -EBUSY;
5695}
5696
5697static int hot_add_disk(struct mddev *mddev, dev_t dev)
5698{
5699	char b[BDEVNAME_SIZE];
5700	int err;
5701	struct md_rdev *rdev;
5702
5703	if (!mddev->pers)
5704		return -ENODEV;
5705
5706	if (mddev->major_version != 0) {
5707		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5708			" version-0 superblocks.\n",
5709			mdname(mddev));
5710		return -EINVAL;
5711	}
5712	if (!mddev->pers->hot_add_disk) {
5713		printk(KERN_WARNING
5714			"%s: personality does not support diskops!\n",
5715			mdname(mddev));
5716		return -EINVAL;
5717	}
5718
5719	rdev = md_import_device(dev, -1, 0);
5720	if (IS_ERR(rdev)) {
5721		printk(KERN_WARNING
5722			"md: error, md_import_device() returned %ld\n",
5723			PTR_ERR(rdev));
5724		return -EINVAL;
5725	}
5726
5727	if (mddev->persistent)
5728		rdev->sb_start = calc_dev_sboffset(rdev);
5729	else
5730		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5731
5732	rdev->sectors = rdev->sb_start;
5733
5734	if (test_bit(Faulty, &rdev->flags)) {
5735		printk(KERN_WARNING
5736			"md: can not hot-add faulty %s disk to %s!\n",
5737			bdevname(rdev->bdev,b), mdname(mddev));
5738		err = -EINVAL;
5739		goto abort_export;
5740	}
5741	clear_bit(In_sync, &rdev->flags);
5742	rdev->desc_nr = -1;
5743	rdev->saved_raid_disk = -1;
5744	err = bind_rdev_to_array(rdev, mddev);
5745	if (err)
5746		goto abort_export;
5747
5748	/*
5749	 * The rest should better be atomic, we can have disk failures
5750	 * noticed in interrupt contexts ...
5751	 */
5752
5753	rdev->raid_disk = -1;
5754
5755	md_update_sb(mddev, 1);
5756
5757	/*
5758	 * Kick recovery, maybe this spare has to be added to the
5759	 * array immediately.
5760	 */
5761	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5762	md_wakeup_thread(mddev->thread);
5763	md_new_event(mddev);
5764	return 0;
5765
5766abort_export:
5767	export_rdev(rdev);
5768	return err;
5769}
5770
5771static int set_bitmap_file(struct mddev *mddev, int fd)
5772{
5773	int err = 0;
5774
5775	if (mddev->pers) {
5776		if (!mddev->pers->quiesce || !mddev->thread)
5777			return -EBUSY;
5778		if (mddev->recovery || mddev->sync_thread)
5779			return -EBUSY;
5780		/* we should be able to change the bitmap.. */
5781	}
5782
5783	if (fd >= 0) {
5784		struct inode *inode;
5785		if (mddev->bitmap)
5786			return -EEXIST; /* cannot add when bitmap is present */
5787		mddev->bitmap_info.file = fget(fd);
5788
5789		if (mddev->bitmap_info.file == NULL) {
5790			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5791			       mdname(mddev));
5792			return -EBADF;
5793		}
5794
5795		inode = mddev->bitmap_info.file->f_mapping->host;
5796		if (!S_ISREG(inode->i_mode)) {
5797			printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
5798			       mdname(mddev));
5799			err = -EBADF;
5800		} else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
5801			printk(KERN_ERR "%s: error: bitmap file must open for write\n",
5802			       mdname(mddev));
5803			err = -EBADF;
5804		} else if (atomic_read(&inode->i_writecount) != 1) {
5805			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5806			       mdname(mddev));
5807			err = -EBUSY;
5808		}
5809		if (err) {
5810			fput(mddev->bitmap_info.file);
5811			mddev->bitmap_info.file = NULL;
5812			return err;
5813		}
5814		mddev->bitmap_info.offset = 0; /* file overrides offset */
5815	} else if (mddev->bitmap == NULL)
5816		return -ENOENT; /* cannot remove what isn't there */
5817	err = 0;
5818	if (mddev->pers) {
5819		mddev->pers->quiesce(mddev, 1);
5820		if (fd >= 0) {
5821			err = bitmap_create(mddev);
5822			if (!err)
5823				err = bitmap_load(mddev);
5824		}
5825		if (fd < 0 || err) {
5826			bitmap_destroy(mddev);
5827			fd = -1; /* make sure to put the file */
5828		}
5829		mddev->pers->quiesce(mddev, 0);
5830	}
5831	if (fd < 0) {
5832		if (mddev->bitmap_info.file)
5833			fput(mddev->bitmap_info.file);
5834		mddev->bitmap_info.file = NULL;
5835	}
5836
5837	return err;
5838}
5839
5840/*
5841 * set_array_info is used two different ways
5842 * The original usage is when creating a new array.
5843 * In this usage, raid_disks is > 0 and it together with
5844 *  level, size, not_persistent,layout,chunksize determine the
5845 *  shape of the array.
5846 *  This will always create an array with a type-0.90.0 superblock.
5847 * The newer usage is when assembling an array.
5848 *  In this case raid_disks will be 0, and the major_version field is
5849 *  use to determine which style super-blocks are to be found on the devices.
5850 *  The minor and patch _version numbers are also kept incase the
5851 *  super_block handler wishes to interpret them.
5852 */
5853static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
5854{
5855
5856	if (info->raid_disks == 0) {
5857		/* just setting version number for superblock loading */
5858		if (info->major_version < 0 ||
5859		    info->major_version >= ARRAY_SIZE(super_types) ||
5860		    super_types[info->major_version].name == NULL) {
5861			/* maybe try to auto-load a module? */
5862			printk(KERN_INFO
5863				"md: superblock version %d not known\n",
5864				info->major_version);
5865			return -EINVAL;
5866		}
5867		mddev->major_version = info->major_version;
5868		mddev->minor_version = info->minor_version;
5869		mddev->patch_version = info->patch_version;
5870		mddev->persistent = !info->not_persistent;
5871		/* ensure mddev_put doesn't delete this now that there
5872		 * is some minimal configuration.
5873		 */
5874		mddev->ctime         = get_seconds();
5875		return 0;
5876	}
5877	mddev->major_version = MD_MAJOR_VERSION;
5878	mddev->minor_version = MD_MINOR_VERSION;
5879	mddev->patch_version = MD_PATCHLEVEL_VERSION;
5880	mddev->ctime         = get_seconds();
5881
5882	mddev->level         = info->level;
5883	mddev->clevel[0]     = 0;
5884	mddev->dev_sectors   = 2 * (sector_t)info->size;
5885	mddev->raid_disks    = info->raid_disks;
5886	/* don't set md_minor, it is determined by which /dev/md* was
5887	 * openned
5888	 */
5889	if (info->state & (1<<MD_SB_CLEAN))
5890		mddev->recovery_cp = MaxSector;
5891	else
5892		mddev->recovery_cp = 0;
5893	mddev->persistent    = ! info->not_persistent;
5894	mddev->external	     = 0;
5895
5896	mddev->layout        = info->layout;
5897	mddev->chunk_sectors = info->chunk_size >> 9;
5898
5899	mddev->max_disks     = MD_SB_DISKS;
5900
5901	if (mddev->persistent)
5902		mddev->flags         = 0;
5903	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5904
5905	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5906	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
5907	mddev->bitmap_info.offset = 0;
5908
5909	mddev->reshape_position = MaxSector;
5910
5911	/*
5912	 * Generate a 128 bit UUID
5913	 */
5914	get_random_bytes(mddev->uuid, 16);
5915
5916	mddev->new_level = mddev->level;
5917	mddev->new_chunk_sectors = mddev->chunk_sectors;
5918	mddev->new_layout = mddev->layout;
5919	mddev->delta_disks = 0;
5920	mddev->reshape_backwards = 0;
5921
5922	return 0;
5923}
5924
5925void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
5926{
5927	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5928
5929	if (mddev->external_size)
5930		return;
5931
5932	mddev->array_sectors = array_sectors;
5933}
5934EXPORT_SYMBOL(md_set_array_sectors);
5935
5936static int update_size(struct mddev *mddev, sector_t num_sectors)
5937{
5938	struct md_rdev *rdev;
5939	int rv;
5940	int fit = (num_sectors == 0);
5941
5942	if (mddev->pers->resize == NULL)
5943		return -EINVAL;
5944	/* The "num_sectors" is the number of sectors of each device that
5945	 * is used.  This can only make sense for arrays with redundancy.
5946	 * linear and raid0 always use whatever space is available. We can only
5947	 * consider changing this number if no resync or reconstruction is
5948	 * happening, and if the new size is acceptable. It must fit before the
5949	 * sb_start or, if that is <data_offset, it must fit before the size
5950	 * of each device.  If num_sectors is zero, we find the largest size
5951	 * that fits.
5952	 */
5953	if (mddev->sync_thread)
5954		return -EBUSY;
5955	if (mddev->ro)
5956		return -EROFS;
5957
5958	rdev_for_each(rdev, mddev) {
5959		sector_t avail = rdev->sectors;
5960
5961		if (fit && (num_sectors == 0 || num_sectors > avail))
5962			num_sectors = avail;
5963		if (avail < num_sectors)
5964			return -ENOSPC;
5965	}
5966	rv = mddev->pers->resize(mddev, num_sectors);
5967	if (!rv)
5968		revalidate_disk(mddev->gendisk);
5969	return rv;
5970}
5971
5972static int update_raid_disks(struct mddev *mddev, int raid_disks)
5973{
5974	int rv;
5975	struct md_rdev *rdev;
5976	/* change the number of raid disks */
5977	if (mddev->pers->check_reshape == NULL)
5978		return -EINVAL;
5979	if (mddev->ro)
5980		return -EROFS;
5981	if (raid_disks <= 0 ||
5982	    (mddev->max_disks && raid_disks >= mddev->max_disks))
5983		return -EINVAL;
5984	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5985		return -EBUSY;
5986
5987	rdev_for_each(rdev, mddev) {
5988		if (mddev->raid_disks < raid_disks &&
5989		    rdev->data_offset < rdev->new_data_offset)
5990			return -EINVAL;
5991		if (mddev->raid_disks > raid_disks &&
5992		    rdev->data_offset > rdev->new_data_offset)
5993			return -EINVAL;
5994	}
5995
5996	mddev->delta_disks = raid_disks - mddev->raid_disks;
5997	if (mddev->delta_disks < 0)
5998		mddev->reshape_backwards = 1;
5999	else if (mddev->delta_disks > 0)
6000		mddev->reshape_backwards = 0;
6001
6002	rv = mddev->pers->check_reshape(mddev);
6003	if (rv < 0) {
6004		mddev->delta_disks = 0;
6005		mddev->reshape_backwards = 0;
6006	}
6007	return rv;
6008}
6009
6010/*
6011 * update_array_info is used to change the configuration of an
6012 * on-line array.
6013 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6014 * fields in the info are checked against the array.
6015 * Any differences that cannot be handled will cause an error.
6016 * Normally, only one change can be managed at a time.
6017 */
6018static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6019{
6020	int rv = 0;
6021	int cnt = 0;
6022	int state = 0;
6023
6024	/* calculate expected state,ignoring low bits */
6025	if (mddev->bitmap && mddev->bitmap_info.offset)
6026		state |= (1 << MD_SB_BITMAP_PRESENT);
6027
6028	if (mddev->major_version != info->major_version ||
6029	    mddev->minor_version != info->minor_version ||
6030/*	    mddev->patch_version != info->patch_version || */
6031	    mddev->ctime         != info->ctime         ||
6032	    mddev->level         != info->level         ||
6033/*	    mddev->layout        != info->layout        || */
6034	    !mddev->persistent	 != info->not_persistent||
6035	    mddev->chunk_sectors != info->chunk_size >> 9 ||
6036	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6037	    ((state^info->state) & 0xfffffe00)
6038		)
6039		return -EINVAL;
6040	/* Check there is only one change */
6041	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6042		cnt++;
6043	if (mddev->raid_disks != info->raid_disks)
6044		cnt++;
6045	if (mddev->layout != info->layout)
6046		cnt++;
6047	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6048		cnt++;
6049	if (cnt == 0)
6050		return 0;
6051	if (cnt > 1)
6052		return -EINVAL;
6053
6054	if (mddev->layout != info->layout) {
6055		/* Change layout
6056		 * we don't need to do anything at the md level, the
6057		 * personality will take care of it all.
6058		 */
6059		if (mddev->pers->check_reshape == NULL)
6060			return -EINVAL;
6061		else {
6062			mddev->new_layout = info->layout;
6063			rv = mddev->pers->check_reshape(mddev);
6064			if (rv)
6065				mddev->new_layout = mddev->layout;
6066			return rv;
6067		}
6068	}
6069	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6070		rv = update_size(mddev, (sector_t)info->size * 2);
6071
6072	if (mddev->raid_disks    != info->raid_disks)
6073		rv = update_raid_disks(mddev, info->raid_disks);
6074
6075	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6076		if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
6077			return -EINVAL;
6078		if (mddev->recovery || mddev->sync_thread)
6079			return -EBUSY;
6080		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6081			/* add the bitmap */
6082			if (mddev->bitmap)
6083				return -EEXIST;
6084			if (mddev->bitmap_info.default_offset == 0)
6085				return -EINVAL;
6086			mddev->bitmap_info.offset =
6087				mddev->bitmap_info.default_offset;
6088			mddev->bitmap_info.space =
6089				mddev->bitmap_info.default_space;
6090			mddev->pers->quiesce(mddev, 1);
6091			rv = bitmap_create(mddev);
6092			if (!rv)
6093				rv = bitmap_load(mddev);
6094			if (rv)
6095				bitmap_destroy(mddev);
6096			mddev->pers->quiesce(mddev, 0);
6097		} else {
6098			/* remove the bitmap */
6099			if (!mddev->bitmap)
6100				return -ENOENT;
6101			if (mddev->bitmap->storage.file)
6102				return -EINVAL;
6103			mddev->pers->quiesce(mddev, 1);
6104			bitmap_destroy(mddev);
6105			mddev->pers->quiesce(mddev, 0);
6106			mddev->bitmap_info.offset = 0;
6107		}
6108	}
6109	md_update_sb(mddev, 1);
6110	return rv;
6111}
6112
6113static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6114{
6115	struct md_rdev *rdev;
6116	int err = 0;
6117
6118	if (mddev->pers == NULL)
6119		return -ENODEV;
6120
6121	rcu_read_lock();
6122	rdev = find_rdev_rcu(mddev, dev);
6123	if (!rdev)
6124		err =  -ENODEV;
6125	else {
6126		md_error(mddev, rdev);
6127		if (!test_bit(Faulty, &rdev->flags))
6128			err = -EBUSY;
6129	}
6130	rcu_read_unlock();
6131	return err;
6132}
6133
6134/*
6135 * We have a problem here : there is no easy way to give a CHS
6136 * virtual geometry. We currently pretend that we have a 2 heads
6137 * 4 sectors (with a BIG number of cylinders...). This drives
6138 * dosfs just mad... ;-)
6139 */
6140static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6141{
6142	struct mddev *mddev = bdev->bd_disk->private_data;
6143
6144	geo->heads = 2;
6145	geo->sectors = 4;
6146	geo->cylinders = mddev->array_sectors / 8;
6147	return 0;
6148}
6149
6150static inline bool md_ioctl_valid(unsigned int cmd)
6151{
6152	switch (cmd) {
6153	case ADD_NEW_DISK:
6154	case BLKROSET:
6155	case GET_ARRAY_INFO:
6156	case GET_BITMAP_FILE:
6157	case GET_DISK_INFO:
6158	case HOT_ADD_DISK:
6159	case HOT_REMOVE_DISK:
6160	case RAID_AUTORUN:
6161	case RAID_VERSION:
6162	case RESTART_ARRAY_RW:
6163	case RUN_ARRAY:
6164	case SET_ARRAY_INFO:
6165	case SET_BITMAP_FILE:
6166	case SET_DISK_FAULTY:
6167	case STOP_ARRAY:
6168	case STOP_ARRAY_RO:
6169		return true;
6170	default:
6171		return false;
6172	}
6173}
6174
6175static int md_ioctl(struct block_device *bdev, fmode_t mode,
6176			unsigned int cmd, unsigned long arg)
6177{
6178	int err = 0;
6179	void __user *argp = (void __user *)arg;
6180	struct mddev *mddev = NULL;
6181	int ro;
6182
6183	if (!md_ioctl_valid(cmd))
6184		return -ENOTTY;
6185
6186	switch (cmd) {
6187	case RAID_VERSION:
6188	case GET_ARRAY_INFO:
6189	case GET_DISK_INFO:
6190		break;
6191	default:
6192		if (!capable(CAP_SYS_ADMIN))
6193			return -EACCES;
6194	}
6195
6196	/*
6197	 * Commands dealing with the RAID driver but not any
6198	 * particular array:
6199	 */
6200	switch (cmd) {
6201	case RAID_VERSION:
6202		err = get_version(argp);
6203		goto out;
6204
6205#ifndef MODULE
6206	case RAID_AUTORUN:
6207		err = 0;
6208		autostart_arrays(arg);
6209		goto out;
6210#endif
6211	default:;
6212	}
6213
6214	/*
6215	 * Commands creating/starting a new array:
6216	 */
6217
6218	mddev = bdev->bd_disk->private_data;
6219
6220	if (!mddev) {
6221		BUG();
6222		goto out;
6223	}
6224
6225	/* Some actions do not requires the mutex */
6226	switch (cmd) {
6227	case GET_ARRAY_INFO:
6228		if (!mddev->raid_disks && !mddev->external)
6229			err = -ENODEV;
6230		else
6231			err = get_array_info(mddev, argp);
6232		goto out;
6233
6234	case GET_DISK_INFO:
6235		if (!mddev->raid_disks && !mddev->external)
6236			err = -ENODEV;
6237		else
6238			err = get_disk_info(mddev, argp);
6239		goto out;
6240
6241	case SET_DISK_FAULTY:
6242		err = set_disk_faulty(mddev, new_decode_dev(arg));
6243		goto out;
6244	}
6245
6246	if (cmd == ADD_NEW_DISK)
6247		/* need to ensure md_delayed_delete() has completed */
6248		flush_workqueue(md_misc_wq);
6249
6250	if (cmd == HOT_REMOVE_DISK)
6251		/* need to ensure recovery thread has run */
6252		wait_event_interruptible_timeout(mddev->sb_wait,
6253						 !test_bit(MD_RECOVERY_NEEDED,
6254							   &mddev->flags),
6255						 msecs_to_jiffies(5000));
6256	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6257		/* Need to flush page cache, and ensure no-one else opens
6258		 * and writes
6259		 */
6260		mutex_lock(&mddev->open_mutex);
6261		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
6262			mutex_unlock(&mddev->open_mutex);
6263			err = -EBUSY;
6264			goto out;
6265		}
6266		set_bit(MD_STILL_CLOSED, &mddev->flags);
6267		mutex_unlock(&mddev->open_mutex);
6268		sync_blockdev(bdev);
6269	}
6270	err = mddev_lock(mddev);
6271	if (err) {
6272		printk(KERN_INFO
6273			"md: ioctl lock interrupted, reason %d, cmd %d\n",
6274			err, cmd);
6275		goto out;
6276	}
6277
6278	if (cmd == SET_ARRAY_INFO) {
6279		mdu_array_info_t info;
6280		if (!arg)
6281			memset(&info, 0, sizeof(info));
6282		else if (copy_from_user(&info, argp, sizeof(info))) {
6283			err = -EFAULT;
6284			goto unlock;
6285		}
6286		if (mddev->pers) {
6287			err = update_array_info(mddev, &info);
6288			if (err) {
6289				printk(KERN_WARNING "md: couldn't update"
6290				       " array info. %d\n", err);
6291				goto unlock;
6292			}
6293			goto unlock;
6294		}
6295		if (!list_empty(&mddev->disks)) {
6296			printk(KERN_WARNING
6297			       "md: array %s already has disks!\n",
6298			       mdname(mddev));
6299			err = -EBUSY;
6300			goto unlock;
6301		}
6302		if (mddev->raid_disks) {
6303			printk(KERN_WARNING
6304			       "md: array %s already initialised!\n",
6305			       mdname(mddev));
6306			err = -EBUSY;
6307			goto unlock;
6308		}
6309		err = set_array_info(mddev, &info);
6310		if (err) {
6311			printk(KERN_WARNING "md: couldn't set"
6312			       " array info. %d\n", err);
6313			goto unlock;
6314		}
6315		goto unlock;
6316	}
6317
6318	/*
6319	 * Commands querying/configuring an existing array:
6320	 */
6321	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6322	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6323	if ((!mddev->raid_disks && !mddev->external)
6324	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6325	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6326	    && cmd != GET_BITMAP_FILE) {
6327		err = -ENODEV;
6328		goto unlock;
6329	}
6330
6331	/*
6332	 * Commands even a read-only array can execute:
6333	 */
6334	switch (cmd) {
6335	case GET_BITMAP_FILE:
6336		err = get_bitmap_file(mddev, argp);
6337		goto unlock;
6338
6339	case RESTART_ARRAY_RW:
6340		err = restart_array(mddev);
6341		goto unlock;
6342
6343	case STOP_ARRAY:
6344		err = do_md_stop(mddev, 0, bdev);
6345		goto unlock;
6346
6347	case STOP_ARRAY_RO:
6348		err = md_set_readonly(mddev, bdev);
6349		goto unlock;
6350
6351	case HOT_REMOVE_DISK:
6352		err = hot_remove_disk(mddev, new_decode_dev(arg));
6353		goto unlock;
6354
6355	case ADD_NEW_DISK:
6356		/* We can support ADD_NEW_DISK on read-only arrays
6357		 * on if we are re-adding a preexisting device.
6358		 * So require mddev->pers and MD_DISK_SYNC.
6359		 */
6360		if (mddev->pers) {
6361			mdu_disk_info_t info;
6362			if (copy_from_user(&info, argp, sizeof(info)))
6363				err = -EFAULT;
6364			else if (!(info.state & (1<<MD_DISK_SYNC)))
6365				/* Need to clear read-only for this */
6366				break;
6367			else
6368				err = add_new_disk(mddev, &info);
6369			goto unlock;
6370		}
6371		break;
6372
6373	case BLKROSET:
6374		if (get_user(ro, (int __user *)(arg))) {
6375			err = -EFAULT;
6376			goto unlock;
6377		}
6378		err = -EINVAL;
6379
6380		/* if the bdev is going readonly the value of mddev->ro
6381		 * does not matter, no writes are coming
6382		 */
6383		if (ro)
6384			goto unlock;
6385
6386		/* are we are already prepared for writes? */
6387		if (mddev->ro != 1)
6388			goto unlock;
6389
6390		/* transitioning to readauto need only happen for
6391		 * arrays that call md_write_start
6392		 */
6393		if (mddev->pers) {
6394			err = restart_array(mddev);
6395			if (err == 0) {
6396				mddev->ro = 2;
6397				set_disk_ro(mddev->gendisk, 0);
6398			}
6399		}
6400		goto unlock;
6401	}
6402
6403	/*
6404	 * The remaining ioctls are changing the state of the
6405	 * superblock, so we do not allow them on read-only arrays.
6406	 */
6407	if (mddev->ro && mddev->pers) {
6408		if (mddev->ro == 2) {
6409			mddev->ro = 0;
6410			sysfs_notify_dirent_safe(mddev->sysfs_state);
6411			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6412			/* mddev_unlock will wake thread */
6413			/* If a device failed while we were read-only, we
6414			 * need to make sure the metadata is updated now.
6415			 */
6416			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6417				mddev_unlock(mddev);
6418				wait_event(mddev->sb_wait,
6419					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6420					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6421				mddev_lock_nointr(mddev);
6422			}
6423		} else {
6424			err = -EROFS;
6425			goto unlock;
6426		}
6427	}
6428
6429	switch (cmd) {
6430	case ADD_NEW_DISK:
6431	{
6432		mdu_disk_info_t info;
6433		if (copy_from_user(&info, argp, sizeof(info)))
6434			err = -EFAULT;
6435		else
6436			err = add_new_disk(mddev, &info);
6437		goto unlock;
6438	}
6439
6440	case HOT_ADD_DISK:
6441		err = hot_add_disk(mddev, new_decode_dev(arg));
6442		goto unlock;
6443
6444	case RUN_ARRAY:
6445		err = do_md_run(mddev);
6446		goto unlock;
6447
6448	case SET_BITMAP_FILE:
6449		err = set_bitmap_file(mddev, (int)arg);
6450		goto unlock;
6451
6452	default:
6453		err = -EINVAL;
6454		goto unlock;
6455	}
6456
6457unlock:
6458	if (mddev->hold_active == UNTIL_IOCTL &&
6459	    err != -EINVAL)
6460		mddev->hold_active = 0;
6461	mddev_unlock(mddev);
6462out:
6463	return err;
6464}
6465#ifdef CONFIG_COMPAT
6466static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6467		    unsigned int cmd, unsigned long arg)
6468{
6469	switch (cmd) {
6470	case HOT_REMOVE_DISK:
6471	case HOT_ADD_DISK:
6472	case SET_DISK_FAULTY:
6473	case SET_BITMAP_FILE:
6474		/* These take in integer arg, do not convert */
6475		break;
6476	default:
6477		arg = (unsigned long)compat_ptr(arg);
6478		break;
6479	}
6480
6481	return md_ioctl(bdev, mode, cmd, arg);
6482}
6483#endif /* CONFIG_COMPAT */
6484
6485static int md_open(struct block_device *bdev, fmode_t mode)
6486{
6487	/*
6488	 * Succeed if we can lock the mddev, which confirms that
6489	 * it isn't being stopped right now.
6490	 */
6491	struct mddev *mddev = mddev_find(bdev->bd_dev);
6492	int err;
6493
6494	if (!mddev)
6495		return -ENODEV;
6496
6497	if (mddev->gendisk != bdev->bd_disk) {
6498		/* we are racing with mddev_put which is discarding this
6499		 * bd_disk.
6500		 */
6501		mddev_put(mddev);
6502		/* Wait until bdev->bd_disk is definitely gone */
6503		flush_workqueue(md_misc_wq);
6504		/* Then retry the open from the top */
6505		return -ERESTARTSYS;
6506	}
6507	BUG_ON(mddev != bdev->bd_disk->private_data);
6508
6509	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6510		goto out;
6511
6512	err = 0;
6513	atomic_inc(&mddev->openers);
6514	clear_bit(MD_STILL_CLOSED, &mddev->flags);
6515	mutex_unlock(&mddev->open_mutex);
6516
6517	check_disk_change(bdev);
6518 out:
6519	return err;
6520}
6521
6522static void md_release(struct gendisk *disk, fmode_t mode)
6523{
6524	struct mddev *mddev = disk->private_data;
6525
6526	BUG_ON(!mddev);
6527	atomic_dec(&mddev->openers);
6528	mddev_put(mddev);
6529}
6530
6531static int md_media_changed(struct gendisk *disk)
6532{
6533	struct mddev *mddev = disk->private_data;
6534
6535	return mddev->changed;
6536}
6537
6538static int md_revalidate(struct gendisk *disk)
6539{
6540	struct mddev *mddev = disk->private_data;
6541
6542	mddev->changed = 0;
6543	return 0;
6544}
6545static const struct block_device_operations md_fops =
6546{
6547	.owner		= THIS_MODULE,
6548	.open		= md_open,
6549	.release	= md_release,
6550	.ioctl		= md_ioctl,
6551#ifdef CONFIG_COMPAT
6552	.compat_ioctl	= md_compat_ioctl,
6553#endif
6554	.getgeo		= md_getgeo,
6555	.media_changed  = md_media_changed,
6556	.revalidate_disk= md_revalidate,
6557};
6558
6559static int md_thread(void *arg)
6560{
6561	struct md_thread *thread = arg;
6562
6563	/*
6564	 * md_thread is a 'system-thread', it's priority should be very
6565	 * high. We avoid resource deadlocks individually in each
6566	 * raid personality. (RAID5 does preallocation) We also use RR and
6567	 * the very same RT priority as kswapd, thus we will never get
6568	 * into a priority inversion deadlock.
6569	 *
6570	 * we definitely have to have equal or higher priority than
6571	 * bdflush, otherwise bdflush will deadlock if there are too
6572	 * many dirty RAID5 blocks.
6573	 */
6574
6575	allow_signal(SIGKILL);
6576	while (!kthread_should_stop()) {
6577
6578		/* We need to wait INTERRUPTIBLE so that
6579		 * we don't add to the load-average.
6580		 * That means we need to be sure no signals are
6581		 * pending
6582		 */
6583		if (signal_pending(current))
6584			flush_signals(current);
6585
6586		wait_event_interruptible_timeout
6587			(thread->wqueue,
6588			 test_bit(THREAD_WAKEUP, &thread->flags)
6589			 || kthread_should_stop(),
6590			 thread->timeout);
6591
6592		clear_bit(THREAD_WAKEUP, &thread->flags);
6593		if (!kthread_should_stop())
6594			thread->run(thread);
6595	}
6596
6597	return 0;
6598}
6599
6600void md_wakeup_thread(struct md_thread *thread)
6601{
6602	if (thread) {
6603		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6604		set_bit(THREAD_WAKEUP, &thread->flags);
6605		wake_up(&thread->wqueue);
6606	}
6607}
6608EXPORT_SYMBOL(md_wakeup_thread);
6609
6610struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6611		struct mddev *mddev, const char *name)
6612{
6613	struct md_thread *thread;
6614
6615	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
6616	if (!thread)
6617		return NULL;
6618
6619	init_waitqueue_head(&thread->wqueue);
6620
6621	thread->run = run;
6622	thread->mddev = mddev;
6623	thread->timeout = MAX_SCHEDULE_TIMEOUT;
6624	thread->tsk = kthread_run(md_thread, thread,
6625				  "%s_%s",
6626				  mdname(thread->mddev),
6627				  name);
6628	if (IS_ERR(thread->tsk)) {
6629		kfree(thread);
6630		return NULL;
6631	}
6632	return thread;
6633}
6634EXPORT_SYMBOL(md_register_thread);
6635
6636void md_unregister_thread(struct md_thread **threadp)
6637{
6638	struct md_thread *thread = *threadp;
6639	if (!thread)
6640		return;
6641	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6642	/* Locking ensures that mddev_unlock does not wake_up a
6643	 * non-existent thread
6644	 */
6645	spin_lock(&pers_lock);
6646	*threadp = NULL;
6647	spin_unlock(&pers_lock);
6648
6649	kthread_stop(thread->tsk);
6650	kfree(thread);
6651}
6652EXPORT_SYMBOL(md_unregister_thread);
6653
6654void md_error(struct mddev *mddev, struct md_rdev *rdev)
6655{
6656	if (!rdev || test_bit(Faulty, &rdev->flags))
6657		return;
6658
6659	if (!mddev->pers || !mddev->pers->error_handler)
6660		return;
6661	mddev->pers->error_handler(mddev,rdev);
6662	if (mddev->degraded)
6663		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6664	sysfs_notify_dirent_safe(rdev->sysfs_state);
6665	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6666	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6667	md_wakeup_thread(mddev->thread);
6668	if (mddev->event_work.func)
6669		queue_work(md_misc_wq, &mddev->event_work);
6670	md_new_event_inintr(mddev);
6671}
6672EXPORT_SYMBOL(md_error);
6673
6674/* seq_file implementation /proc/mdstat */
6675
6676static void status_unused(struct seq_file *seq)
6677{
6678	int i = 0;
6679	struct md_rdev *rdev;
6680
6681	seq_printf(seq, "unused devices: ");
6682
6683	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6684		char b[BDEVNAME_SIZE];
6685		i++;
6686		seq_printf(seq, "%s ",
6687			      bdevname(rdev->bdev,b));
6688	}
6689	if (!i)
6690		seq_printf(seq, "<none>");
6691
6692	seq_printf(seq, "\n");
6693}
6694
6695static void status_resync(struct seq_file *seq, struct mddev *mddev)
6696{
6697	sector_t max_sectors, resync, res;
6698	unsigned long dt, db;
6699	sector_t rt;
6700	int scale;
6701	unsigned int per_milli;
6702
6703	if (mddev->curr_resync <= 3)
6704		resync = 0;
6705	else
6706		resync = mddev->curr_resync
6707			- atomic_read(&mddev->recovery_active);
6708
6709	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6710	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6711		max_sectors = mddev->resync_max_sectors;
6712	else
6713		max_sectors = mddev->dev_sectors;
6714
6715	WARN_ON(max_sectors == 0);
6716	/* Pick 'scale' such that (resync>>scale)*1000 will fit
6717	 * in a sector_t, and (max_sectors>>scale) will fit in a
6718	 * u32, as those are the requirements for sector_div.
6719	 * Thus 'scale' must be at least 10
6720	 */
6721	scale = 10;
6722	if (sizeof(sector_t) > sizeof(unsigned long)) {
6723		while ( max_sectors/2 > (1ULL<<(scale+32)))
6724			scale++;
6725	}
6726	res = (resync>>scale)*1000;
6727	sector_div(res, (u32)((max_sectors>>scale)+1));
6728
6729	per_milli = res;
6730	{
6731		int i, x = per_milli/50, y = 20-x;
6732		seq_printf(seq, "[");
6733		for (i = 0; i < x; i++)
6734			seq_printf(seq, "=");
6735		seq_printf(seq, ">");
6736		for (i = 0; i < y; i++)
6737			seq_printf(seq, ".");
6738		seq_printf(seq, "] ");
6739	}
6740	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6741		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6742		    "reshape" :
6743		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6744		     "check" :
6745		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6746		      "resync" : "recovery"))),
6747		   per_milli/10, per_milli % 10,
6748		   (unsigned long long) resync/2,
6749		   (unsigned long long) max_sectors/2);
6750
6751	/*
6752	 * dt: time from mark until now
6753	 * db: blocks written from mark until now
6754	 * rt: remaining time
6755	 *
6756	 * rt is a sector_t, so could be 32bit or 64bit.
6757	 * So we divide before multiply in case it is 32bit and close
6758	 * to the limit.
6759	 * We scale the divisor (db) by 32 to avoid losing precision
6760	 * near the end of resync when the number of remaining sectors
6761	 * is close to 'db'.
6762	 * We then divide rt by 32 after multiplying by db to compensate.
6763	 * The '+1' avoids division by zero if db is very small.
6764	 */
6765	dt = ((jiffies - mddev->resync_mark) / HZ);
6766	if (!dt) dt++;
6767	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6768		- mddev->resync_mark_cnt;
6769
6770	rt = max_sectors - resync;    /* number of remaining sectors */
6771	sector_div(rt, db/32+1);
6772	rt *= dt;
6773	rt >>= 5;
6774
6775	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6776		   ((unsigned long)rt % 60)/6);
6777
6778	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6779}
6780
6781static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6782{
6783	struct list_head *tmp;
6784	loff_t l = *pos;
6785	struct mddev *mddev;
6786
6787	if (l >= 0x10000)
6788		return NULL;
6789	if (!l--)
6790		/* header */
6791		return (void*)1;
6792
6793	spin_lock(&all_mddevs_lock);
6794	list_for_each(tmp,&all_mddevs)
6795		if (!l--) {
6796			mddev = list_entry(tmp, struct mddev, all_mddevs);
6797			mddev_get(mddev);
6798			spin_unlock(&all_mddevs_lock);
6799			return mddev;
6800		}
6801	spin_unlock(&all_mddevs_lock);
6802	if (!l--)
6803		return (void*)2;/* tail */
6804	return NULL;
6805}
6806
6807static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6808{
6809	struct list_head *tmp;
6810	struct mddev *next_mddev, *mddev = v;
6811
6812	++*pos;
6813	if (v == (void*)2)
6814		return NULL;
6815
6816	spin_lock(&all_mddevs_lock);
6817	if (v == (void*)1)
6818		tmp = all_mddevs.next;
6819	else
6820		tmp = mddev->all_mddevs.next;
6821	if (tmp != &all_mddevs)
6822		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
6823	else {
6824		next_mddev = (void*)2;
6825		*pos = 0x10000;
6826	}
6827	spin_unlock(&all_mddevs_lock);
6828
6829	if (v != (void*)1)
6830		mddev_put(mddev);
6831	return next_mddev;
6832
6833}
6834
6835static void md_seq_stop(struct seq_file *seq, void *v)
6836{
6837	struct mddev *mddev = v;
6838
6839	if (mddev && v != (void*)1 && v != (void*)2)
6840		mddev_put(mddev);
6841}
6842
6843static int md_seq_show(struct seq_file *seq, void *v)
6844{
6845	struct mddev *mddev = v;
6846	sector_t sectors;
6847	struct md_rdev *rdev;
6848
6849	if (v == (void*)1) {
6850		struct md_personality *pers;
6851		seq_printf(seq, "Personalities : ");
6852		spin_lock(&pers_lock);
6853		list_for_each_entry(pers, &pers_list, list)
6854			seq_printf(seq, "[%s] ", pers->name);
6855
6856		spin_unlock(&pers_lock);
6857		seq_printf(seq, "\n");
6858		seq->poll_event = atomic_read(&md_event_count);
6859		return 0;
6860	}
6861	if (v == (void*)2) {
6862		status_unused(seq);
6863		return 0;
6864	}
6865
6866	if (mddev_lock(mddev) < 0)
6867		return -EINTR;
6868
6869	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6870		seq_printf(seq, "%s : %sactive", mdname(mddev),
6871						mddev->pers ? "" : "in");
6872		if (mddev->pers) {
6873			if (mddev->ro==1)
6874				seq_printf(seq, " (read-only)");
6875			if (mddev->ro==2)
6876				seq_printf(seq, " (auto-read-only)");
6877			seq_printf(seq, " %s", mddev->pers->name);
6878		}
6879
6880		sectors = 0;
6881		rdev_for_each(rdev, mddev) {
6882			char b[BDEVNAME_SIZE];
6883			seq_printf(seq, " %s[%d]",
6884				bdevname(rdev->bdev,b), rdev->desc_nr);
6885			if (test_bit(WriteMostly, &rdev->flags))
6886				seq_printf(seq, "(W)");
6887			if (test_bit(Faulty, &rdev->flags)) {
6888				seq_printf(seq, "(F)");
6889				continue;
6890			}
6891			if (rdev->raid_disk < 0)
6892				seq_printf(seq, "(S)"); /* spare */
6893			if (test_bit(Replacement, &rdev->flags))
6894				seq_printf(seq, "(R)");
6895			sectors += rdev->sectors;
6896		}
6897
6898		if (!list_empty(&mddev->disks)) {
6899			if (mddev->pers)
6900				seq_printf(seq, "\n      %llu blocks",
6901					   (unsigned long long)
6902					   mddev->array_sectors / 2);
6903			else
6904				seq_printf(seq, "\n      %llu blocks",
6905					   (unsigned long long)sectors / 2);
6906		}
6907		if (mddev->persistent) {
6908			if (mddev->major_version != 0 ||
6909			    mddev->minor_version != 90) {
6910				seq_printf(seq," super %d.%d",
6911					   mddev->major_version,
6912					   mddev->minor_version);
6913			}
6914		} else if (mddev->external)
6915			seq_printf(seq, " super external:%s",
6916				   mddev->metadata_type);
6917		else
6918			seq_printf(seq, " super non-persistent");
6919
6920		if (mddev->pers) {
6921			mddev->pers->status(seq, mddev);
6922			seq_printf(seq, "\n      ");
6923			if (mddev->pers->sync_request) {
6924				if (mddev->curr_resync > 2) {
6925					status_resync(seq, mddev);
6926					seq_printf(seq, "\n      ");
6927				} else if (mddev->curr_resync >= 1)
6928					seq_printf(seq, "\tresync=DELAYED\n      ");
6929				else if (mddev->recovery_cp < MaxSector)
6930					seq_printf(seq, "\tresync=PENDING\n      ");
6931			}
6932		} else
6933			seq_printf(seq, "\n       ");
6934
6935		bitmap_status(seq, mddev->bitmap);
6936
6937		seq_printf(seq, "\n");
6938	}
6939	mddev_unlock(mddev);
6940
6941	return 0;
6942}
6943
6944static const struct seq_operations md_seq_ops = {
6945	.start  = md_seq_start,
6946	.next   = md_seq_next,
6947	.stop   = md_seq_stop,
6948	.show   = md_seq_show,
6949};
6950
6951static int md_seq_open(struct inode *inode, struct file *file)
6952{
6953	struct seq_file *seq;
6954	int error;
6955
6956	error = seq_open(file, &md_seq_ops);
6957	if (error)
6958		return error;
6959
6960	seq = file->private_data;
6961	seq->poll_event = atomic_read(&md_event_count);
6962	return error;
6963}
6964
6965static int md_unloading;
6966static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6967{
6968	struct seq_file *seq = filp->private_data;
6969	int mask;
6970
6971	if (md_unloading)
6972		return POLLIN|POLLRDNORM|POLLERR|POLLPRI;;
6973	poll_wait(filp, &md_event_waiters, wait);
6974
6975	/* always allow read */
6976	mask = POLLIN | POLLRDNORM;
6977
6978	if (seq->poll_event != atomic_read(&md_event_count))
6979		mask |= POLLERR | POLLPRI;
6980	return mask;
6981}
6982
6983static const struct file_operations md_seq_fops = {
6984	.owner		= THIS_MODULE,
6985	.open           = md_seq_open,
6986	.read           = seq_read,
6987	.llseek         = seq_lseek,
6988	.release	= seq_release_private,
6989	.poll		= mdstat_poll,
6990};
6991
6992int register_md_personality(struct md_personality *p)
6993{
6994	printk(KERN_INFO "md: %s personality registered for level %d\n",
6995						p->name, p->level);
6996	spin_lock(&pers_lock);
6997	list_add_tail(&p->list, &pers_list);
6998	spin_unlock(&pers_lock);
6999	return 0;
7000}
7001EXPORT_SYMBOL(register_md_personality);
7002
7003int unregister_md_personality(struct md_personality *p)
7004{
7005	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7006	spin_lock(&pers_lock);
7007	list_del_init(&p->list);
7008	spin_unlock(&pers_lock);
7009	return 0;
7010}
7011EXPORT_SYMBOL(unregister_md_personality);
7012
7013static int is_mddev_idle(struct mddev *mddev, int init)
7014{
7015	struct md_rdev *rdev;
7016	int idle;
7017	int curr_events;
7018
7019	idle = 1;
7020	rcu_read_lock();
7021	rdev_for_each_rcu(rdev, mddev) {
7022		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7023		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7024			      (int)part_stat_read(&disk->part0, sectors[1]) -
7025			      atomic_read(&disk->sync_io);
7026		/* sync IO will cause sync_io to increase before the disk_stats
7027		 * as sync_io is counted when a request starts, and
7028		 * disk_stats is counted when it completes.
7029		 * So resync activity will cause curr_events to be smaller than
7030		 * when there was no such activity.
7031		 * non-sync IO will cause disk_stat to increase without
7032		 * increasing sync_io so curr_events will (eventually)
7033		 * be larger than it was before.  Once it becomes
7034		 * substantially larger, the test below will cause
7035		 * the array to appear non-idle, and resync will slow
7036		 * down.
7037		 * If there is a lot of outstanding resync activity when
7038		 * we set last_event to curr_events, then all that activity
7039		 * completing might cause the array to appear non-idle
7040		 * and resync will be slowed down even though there might
7041		 * not have been non-resync activity.  This will only
7042		 * happen once though.  'last_events' will soon reflect
7043		 * the state where there is little or no outstanding
7044		 * resync requests, and further resync activity will
7045		 * always make curr_events less than last_events.
7046		 *
7047		 */
7048		if (init || curr_events - rdev->last_events > 64) {
7049			rdev->last_events = curr_events;
7050			idle = 0;
7051		}
7052	}
7053	rcu_read_unlock();
7054	return idle;
7055}
7056
7057void md_done_sync(struct mddev *mddev, int blocks, int ok)
7058{
7059	/* another "blocks" (512byte) blocks have been synced */
7060	atomic_sub(blocks, &mddev->recovery_active);
7061	wake_up(&mddev->recovery_wait);
7062	if (!ok) {
7063		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7064		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7065		md_wakeup_thread(mddev->thread);
7066		// stop recovery, signal do_sync ....
7067	}
7068}
7069EXPORT_SYMBOL(md_done_sync);
7070
7071/* md_write_start(mddev, bi)
7072 * If we need to update some array metadata (e.g. 'active' flag
7073 * in superblock) before writing, schedule a superblock update
7074 * and wait for it to complete.
7075 */
7076void md_write_start(struct mddev *mddev, struct bio *bi)
7077{
7078	int did_change = 0;
7079	if (bio_data_dir(bi) != WRITE)
7080		return;
7081
7082	BUG_ON(mddev->ro == 1);
7083	if (mddev->ro == 2) {
7084		/* need to switch to read/write */
7085		mddev->ro = 0;
7086		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7087		md_wakeup_thread(mddev->thread);
7088		md_wakeup_thread(mddev->sync_thread);
7089		did_change = 1;
7090	}
7091	atomic_inc(&mddev->writes_pending);
7092	if (mddev->safemode == 1)
7093		mddev->safemode = 0;
7094	if (mddev->in_sync) {
7095		spin_lock_irq(&mddev->write_lock);
7096		if (mddev->in_sync) {
7097			mddev->in_sync = 0;
7098			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7099			set_bit(MD_CHANGE_PENDING, &mddev->flags);
7100			md_wakeup_thread(mddev->thread);
7101			did_change = 1;
7102		}
7103		spin_unlock_irq(&mddev->write_lock);
7104	}
7105	if (did_change)
7106		sysfs_notify_dirent_safe(mddev->sysfs_state);
7107	wait_event(mddev->sb_wait,
7108		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7109}
7110EXPORT_SYMBOL(md_write_start);
7111
7112void md_write_end(struct mddev *mddev)
7113{
7114	if (atomic_dec_and_test(&mddev->writes_pending)) {
7115		if (mddev->safemode == 2)
7116			md_wakeup_thread(mddev->thread);
7117		else if (mddev->safemode_delay)
7118			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7119	}
7120}
7121EXPORT_SYMBOL(md_write_end);
7122
7123/* md_allow_write(mddev)
7124 * Calling this ensures that the array is marked 'active' so that writes
7125 * may proceed without blocking.  It is important to call this before
7126 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7127 * Must be called with mddev_lock held.
7128 *
7129 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7130 * is dropped, so return -EAGAIN after notifying userspace.
7131 */
7132int md_allow_write(struct mddev *mddev)
7133{
7134	if (!mddev->pers)
7135		return 0;
7136	if (mddev->ro)
7137		return 0;
7138	if (!mddev->pers->sync_request)
7139		return 0;
7140
7141	spin_lock_irq(&mddev->write_lock);
7142	if (mddev->in_sync) {
7143		mddev->in_sync = 0;
7144		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7145		set_bit(MD_CHANGE_PENDING, &mddev->flags);
7146		if (mddev->safemode_delay &&
7147		    mddev->safemode == 0)
7148			mddev->safemode = 1;
7149		spin_unlock_irq(&mddev->write_lock);
7150		md_update_sb(mddev, 0);
7151		sysfs_notify_dirent_safe(mddev->sysfs_state);
7152	} else
7153		spin_unlock_irq(&mddev->write_lock);
7154
7155	if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7156		return -EAGAIN;
7157	else
7158		return 0;
7159}
7160EXPORT_SYMBOL_GPL(md_allow_write);
7161
7162#define SYNC_MARKS	10
7163#define	SYNC_MARK_STEP	(3*HZ)
7164#define UPDATE_FREQUENCY (5*60*HZ)
7165void md_do_sync(struct md_thread *thread)
7166{
7167	struct mddev *mddev = thread->mddev;
7168	struct mddev *mddev2;
7169	unsigned int currspeed = 0,
7170		 window;
7171	sector_t max_sectors,j, io_sectors, recovery_done;
7172	unsigned long mark[SYNC_MARKS];
7173	unsigned long update_time;
7174	sector_t mark_cnt[SYNC_MARKS];
7175	int last_mark,m;
7176	struct list_head *tmp;
7177	sector_t last_check;
7178	int skipped = 0;
7179	struct md_rdev *rdev;
7180	char *desc, *action = NULL;
7181	struct blk_plug plug;
7182
7183	/* just incase thread restarts... */
7184	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7185		return;
7186	if (mddev->ro) {/* never try to sync a read-only array */
7187		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7188		return;
7189	}
7190
7191	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7192		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
7193			desc = "data-check";
7194			action = "check";
7195		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7196			desc = "requested-resync";
7197			action = "repair";
7198		} else
7199			desc = "resync";
7200	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7201		desc = "reshape";
7202	else
7203		desc = "recovery";
7204
7205	mddev->last_sync_action = action ?: desc;
7206
7207	/* we overload curr_resync somewhat here.
7208	 * 0 == not engaged in resync at all
7209	 * 2 == checking that there is no conflict with another sync
7210	 * 1 == like 2, but have yielded to allow conflicting resync to
7211	 *		commense
7212	 * other == active in resync - this many blocks
7213	 *
7214	 * Before starting a resync we must have set curr_resync to
7215	 * 2, and then checked that every "conflicting" array has curr_resync
7216	 * less than ours.  When we find one that is the same or higher
7217	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
7218	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7219	 * This will mean we have to start checking from the beginning again.
7220	 *
7221	 */
7222
7223	do {
7224		mddev->curr_resync = 2;
7225
7226	try_again:
7227		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7228			goto skip;
7229		for_each_mddev(mddev2, tmp) {
7230			if (mddev2 == mddev)
7231				continue;
7232			if (!mddev->parallel_resync
7233			&&  mddev2->curr_resync
7234			&&  match_mddev_units(mddev, mddev2)) {
7235				DEFINE_WAIT(wq);
7236				if (mddev < mddev2 && mddev->curr_resync == 2) {
7237					/* arbitrarily yield */
7238					mddev->curr_resync = 1;
7239					wake_up(&resync_wait);
7240				}
7241				if (mddev > mddev2 && mddev->curr_resync == 1)
7242					/* no need to wait here, we can wait the next
7243					 * time 'round when curr_resync == 2
7244					 */
7245					continue;
7246				/* We need to wait 'interruptible' so as not to
7247				 * contribute to the load average, and not to
7248				 * be caught by 'softlockup'
7249				 */
7250				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7251				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7252				    mddev2->curr_resync >= mddev->curr_resync) {
7253					printk(KERN_INFO "md: delaying %s of %s"
7254					       " until %s has finished (they"
7255					       " share one or more physical units)\n",
7256					       desc, mdname(mddev), mdname(mddev2));
7257					mddev_put(mddev2);
7258					if (signal_pending(current))
7259						flush_signals(current);
7260					schedule();
7261					finish_wait(&resync_wait, &wq);
7262					goto try_again;
7263				}
7264				finish_wait(&resync_wait, &wq);
7265			}
7266		}
7267	} while (mddev->curr_resync < 2);
7268
7269	j = 0;
7270	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7271		/* resync follows the size requested by the personality,
7272		 * which defaults to physical size, but can be virtual size
7273		 */
7274		max_sectors = mddev->resync_max_sectors;
7275		atomic64_set(&mddev->resync_mismatches, 0);
7276		/* we don't use the checkpoint if there's a bitmap */
7277		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7278			j = mddev->resync_min;
7279		else if (!mddev->bitmap)
7280			j = mddev->recovery_cp;
7281
7282	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7283		max_sectors = mddev->resync_max_sectors;
7284	else {
7285		/* recovery follows the physical size of devices */
7286		max_sectors = mddev->dev_sectors;
7287		j = MaxSector;
7288		rcu_read_lock();
7289		rdev_for_each_rcu(rdev, mddev)
7290			if (rdev->raid_disk >= 0 &&
7291			    !test_bit(Faulty, &rdev->flags) &&
7292			    !test_bit(In_sync, &rdev->flags) &&
7293			    rdev->recovery_offset < j)
7294				j = rdev->recovery_offset;
7295		rcu_read_unlock();
7296
7297		/* If there is a bitmap, we need to make sure all
7298		 * writes that started before we added a spare
7299		 * complete before we start doing a recovery.
7300		 * Otherwise the write might complete and (via
7301		 * bitmap_endwrite) set a bit in the bitmap after the
7302		 * recovery has checked that bit and skipped that
7303		 * region.
7304		 */
7305		if (mddev->bitmap) {
7306			mddev->pers->quiesce(mddev, 1);
7307			mddev->pers->quiesce(mddev, 0);
7308		}
7309	}
7310
7311	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7312	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
7313		" %d KB/sec/disk.\n", speed_min(mddev));
7314	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7315	       "(but not more than %d KB/sec) for %s.\n",
7316	       speed_max(mddev), desc);
7317
7318	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7319
7320	io_sectors = 0;
7321	for (m = 0; m < SYNC_MARKS; m++) {
7322		mark[m] = jiffies;
7323		mark_cnt[m] = io_sectors;
7324	}
7325	last_mark = 0;
7326	mddev->resync_mark = mark[last_mark];
7327	mddev->resync_mark_cnt = mark_cnt[last_mark];
7328
7329	/*
7330	 * Tune reconstruction:
7331	 */
7332	window = 32*(PAGE_SIZE/512);
7333	printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7334		window/2, (unsigned long long)max_sectors/2);
7335
7336	atomic_set(&mddev->recovery_active, 0);
7337	last_check = 0;
7338
7339	if (j>2) {
7340		printk(KERN_INFO
7341		       "md: resuming %s of %s from checkpoint.\n",
7342		       desc, mdname(mddev));
7343		mddev->curr_resync = j;
7344	} else
7345		mddev->curr_resync = 3; /* no longer delayed */
7346	mddev->curr_resync_completed = j;
7347	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7348	md_new_event(mddev);
7349	update_time = jiffies;
7350
7351	blk_start_plug(&plug);
7352	while (j < max_sectors) {
7353		sector_t sectors;
7354
7355		skipped = 0;
7356
7357		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7358		    ((mddev->curr_resync > mddev->curr_resync_completed &&
7359		      (mddev->curr_resync - mddev->curr_resync_completed)
7360		      > (max_sectors >> 4)) ||
7361		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7362		     (j - mddev->curr_resync_completed)*2
7363		     >= mddev->resync_max - mddev->curr_resync_completed
7364			    )) {
7365			/* time to update curr_resync_completed */
7366			wait_event(mddev->recovery_wait,
7367				   atomic_read(&mddev->recovery_active) == 0);
7368			mddev->curr_resync_completed = j;
7369			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7370			    j > mddev->recovery_cp)
7371				mddev->recovery_cp = j;
7372			update_time = jiffies;
7373			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7374			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7375		}
7376
7377		while (j >= mddev->resync_max &&
7378		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7379			/* As this condition is controlled by user-space,
7380			 * we can block indefinitely, so use '_interruptible'
7381			 * to avoid triggering warnings.
7382			 */
7383			flush_signals(current); /* just in case */
7384			wait_event_interruptible(mddev->recovery_wait,
7385						 mddev->resync_max > j
7386						 || test_bit(MD_RECOVERY_INTR,
7387							     &mddev->recovery));
7388		}
7389
7390		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7391			break;
7392
7393		sectors = mddev->pers->sync_request(mddev, j, &skipped,
7394						  currspeed < speed_min(mddev));
7395		if (sectors == 0) {
7396			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7397			break;
7398		}
7399
7400		if (!skipped) { /* actual IO requested */
7401			io_sectors += sectors;
7402			atomic_add(sectors, &mddev->recovery_active);
7403		}
7404
7405		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7406			break;
7407
7408		j += sectors;
7409		if (j > 2)
7410			mddev->curr_resync = j;
7411		mddev->curr_mark_cnt = io_sectors;
7412		if (last_check == 0)
7413			/* this is the earliest that rebuild will be
7414			 * visible in /proc/mdstat
7415			 */
7416			md_new_event(mddev);
7417
7418		if (last_check + window > io_sectors || j == max_sectors)
7419			continue;
7420
7421		last_check = io_sectors;
7422	repeat:
7423		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7424			/* step marks */
7425			int next = (last_mark+1) % SYNC_MARKS;
7426
7427			mddev->resync_mark = mark[next];
7428			mddev->resync_mark_cnt = mark_cnt[next];
7429			mark[next] = jiffies;
7430			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7431			last_mark = next;
7432		}
7433
7434		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7435			break;
7436
7437		/*
7438		 * this loop exits only if either when we are slower than
7439		 * the 'hard' speed limit, or the system was IO-idle for
7440		 * a jiffy.
7441		 * the system might be non-idle CPU-wise, but we only care
7442		 * about not overloading the IO subsystem. (things like an
7443		 * e2fsck being done on the RAID array should execute fast)
7444		 */
7445		cond_resched();
7446
7447		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
7448		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
7449			/((jiffies-mddev->resync_mark)/HZ +1) +1;
7450
7451		if (currspeed > speed_min(mddev)) {
7452			if ((currspeed > speed_max(mddev)) ||
7453					!is_mddev_idle(mddev, 0)) {
7454				msleep(500);
7455				goto repeat;
7456			}
7457		}
7458	}
7459	printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7460	       test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7461	       ? "interrupted" : "done");
7462	/*
7463	 * this also signals 'finished resyncing' to md_stop
7464	 */
7465	blk_finish_plug(&plug);
7466	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7467
7468	/* tell personality that we are finished */
7469	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7470
7471	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7472	    mddev->curr_resync > 2) {
7473		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7474			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7475				if (mddev->curr_resync >= mddev->recovery_cp) {
7476					printk(KERN_INFO
7477					       "md: checkpointing %s of %s.\n",
7478					       desc, mdname(mddev));
7479					if (test_bit(MD_RECOVERY_ERROR,
7480						&mddev->recovery))
7481						mddev->recovery_cp =
7482							mddev->curr_resync_completed;
7483					else
7484						mddev->recovery_cp =
7485							mddev->curr_resync;
7486				}
7487			} else
7488				mddev->recovery_cp = MaxSector;
7489		} else {
7490			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7491				mddev->curr_resync = MaxSector;
7492			rcu_read_lock();
7493			rdev_for_each_rcu(rdev, mddev)
7494				if (rdev->raid_disk >= 0 &&
7495				    mddev->delta_disks >= 0 &&
7496				    !test_bit(Faulty, &rdev->flags) &&
7497				    !test_bit(In_sync, &rdev->flags) &&
7498				    rdev->recovery_offset < mddev->curr_resync)
7499					rdev->recovery_offset = mddev->curr_resync;
7500			rcu_read_unlock();
7501		}
7502	}
7503 skip:
7504	set_bit(MD_CHANGE_DEVS, &mddev->flags);
7505
7506	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7507		/* We completed so min/max setting can be forgotten if used. */
7508		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7509			mddev->resync_min = 0;
7510		mddev->resync_max = MaxSector;
7511	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7512		mddev->resync_min = mddev->curr_resync_completed;
7513	mddev->curr_resync = 0;
7514	wake_up(&resync_wait);
7515	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7516	md_wakeup_thread(mddev->thread);
7517	return;
7518}
7519EXPORT_SYMBOL_GPL(md_do_sync);
7520
7521static int remove_and_add_spares(struct mddev *mddev,
7522				 struct md_rdev *this)
7523{
7524	struct md_rdev *rdev;
7525	int spares = 0;
7526	int removed = 0;
7527
7528	rdev_for_each(rdev, mddev)
7529		if ((this == NULL || rdev == this) &&
7530		    rdev->raid_disk >= 0 &&
7531		    !test_bit(Blocked, &rdev->flags) &&
7532		    (test_bit(Faulty, &rdev->flags) ||
7533		     ! test_bit(In_sync, &rdev->flags)) &&
7534		    atomic_read(&rdev->nr_pending)==0) {
7535			if (mddev->pers->hot_remove_disk(
7536				    mddev, rdev) == 0) {
7537				sysfs_unlink_rdev(mddev, rdev);
7538				rdev->raid_disk = -1;
7539				removed++;
7540			}
7541		}
7542	if (removed && mddev->kobj.sd)
7543		sysfs_notify(&mddev->kobj, NULL, "degraded");
7544
7545	if (this)
7546		goto no_add;
7547
7548	rdev_for_each(rdev, mddev) {
7549		if (rdev->raid_disk >= 0 &&
7550		    !test_bit(In_sync, &rdev->flags) &&
7551		    !test_bit(Faulty, &rdev->flags))
7552			spares++;
7553		if (rdev->raid_disk >= 0)
7554			continue;
7555		if (test_bit(Faulty, &rdev->flags))
7556			continue;
7557		if (mddev->ro &&
7558		    ! (rdev->saved_raid_disk >= 0 &&
7559		       !test_bit(Bitmap_sync, &rdev->flags)))
7560			continue;
7561
7562		if (rdev->saved_raid_disk < 0)
7563			rdev->recovery_offset = 0;
7564		if (mddev->pers->
7565		    hot_add_disk(mddev, rdev) == 0) {
7566			if (sysfs_link_rdev(mddev, rdev))
7567				/* failure here is OK */;
7568			spares++;
7569			md_new_event(mddev);
7570			set_bit(MD_CHANGE_DEVS, &mddev->flags);
7571		}
7572	}
7573no_add:
7574	if (removed)
7575		set_bit(MD_CHANGE_DEVS, &mddev->flags);
7576	return spares;
7577}
7578
7579static void md_start_sync(struct work_struct *ws)
7580{
7581	struct mddev *mddev = container_of(ws, struct mddev, del_work);
7582
7583	mddev->sync_thread = md_register_thread(md_do_sync,
7584						mddev,
7585						"resync");
7586	if (!mddev->sync_thread) {
7587		printk(KERN_ERR "%s: could not start resync"
7588		       " thread...\n",
7589		       mdname(mddev));
7590		/* leave the spares where they are, it shouldn't hurt */
7591		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7592		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7593		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7594		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7595		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7596		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7597				       &mddev->recovery))
7598			if (mddev->sysfs_action)
7599				sysfs_notify_dirent_safe(mddev->sysfs_action);
7600	} else
7601		md_wakeup_thread(mddev->sync_thread);
7602	sysfs_notify_dirent_safe(mddev->sysfs_action);
7603	md_new_event(mddev);
7604}
7605
7606/*
7607 * This routine is regularly called by all per-raid-array threads to
7608 * deal with generic issues like resync and super-block update.
7609 * Raid personalities that don't have a thread (linear/raid0) do not
7610 * need this as they never do any recovery or update the superblock.
7611 *
7612 * It does not do any resync itself, but rather "forks" off other threads
7613 * to do that as needed.
7614 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7615 * "->recovery" and create a thread at ->sync_thread.
7616 * When the thread finishes it sets MD_RECOVERY_DONE
7617 * and wakeups up this thread which will reap the thread and finish up.
7618 * This thread also removes any faulty devices (with nr_pending == 0).
7619 *
7620 * The overall approach is:
7621 *  1/ if the superblock needs updating, update it.
7622 *  2/ If a recovery thread is running, don't do anything else.
7623 *  3/ If recovery has finished, clean up, possibly marking spares active.
7624 *  4/ If there are any faulty devices, remove them.
7625 *  5/ If array is degraded, try to add spares devices
7626 *  6/ If array has spares or is not in-sync, start a resync thread.
7627 */
7628void md_check_recovery(struct mddev *mddev)
7629{
7630	if (mddev->suspended)
7631		return;
7632
7633	if (mddev->bitmap)
7634		bitmap_daemon_work(mddev);
7635
7636	if (signal_pending(current)) {
7637		if (mddev->pers->sync_request && !mddev->external) {
7638			printk(KERN_INFO "md: %s in immediate safe mode\n",
7639			       mdname(mddev));
7640			mddev->safemode = 2;
7641		}
7642		flush_signals(current);
7643	}
7644
7645	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7646		return;
7647	if ( ! (
7648		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
7649		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7650		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7651		(mddev->external == 0 && mddev->safemode == 1) ||
7652		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7653		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7654		))
7655		return;
7656
7657	if (mddev_trylock(mddev)) {
7658		int spares = 0;
7659
7660		if (mddev->ro) {
7661			/* On a read-only array we can:
7662			 * - remove failed devices
7663			 * - add already-in_sync devices if the array itself
7664			 *   is in-sync.
7665			 * As we only add devices that are already in-sync,
7666			 * we can activate the spares immediately.
7667			 */
7668			remove_and_add_spares(mddev, NULL);
7669			/* There is no thread, but we need to call
7670			 * ->spare_active and clear saved_raid_disk
7671			 */
7672			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7673			md_reap_sync_thread(mddev);
7674			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7675			goto unlock;
7676		}
7677
7678		if (!mddev->external) {
7679			int did_change = 0;
7680			spin_lock_irq(&mddev->write_lock);
7681			if (mddev->safemode &&
7682			    !atomic_read(&mddev->writes_pending) &&
7683			    !mddev->in_sync &&
7684			    mddev->recovery_cp == MaxSector) {
7685				mddev->in_sync = 1;
7686				did_change = 1;
7687				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7688			}
7689			if (mddev->safemode == 1)
7690				mddev->safemode = 0;
7691			spin_unlock_irq(&mddev->write_lock);
7692			if (did_change)
7693				sysfs_notify_dirent_safe(mddev->sysfs_state);
7694		}
7695
7696		if (mddev->flags & MD_UPDATE_SB_FLAGS)
7697			md_update_sb(mddev, 0);
7698
7699		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7700		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7701			/* resync/recovery still happening */
7702			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7703			goto unlock;
7704		}
7705		if (mddev->sync_thread) {
7706			md_reap_sync_thread(mddev);
7707			goto unlock;
7708		}
7709		/* Set RUNNING before clearing NEEDED to avoid
7710		 * any transients in the value of "sync_action".
7711		 */
7712		mddev->curr_resync_completed = 0;
7713		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7714		/* Clear some bits that don't mean anything, but
7715		 * might be left set
7716		 */
7717		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7718		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7719
7720		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7721		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7722			goto not_running;
7723		/* no recovery is running.
7724		 * remove any failed drives, then
7725		 * add spares if possible.
7726		 * Spares are also removed and re-added, to allow
7727		 * the personality to fail the re-add.
7728		 */
7729
7730		if (mddev->reshape_position != MaxSector) {
7731			if (mddev->pers->check_reshape == NULL ||
7732			    mddev->pers->check_reshape(mddev) != 0)
7733				/* Cannot proceed */
7734				goto not_running;
7735			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7736			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7737		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
7738			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7739			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7740			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7741			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7742		} else if (mddev->recovery_cp < MaxSector) {
7743			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7744			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7745		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7746			/* nothing to be done ... */
7747			goto not_running;
7748
7749		if (mddev->pers->sync_request) {
7750			if (spares) {
7751				/* We are adding a device or devices to an array
7752				 * which has the bitmap stored on all devices.
7753				 * So make sure all bitmap pages get written
7754				 */
7755				bitmap_write_all(mddev->bitmap);
7756			}
7757			INIT_WORK(&mddev->del_work, md_start_sync);
7758			queue_work(md_misc_wq, &mddev->del_work);
7759			goto unlock;
7760		}
7761	not_running:
7762		if (!mddev->sync_thread) {
7763			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7764			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7765					       &mddev->recovery))
7766				if (mddev->sysfs_action)
7767					sysfs_notify_dirent_safe(mddev->sysfs_action);
7768		}
7769	unlock:
7770		wake_up(&mddev->sb_wait);
7771		mddev_unlock(mddev);
7772	}
7773}
7774EXPORT_SYMBOL(md_check_recovery);
7775
7776void md_reap_sync_thread(struct mddev *mddev)
7777{
7778	struct md_rdev *rdev;
7779
7780	/* resync has finished, collect result */
7781	md_unregister_thread(&mddev->sync_thread);
7782	wake_up(&resync_wait);
7783	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7784	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7785		/* success...*/
7786		/* activate any spares */
7787		if (mddev->pers->spare_active(mddev)) {
7788			sysfs_notify(&mddev->kobj, NULL,
7789				     "degraded");
7790			set_bit(MD_CHANGE_DEVS, &mddev->flags);
7791		}
7792	}
7793	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7794	    mddev->pers->finish_reshape)
7795		mddev->pers->finish_reshape(mddev);
7796
7797	/* If array is no-longer degraded, then any saved_raid_disk
7798	 * information must be scrapped.
7799	 */
7800	if (!mddev->degraded)
7801		rdev_for_each(rdev, mddev)
7802			rdev->saved_raid_disk = -1;
7803
7804	md_update_sb(mddev, 1);
7805	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7806	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7807	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7808	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7809	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7810	/* flag recovery needed just to double check */
7811	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7812	sysfs_notify_dirent_safe(mddev->sysfs_action);
7813	md_new_event(mddev);
7814	if (mddev->event_work.func)
7815		queue_work(md_misc_wq, &mddev->event_work);
7816}
7817EXPORT_SYMBOL(md_reap_sync_thread);
7818
7819void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7820{
7821	sysfs_notify_dirent_safe(rdev->sysfs_state);
7822	wait_event_timeout(rdev->blocked_wait,
7823			   !test_bit(Blocked, &rdev->flags) &&
7824			   !test_bit(BlockedBadBlocks, &rdev->flags),
7825			   msecs_to_jiffies(5000));
7826	rdev_dec_pending(rdev, mddev);
7827}
7828EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7829
7830void md_finish_reshape(struct mddev *mddev)
7831{
7832	/* called be personality module when reshape completes. */
7833	struct md_rdev *rdev;
7834
7835	rdev_for_each(rdev, mddev) {
7836		if (rdev->data_offset > rdev->new_data_offset)
7837			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7838		else
7839			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7840		rdev->data_offset = rdev->new_data_offset;
7841	}
7842}
7843EXPORT_SYMBOL(md_finish_reshape);
7844
7845/* Bad block management.
7846 * We can record which blocks on each device are 'bad' and so just
7847 * fail those blocks, or that stripe, rather than the whole device.
7848 * Entries in the bad-block table are 64bits wide.  This comprises:
7849 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7850 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7851 *  A 'shift' can be set so that larger blocks are tracked and
7852 *  consequently larger devices can be covered.
7853 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7854 *
7855 * Locking of the bad-block table uses a seqlock so md_is_badblock
7856 * might need to retry if it is very unlucky.
7857 * We will sometimes want to check for bad blocks in a bi_end_io function,
7858 * so we use the write_seqlock_irq variant.
7859 *
7860 * When looking for a bad block we specify a range and want to
7861 * know if any block in the range is bad.  So we binary-search
7862 * to the last range that starts at-or-before the given endpoint,
7863 * (or "before the sector after the target range")
7864 * then see if it ends after the given start.
7865 * We return
7866 *  0 if there are no known bad blocks in the range
7867 *  1 if there are known bad block which are all acknowledged
7868 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7869 * plus the start/length of the first bad section we overlap.
7870 */
7871int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7872		   sector_t *first_bad, int *bad_sectors)
7873{
7874	int hi;
7875	int lo;
7876	u64 *p = bb->page;
7877	int rv;
7878	sector_t target = s + sectors;
7879	unsigned seq;
7880
7881	if (bb->shift > 0) {
7882		/* round the start down, and the end up */
7883		s >>= bb->shift;
7884		target += (1<<bb->shift) - 1;
7885		target >>= bb->shift;
7886		sectors = target - s;
7887	}
7888	/* 'target' is now the first block after the bad range */
7889
7890retry:
7891	seq = read_seqbegin(&bb->lock);
7892	lo = 0;
7893	rv = 0;
7894	hi = bb->count;
7895
7896	/* Binary search between lo and hi for 'target'
7897	 * i.e. for the last range that starts before 'target'
7898	 */
7899	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7900	 * are known not to be the last range before target.
7901	 * VARIANT: hi-lo is the number of possible
7902	 * ranges, and decreases until it reaches 1
7903	 */
7904	while (hi - lo > 1) {
7905		int mid = (lo + hi) / 2;
7906		sector_t a = BB_OFFSET(p[mid]);
7907		if (a < target)
7908			/* This could still be the one, earlier ranges
7909			 * could not. */
7910			lo = mid;
7911		else
7912			/* This and later ranges are definitely out. */
7913			hi = mid;
7914	}
7915	/* 'lo' might be the last that started before target, but 'hi' isn't */
7916	if (hi > lo) {
7917		/* need to check all range that end after 's' to see if
7918		 * any are unacknowledged.
7919		 */
7920		while (lo >= 0 &&
7921		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7922			if (BB_OFFSET(p[lo]) < target) {
7923				/* starts before the end, and finishes after
7924				 * the start, so they must overlap
7925				 */
7926				if (rv != -1 && BB_ACK(p[lo]))
7927					rv = 1;
7928				else
7929					rv = -1;
7930				*first_bad = BB_OFFSET(p[lo]);
7931				*bad_sectors = BB_LEN(p[lo]);
7932			}
7933			lo--;
7934		}
7935	}
7936
7937	if (read_seqretry(&bb->lock, seq))
7938		goto retry;
7939
7940	return rv;
7941}
7942EXPORT_SYMBOL_GPL(md_is_badblock);
7943
7944/*
7945 * Add a range of bad blocks to the table.
7946 * This might extend the table, or might contract it
7947 * if two adjacent ranges can be merged.
7948 * We binary-search to find the 'insertion' point, then
7949 * decide how best to handle it.
7950 */
7951static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7952			    int acknowledged)
7953{
7954	u64 *p;
7955	int lo, hi;
7956	int rv = 1;
7957	unsigned long flags;
7958
7959	if (bb->shift < 0)
7960		/* badblocks are disabled */
7961		return 0;
7962
7963	if (bb->shift) {
7964		/* round the start down, and the end up */
7965		sector_t next = s + sectors;
7966		s >>= bb->shift;
7967		next += (1<<bb->shift) - 1;
7968		next >>= bb->shift;
7969		sectors = next - s;
7970	}
7971
7972	write_seqlock_irqsave(&bb->lock, flags);
7973
7974	p = bb->page;
7975	lo = 0;
7976	hi = bb->count;
7977	/* Find the last range that starts at-or-before 's' */
7978	while (hi - lo > 1) {
7979		int mid = (lo + hi) / 2;
7980		sector_t a = BB_OFFSET(p[mid]);
7981		if (a <= s)
7982			lo = mid;
7983		else
7984			hi = mid;
7985	}
7986	if (hi > lo && BB_OFFSET(p[lo]) > s)
7987		hi = lo;
7988
7989	if (hi > lo) {
7990		/* we found a range that might merge with the start
7991		 * of our new range
7992		 */
7993		sector_t a = BB_OFFSET(p[lo]);
7994		sector_t e = a + BB_LEN(p[lo]);
7995		int ack = BB_ACK(p[lo]);
7996		if (e >= s) {
7997			/* Yes, we can merge with a previous range */
7998			if (s == a && s + sectors >= e)
7999				/* new range covers old */
8000				ack = acknowledged;
8001			else
8002				ack = ack && acknowledged;
8003
8004			if (e < s + sectors)
8005				e = s + sectors;
8006			if (e - a <= BB_MAX_LEN) {
8007				p[lo] = BB_MAKE(a, e-a, ack);
8008				s = e;
8009			} else {
8010				/* does not all fit in one range,
8011				 * make p[lo] maximal
8012				 */
8013				if (BB_LEN(p[lo]) != BB_MAX_LEN)
8014					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8015				s = a + BB_MAX_LEN;
8016			}
8017			sectors = e - s;
8018		}
8019	}
8020	if (sectors && hi < bb->count) {
8021		/* 'hi' points to the first range that starts after 's'.
8022		 * Maybe we can merge with the start of that range */
8023		sector_t a = BB_OFFSET(p[hi]);
8024		sector_t e = a + BB_LEN(p[hi]);
8025		int ack = BB_ACK(p[hi]);
8026		if (a <= s + sectors) {
8027			/* merging is possible */
8028			if (e <= s + sectors) {
8029				/* full overlap */
8030				e = s + sectors;
8031				ack = acknowledged;
8032			} else
8033				ack = ack && acknowledged;
8034
8035			a = s;
8036			if (e - a <= BB_MAX_LEN) {
8037				p[hi] = BB_MAKE(a, e-a, ack);
8038				s = e;
8039			} else {
8040				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8041				s = a + BB_MAX_LEN;
8042			}
8043			sectors = e - s;
8044			lo = hi;
8045			hi++;
8046		}
8047	}
8048	if (sectors == 0 && hi < bb->count) {
8049		/* we might be able to combine lo and hi */
8050		/* Note: 's' is at the end of 'lo' */
8051		sector_t a = BB_OFFSET(p[hi]);
8052		int lolen = BB_LEN(p[lo]);
8053		int hilen = BB_LEN(p[hi]);
8054		int newlen = lolen + hilen - (s - a);
8055		if (s >= a && newlen < BB_MAX_LEN) {
8056			/* yes, we can combine them */
8057			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8058			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8059			memmove(p + hi, p + hi + 1,
8060				(bb->count - hi - 1) * 8);
8061			bb->count--;
8062		}
8063	}
8064	while (sectors) {
8065		/* didn't merge (it all).
8066		 * Need to add a range just before 'hi' */
8067		if (bb->count >= MD_MAX_BADBLOCKS) {
8068			/* No room for more */
8069			rv = 0;
8070			break;
8071		} else {
8072			int this_sectors = sectors;
8073			memmove(p + hi + 1, p + hi,
8074				(bb->count - hi) * 8);
8075			bb->count++;
8076
8077			if (this_sectors > BB_MAX_LEN)
8078				this_sectors = BB_MAX_LEN;
8079			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8080			sectors -= this_sectors;
8081			s += this_sectors;
8082		}
8083	}
8084
8085	bb->changed = 1;
8086	if (!acknowledged)
8087		bb->unacked_exist = 1;
8088	write_sequnlock_irqrestore(&bb->lock, flags);
8089
8090	return rv;
8091}
8092
8093int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8094		       int is_new)
8095{
8096	int rv;
8097	if (is_new)
8098		s += rdev->new_data_offset;
8099	else
8100		s += rdev->data_offset;
8101	rv = md_set_badblocks(&rdev->badblocks,
8102			      s, sectors, 0);
8103	if (rv) {
8104		/* Make sure they get written out promptly */
8105		sysfs_notify_dirent_safe(rdev->sysfs_state);
8106		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8107		md_wakeup_thread(rdev->mddev->thread);
8108	}
8109	return rv;
8110}
8111EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8112
8113/*
8114 * Remove a range of bad blocks from the table.
8115 * This may involve extending the table if we spilt a region,
8116 * but it must not fail.  So if the table becomes full, we just
8117 * drop the remove request.
8118 */
8119static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8120{
8121	u64 *p;
8122	int lo, hi;
8123	sector_t target = s + sectors;
8124	int rv = 0;
8125
8126	if (bb->shift > 0) {
8127		/* When clearing we round the start up and the end down.
8128		 * This should not matter as the shift should align with
8129		 * the block size and no rounding should ever be needed.
8130		 * However it is better the think a block is bad when it
8131		 * isn't than to think a block is not bad when it is.
8132		 */
8133		s += (1<<bb->shift) - 1;
8134		s >>= bb->shift;
8135		target >>= bb->shift;
8136		sectors = target - s;
8137	}
8138
8139	write_seqlock_irq(&bb->lock);
8140
8141	p = bb->page;
8142	lo = 0;
8143	hi = bb->count;
8144	/* Find the last range that starts before 'target' */
8145	while (hi - lo > 1) {
8146		int mid = (lo + hi) / 2;
8147		sector_t a = BB_OFFSET(p[mid]);
8148		if (a < target)
8149			lo = mid;
8150		else
8151			hi = mid;
8152	}
8153	if (hi > lo) {
8154		/* p[lo] is the last range that could overlap the
8155		 * current range.  Earlier ranges could also overlap,
8156		 * but only this one can overlap the end of the range.
8157		 */
8158		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8159			/* Partial overlap, leave the tail of this range */
8160			int ack = BB_ACK(p[lo]);
8161			sector_t a = BB_OFFSET(p[lo]);
8162			sector_t end = a + BB_LEN(p[lo]);
8163
8164			if (a < s) {
8165				/* we need to split this range */
8166				if (bb->count >= MD_MAX_BADBLOCKS) {
8167					rv = -ENOSPC;
8168					goto out;
8169				}
8170				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8171				bb->count++;
8172				p[lo] = BB_MAKE(a, s-a, ack);
8173				lo++;
8174			}
8175			p[lo] = BB_MAKE(target, end - target, ack);
8176			/* there is no longer an overlap */
8177			hi = lo;
8178			lo--;
8179		}
8180		while (lo >= 0 &&
8181		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8182			/* This range does overlap */
8183			if (BB_OFFSET(p[lo]) < s) {
8184				/* Keep the early parts of this range. */
8185				int ack = BB_ACK(p[lo]);
8186				sector_t start = BB_OFFSET(p[lo]);
8187				p[lo] = BB_MAKE(start, s - start, ack);
8188				/* now low doesn't overlap, so.. */
8189				break;
8190			}
8191			lo--;
8192		}
8193		/* 'lo' is strictly before, 'hi' is strictly after,
8194		 * anything between needs to be discarded
8195		 */
8196		if (hi - lo > 1) {
8197			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8198			bb->count -= (hi - lo - 1);
8199		}
8200	}
8201
8202	bb->changed = 1;
8203out:
8204	write_sequnlock_irq(&bb->lock);
8205	return rv;
8206}
8207
8208int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8209			 int is_new)
8210{
8211	if (is_new)
8212		s += rdev->new_data_offset;
8213	else
8214		s += rdev->data_offset;
8215	return md_clear_badblocks(&rdev->badblocks,
8216				  s, sectors);
8217}
8218EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8219
8220/*
8221 * Acknowledge all bad blocks in a list.
8222 * This only succeeds if ->changed is clear.  It is used by
8223 * in-kernel metadata updates
8224 */
8225void md_ack_all_badblocks(struct badblocks *bb)
8226{
8227	if (bb->page == NULL || bb->changed)
8228		/* no point even trying */
8229		return;
8230	write_seqlock_irq(&bb->lock);
8231
8232	if (bb->changed == 0 && bb->unacked_exist) {
8233		u64 *p = bb->page;
8234		int i;
8235		for (i = 0; i < bb->count ; i++) {
8236			if (!BB_ACK(p[i])) {
8237				sector_t start = BB_OFFSET(p[i]);
8238				int len = BB_LEN(p[i]);
8239				p[i] = BB_MAKE(start, len, 1);
8240			}
8241		}
8242		bb->unacked_exist = 0;
8243	}
8244	write_sequnlock_irq(&bb->lock);
8245}
8246EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8247
8248/* sysfs access to bad-blocks list.
8249 * We present two files.
8250 * 'bad-blocks' lists sector numbers and lengths of ranges that
8251 *    are recorded as bad.  The list is truncated to fit within
8252 *    the one-page limit of sysfs.
8253 *    Writing "sector length" to this file adds an acknowledged
8254 *    bad block list.
8255 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8256 *    been acknowledged.  Writing to this file adds bad blocks
8257 *    without acknowledging them.  This is largely for testing.
8258 */
8259
8260static ssize_t
8261badblocks_show(struct badblocks *bb, char *page, int unack)
8262{
8263	size_t len;
8264	int i;
8265	u64 *p = bb->page;
8266	unsigned seq;
8267
8268	if (bb->shift < 0)
8269		return 0;
8270
8271retry:
8272	seq = read_seqbegin(&bb->lock);
8273
8274	len = 0;
8275	i = 0;
8276
8277	while (len < PAGE_SIZE && i < bb->count) {
8278		sector_t s = BB_OFFSET(p[i]);
8279		unsigned int length = BB_LEN(p[i]);
8280		int ack = BB_ACK(p[i]);
8281		i++;
8282
8283		if (unack && ack)
8284			continue;
8285
8286		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8287				(unsigned long long)s << bb->shift,
8288				length << bb->shift);
8289	}
8290	if (unack && len == 0)
8291		bb->unacked_exist = 0;
8292
8293	if (read_seqretry(&bb->lock, seq))
8294		goto retry;
8295
8296	return len;
8297}
8298
8299#define DO_DEBUG 1
8300
8301static ssize_t
8302badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8303{
8304	unsigned long long sector;
8305	int length;
8306	char newline;
8307#ifdef DO_DEBUG
8308	/* Allow clearing via sysfs *only* for testing/debugging.
8309	 * Normally only a successful write may clear a badblock
8310	 */
8311	int clear = 0;
8312	if (page[0] == '-') {
8313		clear = 1;
8314		page++;
8315	}
8316#endif /* DO_DEBUG */
8317
8318	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8319	case 3:
8320		if (newline != '\n')
8321			return -EINVAL;
8322	case 2:
8323		if (length <= 0)
8324			return -EINVAL;
8325		break;
8326	default:
8327		return -EINVAL;
8328	}
8329
8330#ifdef DO_DEBUG
8331	if (clear) {
8332		md_clear_badblocks(bb, sector, length);
8333		return len;
8334	}
8335#endif /* DO_DEBUG */
8336	if (md_set_badblocks(bb, sector, length, !unack))
8337		return len;
8338	else
8339		return -ENOSPC;
8340}
8341
8342static int md_notify_reboot(struct notifier_block *this,
8343			    unsigned long code, void *x)
8344{
8345	struct list_head *tmp;
8346	struct mddev *mddev;
8347	int need_delay = 0;
8348
8349	for_each_mddev(mddev, tmp) {
8350		if (mddev_trylock(mddev)) {
8351			if (mddev->pers)
8352				__md_stop_writes(mddev);
8353			if (mddev->persistent)
8354				mddev->safemode = 2;
8355			mddev_unlock(mddev);
8356		}
8357		need_delay = 1;
8358	}
8359	/*
8360	 * certain more exotic SCSI devices are known to be
8361	 * volatile wrt too early system reboots. While the
8362	 * right place to handle this issue is the given
8363	 * driver, we do want to have a safe RAID driver ...
8364	 */
8365	if (need_delay)
8366		mdelay(1000*1);
8367
8368	return NOTIFY_DONE;
8369}
8370
8371static struct notifier_block md_notifier = {
8372	.notifier_call	= md_notify_reboot,
8373	.next		= NULL,
8374	.priority	= INT_MAX, /* before any real devices */
8375};
8376
8377static void md_geninit(void)
8378{
8379	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8380
8381	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8382}
8383
8384static int __init md_init(void)
8385{
8386	int ret = -ENOMEM;
8387
8388	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8389	if (!md_wq)
8390		goto err_wq;
8391
8392	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8393	if (!md_misc_wq)
8394		goto err_misc_wq;
8395
8396	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8397		goto err_md;
8398
8399	if ((ret = register_blkdev(0, "mdp")) < 0)
8400		goto err_mdp;
8401	mdp_major = ret;
8402
8403	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
8404			    md_probe, NULL, NULL);
8405	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8406			    md_probe, NULL, NULL);
8407
8408	register_reboot_notifier(&md_notifier);
8409	raid_table_header = register_sysctl_table(raid_root_table);
8410
8411	md_geninit();
8412	return 0;
8413
8414err_mdp:
8415	unregister_blkdev(MD_MAJOR, "md");
8416err_md:
8417	destroy_workqueue(md_misc_wq);
8418err_misc_wq:
8419	destroy_workqueue(md_wq);
8420err_wq:
8421	return ret;
8422}
8423
8424#ifndef MODULE
8425
8426/*
8427 * Searches all registered partitions for autorun RAID arrays
8428 * at boot time.
8429 */
8430
8431static LIST_HEAD(all_detected_devices);
8432struct detected_devices_node {
8433	struct list_head list;
8434	dev_t dev;
8435};
8436
8437void md_autodetect_dev(dev_t dev)
8438{
8439	struct detected_devices_node *node_detected_dev;
8440
8441	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8442	if (node_detected_dev) {
8443		node_detected_dev->dev = dev;
8444		list_add_tail(&node_detected_dev->list, &all_detected_devices);
8445	} else {
8446		printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8447			", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8448	}
8449}
8450
8451static void autostart_arrays(int part)
8452{
8453	struct md_rdev *rdev;
8454	struct detected_devices_node *node_detected_dev;
8455	dev_t dev;
8456	int i_scanned, i_passed;
8457
8458	i_scanned = 0;
8459	i_passed = 0;
8460
8461	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8462
8463	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8464		i_scanned++;
8465		node_detected_dev = list_entry(all_detected_devices.next,
8466					struct detected_devices_node, list);
8467		list_del(&node_detected_dev->list);
8468		dev = node_detected_dev->dev;
8469		kfree(node_detected_dev);
8470		rdev = md_import_device(dev,0, 90);
8471		if (IS_ERR(rdev))
8472			continue;
8473
8474		if (test_bit(Faulty, &rdev->flags))
8475			continue;
8476
8477		set_bit(AutoDetected, &rdev->flags);
8478		list_add(&rdev->same_set, &pending_raid_disks);
8479		i_passed++;
8480	}
8481
8482	printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8483						i_scanned, i_passed);
8484
8485	autorun_devices(part);
8486}
8487
8488#endif /* !MODULE */
8489
8490static __exit void md_exit(void)
8491{
8492	struct mddev *mddev;
8493	struct list_head *tmp;
8494	int delay = 1;
8495
8496	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
8497	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8498
8499	unregister_blkdev(MD_MAJOR,"md");
8500	unregister_blkdev(mdp_major, "mdp");
8501	unregister_reboot_notifier(&md_notifier);
8502	unregister_sysctl_table(raid_table_header);
8503
8504	/* We cannot unload the modules while some process is
8505	 * waiting for us in select() or poll() - wake them up
8506	 */
8507	md_unloading = 1;
8508	while (waitqueue_active(&md_event_waiters)) {
8509		/* not safe to leave yet */
8510		wake_up(&md_event_waiters);
8511		msleep(delay);
8512		delay += delay;
8513	}
8514	remove_proc_entry("mdstat", NULL);
8515
8516	for_each_mddev(mddev, tmp) {
8517		export_array(mddev);
8518		mddev->hold_active = 0;
8519	}
8520	destroy_workqueue(md_misc_wq);
8521	destroy_workqueue(md_wq);
8522}
8523
8524subsys_initcall(md_init);
8525module_exit(md_exit)
8526
8527static int get_ro(char *buffer, struct kernel_param *kp)
8528{
8529	return sprintf(buffer, "%d", start_readonly);
8530}
8531static int set_ro(const char *val, struct kernel_param *kp)
8532{
8533	char *e;
8534	int num = simple_strtoul(val, &e, 10);
8535	if (*val && (*e == '\0' || *e == '\n')) {
8536		start_readonly = num;
8537		return 0;
8538	}
8539	return -EINVAL;
8540}
8541
8542module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8543module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
8544module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
8545
8546MODULE_LICENSE("GPL");
8547MODULE_DESCRIPTION("MD RAID framework");
8548MODULE_ALIAS("md");
8549MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
8550