drbd_nl.c revision f2024e7ce29f4287395ce879364cd68c7ac226f2
1/*
2   drbd_nl.c
3
4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10   drbd is free software; you can redistribute it and/or modify
11   it under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 2, or (at your option)
13   any later version.
14
15   drbd is distributed in the hope that it will be useful,
16   but WITHOUT ANY WARRANTY; without even the implied warranty of
17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18   GNU General Public License for more details.
19
20   You should have received a copy of the GNU General Public License
21   along with drbd; see the file COPYING.  If not, write to
22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/in.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/connector.h>
33#include <linux/blkpg.h>
34#include <linux/cpumask.h>
35#include "drbd_int.h"
36#include "drbd_req.h"
37#include "drbd_wrappers.h"
38#include <asm/unaligned.h>
39#include <linux/drbd_tag_magic.h>
40#include <linux/drbd_limits.h>
41#include <linux/compiler.h>
42#include <linux/kthread.h>
43
44static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
45static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
46static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
47
48/* see get_sb_bdev and bd_claim */
49static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
50
51/* Generate the tag_list to struct functions */
52#define NL_PACKET(name, number, fields) \
53static int name ## _from_tags(struct drbd_conf *mdev, \
54	unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
55static int name ## _from_tags(struct drbd_conf *mdev, \
56	unsigned short *tags, struct name *arg) \
57{ \
58	int tag; \
59	int dlen; \
60	\
61	while ((tag = get_unaligned(tags++)) != TT_END) {	\
62		dlen = get_unaligned(tags++);			\
63		switch (tag_number(tag)) { \
64		fields \
65		default: \
66			if (tag & T_MANDATORY) { \
67				dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
68				return 0; \
69			} \
70		} \
71		tags = (unsigned short *)((char *)tags + dlen); \
72	} \
73	return 1; \
74}
75#define NL_INTEGER(pn, pr, member) \
76	case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
77		arg->member = get_unaligned((int *)(tags));	\
78		break;
79#define NL_INT64(pn, pr, member) \
80	case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
81		arg->member = get_unaligned((u64 *)(tags));	\
82		break;
83#define NL_BIT(pn, pr, member) \
84	case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
85		arg->member = *(char *)(tags) ? 1 : 0; \
86		break;
87#define NL_STRING(pn, pr, member, len) \
88	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
89		if (dlen > len) { \
90			dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
91				#member, dlen, (unsigned int)len); \
92			return 0; \
93		} \
94		 arg->member ## _len = dlen; \
95		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
96		 break;
97#include "linux/drbd_nl.h"
98
99/* Generate the struct to tag_list functions */
100#define NL_PACKET(name, number, fields) \
101static unsigned short* \
102name ## _to_tags(struct drbd_conf *mdev, \
103	struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
104static unsigned short* \
105name ## _to_tags(struct drbd_conf *mdev, \
106	struct name *arg, unsigned short *tags) \
107{ \
108	fields \
109	return tags; \
110}
111
112#define NL_INTEGER(pn, pr, member) \
113	put_unaligned(pn | pr | TT_INTEGER, tags++);	\
114	put_unaligned(sizeof(int), tags++);		\
115	put_unaligned(arg->member, (int *)tags);	\
116	tags = (unsigned short *)((char *)tags+sizeof(int));
117#define NL_INT64(pn, pr, member) \
118	put_unaligned(pn | pr | TT_INT64, tags++);	\
119	put_unaligned(sizeof(u64), tags++);		\
120	put_unaligned(arg->member, (u64 *)tags);	\
121	tags = (unsigned short *)((char *)tags+sizeof(u64));
122#define NL_BIT(pn, pr, member) \
123	put_unaligned(pn | pr | TT_BIT, tags++);	\
124	put_unaligned(sizeof(char), tags++);		\
125	*(char *)tags = arg->member; \
126	tags = (unsigned short *)((char *)tags+sizeof(char));
127#define NL_STRING(pn, pr, member, len) \
128	put_unaligned(pn | pr | TT_STRING, tags++);	\
129	put_unaligned(arg->member ## _len, tags++);	\
130	memcpy(tags, arg->member, arg->member ## _len); \
131	tags = (unsigned short *)((char *)tags + arg->member ## _len);
132#include "linux/drbd_nl.h"
133
134void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
135void drbd_nl_send_reply(struct cn_msg *, int);
136
137int drbd_khelper(struct drbd_conf *mdev, char *cmd)
138{
139	char *envp[] = { "HOME=/",
140			"TERM=linux",
141			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
142			NULL, /* Will be set to address family */
143			NULL, /* Will be set to address */
144			NULL };
145
146	char mb[12], af[20], ad[60], *afs;
147	char *argv[] = {usermode_helper, cmd, mb, NULL };
148	int ret;
149
150	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
151
152	if (get_net_conf(mdev)) {
153		switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
154		case AF_INET6:
155			afs = "ipv6";
156			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
157				 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
158			break;
159		case AF_INET:
160			afs = "ipv4";
161			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
162				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
163			break;
164		default:
165			afs = "ssocks";
166			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
167				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
168		}
169		snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
170		envp[3]=af;
171		envp[4]=ad;
172		put_net_conf(mdev);
173	}
174
175	/* The helper may take some time.
176	 * write out any unsynced meta data changes now */
177	drbd_md_sync(mdev);
178
179	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
180
181	drbd_bcast_ev_helper(mdev, cmd);
182	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
183	if (ret)
184		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
185				usermode_helper, cmd, mb,
186				(ret >> 8) & 0xff, ret);
187	else
188		dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
189				usermode_helper, cmd, mb,
190				(ret >> 8) & 0xff, ret);
191
192	if (ret < 0) /* Ignore any ERRNOs we got. */
193		ret = 0;
194
195	return ret;
196}
197
198enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
199{
200	char *ex_to_string;
201	int r;
202	enum drbd_disk_state nps;
203	enum drbd_fencing_p fp;
204
205	D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
206
207	if (get_ldev_if_state(mdev, D_CONSISTENT)) {
208		fp = mdev->ldev->dc.fencing;
209		put_ldev(mdev);
210	} else {
211		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
212		nps = mdev->state.pdsk;
213		goto out;
214	}
215
216	r = drbd_khelper(mdev, "fence-peer");
217
218	switch ((r>>8) & 0xff) {
219	case 3: /* peer is inconsistent */
220		ex_to_string = "peer is inconsistent or worse";
221		nps = D_INCONSISTENT;
222		break;
223	case 4: /* peer got outdated, or was already outdated */
224		ex_to_string = "peer was fenced";
225		nps = D_OUTDATED;
226		break;
227	case 5: /* peer was down */
228		if (mdev->state.disk == D_UP_TO_DATE) {
229			/* we will(have) create(d) a new UUID anyways... */
230			ex_to_string = "peer is unreachable, assumed to be dead";
231			nps = D_OUTDATED;
232		} else {
233			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
234			nps = mdev->state.pdsk;
235		}
236		break;
237	case 6: /* Peer is primary, voluntarily outdate myself.
238		 * This is useful when an unconnected R_SECONDARY is asked to
239		 * become R_PRIMARY, but finds the other peer being active. */
240		ex_to_string = "peer is active";
241		dev_warn(DEV, "Peer is primary, outdating myself.\n");
242		nps = D_UNKNOWN;
243		_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
244		break;
245	case 7:
246		if (fp != FP_STONITH)
247			dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
248		ex_to_string = "peer was stonithed";
249		nps = D_OUTDATED;
250		break;
251	default:
252		/* The script is broken ... */
253		nps = D_UNKNOWN;
254		dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
255		return nps;
256	}
257
258	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
259			(r>>8) & 0xff, ex_to_string);
260
261out:
262	if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
263		/* The handler was not successful... unfreeze here, the
264		   state engine can not unfreeze... */
265		_drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
266	}
267
268	return nps;
269}
270
271static int _try_outdate_peer_async(void *data)
272{
273	struct drbd_conf *mdev = (struct drbd_conf *)data;
274	enum drbd_disk_state nps;
275
276	nps = drbd_try_outdate_peer(mdev);
277	drbd_request_state(mdev, NS(pdsk, nps));
278
279	return 0;
280}
281
282void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
283{
284	struct task_struct *opa;
285
286	opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
287	if (IS_ERR(opa))
288		dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
289}
290
291enum drbd_state_rv
292drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
293{
294	const int max_tries = 4;
295	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
296	int try = 0;
297	int forced = 0;
298	union drbd_state mask, val;
299	enum drbd_disk_state nps;
300
301	if (new_role == R_PRIMARY)
302		request_ping(mdev); /* Detect a dead peer ASAP */
303
304	mutex_lock(&mdev->state_mutex);
305
306	mask.i = 0; mask.role = R_MASK;
307	val.i  = 0; val.role  = new_role;
308
309	while (try++ < max_tries) {
310		rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
311
312		/* in case we first succeeded to outdate,
313		 * but now suddenly could establish a connection */
314		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
315			val.pdsk = 0;
316			mask.pdsk = 0;
317			continue;
318		}
319
320		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
321		    (mdev->state.disk < D_UP_TO_DATE &&
322		     mdev->state.disk >= D_INCONSISTENT)) {
323			mask.disk = D_MASK;
324			val.disk  = D_UP_TO_DATE;
325			forced = 1;
326			continue;
327		}
328
329		if (rv == SS_NO_UP_TO_DATE_DISK &&
330		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
331			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
332			nps = drbd_try_outdate_peer(mdev);
333
334			if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
335				val.disk = D_UP_TO_DATE;
336				mask.disk = D_MASK;
337			}
338
339			val.pdsk = nps;
340			mask.pdsk = D_MASK;
341
342			continue;
343		}
344
345		if (rv == SS_NOTHING_TO_DO)
346			goto fail;
347		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
348			nps = drbd_try_outdate_peer(mdev);
349
350			if (force && nps > D_OUTDATED) {
351				dev_warn(DEV, "Forced into split brain situation!\n");
352				nps = D_OUTDATED;
353			}
354
355			mask.pdsk = D_MASK;
356			val.pdsk  = nps;
357
358			continue;
359		}
360		if (rv == SS_TWO_PRIMARIES) {
361			/* Maybe the peer is detected as dead very soon...
362			   retry at most once more in this case. */
363			__set_current_state(TASK_INTERRUPTIBLE);
364			schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
365			if (try < max_tries)
366				try = max_tries - 1;
367			continue;
368		}
369		if (rv < SS_SUCCESS) {
370			rv = _drbd_request_state(mdev, mask, val,
371						CS_VERBOSE + CS_WAIT_COMPLETE);
372			if (rv < SS_SUCCESS)
373				goto fail;
374		}
375		break;
376	}
377
378	if (rv < SS_SUCCESS)
379		goto fail;
380
381	if (forced)
382		dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
383
384	/* Wait until nothing is on the fly :) */
385	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
386
387	if (new_role == R_SECONDARY) {
388		set_disk_ro(mdev->vdisk, true);
389		if (get_ldev(mdev)) {
390			mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
391			put_ldev(mdev);
392		}
393	} else {
394		if (get_net_conf(mdev)) {
395			mdev->net_conf->want_lose = 0;
396			put_net_conf(mdev);
397		}
398		set_disk_ro(mdev->vdisk, false);
399		if (get_ldev(mdev)) {
400			if (((mdev->state.conn < C_CONNECTED ||
401			       mdev->state.pdsk <= D_FAILED)
402			      && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
403				drbd_uuid_new_current(mdev);
404
405			mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
406			put_ldev(mdev);
407		}
408	}
409
410	if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
411		drbd_al_to_on_disk_bm(mdev);
412		put_ldev(mdev);
413	}
414
415	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
416		/* if this was forced, we should consider sync */
417		if (forced)
418			drbd_send_uuids(mdev);
419		drbd_send_state(mdev);
420	}
421
422	drbd_md_sync(mdev);
423
424	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
425 fail:
426	mutex_unlock(&mdev->state_mutex);
427	return rv;
428}
429
430static struct drbd_conf *ensure_mdev(int minor, int create)
431{
432	struct drbd_conf *mdev;
433
434	if (minor >= minor_count)
435		return NULL;
436
437	mdev = minor_to_mdev(minor);
438
439	if (!mdev && create) {
440		struct gendisk *disk = NULL;
441		mdev = drbd_new_device(minor);
442
443		spin_lock_irq(&drbd_pp_lock);
444		if (minor_table[minor] == NULL) {
445			minor_table[minor] = mdev;
446			disk = mdev->vdisk;
447			mdev = NULL;
448		} /* else: we lost the race */
449		spin_unlock_irq(&drbd_pp_lock);
450
451		if (disk) /* we won the race above */
452			/* in case we ever add a drbd_delete_device(),
453			 * don't forget the del_gendisk! */
454			add_disk(disk);
455		else /* we lost the race above */
456			drbd_free_mdev(mdev);
457
458		mdev = minor_to_mdev(minor);
459	}
460
461	return mdev;
462}
463
464static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
465			   struct drbd_nl_cfg_reply *reply)
466{
467	struct primary primary_args;
468
469	memset(&primary_args, 0, sizeof(struct primary));
470	if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
471		reply->ret_code = ERR_MANDATORY_TAG;
472		return 0;
473	}
474
475	reply->ret_code =
476		drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
477
478	return 0;
479}
480
481static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
482			     struct drbd_nl_cfg_reply *reply)
483{
484	reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
485
486	return 0;
487}
488
489/* initializes the md.*_offset members, so we are able to find
490 * the on disk meta data */
491static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
492				       struct drbd_backing_dev *bdev)
493{
494	sector_t md_size_sect = 0;
495	switch (bdev->dc.meta_dev_idx) {
496	default:
497		/* v07 style fixed size indexed meta data */
498		bdev->md.md_size_sect = MD_RESERVED_SECT;
499		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
500		bdev->md.al_offset = MD_AL_OFFSET;
501		bdev->md.bm_offset = MD_BM_OFFSET;
502		break;
503	case DRBD_MD_INDEX_FLEX_EXT:
504		/* just occupy the full device; unit: sectors */
505		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
506		bdev->md.md_offset = 0;
507		bdev->md.al_offset = MD_AL_OFFSET;
508		bdev->md.bm_offset = MD_BM_OFFSET;
509		break;
510	case DRBD_MD_INDEX_INTERNAL:
511	case DRBD_MD_INDEX_FLEX_INT:
512		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
513		/* al size is still fixed */
514		bdev->md.al_offset = -MD_AL_MAX_SIZE;
515		/* we need (slightly less than) ~ this much bitmap sectors: */
516		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
517		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
518		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
519		md_size_sect = ALIGN(md_size_sect, 8);
520
521		/* plus the "drbd meta data super block",
522		 * and the activity log; */
523		md_size_sect += MD_BM_OFFSET;
524
525		bdev->md.md_size_sect = md_size_sect;
526		/* bitmap offset is adjusted by 'super' block size */
527		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
528		break;
529	}
530}
531
532char *ppsize(char *buf, unsigned long long size)
533{
534	/* Needs 9 bytes at max. */
535	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
536	int base = 0;
537	while (size >= 10000) {
538		/* shift + round */
539		size = (size >> 10) + !!(size & (1<<9));
540		base++;
541	}
542	sprintf(buf, "%lu %cB", (long)size, units[base]);
543
544	return buf;
545}
546
547/* there is still a theoretical deadlock when called from receiver
548 * on an D_INCONSISTENT R_PRIMARY:
549 *  remote READ does inc_ap_bio, receiver would need to receive answer
550 *  packet from remote to dec_ap_bio again.
551 *  receiver receive_sizes(), comes here,
552 *  waits for ap_bio_cnt == 0. -> deadlock.
553 * but this cannot happen, actually, because:
554 *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
555 *  (not connected, or bad/no disk on peer):
556 *  see drbd_fail_request_early, ap_bio_cnt is zero.
557 *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
558 *  peer may not initiate a resize.
559 */
560void drbd_suspend_io(struct drbd_conf *mdev)
561{
562	set_bit(SUSPEND_IO, &mdev->flags);
563	if (is_susp(mdev->state))
564		return;
565	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
566}
567
568void drbd_resume_io(struct drbd_conf *mdev)
569{
570	clear_bit(SUSPEND_IO, &mdev->flags);
571	wake_up(&mdev->misc_wait);
572}
573
574/**
575 * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
576 * @mdev:	DRBD device.
577 *
578 * Returns 0 on success, negative return values indicate errors.
579 * You should call drbd_md_sync() after calling this function.
580 */
581enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
582{
583	sector_t prev_first_sect, prev_size; /* previous meta location */
584	sector_t la_size;
585	sector_t size;
586	char ppb[10];
587
588	int md_moved, la_size_changed;
589	enum determine_dev_size rv = unchanged;
590
591	/* race:
592	 * application request passes inc_ap_bio,
593	 * but then cannot get an AL-reference.
594	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
595	 *
596	 * to avoid that:
597	 * Suspend IO right here.
598	 * still lock the act_log to not trigger ASSERTs there.
599	 */
600	drbd_suspend_io(mdev);
601
602	/* no wait necessary anymore, actually we could assert that */
603	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
604
605	prev_first_sect = drbd_md_first_sector(mdev->ldev);
606	prev_size = mdev->ldev->md.md_size_sect;
607	la_size = mdev->ldev->md.la_size_sect;
608
609	/* TODO: should only be some assert here, not (re)init... */
610	drbd_md_set_sector_offsets(mdev, mdev->ldev);
611
612	size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
613
614	if (drbd_get_capacity(mdev->this_bdev) != size ||
615	    drbd_bm_capacity(mdev) != size) {
616		int err;
617		err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
618		if (unlikely(err)) {
619			/* currently there is only one error: ENOMEM! */
620			size = drbd_bm_capacity(mdev)>>1;
621			if (size == 0) {
622				dev_err(DEV, "OUT OF MEMORY! "
623				    "Could not allocate bitmap!\n");
624			} else {
625				dev_err(DEV, "BM resizing failed. "
626				    "Leaving size unchanged at size = %lu KB\n",
627				    (unsigned long)size);
628			}
629			rv = dev_size_error;
630		}
631		/* racy, see comments above. */
632		drbd_set_my_capacity(mdev, size);
633		mdev->ldev->md.la_size_sect = size;
634		dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
635		     (unsigned long long)size>>1);
636	}
637	if (rv == dev_size_error)
638		goto out;
639
640	la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
641
642	md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
643		|| prev_size	   != mdev->ldev->md.md_size_sect;
644
645	if (la_size_changed || md_moved) {
646		drbd_al_shrink(mdev); /* All extents inactive. */
647		dev_info(DEV, "Writing the whole bitmap, %s\n",
648			 la_size_changed && md_moved ? "size changed and md moved" :
649			 la_size_changed ? "size changed" : "md moved");
650		rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
651		drbd_md_mark_dirty(mdev);
652	}
653
654	if (size > la_size)
655		rv = grew;
656	if (size < la_size)
657		rv = shrunk;
658out:
659	lc_unlock(mdev->act_log);
660	wake_up(&mdev->al_wait);
661	drbd_resume_io(mdev);
662
663	return rv;
664}
665
666sector_t
667drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
668{
669	sector_t p_size = mdev->p_size;   /* partner's disk size. */
670	sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
671	sector_t m_size; /* my size */
672	sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
673	sector_t size = 0;
674
675	m_size = drbd_get_max_capacity(bdev);
676
677	if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) {
678		dev_warn(DEV, "Resize while not connected was forced by the user!\n");
679		p_size = m_size;
680	}
681
682	if (p_size && m_size) {
683		size = min_t(sector_t, p_size, m_size);
684	} else {
685		if (la_size) {
686			size = la_size;
687			if (m_size && m_size < size)
688				size = m_size;
689			if (p_size && p_size < size)
690				size = p_size;
691		} else {
692			if (m_size)
693				size = m_size;
694			if (p_size)
695				size = p_size;
696		}
697	}
698
699	if (size == 0)
700		dev_err(DEV, "Both nodes diskless!\n");
701
702	if (u_size) {
703		if (u_size > size)
704			dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
705			    (unsigned long)u_size>>1, (unsigned long)size>>1);
706		else
707			size = u_size;
708	}
709
710	return size;
711}
712
713/**
714 * drbd_check_al_size() - Ensures that the AL is of the right size
715 * @mdev:	DRBD device.
716 *
717 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
718 * failed, and 0 on success. You should call drbd_md_sync() after you called
719 * this function.
720 */
721static int drbd_check_al_size(struct drbd_conf *mdev)
722{
723	struct lru_cache *n, *t;
724	struct lc_element *e;
725	unsigned int in_use;
726	int i;
727
728	ERR_IF(mdev->sync_conf.al_extents < 7)
729		mdev->sync_conf.al_extents = 127;
730
731	if (mdev->act_log &&
732	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
733		return 0;
734
735	in_use = 0;
736	t = mdev->act_log;
737	n = lc_create("act_log", drbd_al_ext_cache,
738		mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
739
740	if (n == NULL) {
741		dev_err(DEV, "Cannot allocate act_log lru!\n");
742		return -ENOMEM;
743	}
744	spin_lock_irq(&mdev->al_lock);
745	if (t) {
746		for (i = 0; i < t->nr_elements; i++) {
747			e = lc_element_by_index(t, i);
748			if (e->refcnt)
749				dev_err(DEV, "refcnt(%d)==%d\n",
750				    e->lc_number, e->refcnt);
751			in_use += e->refcnt;
752		}
753	}
754	if (!in_use)
755		mdev->act_log = n;
756	spin_unlock_irq(&mdev->al_lock);
757	if (in_use) {
758		dev_err(DEV, "Activity log still in use!\n");
759		lc_destroy(n);
760		return -EBUSY;
761	} else {
762		if (t)
763			lc_destroy(t);
764	}
765	drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
766	return 0;
767}
768
769void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local)
770{
771	struct request_queue * const q = mdev->rq_queue;
772	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
773	int max_segments = mdev->ldev->dc.max_bio_bvecs;
774	int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
775
776	blk_queue_logical_block_size(q, 512);
777	blk_queue_max_hw_sectors(q, max_hw_sectors);
778	/* This is the workaround for "bio would need to, but cannot, be split" */
779	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
780	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
781	blk_queue_stack_limits(q, b);
782
783	dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9);
784
785	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
786		dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
787		     q->backing_dev_info.ra_pages,
788		     b->backing_dev_info.ra_pages);
789		q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
790	}
791}
792
793/* serialize deconfig (worker exiting, doing cleanup)
794 * and reconfig (drbdsetup disk, drbdsetup net)
795 *
796 * Wait for a potentially exiting worker, then restart it,
797 * or start a new one.  Flush any pending work, there may still be an
798 * after_state_change queued.
799 */
800static void drbd_reconfig_start(struct drbd_conf *mdev)
801{
802	wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
803	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
804	drbd_thread_start(&mdev->worker);
805	drbd_flush_workqueue(mdev);
806}
807
808/* if still unconfigured, stops worker again.
809 * if configured now, clears CONFIG_PENDING.
810 * wakes potential waiters */
811static void drbd_reconfig_done(struct drbd_conf *mdev)
812{
813	spin_lock_irq(&mdev->req_lock);
814	if (mdev->state.disk == D_DISKLESS &&
815	    mdev->state.conn == C_STANDALONE &&
816	    mdev->state.role == R_SECONDARY) {
817		set_bit(DEVICE_DYING, &mdev->flags);
818		drbd_thread_stop_nowait(&mdev->worker);
819	} else
820		clear_bit(CONFIG_PENDING, &mdev->flags);
821	spin_unlock_irq(&mdev->req_lock);
822	wake_up(&mdev->state_wait);
823}
824
825/* Make sure IO is suspended before calling this function(). */
826static void drbd_suspend_al(struct drbd_conf *mdev)
827{
828	int s = 0;
829
830	if (lc_try_lock(mdev->act_log)) {
831		drbd_al_shrink(mdev);
832		lc_unlock(mdev->act_log);
833	} else {
834		dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
835		return;
836	}
837
838	spin_lock_irq(&mdev->req_lock);
839	if (mdev->state.conn < C_CONNECTED)
840		s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
841
842	spin_unlock_irq(&mdev->req_lock);
843
844	if (s)
845		dev_info(DEV, "Suspended AL updates\n");
846}
847
848/* does always return 0;
849 * interesting return code is in reply->ret_code */
850static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
851			     struct drbd_nl_cfg_reply *reply)
852{
853	enum drbd_ret_code retcode;
854	enum determine_dev_size dd;
855	sector_t max_possible_sectors;
856	sector_t min_md_device_sectors;
857	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
858	struct block_device *bdev;
859	struct lru_cache *resync_lru = NULL;
860	union drbd_state ns, os;
861	unsigned int max_bio_size;
862	enum drbd_state_rv rv;
863	int cp_discovered = 0;
864	int logical_block_size;
865
866	drbd_reconfig_start(mdev);
867
868	/* if you want to reconfigure, please tear down first */
869	if (mdev->state.disk > D_DISKLESS) {
870		retcode = ERR_DISK_CONFIGURED;
871		goto fail;
872	}
873	/* It may just now have detached because of IO error.  Make sure
874	 * drbd_ldev_destroy is done already, we may end up here very fast,
875	 * e.g. if someone calls attach from the on-io-error handler,
876	 * to realize a "hot spare" feature (not that I'd recommend that) */
877	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
878
879	/* allocation not in the IO path, cqueue thread context */
880	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
881	if (!nbc) {
882		retcode = ERR_NOMEM;
883		goto fail;
884	}
885
886	nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
887	nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
888	nbc->dc.fencing       = DRBD_FENCING_DEF;
889	nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
890
891	if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
892		retcode = ERR_MANDATORY_TAG;
893		goto fail;
894	}
895
896	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
897		retcode = ERR_MD_IDX_INVALID;
898		goto fail;
899	}
900
901	if (get_net_conf(mdev)) {
902		int prot = mdev->net_conf->wire_protocol;
903		put_net_conf(mdev);
904		if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
905			retcode = ERR_STONITH_AND_PROT_A;
906			goto fail;
907		}
908	}
909
910	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
911				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
912	if (IS_ERR(bdev)) {
913		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
914			PTR_ERR(bdev));
915		retcode = ERR_OPEN_DISK;
916		goto fail;
917	}
918	nbc->backing_bdev = bdev;
919
920	/*
921	 * meta_dev_idx >= 0: external fixed size, possibly multiple
922	 * drbd sharing one meta device.  TODO in that case, paranoia
923	 * check that [md_bdev, meta_dev_idx] is not yet used by some
924	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
925	 * should check it for you already; but if you don't, or
926	 * someone fooled it, we need to double check here)
927	 */
928	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
929				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
930				  (nbc->dc.meta_dev_idx < 0) ?
931				  (void *)mdev : (void *)drbd_m_holder);
932	if (IS_ERR(bdev)) {
933		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
934			PTR_ERR(bdev));
935		retcode = ERR_OPEN_MD_DISK;
936		goto fail;
937	}
938	nbc->md_bdev = bdev;
939
940	if ((nbc->backing_bdev == nbc->md_bdev) !=
941	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
942	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
943		retcode = ERR_MD_IDX_INVALID;
944		goto fail;
945	}
946
947	resync_lru = lc_create("resync", drbd_bm_ext_cache,
948			61, sizeof(struct bm_extent),
949			offsetof(struct bm_extent, lce));
950	if (!resync_lru) {
951		retcode = ERR_NOMEM;
952		goto fail;
953	}
954
955	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
956	drbd_md_set_sector_offsets(mdev, nbc);
957
958	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
959		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
960			(unsigned long long) drbd_get_max_capacity(nbc),
961			(unsigned long long) nbc->dc.disk_size);
962		retcode = ERR_DISK_TO_SMALL;
963		goto fail;
964	}
965
966	if (nbc->dc.meta_dev_idx < 0) {
967		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
968		/* at least one MB, otherwise it does not make sense */
969		min_md_device_sectors = (2<<10);
970	} else {
971		max_possible_sectors = DRBD_MAX_SECTORS;
972		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
973	}
974
975	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
976		retcode = ERR_MD_DISK_TO_SMALL;
977		dev_warn(DEV, "refusing attach: md-device too small, "
978		     "at least %llu sectors needed for this meta-disk type\n",
979		     (unsigned long long) min_md_device_sectors);
980		goto fail;
981	}
982
983	/* Make sure the new disk is big enough
984	 * (we may currently be R_PRIMARY with no local disk...) */
985	if (drbd_get_max_capacity(nbc) <
986	    drbd_get_capacity(mdev->this_bdev)) {
987		retcode = ERR_DISK_TO_SMALL;
988		goto fail;
989	}
990
991	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
992
993	if (nbc->known_size > max_possible_sectors) {
994		dev_warn(DEV, "==> truncating very big lower level device "
995			"to currently maximum possible %llu sectors <==\n",
996			(unsigned long long) max_possible_sectors);
997		if (nbc->dc.meta_dev_idx >= 0)
998			dev_warn(DEV, "==>> using internal or flexible "
999				      "meta data may help <<==\n");
1000	}
1001
1002	drbd_suspend_io(mdev);
1003	/* also wait for the last barrier ack. */
1004	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
1005	/* and for any other previously queued work */
1006	drbd_flush_workqueue(mdev);
1007
1008	rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
1009	retcode = rv;  /* FIXME: Type mismatch. */
1010	drbd_resume_io(mdev);
1011	if (rv < SS_SUCCESS)
1012		goto fail;
1013
1014	if (!get_ldev_if_state(mdev, D_ATTACHING))
1015		goto force_diskless;
1016
1017	drbd_md_set_sector_offsets(mdev, nbc);
1018
1019	/* allocate a second IO page if logical_block_size != 512 */
1020	logical_block_size = bdev_logical_block_size(nbc->md_bdev);
1021	if (logical_block_size == 0)
1022		logical_block_size = MD_SECTOR_SIZE;
1023
1024	if (logical_block_size != MD_SECTOR_SIZE) {
1025		if (!mdev->md_io_tmpp) {
1026			struct page *page = alloc_page(GFP_NOIO);
1027			if (!page)
1028				goto force_diskless_dec;
1029
1030			dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
1031			     logical_block_size, MD_SECTOR_SIZE);
1032			dev_warn(DEV, "Workaround engaged (has performance impact).\n");
1033
1034			mdev->md_io_tmpp = page;
1035		}
1036	}
1037
1038	if (!mdev->bitmap) {
1039		if (drbd_bm_init(mdev)) {
1040			retcode = ERR_NOMEM;
1041			goto force_diskless_dec;
1042		}
1043	}
1044
1045	retcode = drbd_md_read(mdev, nbc);
1046	if (retcode != NO_ERROR)
1047		goto force_diskless_dec;
1048
1049	if (mdev->state.conn < C_CONNECTED &&
1050	    mdev->state.role == R_PRIMARY &&
1051	    (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1052		dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
1053		    (unsigned long long)mdev->ed_uuid);
1054		retcode = ERR_DATA_NOT_CURRENT;
1055		goto force_diskless_dec;
1056	}
1057
1058	/* Since we are diskless, fix the activity log first... */
1059	if (drbd_check_al_size(mdev)) {
1060		retcode = ERR_NOMEM;
1061		goto force_diskless_dec;
1062	}
1063
1064	/* Prevent shrinking of consistent devices ! */
1065	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1066	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
1067		dev_warn(DEV, "refusing to truncate a consistent device\n");
1068		retcode = ERR_DISK_TO_SMALL;
1069		goto force_diskless_dec;
1070	}
1071
1072	if (!drbd_al_read_log(mdev, nbc)) {
1073		retcode = ERR_IO_MD_DISK;
1074		goto force_diskless_dec;
1075	}
1076
1077	/* Reset the "barriers don't work" bits here, then force meta data to
1078	 * be written, to ensure we determine if barriers are supported. */
1079	if (nbc->dc.no_md_flush)
1080		set_bit(MD_NO_FUA, &mdev->flags);
1081	else
1082		clear_bit(MD_NO_FUA, &mdev->flags);
1083
1084	/* Point of no return reached.
1085	 * Devices and memory are no longer released by error cleanup below.
1086	 * now mdev takes over responsibility, and the state engine should
1087	 * clean it up somewhere.  */
1088	D_ASSERT(mdev->ldev == NULL);
1089	mdev->ldev = nbc;
1090	mdev->resync = resync_lru;
1091	nbc = NULL;
1092	resync_lru = NULL;
1093
1094	mdev->write_ordering = WO_bdev_flush;
1095	drbd_bump_write_ordering(mdev, WO_bdev_flush);
1096
1097	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1098		set_bit(CRASHED_PRIMARY, &mdev->flags);
1099	else
1100		clear_bit(CRASHED_PRIMARY, &mdev->flags);
1101
1102	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1103	    !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
1104		set_bit(CRASHED_PRIMARY, &mdev->flags);
1105		cp_discovered = 1;
1106	}
1107
1108	mdev->send_cnt = 0;
1109	mdev->recv_cnt = 0;
1110	mdev->read_cnt = 0;
1111	mdev->writ_cnt = 0;
1112
1113	max_bio_size = DRBD_MAX_BIO_SIZE;
1114	if (mdev->state.conn == C_CONNECTED) {
1115		/* We are Primary, Connected, and now attach a new local
1116		 * backing store. We must not increase the user visible maximum
1117		 * bio size on this device to something the peer may not be
1118		 * able to handle. */
1119		if (mdev->agreed_pro_version < 94)
1120			max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
1121		else if (mdev->agreed_pro_version == 94)
1122			max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
1123		/* else: drbd 8.3.9 and later, stay with default */
1124	}
1125
1126	drbd_setup_queue_param(mdev, max_bio_size);
1127
1128	/* If I am currently not R_PRIMARY,
1129	 * but meta data primary indicator is set,
1130	 * I just now recover from a hard crash,
1131	 * and have been R_PRIMARY before that crash.
1132	 *
1133	 * Now, if I had no connection before that crash
1134	 * (have been degraded R_PRIMARY), chances are that
1135	 * I won't find my peer now either.
1136	 *
1137	 * In that case, and _only_ in that case,
1138	 * we use the degr-wfc-timeout instead of the default,
1139	 * so we can automatically recover from a crash of a
1140	 * degraded but active "cluster" after a certain timeout.
1141	 */
1142	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1143	if (mdev->state.role != R_PRIMARY &&
1144	     drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1145	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1146		set_bit(USE_DEGR_WFC_T, &mdev->flags);
1147
1148	dd = drbd_determin_dev_size(mdev, 0);
1149	if (dd == dev_size_error) {
1150		retcode = ERR_NOMEM_BITMAP;
1151		goto force_diskless_dec;
1152	} else if (dd == grew)
1153		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1154
1155	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1156		dev_info(DEV, "Assuming that all blocks are out of sync "
1157		     "(aka FullSync)\n");
1158		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
1159			retcode = ERR_IO_MD_DISK;
1160			goto force_diskless_dec;
1161		}
1162	} else {
1163		if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
1164			retcode = ERR_IO_MD_DISK;
1165			goto force_diskless_dec;
1166		}
1167	}
1168
1169	if (cp_discovered) {
1170		drbd_al_apply_to_bm(mdev);
1171		drbd_al_to_on_disk_bm(mdev);
1172	}
1173
1174	if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
1175		drbd_suspend_al(mdev); /* IO is still suspended here... */
1176
1177	spin_lock_irq(&mdev->req_lock);
1178	os = mdev->state;
1179	ns.i = os.i;
1180	/* If MDF_CONSISTENT is not set go into inconsistent state,
1181	   otherwise investigate MDF_WasUpToDate...
1182	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1183	   otherwise into D_CONSISTENT state.
1184	*/
1185	if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1186		if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1187			ns.disk = D_CONSISTENT;
1188		else
1189			ns.disk = D_OUTDATED;
1190	} else {
1191		ns.disk = D_INCONSISTENT;
1192	}
1193
1194	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1195		ns.pdsk = D_OUTDATED;
1196
1197	if ( ns.disk == D_CONSISTENT &&
1198	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
1199		ns.disk = D_UP_TO_DATE;
1200
1201	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1202	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1203	   this point, because drbd_request_state() modifies these
1204	   flags. */
1205
1206	/* In case we are C_CONNECTED postpone any decision on the new disk
1207	   state after the negotiation phase. */
1208	if (mdev->state.conn == C_CONNECTED) {
1209		mdev->new_state_tmp.i = ns.i;
1210		ns.i = os.i;
1211		ns.disk = D_NEGOTIATING;
1212
1213		/* We expect to receive up-to-date UUIDs soon.
1214		   To avoid a race in receive_state, free p_uuid while
1215		   holding req_lock. I.e. atomic with the state change */
1216		kfree(mdev->p_uuid);
1217		mdev->p_uuid = NULL;
1218	}
1219
1220	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1221	ns = mdev->state;
1222	spin_unlock_irq(&mdev->req_lock);
1223
1224	if (rv < SS_SUCCESS)
1225		goto force_diskless_dec;
1226
1227	if (mdev->state.role == R_PRIMARY)
1228		mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
1229	else
1230		mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1231
1232	drbd_md_mark_dirty(mdev);
1233	drbd_md_sync(mdev);
1234
1235	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1236	put_ldev(mdev);
1237	reply->ret_code = retcode;
1238	drbd_reconfig_done(mdev);
1239	return 0;
1240
1241 force_diskless_dec:
1242	put_ldev(mdev);
1243 force_diskless:
1244	drbd_force_state(mdev, NS(disk, D_FAILED));
1245	drbd_md_sync(mdev);
1246 fail:
1247	if (nbc) {
1248		if (nbc->backing_bdev)
1249			blkdev_put(nbc->backing_bdev,
1250				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1251		if (nbc->md_bdev)
1252			blkdev_put(nbc->md_bdev,
1253				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1254		kfree(nbc);
1255	}
1256	lc_destroy(resync_lru);
1257
1258	reply->ret_code = retcode;
1259	drbd_reconfig_done(mdev);
1260	return 0;
1261}
1262
1263/* Detaching the disk is a process in multiple stages.  First we need to lock
1264 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1265 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1266 * internal references as well.
1267 * Only then we have finally detached. */
1268static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1269			  struct drbd_nl_cfg_reply *reply)
1270{
1271	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1272	reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1273	if (mdev->state.disk == D_DISKLESS)
1274		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1275	drbd_resume_io(mdev);
1276	return 0;
1277}
1278
1279static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1280			    struct drbd_nl_cfg_reply *reply)
1281{
1282	int i, ns;
1283	enum drbd_ret_code retcode;
1284	struct net_conf *new_conf = NULL;
1285	struct crypto_hash *tfm = NULL;
1286	struct crypto_hash *integrity_w_tfm = NULL;
1287	struct crypto_hash *integrity_r_tfm = NULL;
1288	struct hlist_head *new_tl_hash = NULL;
1289	struct hlist_head *new_ee_hash = NULL;
1290	struct drbd_conf *odev;
1291	char hmac_name[CRYPTO_MAX_ALG_NAME];
1292	void *int_dig_out = NULL;
1293	void *int_dig_in = NULL;
1294	void *int_dig_vv = NULL;
1295	struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
1296
1297	drbd_reconfig_start(mdev);
1298
1299	if (mdev->state.conn > C_STANDALONE) {
1300		retcode = ERR_NET_CONFIGURED;
1301		goto fail;
1302	}
1303
1304	/* allocation not in the IO path, cqueue thread context */
1305	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
1306	if (!new_conf) {
1307		retcode = ERR_NOMEM;
1308		goto fail;
1309	}
1310
1311	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
1312	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
1313	new_conf->ping_int	   = DRBD_PING_INT_DEF;
1314	new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
1315	new_conf->max_buffers	   = DRBD_MAX_BUFFERS_DEF;
1316	new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
1317	new_conf->sndbuf_size	   = DRBD_SNDBUF_SIZE_DEF;
1318	new_conf->rcvbuf_size	   = DRBD_RCVBUF_SIZE_DEF;
1319	new_conf->ko_count	   = DRBD_KO_COUNT_DEF;
1320	new_conf->after_sb_0p	   = DRBD_AFTER_SB_0P_DEF;
1321	new_conf->after_sb_1p	   = DRBD_AFTER_SB_1P_DEF;
1322	new_conf->after_sb_2p	   = DRBD_AFTER_SB_2P_DEF;
1323	new_conf->want_lose	   = 0;
1324	new_conf->two_primaries    = 0;
1325	new_conf->wire_protocol    = DRBD_PROT_C;
1326	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
1327	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
1328	new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
1329	new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
1330
1331	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1332		retcode = ERR_MANDATORY_TAG;
1333		goto fail;
1334	}
1335
1336	if (new_conf->two_primaries
1337	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
1338		retcode = ERR_NOT_PROTO_C;
1339		goto fail;
1340	}
1341
1342	if (get_ldev(mdev)) {
1343		enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
1344		put_ldev(mdev);
1345		if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
1346			retcode = ERR_STONITH_AND_PROT_A;
1347			goto fail;
1348		}
1349	}
1350
1351	if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
1352		retcode = ERR_CONG_NOT_PROTO_A;
1353		goto fail;
1354	}
1355
1356	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1357		retcode = ERR_DISCARD;
1358		goto fail;
1359	}
1360
1361	retcode = NO_ERROR;
1362
1363	new_my_addr = (struct sockaddr *)&new_conf->my_addr;
1364	new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
1365	for (i = 0; i < minor_count; i++) {
1366		odev = minor_to_mdev(i);
1367		if (!odev || odev == mdev)
1368			continue;
1369		if (get_net_conf(odev)) {
1370			taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
1371			if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
1372			    !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
1373				retcode = ERR_LOCAL_ADDR;
1374
1375			taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
1376			if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
1377			    !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
1378				retcode = ERR_PEER_ADDR;
1379
1380			put_net_conf(odev);
1381			if (retcode != NO_ERROR)
1382				goto fail;
1383		}
1384	}
1385
1386	if (new_conf->cram_hmac_alg[0] != 0) {
1387		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
1388			new_conf->cram_hmac_alg);
1389		tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
1390		if (IS_ERR(tfm)) {
1391			tfm = NULL;
1392			retcode = ERR_AUTH_ALG;
1393			goto fail;
1394		}
1395
1396		if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
1397			retcode = ERR_AUTH_ALG_ND;
1398			goto fail;
1399		}
1400	}
1401
1402	if (new_conf->integrity_alg[0]) {
1403		integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1404		if (IS_ERR(integrity_w_tfm)) {
1405			integrity_w_tfm = NULL;
1406			retcode=ERR_INTEGRITY_ALG;
1407			goto fail;
1408		}
1409
1410		if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
1411			retcode=ERR_INTEGRITY_ALG_ND;
1412			goto fail;
1413		}
1414
1415		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1416		if (IS_ERR(integrity_r_tfm)) {
1417			integrity_r_tfm = NULL;
1418			retcode=ERR_INTEGRITY_ALG;
1419			goto fail;
1420		}
1421	}
1422
1423	ns = new_conf->max_epoch_size/8;
1424	if (mdev->tl_hash_s != ns) {
1425		new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1426		if (!new_tl_hash) {
1427			retcode = ERR_NOMEM;
1428			goto fail;
1429		}
1430	}
1431
1432	ns = new_conf->max_buffers/8;
1433	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
1434		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1435		if (!new_ee_hash) {
1436			retcode = ERR_NOMEM;
1437			goto fail;
1438		}
1439	}
1440
1441	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
1442
1443	if (integrity_w_tfm) {
1444		i = crypto_hash_digestsize(integrity_w_tfm);
1445		int_dig_out = kmalloc(i, GFP_KERNEL);
1446		if (!int_dig_out) {
1447			retcode = ERR_NOMEM;
1448			goto fail;
1449		}
1450		int_dig_in = kmalloc(i, GFP_KERNEL);
1451		if (!int_dig_in) {
1452			retcode = ERR_NOMEM;
1453			goto fail;
1454		}
1455		int_dig_vv = kmalloc(i, GFP_KERNEL);
1456		if (!int_dig_vv) {
1457			retcode = ERR_NOMEM;
1458			goto fail;
1459		}
1460	}
1461
1462	if (!mdev->bitmap) {
1463		if(drbd_bm_init(mdev)) {
1464			retcode = ERR_NOMEM;
1465			goto fail;
1466		}
1467	}
1468
1469	drbd_flush_workqueue(mdev);
1470	spin_lock_irq(&mdev->req_lock);
1471	if (mdev->net_conf != NULL) {
1472		retcode = ERR_NET_CONFIGURED;
1473		spin_unlock_irq(&mdev->req_lock);
1474		goto fail;
1475	}
1476	mdev->net_conf = new_conf;
1477
1478	mdev->send_cnt = 0;
1479	mdev->recv_cnt = 0;
1480
1481	if (new_tl_hash) {
1482		kfree(mdev->tl_hash);
1483		mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
1484		mdev->tl_hash = new_tl_hash;
1485	}
1486
1487	if (new_ee_hash) {
1488		kfree(mdev->ee_hash);
1489		mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
1490		mdev->ee_hash = new_ee_hash;
1491	}
1492
1493	crypto_free_hash(mdev->cram_hmac_tfm);
1494	mdev->cram_hmac_tfm = tfm;
1495
1496	crypto_free_hash(mdev->integrity_w_tfm);
1497	mdev->integrity_w_tfm = integrity_w_tfm;
1498
1499	crypto_free_hash(mdev->integrity_r_tfm);
1500	mdev->integrity_r_tfm = integrity_r_tfm;
1501
1502	kfree(mdev->int_dig_out);
1503	kfree(mdev->int_dig_in);
1504	kfree(mdev->int_dig_vv);
1505	mdev->int_dig_out=int_dig_out;
1506	mdev->int_dig_in=int_dig_in;
1507	mdev->int_dig_vv=int_dig_vv;
1508	retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
1509	spin_unlock_irq(&mdev->req_lock);
1510
1511	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1512	reply->ret_code = retcode;
1513	drbd_reconfig_done(mdev);
1514	return 0;
1515
1516fail:
1517	kfree(int_dig_out);
1518	kfree(int_dig_in);
1519	kfree(int_dig_vv);
1520	crypto_free_hash(tfm);
1521	crypto_free_hash(integrity_w_tfm);
1522	crypto_free_hash(integrity_r_tfm);
1523	kfree(new_tl_hash);
1524	kfree(new_ee_hash);
1525	kfree(new_conf);
1526
1527	reply->ret_code = retcode;
1528	drbd_reconfig_done(mdev);
1529	return 0;
1530}
1531
1532static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1533			      struct drbd_nl_cfg_reply *reply)
1534{
1535	int retcode;
1536	struct disconnect dc;
1537
1538	memset(&dc, 0, sizeof(struct disconnect));
1539	if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
1540		retcode = ERR_MANDATORY_TAG;
1541		goto fail;
1542	}
1543
1544	if (dc.force) {
1545		spin_lock_irq(&mdev->req_lock);
1546		if (mdev->state.conn >= C_WF_CONNECTION)
1547			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
1548		spin_unlock_irq(&mdev->req_lock);
1549		goto done;
1550	}
1551
1552	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1553
1554	if (retcode == SS_NOTHING_TO_DO)
1555		goto done;
1556	else if (retcode == SS_ALREADY_STANDALONE)
1557		goto done;
1558	else if (retcode == SS_PRIMARY_NOP) {
1559		/* Our statche checking code wants to see the peer outdated. */
1560		retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1561						      pdsk, D_OUTDATED));
1562	} else if (retcode == SS_CW_FAILED_BY_PEER) {
1563		/* The peer probably wants to see us outdated. */
1564		retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1565							disk, D_OUTDATED),
1566					      CS_ORDERED);
1567		if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
1568			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1569			retcode = SS_SUCCESS;
1570		}
1571	}
1572
1573	if (retcode < SS_SUCCESS)
1574		goto fail;
1575
1576	if (wait_event_interruptible(mdev->state_wait,
1577				     mdev->state.conn != C_DISCONNECTING)) {
1578		/* Do not test for mdev->state.conn == C_STANDALONE, since
1579		   someone else might connect us in the mean time! */
1580		retcode = ERR_INTR;
1581		goto fail;
1582	}
1583
1584 done:
1585	retcode = NO_ERROR;
1586 fail:
1587	drbd_md_sync(mdev);
1588	reply->ret_code = retcode;
1589	return 0;
1590}
1591
1592void resync_after_online_grow(struct drbd_conf *mdev)
1593{
1594	int iass; /* I am sync source */
1595
1596	dev_info(DEV, "Resync of new storage after online grow\n");
1597	if (mdev->state.role != mdev->state.peer)
1598		iass = (mdev->state.role == R_PRIMARY);
1599	else
1600		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1601
1602	if (iass)
1603		drbd_start_resync(mdev, C_SYNC_SOURCE);
1604	else
1605		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
1606}
1607
1608static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1609			  struct drbd_nl_cfg_reply *reply)
1610{
1611	struct resize rs;
1612	int retcode = NO_ERROR;
1613	enum determine_dev_size dd;
1614	enum dds_flags ddsf;
1615
1616	memset(&rs, 0, sizeof(struct resize));
1617	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
1618		retcode = ERR_MANDATORY_TAG;
1619		goto fail;
1620	}
1621
1622	if (mdev->state.conn > C_CONNECTED) {
1623		retcode = ERR_RESIZE_RESYNC;
1624		goto fail;
1625	}
1626
1627	if (mdev->state.role == R_SECONDARY &&
1628	    mdev->state.peer == R_SECONDARY) {
1629		retcode = ERR_NO_PRIMARY;
1630		goto fail;
1631	}
1632
1633	if (!get_ldev(mdev)) {
1634		retcode = ERR_NO_DISK;
1635		goto fail;
1636	}
1637
1638	if (rs.no_resync && mdev->agreed_pro_version < 93) {
1639		retcode = ERR_NEED_APV_93;
1640		goto fail;
1641	}
1642
1643	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
1644		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1645
1646	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1647	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
1648	dd = drbd_determin_dev_size(mdev, ddsf);
1649	drbd_md_sync(mdev);
1650	put_ldev(mdev);
1651	if (dd == dev_size_error) {
1652		retcode = ERR_NOMEM_BITMAP;
1653		goto fail;
1654	}
1655
1656	if (mdev->state.conn == C_CONNECTED) {
1657		if (dd == grew)
1658			set_bit(RESIZE_PENDING, &mdev->flags);
1659
1660		drbd_send_uuids(mdev);
1661		drbd_send_sizes(mdev, 1, ddsf);
1662	}
1663
1664 fail:
1665	reply->ret_code = retcode;
1666	return 0;
1667}
1668
1669static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1670			       struct drbd_nl_cfg_reply *reply)
1671{
1672	int retcode = NO_ERROR;
1673	int err;
1674	int ovr; /* online verify running */
1675	int rsr; /* re-sync running */
1676	struct crypto_hash *verify_tfm = NULL;
1677	struct crypto_hash *csums_tfm = NULL;
1678	struct syncer_conf sc;
1679	cpumask_var_t new_cpu_mask;
1680	int *rs_plan_s = NULL;
1681	int fifo_size;
1682
1683	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1684		retcode = ERR_NOMEM;
1685		goto fail;
1686	}
1687
1688	if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
1689		memset(&sc, 0, sizeof(struct syncer_conf));
1690		sc.rate       = DRBD_RATE_DEF;
1691		sc.after      = DRBD_AFTER_DEF;
1692		sc.al_extents = DRBD_AL_EXTENTS_DEF;
1693		sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
1694		sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
1695		sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
1696		sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
1697		sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
1698		sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
1699	} else
1700		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1701
1702	if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
1703		retcode = ERR_MANDATORY_TAG;
1704		goto fail;
1705	}
1706
1707	/* re-sync running */
1708	rsr = (	mdev->state.conn == C_SYNC_SOURCE ||
1709		mdev->state.conn == C_SYNC_TARGET ||
1710		mdev->state.conn == C_PAUSED_SYNC_S ||
1711		mdev->state.conn == C_PAUSED_SYNC_T );
1712
1713	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
1714		retcode = ERR_CSUMS_RESYNC_RUNNING;
1715		goto fail;
1716	}
1717
1718	if (!rsr && sc.csums_alg[0]) {
1719		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
1720		if (IS_ERR(csums_tfm)) {
1721			csums_tfm = NULL;
1722			retcode = ERR_CSUMS_ALG;
1723			goto fail;
1724		}
1725
1726		if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
1727			retcode = ERR_CSUMS_ALG_ND;
1728			goto fail;
1729		}
1730	}
1731
1732	/* online verify running */
1733	ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
1734
1735	if (ovr) {
1736		if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
1737			retcode = ERR_VERIFY_RUNNING;
1738			goto fail;
1739		}
1740	}
1741
1742	if (!ovr && sc.verify_alg[0]) {
1743		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
1744		if (IS_ERR(verify_tfm)) {
1745			verify_tfm = NULL;
1746			retcode = ERR_VERIFY_ALG;
1747			goto fail;
1748		}
1749
1750		if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
1751			retcode = ERR_VERIFY_ALG_ND;
1752			goto fail;
1753		}
1754	}
1755
1756	/* silently ignore cpu mask on UP kernel */
1757	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
1758		err = __bitmap_parse(sc.cpu_mask, 32, 0,
1759				cpumask_bits(new_cpu_mask), nr_cpu_ids);
1760		if (err) {
1761			dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
1762			retcode = ERR_CPU_MASK_PARSE;
1763			goto fail;
1764		}
1765	}
1766
1767	ERR_IF (sc.rate < 1) sc.rate = 1;
1768	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
1769#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
1770	if (sc.al_extents > AL_MAX) {
1771		dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
1772		sc.al_extents = AL_MAX;
1773	}
1774#undef AL_MAX
1775
1776	/* to avoid spurious errors when configuring minors before configuring
1777	 * the minors they depend on: if necessary, first create the minor we
1778	 * depend on */
1779	if (sc.after >= 0)
1780		ensure_mdev(sc.after, 1);
1781
1782	/* most sanity checks done, try to assign the new sync-after
1783	 * dependency.  need to hold the global lock in there,
1784	 * to avoid a race in the dependency loop check. */
1785	retcode = drbd_alter_sa(mdev, sc.after);
1786	if (retcode != NO_ERROR)
1787		goto fail;
1788
1789	fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1790	if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
1791		rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
1792		if (!rs_plan_s) {
1793			dev_err(DEV, "kmalloc of fifo_buffer failed");
1794			retcode = ERR_NOMEM;
1795			goto fail;
1796		}
1797	}
1798
1799	/* ok, assign the rest of it as well.
1800	 * lock against receive_SyncParam() */
1801	spin_lock(&mdev->peer_seq_lock);
1802	mdev->sync_conf = sc;
1803
1804	if (!rsr) {
1805		crypto_free_hash(mdev->csums_tfm);
1806		mdev->csums_tfm = csums_tfm;
1807		csums_tfm = NULL;
1808	}
1809
1810	if (!ovr) {
1811		crypto_free_hash(mdev->verify_tfm);
1812		mdev->verify_tfm = verify_tfm;
1813		verify_tfm = NULL;
1814	}
1815
1816	if (fifo_size != mdev->rs_plan_s.size) {
1817		kfree(mdev->rs_plan_s.values);
1818		mdev->rs_plan_s.values = rs_plan_s;
1819		mdev->rs_plan_s.size   = fifo_size;
1820		mdev->rs_planed = 0;
1821		rs_plan_s = NULL;
1822	}
1823
1824	spin_unlock(&mdev->peer_seq_lock);
1825
1826	if (get_ldev(mdev)) {
1827		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1828		drbd_al_shrink(mdev);
1829		err = drbd_check_al_size(mdev);
1830		lc_unlock(mdev->act_log);
1831		wake_up(&mdev->al_wait);
1832
1833		put_ldev(mdev);
1834		drbd_md_sync(mdev);
1835
1836		if (err) {
1837			retcode = ERR_NOMEM;
1838			goto fail;
1839		}
1840	}
1841
1842	if (mdev->state.conn >= C_CONNECTED)
1843		drbd_send_sync_param(mdev, &sc);
1844
1845	if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
1846		cpumask_copy(mdev->cpu_mask, new_cpu_mask);
1847		drbd_calc_cpu_mask(mdev);
1848		mdev->receiver.reset_cpu_mask = 1;
1849		mdev->asender.reset_cpu_mask = 1;
1850		mdev->worker.reset_cpu_mask = 1;
1851	}
1852
1853	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1854fail:
1855	kfree(rs_plan_s);
1856	free_cpumask_var(new_cpu_mask);
1857	crypto_free_hash(csums_tfm);
1858	crypto_free_hash(verify_tfm);
1859	reply->ret_code = retcode;
1860	return 0;
1861}
1862
1863static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1864			      struct drbd_nl_cfg_reply *reply)
1865{
1866	int retcode;
1867
1868	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1869
1870	if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
1871		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1872
1873	while (retcode == SS_NEED_CONNECTION) {
1874		spin_lock_irq(&mdev->req_lock);
1875		if (mdev->state.conn < C_CONNECTED)
1876			retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
1877		spin_unlock_irq(&mdev->req_lock);
1878
1879		if (retcode != SS_NEED_CONNECTION)
1880			break;
1881
1882		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1883	}
1884
1885	reply->ret_code = retcode;
1886	return 0;
1887}
1888
1889static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
1890{
1891	int rv;
1892
1893	rv = drbd_bmio_set_n_write(mdev);
1894	drbd_suspend_al(mdev);
1895	return rv;
1896}
1897
1898static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1899				   struct drbd_nl_cfg_reply *reply)
1900{
1901	int retcode;
1902
1903	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
1904
1905	if (retcode < SS_SUCCESS) {
1906		if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
1907			/* The peer will get a resync upon connect anyways. Just make that
1908			   into a full resync. */
1909			retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
1910			if (retcode >= SS_SUCCESS) {
1911				/* open coded drbd_bitmap_io() */
1912				if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
1913						   "set_n_write from invalidate_peer"))
1914					retcode = ERR_IO_MD_DISK;
1915			}
1916		} else
1917			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1918	}
1919
1920	reply->ret_code = retcode;
1921	return 0;
1922}
1923
1924static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1925			      struct drbd_nl_cfg_reply *reply)
1926{
1927	int retcode = NO_ERROR;
1928
1929	if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
1930		retcode = ERR_PAUSE_IS_SET;
1931
1932	reply->ret_code = retcode;
1933	return 0;
1934}
1935
1936static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1937			       struct drbd_nl_cfg_reply *reply)
1938{
1939	int retcode = NO_ERROR;
1940
1941	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
1942		retcode = ERR_PAUSE_IS_CLEAR;
1943
1944	reply->ret_code = retcode;
1945	return 0;
1946}
1947
1948static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1949			      struct drbd_nl_cfg_reply *reply)
1950{
1951	reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
1952
1953	return 0;
1954}
1955
1956static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1957			     struct drbd_nl_cfg_reply *reply)
1958{
1959	if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1960		drbd_uuid_new_current(mdev);
1961		clear_bit(NEW_CUR_UUID, &mdev->flags);
1962	}
1963	drbd_suspend_io(mdev);
1964	reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
1965	if (reply->ret_code == SS_SUCCESS) {
1966		if (mdev->state.conn < C_CONNECTED)
1967			tl_clear(mdev);
1968		if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
1969			tl_restart(mdev, fail_frozen_disk_io);
1970	}
1971	drbd_resume_io(mdev);
1972
1973	return 0;
1974}
1975
1976static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1977			   struct drbd_nl_cfg_reply *reply)
1978{
1979	reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
1980	return 0;
1981}
1982
1983static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1984			   struct drbd_nl_cfg_reply *reply)
1985{
1986	unsigned short *tl;
1987
1988	tl = reply->tag_list;
1989
1990	if (get_ldev(mdev)) {
1991		tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
1992		put_ldev(mdev);
1993	}
1994
1995	if (get_net_conf(mdev)) {
1996		tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
1997		put_net_conf(mdev);
1998	}
1999	tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
2000
2001	put_unaligned(TT_END, tl++); /* Close the tag list */
2002
2003	return (int)((char *)tl - (char *)reply->tag_list);
2004}
2005
2006static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2007			     struct drbd_nl_cfg_reply *reply)
2008{
2009	unsigned short *tl = reply->tag_list;
2010	union drbd_state s = mdev->state;
2011	unsigned long rs_left;
2012	unsigned int res;
2013
2014	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
2015
2016	/* no local ref, no bitmap, no syncer progress. */
2017	if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
2018		if (get_ldev(mdev)) {
2019			drbd_get_syncer_progress(mdev, &rs_left, &res);
2020			tl = tl_add_int(tl, T_sync_progress, &res);
2021			put_ldev(mdev);
2022		}
2023	}
2024	put_unaligned(TT_END, tl++); /* Close the tag list */
2025
2026	return (int)((char *)tl - (char *)reply->tag_list);
2027}
2028
2029static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2030			     struct drbd_nl_cfg_reply *reply)
2031{
2032	unsigned short *tl;
2033
2034	tl = reply->tag_list;
2035
2036	if (get_ldev(mdev)) {
2037		tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
2038		tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
2039		put_ldev(mdev);
2040	}
2041	put_unaligned(TT_END, tl++); /* Close the tag list */
2042
2043	return (int)((char *)tl - (char *)reply->tag_list);
2044}
2045
2046/**
2047 * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
2048 * @mdev:	DRBD device.
2049 * @nlp:	Netlink/connector packet from drbdsetup
2050 * @reply:	Reply packet for drbdsetup
2051 */
2052static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2053				    struct drbd_nl_cfg_reply *reply)
2054{
2055	unsigned short *tl;
2056	char rv;
2057
2058	tl = reply->tag_list;
2059
2060	rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
2061	  test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
2062
2063	tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
2064	put_unaligned(TT_END, tl++); /* Close the tag list */
2065
2066	return (int)((char *)tl - (char *)reply->tag_list);
2067}
2068
2069static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2070				    struct drbd_nl_cfg_reply *reply)
2071{
2072	/* default to resume from last known position, if possible */
2073	struct start_ov args =
2074		{ .start_sector = mdev->ov_start_sector };
2075
2076	if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
2077		reply->ret_code = ERR_MANDATORY_TAG;
2078		return 0;
2079	}
2080	/* w_make_ov_request expects position to be aligned */
2081	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
2082	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
2083	return 0;
2084}
2085
2086
2087static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2088			      struct drbd_nl_cfg_reply *reply)
2089{
2090	int retcode = NO_ERROR;
2091	int skip_initial_sync = 0;
2092	int err;
2093
2094	struct new_c_uuid args;
2095
2096	memset(&args, 0, sizeof(struct new_c_uuid));
2097	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
2098		reply->ret_code = ERR_MANDATORY_TAG;
2099		return 0;
2100	}
2101
2102	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
2103
2104	if (!get_ldev(mdev)) {
2105		retcode = ERR_NO_DISK;
2106		goto out;
2107	}
2108
2109	/* this is "skip initial sync", assume to be clean */
2110	if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
2111	    mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
2112		dev_info(DEV, "Preparing to skip initial sync\n");
2113		skip_initial_sync = 1;
2114	} else if (mdev->state.conn != C_STANDALONE) {
2115		retcode = ERR_CONNECTED;
2116		goto out_dec;
2117	}
2118
2119	drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
2120	drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
2121
2122	if (args.clear_bm) {
2123		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
2124		if (err) {
2125			dev_err(DEV, "Writing bitmap failed with %d\n",err);
2126			retcode = ERR_IO_MD_DISK;
2127		}
2128		if (skip_initial_sync) {
2129			drbd_send_uuids_skip_initial_sync(mdev);
2130			_drbd_uuid_set(mdev, UI_BITMAP, 0);
2131			spin_lock_irq(&mdev->req_lock);
2132			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
2133					CS_VERBOSE, NULL);
2134			spin_unlock_irq(&mdev->req_lock);
2135		}
2136	}
2137
2138	drbd_md_sync(mdev);
2139out_dec:
2140	put_ldev(mdev);
2141out:
2142	mutex_unlock(&mdev->state_mutex);
2143
2144	reply->ret_code = retcode;
2145	return 0;
2146}
2147
2148struct cn_handler_struct {
2149	int (*function)(struct drbd_conf *,
2150			 struct drbd_nl_cfg_req *,
2151			 struct drbd_nl_cfg_reply *);
2152	int reply_body_size;
2153};
2154
2155static struct cn_handler_struct cnd_table[] = {
2156	[ P_primary ]		= { &drbd_nl_primary,		0 },
2157	[ P_secondary ]		= { &drbd_nl_secondary,		0 },
2158	[ P_disk_conf ]		= { &drbd_nl_disk_conf,		0 },
2159	[ P_detach ]		= { &drbd_nl_detach,		0 },
2160	[ P_net_conf ]		= { &drbd_nl_net_conf,		0 },
2161	[ P_disconnect ]	= { &drbd_nl_disconnect,	0 },
2162	[ P_resize ]		= { &drbd_nl_resize,		0 },
2163	[ P_syncer_conf ]	= { &drbd_nl_syncer_conf,	0 },
2164	[ P_invalidate ]	= { &drbd_nl_invalidate,	0 },
2165	[ P_invalidate_peer ]	= { &drbd_nl_invalidate_peer,	0 },
2166	[ P_pause_sync ]	= { &drbd_nl_pause_sync,	0 },
2167	[ P_resume_sync ]	= { &drbd_nl_resume_sync,	0 },
2168	[ P_suspend_io ]	= { &drbd_nl_suspend_io,	0 },
2169	[ P_resume_io ]		= { &drbd_nl_resume_io,		0 },
2170	[ P_outdate ]		= { &drbd_nl_outdate,		0 },
2171	[ P_get_config ]	= { &drbd_nl_get_config,
2172				    sizeof(struct syncer_conf_tag_len_struct) +
2173				    sizeof(struct disk_conf_tag_len_struct) +
2174				    sizeof(struct net_conf_tag_len_struct) },
2175	[ P_get_state ]		= { &drbd_nl_get_state,
2176				    sizeof(struct get_state_tag_len_struct) +
2177				    sizeof(struct sync_progress_tag_len_struct)	},
2178	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
2179				    sizeof(struct get_uuids_tag_len_struct) },
2180	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
2181				    sizeof(struct get_timeout_flag_tag_len_struct)},
2182	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
2183	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
2184};
2185
2186static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
2187{
2188	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
2189	struct cn_handler_struct *cm;
2190	struct cn_msg *cn_reply;
2191	struct drbd_nl_cfg_reply *reply;
2192	struct drbd_conf *mdev;
2193	int retcode, rr;
2194	int reply_size = sizeof(struct cn_msg)
2195		+ sizeof(struct drbd_nl_cfg_reply)
2196		+ sizeof(short int);
2197
2198	if (!try_module_get(THIS_MODULE)) {
2199		printk(KERN_ERR "drbd: try_module_get() failed!\n");
2200		return;
2201	}
2202
2203	if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
2204		retcode = ERR_PERM;
2205		goto fail;
2206	}
2207
2208	mdev = ensure_mdev(nlp->drbd_minor,
2209			(nlp->flags & DRBD_NL_CREATE_DEVICE));
2210	if (!mdev) {
2211		retcode = ERR_MINOR_INVALID;
2212		goto fail;
2213	}
2214
2215	if (nlp->packet_type >= P_nl_after_last_packet ||
2216	    nlp->packet_type == P_return_code_only) {
2217		retcode = ERR_PACKET_NR;
2218		goto fail;
2219	}
2220
2221	cm = cnd_table + nlp->packet_type;
2222
2223	/* This may happen if packet number is 0: */
2224	if (cm->function == NULL) {
2225		retcode = ERR_PACKET_NR;
2226		goto fail;
2227	}
2228
2229	reply_size += cm->reply_body_size;
2230
2231	/* allocation not in the IO path, cqueue thread context */
2232	cn_reply = kzalloc(reply_size, GFP_KERNEL);
2233	if (!cn_reply) {
2234		retcode = ERR_NOMEM;
2235		goto fail;
2236	}
2237	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2238
2239	reply->packet_type =
2240		cm->reply_body_size ? nlp->packet_type : P_return_code_only;
2241	reply->minor = nlp->drbd_minor;
2242	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2243	/* reply->tag_list; might be modified by cm->function. */
2244
2245	rr = cm->function(mdev, nlp, reply);
2246
2247	cn_reply->id = req->id;
2248	cn_reply->seq = req->seq;
2249	cn_reply->ack = req->ack  + 1;
2250	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
2251	cn_reply->flags = 0;
2252
2253	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
2254	if (rr && rr != -ESRCH)
2255		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2256
2257	kfree(cn_reply);
2258	module_put(THIS_MODULE);
2259	return;
2260 fail:
2261	drbd_nl_send_reply(req, retcode);
2262	module_put(THIS_MODULE);
2263}
2264
2265static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
2266
2267static unsigned short *
2268__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
2269	unsigned short len, int nul_terminated)
2270{
2271	unsigned short l = tag_descriptions[tag_number(tag)].max_len;
2272	len = (len < l) ? len :  l;
2273	put_unaligned(tag, tl++);
2274	put_unaligned(len, tl++);
2275	memcpy(tl, data, len);
2276	tl = (unsigned short*)((char*)tl + len);
2277	if (nul_terminated)
2278		*((char*)tl - 1) = 0;
2279	return tl;
2280}
2281
2282static unsigned short *
2283tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
2284{
2285	return __tl_add_blob(tl, tag, data, len, 0);
2286}
2287
2288static unsigned short *
2289tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
2290{
2291	return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
2292}
2293
2294static unsigned short *
2295tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
2296{
2297	put_unaligned(tag, tl++);
2298	switch(tag_type(tag)) {
2299	case TT_INTEGER:
2300		put_unaligned(sizeof(int), tl++);
2301		put_unaligned(*(int *)val, (int *)tl);
2302		tl = (unsigned short*)((char*)tl+sizeof(int));
2303		break;
2304	case TT_INT64:
2305		put_unaligned(sizeof(u64), tl++);
2306		put_unaligned(*(u64 *)val, (u64 *)tl);
2307		tl = (unsigned short*)((char*)tl+sizeof(u64));
2308		break;
2309	default:
2310		/* someone did something stupid. */
2311		;
2312	}
2313	return tl;
2314}
2315
2316void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
2317{
2318	char buffer[sizeof(struct cn_msg)+
2319		    sizeof(struct drbd_nl_cfg_reply)+
2320		    sizeof(struct get_state_tag_len_struct)+
2321		    sizeof(short int)];
2322	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2323	struct drbd_nl_cfg_reply *reply =
2324		(struct drbd_nl_cfg_reply *)cn_reply->data;
2325	unsigned short *tl = reply->tag_list;
2326
2327	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2328
2329	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
2330
2331	put_unaligned(TT_END, tl++); /* Close the tag list */
2332
2333	cn_reply->id.idx = CN_IDX_DRBD;
2334	cn_reply->id.val = CN_VAL_DRBD;
2335
2336	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2337	cn_reply->ack = 0; /* not used here. */
2338	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2339		(int)((char *)tl - (char *)reply->tag_list);
2340	cn_reply->flags = 0;
2341
2342	reply->packet_type = P_get_state;
2343	reply->minor = mdev_to_minor(mdev);
2344	reply->ret_code = NO_ERROR;
2345
2346	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2347}
2348
2349void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
2350{
2351	char buffer[sizeof(struct cn_msg)+
2352		    sizeof(struct drbd_nl_cfg_reply)+
2353		    sizeof(struct call_helper_tag_len_struct)+
2354		    sizeof(short int)];
2355	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2356	struct drbd_nl_cfg_reply *reply =
2357		(struct drbd_nl_cfg_reply *)cn_reply->data;
2358	unsigned short *tl = reply->tag_list;
2359
2360	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2361
2362	tl = tl_add_str(tl, T_helper, helper_name);
2363	put_unaligned(TT_END, tl++); /* Close the tag list */
2364
2365	cn_reply->id.idx = CN_IDX_DRBD;
2366	cn_reply->id.val = CN_VAL_DRBD;
2367
2368	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2369	cn_reply->ack = 0; /* not used here. */
2370	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2371		(int)((char *)tl - (char *)reply->tag_list);
2372	cn_reply->flags = 0;
2373
2374	reply->packet_type = P_call_helper;
2375	reply->minor = mdev_to_minor(mdev);
2376	reply->ret_code = NO_ERROR;
2377
2378	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2379}
2380
2381void drbd_bcast_ee(struct drbd_conf *mdev,
2382		const char *reason, const int dgs,
2383		const char* seen_hash, const char* calc_hash,
2384		const struct drbd_epoch_entry* e)
2385{
2386	struct cn_msg *cn_reply;
2387	struct drbd_nl_cfg_reply *reply;
2388	unsigned short *tl;
2389	struct page *page;
2390	unsigned len;
2391
2392	if (!e)
2393		return;
2394	if (!reason || !reason[0])
2395		return;
2396
2397	/* apparently we have to memcpy twice, first to prepare the data for the
2398	 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
2399	 * netlink skb. */
2400	/* receiver thread context, which is not in the writeout path (of this node),
2401	 * but may be in the writeout path of the _other_ node.
2402	 * GFP_NOIO to avoid potential "distributed deadlock". */
2403	cn_reply = kzalloc(
2404		sizeof(struct cn_msg)+
2405		sizeof(struct drbd_nl_cfg_reply)+
2406		sizeof(struct dump_ee_tag_len_struct)+
2407		sizeof(short int),
2408		GFP_NOIO);
2409
2410	if (!cn_reply) {
2411		dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
2412				(unsigned long long)e->sector, e->size);
2413		return;
2414	}
2415
2416	reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
2417	tl = reply->tag_list;
2418
2419	tl = tl_add_str(tl, T_dump_ee_reason, reason);
2420	tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
2421	tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
2422	tl = tl_add_int(tl, T_ee_sector, &e->sector);
2423	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2424
2425	/* dump the first 32k */
2426	len = min_t(unsigned, e->size, 32 << 10);
2427	put_unaligned(T_ee_data, tl++);
2428	put_unaligned(len, tl++);
2429
2430	page = e->pages;
2431	page_chain_for_each(page) {
2432		void *d = kmap_atomic(page, KM_USER0);
2433		unsigned l = min_t(unsigned, len, PAGE_SIZE);
2434		memcpy(tl, d, l);
2435		kunmap_atomic(d, KM_USER0);
2436		tl = (unsigned short*)((char*)tl + l);
2437		len -= l;
2438		if (len == 0)
2439			break;
2440	}
2441	put_unaligned(TT_END, tl++); /* Close the tag list */
2442
2443	cn_reply->id.idx = CN_IDX_DRBD;
2444	cn_reply->id.val = CN_VAL_DRBD;
2445
2446	cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
2447	cn_reply->ack = 0; // not used here.
2448	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2449		(int)((char*)tl - (char*)reply->tag_list);
2450	cn_reply->flags = 0;
2451
2452	reply->packet_type = P_dump_ee;
2453	reply->minor = mdev_to_minor(mdev);
2454	reply->ret_code = NO_ERROR;
2455
2456	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2457	kfree(cn_reply);
2458}
2459
2460void drbd_bcast_sync_progress(struct drbd_conf *mdev)
2461{
2462	char buffer[sizeof(struct cn_msg)+
2463		    sizeof(struct drbd_nl_cfg_reply)+
2464		    sizeof(struct sync_progress_tag_len_struct)+
2465		    sizeof(short int)];
2466	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2467	struct drbd_nl_cfg_reply *reply =
2468		(struct drbd_nl_cfg_reply *)cn_reply->data;
2469	unsigned short *tl = reply->tag_list;
2470	unsigned long rs_left;
2471	unsigned int res;
2472
2473	/* no local ref, no bitmap, no syncer progress, no broadcast. */
2474	if (!get_ldev(mdev))
2475		return;
2476	drbd_get_syncer_progress(mdev, &rs_left, &res);
2477	put_ldev(mdev);
2478
2479	tl = tl_add_int(tl, T_sync_progress, &res);
2480	put_unaligned(TT_END, tl++); /* Close the tag list */
2481
2482	cn_reply->id.idx = CN_IDX_DRBD;
2483	cn_reply->id.val = CN_VAL_DRBD;
2484
2485	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2486	cn_reply->ack = 0; /* not used here. */
2487	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2488		(int)((char *)tl - (char *)reply->tag_list);
2489	cn_reply->flags = 0;
2490
2491	reply->packet_type = P_sync_progress;
2492	reply->minor = mdev_to_minor(mdev);
2493	reply->ret_code = NO_ERROR;
2494
2495	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2496}
2497
2498int __init drbd_nl_init(void)
2499{
2500	static struct cb_id cn_id_drbd;
2501	int err, try=10;
2502
2503	cn_id_drbd.val = CN_VAL_DRBD;
2504	do {
2505		cn_id_drbd.idx = cn_idx;
2506		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
2507		if (!err)
2508			break;
2509		cn_idx = (cn_idx + CN_IDX_STEP);
2510	} while (try--);
2511
2512	if (err) {
2513		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
2514		return err;
2515	}
2516
2517	return 0;
2518}
2519
2520void drbd_nl_cleanup(void)
2521{
2522	static struct cb_id cn_id_drbd;
2523
2524	cn_id_drbd.idx = cn_idx;
2525	cn_id_drbd.val = CN_VAL_DRBD;
2526
2527	cn_del_callback(&cn_id_drbd);
2528}
2529
2530void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2531{
2532	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
2533	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2534	struct drbd_nl_cfg_reply *reply =
2535		(struct drbd_nl_cfg_reply *)cn_reply->data;
2536	int rr;
2537
2538	memset(buffer, 0, sizeof(buffer));
2539	cn_reply->id = req->id;
2540
2541	cn_reply->seq = req->seq;
2542	cn_reply->ack = req->ack  + 1;
2543	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2544	cn_reply->flags = 0;
2545
2546	reply->packet_type = P_return_code_only;
2547	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2548	reply->ret_code = ret_code;
2549
2550	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2551	if (rr && rr != -ESRCH)
2552		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2553}
2554
2555