ip_set_core.c revision 3ace95c0ac125a042cfb682d0a9bbdbf1e5a2c65
1/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
2 *                         Patrick Schaaf <bof@bof.de>
3 * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10/* Kernel module for IP set management */
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/moduleparam.h>
15#include <linux/ip.h>
16#include <linux/skbuff.h>
17#include <linux/spinlock.h>
18#include <linux/netlink.h>
19#include <linux/rculist.h>
20#include <net/netlink.h>
21
22#include <linux/netfilter.h>
23#include <linux/netfilter/x_tables.h>
24#include <linux/netfilter/nfnetlink.h>
25#include <linux/netfilter/ipset/ip_set.h>
26
27static LIST_HEAD(ip_set_type_list);		/* all registered set types */
28static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
29static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
30
31static struct ip_set **ip_set_list;		/* all individual sets */
32static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
33
34#define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
35
36static unsigned int max_sets;
37
38module_param(max_sets, int, 0600);
39MODULE_PARM_DESC(max_sets, "maximal number of sets");
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
42MODULE_DESCRIPTION("core IP set support");
43MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
44
45/*
46 * The set types are implemented in modules and registered set types
47 * can be found in ip_set_type_list. Adding/deleting types is
48 * serialized by ip_set_type_mutex.
49 */
50
51static inline void
52ip_set_type_lock(void)
53{
54	mutex_lock(&ip_set_type_mutex);
55}
56
57static inline void
58ip_set_type_unlock(void)
59{
60	mutex_unlock(&ip_set_type_mutex);
61}
62
63/* Register and deregister settype */
64
65static struct ip_set_type *
66find_set_type(const char *name, u8 family, u8 revision)
67{
68	struct ip_set_type *type;
69
70	list_for_each_entry_rcu(type, &ip_set_type_list, list)
71		if (STREQ(type->name, name) &&
72		    (type->family == family ||
73		     type->family == NFPROTO_UNSPEC) &&
74		    revision >= type->revision_min &&
75		    revision <= type->revision_max)
76			return type;
77	return NULL;
78}
79
80/* Unlock, try to load a set type module and lock again */
81static bool
82load_settype(const char *name)
83{
84	nfnl_unlock();
85	pr_debug("try to load ip_set_%s\n", name);
86	if (request_module("ip_set_%s", name) < 0) {
87		pr_warning("Can't find ip_set type %s\n", name);
88		nfnl_lock();
89		return false;
90	}
91	nfnl_lock();
92	return true;
93}
94
95/* Find a set type and reference it */
96#define find_set_type_get(name, family, revision, found)	\
97	__find_set_type_get(name, family, revision, found, false)
98
99static int
100__find_set_type_get(const char *name, u8 family, u8 revision,
101		    struct ip_set_type **found, bool retry)
102{
103	struct ip_set_type *type;
104	int err;
105
106	if (retry && !load_settype(name))
107		return -IPSET_ERR_FIND_TYPE;
108
109	rcu_read_lock();
110	*found = find_set_type(name, family, revision);
111	if (*found) {
112		err = !try_module_get((*found)->me) ? -EFAULT : 0;
113		goto unlock;
114	}
115	/* Make sure the type is already loaded
116	 * but we don't support the revision */
117	list_for_each_entry_rcu(type, &ip_set_type_list, list)
118		if (STREQ(type->name, name)) {
119			err = -IPSET_ERR_FIND_TYPE;
120			goto unlock;
121		}
122	rcu_read_unlock();
123
124	return retry ? -IPSET_ERR_FIND_TYPE :
125		__find_set_type_get(name, family, revision, found, true);
126
127unlock:
128	rcu_read_unlock();
129	return err;
130}
131
132/* Find a given set type by name and family.
133 * If we succeeded, the supported minimal and maximum revisions are
134 * filled out.
135 */
136#define find_set_type_minmax(name, family, min, max) \
137	__find_set_type_minmax(name, family, min, max, false)
138
139static int
140__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
141		       bool retry)
142{
143	struct ip_set_type *type;
144	bool found = false;
145
146	if (retry && !load_settype(name))
147		return -IPSET_ERR_FIND_TYPE;
148
149	*min = 255; *max = 0;
150	rcu_read_lock();
151	list_for_each_entry_rcu(type, &ip_set_type_list, list)
152		if (STREQ(type->name, name) &&
153		    (type->family == family ||
154		     type->family == NFPROTO_UNSPEC)) {
155			found = true;
156			if (type->revision_min < *min)
157				*min = type->revision_min;
158			if (type->revision_max > *max)
159				*max = type->revision_max;
160		}
161	rcu_read_unlock();
162	if (found)
163		return 0;
164
165	return retry ? -IPSET_ERR_FIND_TYPE :
166		__find_set_type_minmax(name, family, min, max, true);
167}
168
169#define family_name(f)	((f) == NFPROTO_IPV4 ? "inet" : \
170			 (f) == NFPROTO_IPV6 ? "inet6" : "any")
171
172/* Register a set type structure. The type is identified by
173 * the unique triple of name, family and revision.
174 */
175int
176ip_set_type_register(struct ip_set_type *type)
177{
178	int ret = 0;
179
180	if (type->protocol != IPSET_PROTOCOL) {
181		pr_warning("ip_set type %s, family %s, revision %u:%u uses "
182			   "wrong protocol version %u (want %u)\n",
183			   type->name, family_name(type->family),
184			   type->revision_min, type->revision_max,
185			   type->protocol, IPSET_PROTOCOL);
186		return -EINVAL;
187	}
188
189	ip_set_type_lock();
190	if (find_set_type(type->name, type->family, type->revision_min)) {
191		/* Duplicate! */
192		pr_warning("ip_set type %s, family %s with revision min %u "
193			   "already registered!\n", type->name,
194			   family_name(type->family), type->revision_min);
195		ret = -EINVAL;
196		goto unlock;
197	}
198	list_add_rcu(&type->list, &ip_set_type_list);
199	pr_debug("type %s, family %s, revision %u:%u registered.\n",
200		 type->name, family_name(type->family),
201		 type->revision_min, type->revision_max);
202unlock:
203	ip_set_type_unlock();
204	return ret;
205}
206EXPORT_SYMBOL_GPL(ip_set_type_register);
207
208/* Unregister a set type. There's a small race with ip_set_create */
209void
210ip_set_type_unregister(struct ip_set_type *type)
211{
212	ip_set_type_lock();
213	if (!find_set_type(type->name, type->family, type->revision_min)) {
214		pr_warning("ip_set type %s, family %s with revision min %u "
215			   "not registered\n", type->name,
216			   family_name(type->family), type->revision_min);
217		goto unlock;
218	}
219	list_del_rcu(&type->list);
220	pr_debug("type %s, family %s with revision min %u unregistered.\n",
221		 type->name, family_name(type->family), type->revision_min);
222unlock:
223	ip_set_type_unlock();
224
225	synchronize_rcu();
226}
227EXPORT_SYMBOL_GPL(ip_set_type_unregister);
228
229/* Utility functions */
230void *
231ip_set_alloc(size_t size)
232{
233	void *members = NULL;
234
235	if (size < KMALLOC_MAX_SIZE)
236		members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
237
238	if (members) {
239		pr_debug("%p: allocated with kmalloc\n", members);
240		return members;
241	}
242
243	members = vzalloc(size);
244	if (!members)
245		return NULL;
246	pr_debug("%p: allocated with vmalloc\n", members);
247
248	return members;
249}
250EXPORT_SYMBOL_GPL(ip_set_alloc);
251
252void
253ip_set_free(void *members)
254{
255	pr_debug("%p: free with %s\n", members,
256		 is_vmalloc_addr(members) ? "vfree" : "kfree");
257	if (is_vmalloc_addr(members))
258		vfree(members);
259	else
260		kfree(members);
261}
262EXPORT_SYMBOL_GPL(ip_set_free);
263
264static inline bool
265flag_nested(const struct nlattr *nla)
266{
267	return nla->nla_type & NLA_F_NESTED;
268}
269
270static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
271	[IPSET_ATTR_IPADDR_IPV4]	= { .type = NLA_U32 },
272	[IPSET_ATTR_IPADDR_IPV6]	= { .type = NLA_BINARY,
273					    .len = sizeof(struct in6_addr) },
274};
275
276int
277ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr)
278{
279	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
280
281	if (unlikely(!flag_nested(nla)))
282		return -IPSET_ERR_PROTOCOL;
283	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
284		return -IPSET_ERR_PROTOCOL;
285	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
286		return -IPSET_ERR_PROTOCOL;
287
288	*ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
289	return 0;
290}
291EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
292
293int
294ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
295{
296	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
297
298	if (unlikely(!flag_nested(nla)))
299		return -IPSET_ERR_PROTOCOL;
300
301	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
302		return -IPSET_ERR_PROTOCOL;
303	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
304		return -IPSET_ERR_PROTOCOL;
305
306	memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
307		sizeof(struct in6_addr));
308	return 0;
309}
310EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
311
312/*
313 * Creating/destroying/renaming/swapping affect the existence and
314 * the properties of a set. All of these can be executed from userspace
315 * only and serialized by the nfnl mutex indirectly from nfnetlink.
316 *
317 * Sets are identified by their index in ip_set_list and the index
318 * is used by the external references (set/SET netfilter modules).
319 *
320 * The set behind an index may change by swapping only, from userspace.
321 */
322
323static inline void
324__ip_set_get(ip_set_id_t index)
325{
326	write_lock_bh(&ip_set_ref_lock);
327	ip_set_list[index]->ref++;
328	write_unlock_bh(&ip_set_ref_lock);
329}
330
331static inline void
332__ip_set_put(ip_set_id_t index)
333{
334	write_lock_bh(&ip_set_ref_lock);
335	BUG_ON(ip_set_list[index]->ref == 0);
336	ip_set_list[index]->ref--;
337	write_unlock_bh(&ip_set_ref_lock);
338}
339
340/*
341 * Add, del and test set entries from kernel.
342 *
343 * The set behind the index must exist and must be referenced
344 * so it can't be destroyed (or changed) under our foot.
345 */
346
347int
348ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
349	    const struct xt_action_param *par,
350	    const struct ip_set_adt_opt *opt)
351{
352	struct ip_set *set = ip_set_list[index];
353	int ret = 0;
354
355	BUG_ON(set == NULL);
356	pr_debug("set %s, index %u\n", set->name, index);
357
358	if (opt->dim < set->type->dimension ||
359	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
360		return 0;
361
362	read_lock_bh(&set->lock);
363	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
364	read_unlock_bh(&set->lock);
365
366	if (ret == -EAGAIN) {
367		/* Type requests element to be completed */
368		pr_debug("element must be competed, ADD is triggered\n");
369		write_lock_bh(&set->lock);
370		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
371		write_unlock_bh(&set->lock);
372		ret = 1;
373	}
374
375	/* Convert error codes to nomatch */
376	return (ret < 0 ? 0 : ret);
377}
378EXPORT_SYMBOL_GPL(ip_set_test);
379
380int
381ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
382	   const struct xt_action_param *par,
383	   const struct ip_set_adt_opt *opt)
384{
385	struct ip_set *set = ip_set_list[index];
386	int ret;
387
388	BUG_ON(set == NULL);
389	pr_debug("set %s, index %u\n", set->name, index);
390
391	if (opt->dim < set->type->dimension ||
392	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
393		return 0;
394
395	write_lock_bh(&set->lock);
396	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
397	write_unlock_bh(&set->lock);
398
399	return ret;
400}
401EXPORT_SYMBOL_GPL(ip_set_add);
402
403int
404ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
405	   const struct xt_action_param *par,
406	   const struct ip_set_adt_opt *opt)
407{
408	struct ip_set *set = ip_set_list[index];
409	int ret = 0;
410
411	BUG_ON(set == NULL);
412	pr_debug("set %s, index %u\n", set->name, index);
413
414	if (opt->dim < set->type->dimension ||
415	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
416		return 0;
417
418	write_lock_bh(&set->lock);
419	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
420	write_unlock_bh(&set->lock);
421
422	return ret;
423}
424EXPORT_SYMBOL_GPL(ip_set_del);
425
426/*
427 * Find set by name, reference it once. The reference makes sure the
428 * thing pointed to, does not go away under our feet.
429 *
430 */
431ip_set_id_t
432ip_set_get_byname(const char *name, struct ip_set **set)
433{
434	ip_set_id_t i, index = IPSET_INVALID_ID;
435	struct ip_set *s;
436
437	for (i = 0; i < ip_set_max; i++) {
438		s = ip_set_list[i];
439		if (s != NULL && STREQ(s->name, name)) {
440			__ip_set_get(i);
441			index = i;
442			*set = s;
443		}
444	}
445
446	return index;
447}
448EXPORT_SYMBOL_GPL(ip_set_get_byname);
449
450/*
451 * If the given set pointer points to a valid set, decrement
452 * reference count by 1. The caller shall not assume the index
453 * to be valid, after calling this function.
454 *
455 */
456void
457ip_set_put_byindex(ip_set_id_t index)
458{
459	if (ip_set_list[index] != NULL)
460		__ip_set_put(index);
461}
462EXPORT_SYMBOL_GPL(ip_set_put_byindex);
463
464/*
465 * Get the name of a set behind a set index.
466 * We assume the set is referenced, so it does exist and
467 * can't be destroyed. The set cannot be renamed due to
468 * the referencing either.
469 *
470 */
471const char *
472ip_set_name_byindex(ip_set_id_t index)
473{
474	const struct ip_set *set = ip_set_list[index];
475
476	BUG_ON(set == NULL);
477	BUG_ON(set->ref == 0);
478
479	/* Referenced, so it's safe */
480	return set->name;
481}
482EXPORT_SYMBOL_GPL(ip_set_name_byindex);
483
484/*
485 * Routines to call by external subsystems, which do not
486 * call nfnl_lock for us.
487 */
488
489/*
490 * Find set by name, reference it once. The reference makes sure the
491 * thing pointed to, does not go away under our feet.
492 *
493 * The nfnl mutex is used in the function.
494 */
495ip_set_id_t
496ip_set_nfnl_get(const char *name)
497{
498	struct ip_set *s;
499	ip_set_id_t index;
500
501	nfnl_lock();
502	index = ip_set_get_byname(name, &s);
503	nfnl_unlock();
504
505	return index;
506}
507EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
508
509/*
510 * Find set by index, reference it once. The reference makes sure the
511 * thing pointed to, does not go away under our feet.
512 *
513 * The nfnl mutex is used in the function.
514 */
515ip_set_id_t
516ip_set_nfnl_get_byindex(ip_set_id_t index)
517{
518	if (index > ip_set_max)
519		return IPSET_INVALID_ID;
520
521	nfnl_lock();
522	if (ip_set_list[index])
523		__ip_set_get(index);
524	else
525		index = IPSET_INVALID_ID;
526	nfnl_unlock();
527
528	return index;
529}
530EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
531
532/*
533 * If the given set pointer points to a valid set, decrement
534 * reference count by 1. The caller shall not assume the index
535 * to be valid, after calling this function.
536 *
537 * The nfnl mutex is used in the function.
538 */
539void
540ip_set_nfnl_put(ip_set_id_t index)
541{
542	nfnl_lock();
543	ip_set_put_byindex(index);
544	nfnl_unlock();
545}
546EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
547
548/*
549 * Communication protocol with userspace over netlink.
550 *
551 * The commands are serialized by the nfnl mutex.
552 */
553
554static inline bool
555protocol_failed(const struct nlattr * const tb[])
556{
557	return !tb[IPSET_ATTR_PROTOCOL] ||
558	       nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
559}
560
561static inline u32
562flag_exist(const struct nlmsghdr *nlh)
563{
564	return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
565}
566
567static struct nlmsghdr *
568start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
569	  enum ipset_cmd cmd)
570{
571	struct nlmsghdr *nlh;
572	struct nfgenmsg *nfmsg;
573
574	nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
575			sizeof(*nfmsg), flags);
576	if (nlh == NULL)
577		return NULL;
578
579	nfmsg = nlmsg_data(nlh);
580	nfmsg->nfgen_family = NFPROTO_IPV4;
581	nfmsg->version = NFNETLINK_V0;
582	nfmsg->res_id = 0;
583
584	return nlh;
585}
586
587/* Create a set */
588
589static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
590	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
591	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
592				    .len = IPSET_MAXNAMELEN - 1 },
593	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
594				    .len = IPSET_MAXNAMELEN - 1},
595	[IPSET_ATTR_REVISION]	= { .type = NLA_U8 },
596	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
597	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
598};
599
600static ip_set_id_t
601find_set_id(const char *name)
602{
603	ip_set_id_t i, index = IPSET_INVALID_ID;
604	const struct ip_set *set;
605
606	for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) {
607		set = ip_set_list[i];
608		if (set != NULL && STREQ(set->name, name))
609			index = i;
610	}
611	return index;
612}
613
614static inline struct ip_set *
615find_set(const char *name)
616{
617	ip_set_id_t index = find_set_id(name);
618
619	return index == IPSET_INVALID_ID ? NULL : ip_set_list[index];
620}
621
622static int
623find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
624{
625	ip_set_id_t i;
626
627	*index = IPSET_INVALID_ID;
628	for (i = 0;  i < ip_set_max; i++) {
629		if (ip_set_list[i] == NULL) {
630			if (*index == IPSET_INVALID_ID)
631				*index = i;
632		} else if (STREQ(name, ip_set_list[i]->name)) {
633			/* Name clash */
634			*set = ip_set_list[i];
635			return -EEXIST;
636		}
637	}
638	if (*index == IPSET_INVALID_ID)
639		/* No free slot remained */
640		return -IPSET_ERR_MAX_SETS;
641	return 0;
642}
643
644static int
645ip_set_none(struct sock *ctnl, struct sk_buff *skb,
646	    const struct nlmsghdr *nlh,
647	    const struct nlattr * const attr[])
648{
649	return -EOPNOTSUPP;
650}
651
652static int
653ip_set_create(struct sock *ctnl, struct sk_buff *skb,
654	      const struct nlmsghdr *nlh,
655	      const struct nlattr * const attr[])
656{
657	struct ip_set *set, *clash = NULL;
658	ip_set_id_t index = IPSET_INVALID_ID;
659	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
660	const char *name, *typename;
661	u8 family, revision;
662	u32 flags = flag_exist(nlh);
663	int ret = 0;
664
665	if (unlikely(protocol_failed(attr) ||
666		     attr[IPSET_ATTR_SETNAME] == NULL ||
667		     attr[IPSET_ATTR_TYPENAME] == NULL ||
668		     attr[IPSET_ATTR_REVISION] == NULL ||
669		     attr[IPSET_ATTR_FAMILY] == NULL ||
670		     (attr[IPSET_ATTR_DATA] != NULL &&
671		      !flag_nested(attr[IPSET_ATTR_DATA]))))
672		return -IPSET_ERR_PROTOCOL;
673
674	name = nla_data(attr[IPSET_ATTR_SETNAME]);
675	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
676	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
677	revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
678	pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
679		 name, typename, family_name(family), revision);
680
681	/*
682	 * First, and without any locks, allocate and initialize
683	 * a normal base set structure.
684	 */
685	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
686	if (!set)
687		return -ENOMEM;
688	rwlock_init(&set->lock);
689	strlcpy(set->name, name, IPSET_MAXNAMELEN);
690	set->family = family;
691	set->revision = revision;
692
693	/*
694	 * Next, check that we know the type, and take
695	 * a reference on the type, to make sure it stays available
696	 * while constructing our new set.
697	 *
698	 * After referencing the type, we try to create the type
699	 * specific part of the set without holding any locks.
700	 */
701	ret = find_set_type_get(typename, family, revision, &(set->type));
702	if (ret)
703		goto out;
704
705	/*
706	 * Without holding any locks, create private part.
707	 */
708	if (attr[IPSET_ATTR_DATA] &&
709	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
710			     set->type->create_policy)) {
711		ret = -IPSET_ERR_PROTOCOL;
712		goto put_out;
713	}
714
715	ret = set->type->create(set, tb, flags);
716	if (ret != 0)
717		goto put_out;
718
719	/* BTW, ret==0 here. */
720
721	/*
722	 * Here, we have a valid, constructed set and we are protected
723	 * by the nfnl mutex. Find the first free index in ip_set_list
724	 * and check clashing.
725	 */
726	ret = find_free_id(set->name, &index, &clash);
727	if (ret != 0) {
728		/* If this is the same set and requested, ignore error */
729		if (ret == -EEXIST &&
730		    (flags & IPSET_FLAG_EXIST) &&
731		    STREQ(set->type->name, clash->type->name) &&
732		    set->type->family == clash->type->family &&
733		    set->type->revision_min == clash->type->revision_min &&
734		    set->type->revision_max == clash->type->revision_max &&
735		    set->variant->same_set(set, clash))
736			ret = 0;
737		goto cleanup;
738	}
739
740	/*
741	 * Finally! Add our shiny new set to the list, and be done.
742	 */
743	pr_debug("create: '%s' created with index %u!\n", set->name, index);
744	ip_set_list[index] = set;
745
746	return ret;
747
748cleanup:
749	set->variant->destroy(set);
750put_out:
751	module_put(set->type->me);
752out:
753	kfree(set);
754	return ret;
755}
756
757/* Destroy sets */
758
759static const struct nla_policy
760ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
761	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
762	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
763				    .len = IPSET_MAXNAMELEN - 1 },
764};
765
766static void
767ip_set_destroy_set(ip_set_id_t index)
768{
769	struct ip_set *set = ip_set_list[index];
770
771	pr_debug("set: %s\n",  set->name);
772	ip_set_list[index] = NULL;
773
774	/* Must call it without holding any lock */
775	set->variant->destroy(set);
776	module_put(set->type->me);
777	kfree(set);
778}
779
780static int
781ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
782	       const struct nlmsghdr *nlh,
783	       const struct nlattr * const attr[])
784{
785	ip_set_id_t i;
786	int ret = 0;
787
788	if (unlikely(protocol_failed(attr)))
789		return -IPSET_ERR_PROTOCOL;
790
791	/* Commands are serialized and references are
792	 * protected by the ip_set_ref_lock.
793	 * External systems (i.e. xt_set) must call
794	 * ip_set_put|get_nfnl_* functions, that way we
795	 * can safely check references here.
796	 *
797	 * list:set timer can only decrement the reference
798	 * counter, so if it's already zero, we can proceed
799	 * without holding the lock.
800	 */
801	read_lock_bh(&ip_set_ref_lock);
802	if (!attr[IPSET_ATTR_SETNAME]) {
803		for (i = 0; i < ip_set_max; i++) {
804			if (ip_set_list[i] != NULL && ip_set_list[i]->ref) {
805				ret = -IPSET_ERR_BUSY;
806				goto out;
807			}
808		}
809		read_unlock_bh(&ip_set_ref_lock);
810		for (i = 0; i < ip_set_max; i++) {
811			if (ip_set_list[i] != NULL)
812				ip_set_destroy_set(i);
813		}
814	} else {
815		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
816		if (i == IPSET_INVALID_ID) {
817			ret = -ENOENT;
818			goto out;
819		} else if (ip_set_list[i]->ref) {
820			ret = -IPSET_ERR_BUSY;
821			goto out;
822		}
823		read_unlock_bh(&ip_set_ref_lock);
824
825		ip_set_destroy_set(i);
826	}
827	return 0;
828out:
829	read_unlock_bh(&ip_set_ref_lock);
830	return ret;
831}
832
833/* Flush sets */
834
835static void
836ip_set_flush_set(struct ip_set *set)
837{
838	pr_debug("set: %s\n",  set->name);
839
840	write_lock_bh(&set->lock);
841	set->variant->flush(set);
842	write_unlock_bh(&set->lock);
843}
844
845static int
846ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
847	     const struct nlmsghdr *nlh,
848	     const struct nlattr * const attr[])
849{
850	ip_set_id_t i;
851
852	if (unlikely(protocol_failed(attr)))
853		return -IPSET_ERR_PROTOCOL;
854
855	if (!attr[IPSET_ATTR_SETNAME]) {
856		for (i = 0; i < ip_set_max; i++)
857			if (ip_set_list[i] != NULL)
858				ip_set_flush_set(ip_set_list[i]);
859	} else {
860		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
861		if (i == IPSET_INVALID_ID)
862			return -ENOENT;
863
864		ip_set_flush_set(ip_set_list[i]);
865	}
866
867	return 0;
868}
869
870/* Rename a set */
871
872static const struct nla_policy
873ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
874	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
875	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
876				    .len = IPSET_MAXNAMELEN - 1 },
877	[IPSET_ATTR_SETNAME2]	= { .type = NLA_NUL_STRING,
878				    .len = IPSET_MAXNAMELEN - 1 },
879};
880
881static int
882ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
883	      const struct nlmsghdr *nlh,
884	      const struct nlattr * const attr[])
885{
886	struct ip_set *set;
887	const char *name2;
888	ip_set_id_t i;
889	int ret = 0;
890
891	if (unlikely(protocol_failed(attr) ||
892		     attr[IPSET_ATTR_SETNAME] == NULL ||
893		     attr[IPSET_ATTR_SETNAME2] == NULL))
894		return -IPSET_ERR_PROTOCOL;
895
896	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
897	if (set == NULL)
898		return -ENOENT;
899
900	read_lock_bh(&ip_set_ref_lock);
901	if (set->ref != 0) {
902		ret = -IPSET_ERR_REFERENCED;
903		goto out;
904	}
905
906	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
907	for (i = 0; i < ip_set_max; i++) {
908		if (ip_set_list[i] != NULL &&
909		    STREQ(ip_set_list[i]->name, name2)) {
910			ret = -IPSET_ERR_EXIST_SETNAME2;
911			goto out;
912		}
913	}
914	strncpy(set->name, name2, IPSET_MAXNAMELEN);
915
916out:
917	read_unlock_bh(&ip_set_ref_lock);
918	return ret;
919}
920
921/* Swap two sets so that name/index points to the other.
922 * References and set names are also swapped.
923 *
924 * The commands are serialized by the nfnl mutex and references are
925 * protected by the ip_set_ref_lock. The kernel interfaces
926 * do not hold the mutex but the pointer settings are atomic
927 * so the ip_set_list always contains valid pointers to the sets.
928 */
929
930static int
931ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
932	    const struct nlmsghdr *nlh,
933	    const struct nlattr * const attr[])
934{
935	struct ip_set *from, *to;
936	ip_set_id_t from_id, to_id;
937	char from_name[IPSET_MAXNAMELEN];
938
939	if (unlikely(protocol_failed(attr) ||
940		     attr[IPSET_ATTR_SETNAME] == NULL ||
941		     attr[IPSET_ATTR_SETNAME2] == NULL))
942		return -IPSET_ERR_PROTOCOL;
943
944	from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
945	if (from_id == IPSET_INVALID_ID)
946		return -ENOENT;
947
948	to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2]));
949	if (to_id == IPSET_INVALID_ID)
950		return -IPSET_ERR_EXIST_SETNAME2;
951
952	from = ip_set_list[from_id];
953	to = ip_set_list[to_id];
954
955	/* Features must not change.
956	 * Not an artificial restriction anymore, as we must prevent
957	 * possible loops created by swapping in setlist type of sets. */
958	if (!(from->type->features == to->type->features &&
959	      from->type->family == to->type->family))
960		return -IPSET_ERR_TYPE_MISMATCH;
961
962	strncpy(from_name, from->name, IPSET_MAXNAMELEN);
963	strncpy(from->name, to->name, IPSET_MAXNAMELEN);
964	strncpy(to->name, from_name, IPSET_MAXNAMELEN);
965
966	write_lock_bh(&ip_set_ref_lock);
967	swap(from->ref, to->ref);
968	ip_set_list[from_id] = to;
969	ip_set_list[to_id] = from;
970	write_unlock_bh(&ip_set_ref_lock);
971
972	return 0;
973}
974
975/* List/save set data */
976
977#define DUMP_INIT	0
978#define DUMP_ALL	1
979#define DUMP_ONE	2
980#define DUMP_LAST	3
981
982#define DUMP_TYPE(arg)		(((u32)(arg)) & 0x0000FFFF)
983#define DUMP_FLAGS(arg)		(((u32)(arg)) >> 16)
984
985static int
986ip_set_dump_done(struct netlink_callback *cb)
987{
988	if (cb->args[2]) {
989		pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
990		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
991	}
992	return 0;
993}
994
995static inline void
996dump_attrs(struct nlmsghdr *nlh)
997{
998	const struct nlattr *attr;
999	int rem;
1000
1001	pr_debug("dump nlmsg\n");
1002	nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
1003		pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
1004	}
1005}
1006
1007static int
1008dump_init(struct netlink_callback *cb)
1009{
1010	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
1011	int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
1012	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
1013	struct nlattr *attr = (void *)nlh + min_len;
1014	u32 dump_type;
1015	ip_set_id_t index;
1016
1017	/* Second pass, so parser can't fail */
1018	nla_parse(cda, IPSET_ATTR_CMD_MAX,
1019		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
1020
1021	/* cb->args[0] : dump single set/all sets
1022	 *         [1] : set index
1023	 *         [..]: type specific
1024	 */
1025
1026	if (cda[IPSET_ATTR_SETNAME]) {
1027		index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
1028		if (index == IPSET_INVALID_ID)
1029			return -ENOENT;
1030
1031		dump_type = DUMP_ONE;
1032		cb->args[1] = index;
1033	} else
1034		dump_type = DUMP_ALL;
1035
1036	if (cda[IPSET_ATTR_FLAGS]) {
1037		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
1038		dump_type |= (f << 16);
1039	}
1040	cb->args[0] = dump_type;
1041
1042	return 0;
1043}
1044
1045static int
1046ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
1047{
1048	ip_set_id_t index = IPSET_INVALID_ID, max;
1049	struct ip_set *set = NULL;
1050	struct nlmsghdr *nlh = NULL;
1051	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
1052	u32 dump_type, dump_flags;
1053	int ret = 0;
1054
1055	if (!cb->args[0]) {
1056		ret = dump_init(cb);
1057		if (ret < 0) {
1058			nlh = nlmsg_hdr(cb->skb);
1059			/* We have to create and send the error message
1060			 * manually :-( */
1061			if (nlh->nlmsg_flags & NLM_F_ACK)
1062				netlink_ack(cb->skb, nlh, ret);
1063			return ret;
1064		}
1065	}
1066
1067	if (cb->args[1] >= ip_set_max)
1068		goto out;
1069
1070	dump_type = DUMP_TYPE(cb->args[0]);
1071	dump_flags = DUMP_FLAGS(cb->args[0]);
1072	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
1073dump_last:
1074	pr_debug("args[0]: %u %u args[1]: %ld\n",
1075		 dump_type, dump_flags, cb->args[1]);
1076	for (; cb->args[1] < max; cb->args[1]++) {
1077		index = (ip_set_id_t) cb->args[1];
1078		set = ip_set_list[index];
1079		if (set == NULL) {
1080			if (dump_type == DUMP_ONE) {
1081				ret = -ENOENT;
1082				goto out;
1083			}
1084			continue;
1085		}
1086		/* When dumping all sets, we must dump "sorted"
1087		 * so that lists (unions of sets) are dumped last.
1088		 */
1089		if (dump_type != DUMP_ONE &&
1090		    ((dump_type == DUMP_ALL) ==
1091		     !!(set->type->features & IPSET_DUMP_LAST)))
1092			continue;
1093		pr_debug("List set: %s\n", set->name);
1094		if (!cb->args[2]) {
1095			/* Start listing: make sure set won't be destroyed */
1096			pr_debug("reference set\n");
1097			__ip_set_get(index);
1098		}
1099		nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
1100				cb->nlh->nlmsg_seq, flags,
1101				IPSET_CMD_LIST);
1102		if (!nlh) {
1103			ret = -EMSGSIZE;
1104			goto release_refcount;
1105		}
1106		if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1107		    nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
1108			goto nla_put_failure;
1109		if (dump_flags & IPSET_FLAG_LIST_SETNAME)
1110			goto next_set;
1111		switch (cb->args[2]) {
1112		case 0:
1113			/* Core header data */
1114			if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
1115					   set->type->name) ||
1116			    nla_put_u8(skb, IPSET_ATTR_FAMILY,
1117				       set->family) ||
1118			    nla_put_u8(skb, IPSET_ATTR_REVISION,
1119				       set->revision))
1120				goto nla_put_failure;
1121			ret = set->variant->head(set, skb);
1122			if (ret < 0)
1123				goto release_refcount;
1124			if (dump_flags & IPSET_FLAG_LIST_HEADER)
1125				goto next_set;
1126			/* Fall through and add elements */
1127		default:
1128			read_lock_bh(&set->lock);
1129			ret = set->variant->list(set, skb, cb);
1130			read_unlock_bh(&set->lock);
1131			if (!cb->args[2])
1132				/* Set is done, proceed with next one */
1133				goto next_set;
1134			goto release_refcount;
1135		}
1136	}
1137	/* If we dump all sets, continue with dumping last ones */
1138	if (dump_type == DUMP_ALL) {
1139		dump_type = DUMP_LAST;
1140		cb->args[0] = dump_type | (dump_flags << 16);
1141		cb->args[1] = 0;
1142		goto dump_last;
1143	}
1144	goto out;
1145
1146nla_put_failure:
1147	ret = -EFAULT;
1148next_set:
1149	if (dump_type == DUMP_ONE)
1150		cb->args[1] = IPSET_INVALID_ID;
1151	else
1152		cb->args[1]++;
1153release_refcount:
1154	/* If there was an error or set is done, release set */
1155	if (ret || !cb->args[2]) {
1156		pr_debug("release set %s\n", ip_set_list[index]->name);
1157		ip_set_put_byindex(index);
1158		cb->args[2] = 0;
1159	}
1160out:
1161	if (nlh) {
1162		nlmsg_end(skb, nlh);
1163		pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
1164		dump_attrs(nlh);
1165	}
1166
1167	return ret < 0 ? ret : skb->len;
1168}
1169
1170static int
1171ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
1172	    const struct nlmsghdr *nlh,
1173	    const struct nlattr * const attr[])
1174{
1175	if (unlikely(protocol_failed(attr)))
1176		return -IPSET_ERR_PROTOCOL;
1177
1178	{
1179		struct netlink_dump_control c = {
1180			.dump = ip_set_dump_start,
1181			.done = ip_set_dump_done,
1182		};
1183		return netlink_dump_start(ctnl, skb, nlh, &c);
1184	}
1185}
1186
1187/* Add, del and test */
1188
1189static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
1190	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1191	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
1192				    .len = IPSET_MAXNAMELEN - 1 },
1193	[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
1194	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
1195	[IPSET_ATTR_ADT]	= { .type = NLA_NESTED },
1196};
1197
1198static int
1199call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
1200	struct nlattr *tb[], enum ipset_adt adt,
1201	u32 flags, bool use_lineno)
1202{
1203	int ret;
1204	u32 lineno = 0;
1205	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
1206
1207	do {
1208		write_lock_bh(&set->lock);
1209		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
1210		write_unlock_bh(&set->lock);
1211		retried = true;
1212	} while (ret == -EAGAIN &&
1213		 set->variant->resize &&
1214		 (ret = set->variant->resize(set, retried)) == 0);
1215
1216	if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
1217		return 0;
1218	if (lineno && use_lineno) {
1219		/* Error in restore/batch mode: send back lineno */
1220		struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
1221		struct sk_buff *skb2;
1222		struct nlmsgerr *errmsg;
1223		size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
1224		int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
1225		struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
1226		struct nlattr *cmdattr;
1227		u32 *errline;
1228
1229		skb2 = nlmsg_new(payload, GFP_KERNEL);
1230		if (skb2 == NULL)
1231			return -ENOMEM;
1232		rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
1233				  nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
1234		errmsg = nlmsg_data(rep);
1235		errmsg->error = ret;
1236		memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
1237		cmdattr = (void *)&errmsg->msg + min_len;
1238
1239		nla_parse(cda, IPSET_ATTR_CMD_MAX,
1240			  cmdattr, nlh->nlmsg_len - min_len,
1241			  ip_set_adt_policy);
1242
1243		errline = nla_data(cda[IPSET_ATTR_LINENO]);
1244
1245		*errline = lineno;
1246
1247		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1248		/* Signal netlink not to send its ACK/errmsg.  */
1249		return -EINTR;
1250	}
1251
1252	return ret;
1253}
1254
1255static int
1256ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
1257	    const struct nlmsghdr *nlh,
1258	    const struct nlattr * const attr[])
1259{
1260	struct ip_set *set;
1261	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1262	const struct nlattr *nla;
1263	u32 flags = flag_exist(nlh);
1264	bool use_lineno;
1265	int ret = 0;
1266
1267	if (unlikely(protocol_failed(attr) ||
1268		     attr[IPSET_ATTR_SETNAME] == NULL ||
1269		     !((attr[IPSET_ATTR_DATA] != NULL) ^
1270		       (attr[IPSET_ATTR_ADT] != NULL)) ||
1271		     (attr[IPSET_ATTR_DATA] != NULL &&
1272		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
1273		     (attr[IPSET_ATTR_ADT] != NULL &&
1274		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
1275		       attr[IPSET_ATTR_LINENO] == NULL))))
1276		return -IPSET_ERR_PROTOCOL;
1277
1278	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1279	if (set == NULL)
1280		return -ENOENT;
1281
1282	use_lineno = !!attr[IPSET_ATTR_LINENO];
1283	if (attr[IPSET_ATTR_DATA]) {
1284		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
1285				     attr[IPSET_ATTR_DATA],
1286				     set->type->adt_policy))
1287			return -IPSET_ERR_PROTOCOL;
1288		ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
1289			      use_lineno);
1290	} else {
1291		int nla_rem;
1292
1293		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
1294			memset(tb, 0, sizeof(tb));
1295			if (nla_type(nla) != IPSET_ATTR_DATA ||
1296			    !flag_nested(nla) ||
1297			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
1298					     set->type->adt_policy))
1299				return -IPSET_ERR_PROTOCOL;
1300			ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
1301				      flags, use_lineno);
1302			if (ret < 0)
1303				return ret;
1304		}
1305	}
1306	return ret;
1307}
1308
1309static int
1310ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
1311	    const struct nlmsghdr *nlh,
1312	    const struct nlattr * const attr[])
1313{
1314	struct ip_set *set;
1315	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1316	const struct nlattr *nla;
1317	u32 flags = flag_exist(nlh);
1318	bool use_lineno;
1319	int ret = 0;
1320
1321	if (unlikely(protocol_failed(attr) ||
1322		     attr[IPSET_ATTR_SETNAME] == NULL ||
1323		     !((attr[IPSET_ATTR_DATA] != NULL) ^
1324		       (attr[IPSET_ATTR_ADT] != NULL)) ||
1325		     (attr[IPSET_ATTR_DATA] != NULL &&
1326		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
1327		     (attr[IPSET_ATTR_ADT] != NULL &&
1328		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
1329		       attr[IPSET_ATTR_LINENO] == NULL))))
1330		return -IPSET_ERR_PROTOCOL;
1331
1332	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1333	if (set == NULL)
1334		return -ENOENT;
1335
1336	use_lineno = !!attr[IPSET_ATTR_LINENO];
1337	if (attr[IPSET_ATTR_DATA]) {
1338		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
1339				     attr[IPSET_ATTR_DATA],
1340				     set->type->adt_policy))
1341			return -IPSET_ERR_PROTOCOL;
1342		ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
1343			      use_lineno);
1344	} else {
1345		int nla_rem;
1346
1347		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
1348			memset(tb, 0, sizeof(*tb));
1349			if (nla_type(nla) != IPSET_ATTR_DATA ||
1350			    !flag_nested(nla) ||
1351			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
1352					     set->type->adt_policy))
1353				return -IPSET_ERR_PROTOCOL;
1354			ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
1355				      flags, use_lineno);
1356			if (ret < 0)
1357				return ret;
1358		}
1359	}
1360	return ret;
1361}
1362
1363static int
1364ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
1365	     const struct nlmsghdr *nlh,
1366	     const struct nlattr * const attr[])
1367{
1368	struct ip_set *set;
1369	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1370	int ret = 0;
1371
1372	if (unlikely(protocol_failed(attr) ||
1373		     attr[IPSET_ATTR_SETNAME] == NULL ||
1374		     attr[IPSET_ATTR_DATA] == NULL ||
1375		     !flag_nested(attr[IPSET_ATTR_DATA])))
1376		return -IPSET_ERR_PROTOCOL;
1377
1378	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1379	if (set == NULL)
1380		return -ENOENT;
1381
1382	if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
1383			     set->type->adt_policy))
1384		return -IPSET_ERR_PROTOCOL;
1385
1386	read_lock_bh(&set->lock);
1387	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
1388	read_unlock_bh(&set->lock);
1389	/* Userspace can't trigger element to be re-added */
1390	if (ret == -EAGAIN)
1391		ret = 1;
1392
1393	return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST;
1394}
1395
1396/* Get headed data of a set */
1397
1398static int
1399ip_set_header(struct sock *ctnl, struct sk_buff *skb,
1400	      const struct nlmsghdr *nlh,
1401	      const struct nlattr * const attr[])
1402{
1403	const struct ip_set *set;
1404	struct sk_buff *skb2;
1405	struct nlmsghdr *nlh2;
1406	ip_set_id_t index;
1407	int ret = 0;
1408
1409	if (unlikely(protocol_failed(attr) ||
1410		     attr[IPSET_ATTR_SETNAME] == NULL))
1411		return -IPSET_ERR_PROTOCOL;
1412
1413	index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
1414	if (index == IPSET_INVALID_ID)
1415		return -ENOENT;
1416	set = ip_set_list[index];
1417
1418	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1419	if (skb2 == NULL)
1420		return -ENOMEM;
1421
1422	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1423			 IPSET_CMD_HEADER);
1424	if (!nlh2)
1425		goto nlmsg_failure;
1426	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1427	    nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
1428	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
1429	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
1430	    nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision))
1431		goto nla_put_failure;
1432	nlmsg_end(skb2, nlh2);
1433
1434	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1435	if (ret < 0)
1436		return ret;
1437
1438	return 0;
1439
1440nla_put_failure:
1441	nlmsg_cancel(skb2, nlh2);
1442nlmsg_failure:
1443	kfree_skb(skb2);
1444	return -EMSGSIZE;
1445}
1446
1447/* Get type data */
1448
1449static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
1450	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1451	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
1452				    .len = IPSET_MAXNAMELEN - 1 },
1453	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
1454};
1455
1456static int
1457ip_set_type(struct sock *ctnl, struct sk_buff *skb,
1458	    const struct nlmsghdr *nlh,
1459	    const struct nlattr * const attr[])
1460{
1461	struct sk_buff *skb2;
1462	struct nlmsghdr *nlh2;
1463	u8 family, min, max;
1464	const char *typename;
1465	int ret = 0;
1466
1467	if (unlikely(protocol_failed(attr) ||
1468		     attr[IPSET_ATTR_TYPENAME] == NULL ||
1469		     attr[IPSET_ATTR_FAMILY] == NULL))
1470		return -IPSET_ERR_PROTOCOL;
1471
1472	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
1473	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
1474	ret = find_set_type_minmax(typename, family, &min, &max);
1475	if (ret)
1476		return ret;
1477
1478	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1479	if (skb2 == NULL)
1480		return -ENOMEM;
1481
1482	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1483			 IPSET_CMD_TYPE);
1484	if (!nlh2)
1485		goto nlmsg_failure;
1486	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1487	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
1488	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
1489	    nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
1490	    nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min))
1491		goto nla_put_failure;
1492	nlmsg_end(skb2, nlh2);
1493
1494	pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
1495	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1496	if (ret < 0)
1497		return ret;
1498
1499	return 0;
1500
1501nla_put_failure:
1502	nlmsg_cancel(skb2, nlh2);
1503nlmsg_failure:
1504	kfree_skb(skb2);
1505	return -EMSGSIZE;
1506}
1507
1508/* Get protocol version */
1509
1510static const struct nla_policy
1511ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
1512	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1513};
1514
1515static int
1516ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
1517		const struct nlmsghdr *nlh,
1518		const struct nlattr * const attr[])
1519{
1520	struct sk_buff *skb2;
1521	struct nlmsghdr *nlh2;
1522	int ret = 0;
1523
1524	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
1525		return -IPSET_ERR_PROTOCOL;
1526
1527	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1528	if (skb2 == NULL)
1529		return -ENOMEM;
1530
1531	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1532			 IPSET_CMD_PROTOCOL);
1533	if (!nlh2)
1534		goto nlmsg_failure;
1535	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
1536		goto nla_put_failure;
1537	nlmsg_end(skb2, nlh2);
1538
1539	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1540	if (ret < 0)
1541		return ret;
1542
1543	return 0;
1544
1545nla_put_failure:
1546	nlmsg_cancel(skb2, nlh2);
1547nlmsg_failure:
1548	kfree_skb(skb2);
1549	return -EMSGSIZE;
1550}
1551
1552static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
1553	[IPSET_CMD_NONE]	= {
1554		.call		= ip_set_none,
1555		.attr_count	= IPSET_ATTR_CMD_MAX,
1556	},
1557	[IPSET_CMD_CREATE]	= {
1558		.call		= ip_set_create,
1559		.attr_count	= IPSET_ATTR_CMD_MAX,
1560		.policy		= ip_set_create_policy,
1561	},
1562	[IPSET_CMD_DESTROY]	= {
1563		.call		= ip_set_destroy,
1564		.attr_count	= IPSET_ATTR_CMD_MAX,
1565		.policy		= ip_set_setname_policy,
1566	},
1567	[IPSET_CMD_FLUSH]	= {
1568		.call		= ip_set_flush,
1569		.attr_count	= IPSET_ATTR_CMD_MAX,
1570		.policy		= ip_set_setname_policy,
1571	},
1572	[IPSET_CMD_RENAME]	= {
1573		.call		= ip_set_rename,
1574		.attr_count	= IPSET_ATTR_CMD_MAX,
1575		.policy		= ip_set_setname2_policy,
1576	},
1577	[IPSET_CMD_SWAP]	= {
1578		.call		= ip_set_swap,
1579		.attr_count	= IPSET_ATTR_CMD_MAX,
1580		.policy		= ip_set_setname2_policy,
1581	},
1582	[IPSET_CMD_LIST]	= {
1583		.call		= ip_set_dump,
1584		.attr_count	= IPSET_ATTR_CMD_MAX,
1585		.policy		= ip_set_setname_policy,
1586	},
1587	[IPSET_CMD_SAVE]	= {
1588		.call		= ip_set_dump,
1589		.attr_count	= IPSET_ATTR_CMD_MAX,
1590		.policy		= ip_set_setname_policy,
1591	},
1592	[IPSET_CMD_ADD]	= {
1593		.call		= ip_set_uadd,
1594		.attr_count	= IPSET_ATTR_CMD_MAX,
1595		.policy		= ip_set_adt_policy,
1596	},
1597	[IPSET_CMD_DEL]	= {
1598		.call		= ip_set_udel,
1599		.attr_count	= IPSET_ATTR_CMD_MAX,
1600		.policy		= ip_set_adt_policy,
1601	},
1602	[IPSET_CMD_TEST]	= {
1603		.call		= ip_set_utest,
1604		.attr_count	= IPSET_ATTR_CMD_MAX,
1605		.policy		= ip_set_adt_policy,
1606	},
1607	[IPSET_CMD_HEADER]	= {
1608		.call		= ip_set_header,
1609		.attr_count	= IPSET_ATTR_CMD_MAX,
1610		.policy		= ip_set_setname_policy,
1611	},
1612	[IPSET_CMD_TYPE]	= {
1613		.call		= ip_set_type,
1614		.attr_count	= IPSET_ATTR_CMD_MAX,
1615		.policy		= ip_set_type_policy,
1616	},
1617	[IPSET_CMD_PROTOCOL]	= {
1618		.call		= ip_set_protocol,
1619		.attr_count	= IPSET_ATTR_CMD_MAX,
1620		.policy		= ip_set_protocol_policy,
1621	},
1622};
1623
1624static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
1625	.name		= "ip_set",
1626	.subsys_id	= NFNL_SUBSYS_IPSET,
1627	.cb_count	= IPSET_MSG_MAX,
1628	.cb		= ip_set_netlink_subsys_cb,
1629};
1630
1631/* Interface to iptables/ip6tables */
1632
1633static int
1634ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
1635{
1636	unsigned int *op;
1637	void *data;
1638	int copylen = *len, ret = 0;
1639
1640	if (!capable(CAP_NET_ADMIN))
1641		return -EPERM;
1642	if (optval != SO_IP_SET)
1643		return -EBADF;
1644	if (*len < sizeof(unsigned int))
1645		return -EINVAL;
1646
1647	data = vmalloc(*len);
1648	if (!data)
1649		return -ENOMEM;
1650	if (copy_from_user(data, user, *len) != 0) {
1651		ret = -EFAULT;
1652		goto done;
1653	}
1654	op = (unsigned int *) data;
1655
1656	if (*op < IP_SET_OP_VERSION) {
1657		/* Check the version at the beginning of operations */
1658		struct ip_set_req_version *req_version = data;
1659		if (req_version->version != IPSET_PROTOCOL) {
1660			ret = -EPROTO;
1661			goto done;
1662		}
1663	}
1664
1665	switch (*op) {
1666	case IP_SET_OP_VERSION: {
1667		struct ip_set_req_version *req_version = data;
1668
1669		if (*len != sizeof(struct ip_set_req_version)) {
1670			ret = -EINVAL;
1671			goto done;
1672		}
1673
1674		req_version->version = IPSET_PROTOCOL;
1675		ret = copy_to_user(user, req_version,
1676				   sizeof(struct ip_set_req_version));
1677		goto done;
1678	}
1679	case IP_SET_OP_GET_BYNAME: {
1680		struct ip_set_req_get_set *req_get = data;
1681
1682		if (*len != sizeof(struct ip_set_req_get_set)) {
1683			ret = -EINVAL;
1684			goto done;
1685		}
1686		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
1687		nfnl_lock();
1688		req_get->set.index = find_set_id(req_get->set.name);
1689		nfnl_unlock();
1690		goto copy;
1691	}
1692	case IP_SET_OP_GET_BYINDEX: {
1693		struct ip_set_req_get_set *req_get = data;
1694
1695		if (*len != sizeof(struct ip_set_req_get_set) ||
1696		    req_get->set.index >= ip_set_max) {
1697			ret = -EINVAL;
1698			goto done;
1699		}
1700		nfnl_lock();
1701		strncpy(req_get->set.name,
1702			ip_set_list[req_get->set.index]
1703				? ip_set_list[req_get->set.index]->name : "",
1704			IPSET_MAXNAMELEN);
1705		nfnl_unlock();
1706		goto copy;
1707	}
1708	default:
1709		ret = -EBADMSG;
1710		goto done;
1711	}	/* end of switch(op) */
1712
1713copy:
1714	ret = copy_to_user(user, data, copylen);
1715
1716done:
1717	vfree(data);
1718	if (ret > 0)
1719		ret = 0;
1720	return ret;
1721}
1722
1723static struct nf_sockopt_ops so_set __read_mostly = {
1724	.pf		= PF_INET,
1725	.get_optmin	= SO_IP_SET,
1726	.get_optmax	= SO_IP_SET + 1,
1727	.get		= &ip_set_sockfn_get,
1728	.owner		= THIS_MODULE,
1729};
1730
1731static int __init
1732ip_set_init(void)
1733{
1734	int ret;
1735
1736	if (max_sets)
1737		ip_set_max = max_sets;
1738	if (ip_set_max >= IPSET_INVALID_ID)
1739		ip_set_max = IPSET_INVALID_ID - 1;
1740
1741	ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max,
1742			      GFP_KERNEL);
1743	if (!ip_set_list)
1744		return -ENOMEM;
1745
1746	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
1747	if (ret != 0) {
1748		pr_err("ip_set: cannot register with nfnetlink.\n");
1749		kfree(ip_set_list);
1750		return ret;
1751	}
1752	ret = nf_register_sockopt(&so_set);
1753	if (ret != 0) {
1754		pr_err("SO_SET registry failed: %d\n", ret);
1755		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
1756		kfree(ip_set_list);
1757		return ret;
1758	}
1759
1760	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
1761	return 0;
1762}
1763
1764static void __exit
1765ip_set_fini(void)
1766{
1767	/* There can't be any existing set */
1768	nf_unregister_sockopt(&so_set);
1769	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
1770	kfree(ip_set_list);
1771	pr_debug("these are the famous last words\n");
1772}
1773
1774module_init(ip_set_init);
1775module_exit(ip_set_fini);
1776