1/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
2 *                         Patrick Schaaf <bof@bof.de>
3 * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10/* Kernel module for IP set management */
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/moduleparam.h>
15#include <linux/ip.h>
16#include <linux/skbuff.h>
17#include <linux/spinlock.h>
18#include <linux/rculist.h>
19#include <net/netlink.h>
20
21#include <linux/netfilter.h>
22#include <linux/netfilter/x_tables.h>
23#include <linux/netfilter/nfnetlink.h>
24#include <linux/netfilter/ipset/ip_set.h>
25
26static LIST_HEAD(ip_set_type_list);		/* all registered set types */
27static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
28static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
29
30static struct ip_set * __rcu *ip_set_list;	/* all individual sets */
31static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
32
33#define IP_SET_INC	64
34#define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
35
36static unsigned int max_sets;
37
38module_param(max_sets, int, 0600);
39MODULE_PARM_DESC(max_sets, "maximal number of sets");
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
42MODULE_DESCRIPTION("core IP set support");
43MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
44
45/* When the nfnl mutex is held: */
46#define nfnl_dereference(p)		\
47	rcu_dereference_protected(p, 1)
48#define nfnl_set(id)			\
49	nfnl_dereference(ip_set_list)[id]
50
51/*
52 * The set types are implemented in modules and registered set types
53 * can be found in ip_set_type_list. Adding/deleting types is
54 * serialized by ip_set_type_mutex.
55 */
56
57static inline void
58ip_set_type_lock(void)
59{
60	mutex_lock(&ip_set_type_mutex);
61}
62
63static inline void
64ip_set_type_unlock(void)
65{
66	mutex_unlock(&ip_set_type_mutex);
67}
68
69/* Register and deregister settype */
70
71static struct ip_set_type *
72find_set_type(const char *name, u8 family, u8 revision)
73{
74	struct ip_set_type *type;
75
76	list_for_each_entry_rcu(type, &ip_set_type_list, list)
77		if (STREQ(type->name, name) &&
78		    (type->family == family ||
79		     type->family == NFPROTO_UNSPEC) &&
80		    revision >= type->revision_min &&
81		    revision <= type->revision_max)
82			return type;
83	return NULL;
84}
85
86/* Unlock, try to load a set type module and lock again */
87static bool
88load_settype(const char *name)
89{
90	nfnl_unlock(NFNL_SUBSYS_IPSET);
91	pr_debug("try to load ip_set_%s\n", name);
92	if (request_module("ip_set_%s", name) < 0) {
93		pr_warning("Can't find ip_set type %s\n", name);
94		nfnl_lock(NFNL_SUBSYS_IPSET);
95		return false;
96	}
97	nfnl_lock(NFNL_SUBSYS_IPSET);
98	return true;
99}
100
101/* Find a set type and reference it */
102#define find_set_type_get(name, family, revision, found)	\
103	__find_set_type_get(name, family, revision, found, false)
104
105static int
106__find_set_type_get(const char *name, u8 family, u8 revision,
107		    struct ip_set_type **found, bool retry)
108{
109	struct ip_set_type *type;
110	int err;
111
112	if (retry && !load_settype(name))
113		return -IPSET_ERR_FIND_TYPE;
114
115	rcu_read_lock();
116	*found = find_set_type(name, family, revision);
117	if (*found) {
118		err = !try_module_get((*found)->me) ? -EFAULT : 0;
119		goto unlock;
120	}
121	/* Make sure the type is already loaded
122	 * but we don't support the revision */
123	list_for_each_entry_rcu(type, &ip_set_type_list, list)
124		if (STREQ(type->name, name)) {
125			err = -IPSET_ERR_FIND_TYPE;
126			goto unlock;
127		}
128	rcu_read_unlock();
129
130	return retry ? -IPSET_ERR_FIND_TYPE :
131		__find_set_type_get(name, family, revision, found, true);
132
133unlock:
134	rcu_read_unlock();
135	return err;
136}
137
138/* Find a given set type by name and family.
139 * If we succeeded, the supported minimal and maximum revisions are
140 * filled out.
141 */
142#define find_set_type_minmax(name, family, min, max) \
143	__find_set_type_minmax(name, family, min, max, false)
144
145static int
146__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
147		       bool retry)
148{
149	struct ip_set_type *type;
150	bool found = false;
151
152	if (retry && !load_settype(name))
153		return -IPSET_ERR_FIND_TYPE;
154
155	*min = 255; *max = 0;
156	rcu_read_lock();
157	list_for_each_entry_rcu(type, &ip_set_type_list, list)
158		if (STREQ(type->name, name) &&
159		    (type->family == family ||
160		     type->family == NFPROTO_UNSPEC)) {
161			found = true;
162			if (type->revision_min < *min)
163				*min = type->revision_min;
164			if (type->revision_max > *max)
165				*max = type->revision_max;
166		}
167	rcu_read_unlock();
168	if (found)
169		return 0;
170
171	return retry ? -IPSET_ERR_FIND_TYPE :
172		__find_set_type_minmax(name, family, min, max, true);
173}
174
175#define family_name(f)	((f) == NFPROTO_IPV4 ? "inet" : \
176			 (f) == NFPROTO_IPV6 ? "inet6" : "any")
177
178/* Register a set type structure. The type is identified by
179 * the unique triple of name, family and revision.
180 */
181int
182ip_set_type_register(struct ip_set_type *type)
183{
184	int ret = 0;
185
186	if (type->protocol != IPSET_PROTOCOL) {
187		pr_warning("ip_set type %s, family %s, revision %u:%u uses "
188			   "wrong protocol version %u (want %u)\n",
189			   type->name, family_name(type->family),
190			   type->revision_min, type->revision_max,
191			   type->protocol, IPSET_PROTOCOL);
192		return -EINVAL;
193	}
194
195	ip_set_type_lock();
196	if (find_set_type(type->name, type->family, type->revision_min)) {
197		/* Duplicate! */
198		pr_warning("ip_set type %s, family %s with revision min %u "
199			   "already registered!\n", type->name,
200			   family_name(type->family), type->revision_min);
201		ret = -EINVAL;
202		goto unlock;
203	}
204	list_add_rcu(&type->list, &ip_set_type_list);
205	pr_debug("type %s, family %s, revision %u:%u registered.\n",
206		 type->name, family_name(type->family),
207		 type->revision_min, type->revision_max);
208unlock:
209	ip_set_type_unlock();
210	return ret;
211}
212EXPORT_SYMBOL_GPL(ip_set_type_register);
213
214/* Unregister a set type. There's a small race with ip_set_create */
215void
216ip_set_type_unregister(struct ip_set_type *type)
217{
218	ip_set_type_lock();
219	if (!find_set_type(type->name, type->family, type->revision_min)) {
220		pr_warning("ip_set type %s, family %s with revision min %u "
221			   "not registered\n", type->name,
222			   family_name(type->family), type->revision_min);
223		goto unlock;
224	}
225	list_del_rcu(&type->list);
226	pr_debug("type %s, family %s with revision min %u unregistered.\n",
227		 type->name, family_name(type->family), type->revision_min);
228unlock:
229	ip_set_type_unlock();
230
231	synchronize_rcu();
232}
233EXPORT_SYMBOL_GPL(ip_set_type_unregister);
234
235/* Utility functions */
236void *
237ip_set_alloc(size_t size)
238{
239	void *members = NULL;
240
241	if (size < KMALLOC_MAX_SIZE)
242		members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
243
244	if (members) {
245		pr_debug("%p: allocated with kmalloc\n", members);
246		return members;
247	}
248
249	members = vzalloc(size);
250	if (!members)
251		return NULL;
252	pr_debug("%p: allocated with vmalloc\n", members);
253
254	return members;
255}
256EXPORT_SYMBOL_GPL(ip_set_alloc);
257
258void
259ip_set_free(void *members)
260{
261	pr_debug("%p: free with %s\n", members,
262		 is_vmalloc_addr(members) ? "vfree" : "kfree");
263	if (is_vmalloc_addr(members))
264		vfree(members);
265	else
266		kfree(members);
267}
268EXPORT_SYMBOL_GPL(ip_set_free);
269
270static inline bool
271flag_nested(const struct nlattr *nla)
272{
273	return nla->nla_type & NLA_F_NESTED;
274}
275
276static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
277	[IPSET_ATTR_IPADDR_IPV4]	= { .type = NLA_U32 },
278	[IPSET_ATTR_IPADDR_IPV6]	= { .type = NLA_BINARY,
279					    .len = sizeof(struct in6_addr) },
280};
281
282int
283ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr)
284{
285	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
286
287	if (unlikely(!flag_nested(nla)))
288		return -IPSET_ERR_PROTOCOL;
289	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
290		return -IPSET_ERR_PROTOCOL;
291	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
292		return -IPSET_ERR_PROTOCOL;
293
294	*ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
295	return 0;
296}
297EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
298
299int
300ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
301{
302	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
303
304	if (unlikely(!flag_nested(nla)))
305		return -IPSET_ERR_PROTOCOL;
306
307	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
308		return -IPSET_ERR_PROTOCOL;
309	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
310		return -IPSET_ERR_PROTOCOL;
311
312	memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
313		sizeof(struct in6_addr));
314	return 0;
315}
316EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
317
318int
319ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
320		      struct ip_set_ext *ext)
321{
322	if (tb[IPSET_ATTR_TIMEOUT]) {
323		if (!(set->extensions & IPSET_EXT_TIMEOUT))
324			return -IPSET_ERR_TIMEOUT;
325		ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
326	}
327	if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
328		if (!(set->extensions & IPSET_EXT_COUNTER))
329			return -IPSET_ERR_COUNTER;
330		if (tb[IPSET_ATTR_BYTES])
331			ext->bytes = be64_to_cpu(nla_get_be64(
332						 tb[IPSET_ATTR_BYTES]));
333		if (tb[IPSET_ATTR_PACKETS])
334			ext->packets = be64_to_cpu(nla_get_be64(
335						   tb[IPSET_ATTR_PACKETS]));
336	}
337	return 0;
338}
339EXPORT_SYMBOL_GPL(ip_set_get_extensions);
340
341/*
342 * Creating/destroying/renaming/swapping affect the existence and
343 * the properties of a set. All of these can be executed from userspace
344 * only and serialized by the nfnl mutex indirectly from nfnetlink.
345 *
346 * Sets are identified by their index in ip_set_list and the index
347 * is used by the external references (set/SET netfilter modules).
348 *
349 * The set behind an index may change by swapping only, from userspace.
350 */
351
352static inline void
353__ip_set_get(struct ip_set *set)
354{
355	write_lock_bh(&ip_set_ref_lock);
356	set->ref++;
357	write_unlock_bh(&ip_set_ref_lock);
358}
359
360static inline void
361__ip_set_put(struct ip_set *set)
362{
363	write_lock_bh(&ip_set_ref_lock);
364	BUG_ON(set->ref == 0);
365	set->ref--;
366	write_unlock_bh(&ip_set_ref_lock);
367}
368
369/*
370 * Add, del and test set entries from kernel.
371 *
372 * The set behind the index must exist and must be referenced
373 * so it can't be destroyed (or changed) under our foot.
374 */
375
376static inline struct ip_set *
377ip_set_rcu_get(ip_set_id_t index)
378{
379	struct ip_set *set;
380
381	rcu_read_lock();
382	/* ip_set_list itself needs to be protected */
383	set = rcu_dereference(ip_set_list)[index];
384	rcu_read_unlock();
385
386	return set;
387}
388
389int
390ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
391	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
392{
393	struct ip_set *set = ip_set_rcu_get(index);
394	int ret = 0;
395
396	BUG_ON(set == NULL);
397	pr_debug("set %s, index %u\n", set->name, index);
398
399	if (opt->dim < set->type->dimension ||
400	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
401		return 0;
402
403	read_lock_bh(&set->lock);
404	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
405	read_unlock_bh(&set->lock);
406
407	if (ret == -EAGAIN) {
408		/* Type requests element to be completed */
409		pr_debug("element must be competed, ADD is triggered\n");
410		write_lock_bh(&set->lock);
411		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
412		write_unlock_bh(&set->lock);
413		ret = 1;
414	} else {
415		/* --return-nomatch: invert matched element */
416		if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
417		    (set->type->features & IPSET_TYPE_NOMATCH) &&
418		    (ret > 0 || ret == -ENOTEMPTY))
419			ret = -ret;
420	}
421
422	/* Convert error codes to nomatch */
423	return (ret < 0 ? 0 : ret);
424}
425EXPORT_SYMBOL_GPL(ip_set_test);
426
427int
428ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
429	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
430{
431	struct ip_set *set = ip_set_rcu_get(index);
432	int ret;
433
434	BUG_ON(set == NULL);
435	pr_debug("set %s, index %u\n", set->name, index);
436
437	if (opt->dim < set->type->dimension ||
438	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
439		return 0;
440
441	write_lock_bh(&set->lock);
442	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
443	write_unlock_bh(&set->lock);
444
445	return ret;
446}
447EXPORT_SYMBOL_GPL(ip_set_add);
448
449int
450ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
451	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
452{
453	struct ip_set *set = ip_set_rcu_get(index);
454	int ret = 0;
455
456	BUG_ON(set == NULL);
457	pr_debug("set %s, index %u\n", set->name, index);
458
459	if (opt->dim < set->type->dimension ||
460	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
461		return 0;
462
463	write_lock_bh(&set->lock);
464	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
465	write_unlock_bh(&set->lock);
466
467	return ret;
468}
469EXPORT_SYMBOL_GPL(ip_set_del);
470
471/*
472 * Find set by name, reference it once. The reference makes sure the
473 * thing pointed to, does not go away under our feet.
474 *
475 */
476ip_set_id_t
477ip_set_get_byname(const char *name, struct ip_set **set)
478{
479	ip_set_id_t i, index = IPSET_INVALID_ID;
480	struct ip_set *s;
481
482	rcu_read_lock();
483	for (i = 0; i < ip_set_max; i++) {
484		s = rcu_dereference(ip_set_list)[i];
485		if (s != NULL && STREQ(s->name, name)) {
486			__ip_set_get(s);
487			index = i;
488			*set = s;
489			break;
490		}
491	}
492	rcu_read_unlock();
493
494	return index;
495}
496EXPORT_SYMBOL_GPL(ip_set_get_byname);
497
498/*
499 * If the given set pointer points to a valid set, decrement
500 * reference count by 1. The caller shall not assume the index
501 * to be valid, after calling this function.
502 *
503 */
504void
505ip_set_put_byindex(ip_set_id_t index)
506{
507	struct ip_set *set;
508
509	rcu_read_lock();
510	set = rcu_dereference(ip_set_list)[index];
511	if (set != NULL)
512		__ip_set_put(set);
513	rcu_read_unlock();
514}
515EXPORT_SYMBOL_GPL(ip_set_put_byindex);
516
517/*
518 * Get the name of a set behind a set index.
519 * We assume the set is referenced, so it does exist and
520 * can't be destroyed. The set cannot be renamed due to
521 * the referencing either.
522 *
523 */
524const char *
525ip_set_name_byindex(ip_set_id_t index)
526{
527	const struct ip_set *set = ip_set_rcu_get(index);
528
529	BUG_ON(set == NULL);
530	BUG_ON(set->ref == 0);
531
532	/* Referenced, so it's safe */
533	return set->name;
534}
535EXPORT_SYMBOL_GPL(ip_set_name_byindex);
536
537/*
538 * Routines to call by external subsystems, which do not
539 * call nfnl_lock for us.
540 */
541
542/*
543 * Find set by name, reference it once. The reference makes sure the
544 * thing pointed to, does not go away under our feet.
545 *
546 * The nfnl mutex is used in the function.
547 */
548ip_set_id_t
549ip_set_nfnl_get(const char *name)
550{
551	ip_set_id_t i, index = IPSET_INVALID_ID;
552	struct ip_set *s;
553
554	nfnl_lock(NFNL_SUBSYS_IPSET);
555	for (i = 0; i < ip_set_max; i++) {
556		s = nfnl_set(i);
557		if (s != NULL && STREQ(s->name, name)) {
558			__ip_set_get(s);
559			index = i;
560			break;
561		}
562	}
563	nfnl_unlock(NFNL_SUBSYS_IPSET);
564
565	return index;
566}
567EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
568
569/*
570 * Find set by index, reference it once. The reference makes sure the
571 * thing pointed to, does not go away under our feet.
572 *
573 * The nfnl mutex is used in the function.
574 */
575ip_set_id_t
576ip_set_nfnl_get_byindex(ip_set_id_t index)
577{
578	struct ip_set *set;
579
580	if (index > ip_set_max)
581		return IPSET_INVALID_ID;
582
583	nfnl_lock(NFNL_SUBSYS_IPSET);
584	set = nfnl_set(index);
585	if (set)
586		__ip_set_get(set);
587	else
588		index = IPSET_INVALID_ID;
589	nfnl_unlock(NFNL_SUBSYS_IPSET);
590
591	return index;
592}
593EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
594
595/*
596 * If the given set pointer points to a valid set, decrement
597 * reference count by 1. The caller shall not assume the index
598 * to be valid, after calling this function.
599 *
600 * The nfnl mutex is used in the function.
601 */
602void
603ip_set_nfnl_put(ip_set_id_t index)
604{
605	struct ip_set *set;
606	nfnl_lock(NFNL_SUBSYS_IPSET);
607	set = nfnl_set(index);
608	if (set != NULL)
609		__ip_set_put(set);
610	nfnl_unlock(NFNL_SUBSYS_IPSET);
611}
612EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
613
614/*
615 * Communication protocol with userspace over netlink.
616 *
617 * The commands are serialized by the nfnl mutex.
618 */
619
620static inline bool
621protocol_failed(const struct nlattr * const tb[])
622{
623	return !tb[IPSET_ATTR_PROTOCOL] ||
624	       nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
625}
626
627static inline u32
628flag_exist(const struct nlmsghdr *nlh)
629{
630	return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
631}
632
633static struct nlmsghdr *
634start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
635	  enum ipset_cmd cmd)
636{
637	struct nlmsghdr *nlh;
638	struct nfgenmsg *nfmsg;
639
640	nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
641			sizeof(*nfmsg), flags);
642	if (nlh == NULL)
643		return NULL;
644
645	nfmsg = nlmsg_data(nlh);
646	nfmsg->nfgen_family = NFPROTO_IPV4;
647	nfmsg->version = NFNETLINK_V0;
648	nfmsg->res_id = 0;
649
650	return nlh;
651}
652
653/* Create a set */
654
655static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
656	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
657	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
658				    .len = IPSET_MAXNAMELEN - 1 },
659	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
660				    .len = IPSET_MAXNAMELEN - 1},
661	[IPSET_ATTR_REVISION]	= { .type = NLA_U8 },
662	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
663	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
664};
665
666static struct ip_set *
667find_set_and_id(const char *name, ip_set_id_t *id)
668{
669	struct ip_set *set = NULL;
670	ip_set_id_t i;
671
672	*id = IPSET_INVALID_ID;
673	for (i = 0; i < ip_set_max; i++) {
674		set = nfnl_set(i);
675		if (set != NULL && STREQ(set->name, name)) {
676			*id = i;
677			break;
678		}
679	}
680	return (*id == IPSET_INVALID_ID ? NULL : set);
681}
682
683static inline struct ip_set *
684find_set(const char *name)
685{
686	ip_set_id_t id;
687
688	return find_set_and_id(name, &id);
689}
690
691static int
692find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
693{
694	struct ip_set *s;
695	ip_set_id_t i;
696
697	*index = IPSET_INVALID_ID;
698	for (i = 0;  i < ip_set_max; i++) {
699		s = nfnl_set(i);
700		if (s == NULL) {
701			if (*index == IPSET_INVALID_ID)
702				*index = i;
703		} else if (STREQ(name, s->name)) {
704			/* Name clash */
705			*set = s;
706			return -EEXIST;
707		}
708	}
709	if (*index == IPSET_INVALID_ID)
710		/* No free slot remained */
711		return -IPSET_ERR_MAX_SETS;
712	return 0;
713}
714
715static int
716ip_set_none(struct sock *ctnl, struct sk_buff *skb,
717	    const struct nlmsghdr *nlh,
718	    const struct nlattr * const attr[])
719{
720	return -EOPNOTSUPP;
721}
722
723static int
724ip_set_create(struct sock *ctnl, struct sk_buff *skb,
725	      const struct nlmsghdr *nlh,
726	      const struct nlattr * const attr[])
727{
728	struct ip_set *set, *clash = NULL;
729	ip_set_id_t index = IPSET_INVALID_ID;
730	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
731	const char *name, *typename;
732	u8 family, revision;
733	u32 flags = flag_exist(nlh);
734	int ret = 0;
735
736	if (unlikely(protocol_failed(attr) ||
737		     attr[IPSET_ATTR_SETNAME] == NULL ||
738		     attr[IPSET_ATTR_TYPENAME] == NULL ||
739		     attr[IPSET_ATTR_REVISION] == NULL ||
740		     attr[IPSET_ATTR_FAMILY] == NULL ||
741		     (attr[IPSET_ATTR_DATA] != NULL &&
742		      !flag_nested(attr[IPSET_ATTR_DATA]))))
743		return -IPSET_ERR_PROTOCOL;
744
745	name = nla_data(attr[IPSET_ATTR_SETNAME]);
746	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
747	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
748	revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
749	pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
750		 name, typename, family_name(family), revision);
751
752	/*
753	 * First, and without any locks, allocate and initialize
754	 * a normal base set structure.
755	 */
756	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
757	if (!set)
758		return -ENOMEM;
759	rwlock_init(&set->lock);
760	strlcpy(set->name, name, IPSET_MAXNAMELEN);
761	set->family = family;
762	set->revision = revision;
763
764	/*
765	 * Next, check that we know the type, and take
766	 * a reference on the type, to make sure it stays available
767	 * while constructing our new set.
768	 *
769	 * After referencing the type, we try to create the type
770	 * specific part of the set without holding any locks.
771	 */
772	ret = find_set_type_get(typename, family, revision, &(set->type));
773	if (ret)
774		goto out;
775
776	/*
777	 * Without holding any locks, create private part.
778	 */
779	if (attr[IPSET_ATTR_DATA] &&
780	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
781			     set->type->create_policy)) {
782		ret = -IPSET_ERR_PROTOCOL;
783		goto put_out;
784	}
785
786	ret = set->type->create(set, tb, flags);
787	if (ret != 0)
788		goto put_out;
789
790	/* BTW, ret==0 here. */
791
792	/*
793	 * Here, we have a valid, constructed set and we are protected
794	 * by the nfnl mutex. Find the first free index in ip_set_list
795	 * and check clashing.
796	 */
797	ret = find_free_id(set->name, &index, &clash);
798	if (ret == -EEXIST) {
799		/* If this is the same set and requested, ignore error */
800		if ((flags & IPSET_FLAG_EXIST) &&
801		    STREQ(set->type->name, clash->type->name) &&
802		    set->type->family == clash->type->family &&
803		    set->type->revision_min == clash->type->revision_min &&
804		    set->type->revision_max == clash->type->revision_max &&
805		    set->variant->same_set(set, clash))
806			ret = 0;
807		goto cleanup;
808	} else if (ret == -IPSET_ERR_MAX_SETS) {
809		struct ip_set **list, **tmp;
810		ip_set_id_t i = ip_set_max + IP_SET_INC;
811
812		if (i < ip_set_max || i == IPSET_INVALID_ID)
813			/* Wraparound */
814			goto cleanup;
815
816		list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
817		if (!list)
818			goto cleanup;
819		/* nfnl mutex is held, both lists are valid */
820		tmp = nfnl_dereference(ip_set_list);
821		memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max);
822		rcu_assign_pointer(ip_set_list, list);
823		/* Make sure all current packets have passed through */
824		synchronize_net();
825		/* Use new list */
826		index = ip_set_max;
827		ip_set_max = i;
828		kfree(tmp);
829		ret = 0;
830	} else if (ret)
831		goto cleanup;
832
833	/*
834	 * Finally! Add our shiny new set to the list, and be done.
835	 */
836	pr_debug("create: '%s' created with index %u!\n", set->name, index);
837	nfnl_set(index) = set;
838
839	return ret;
840
841cleanup:
842	set->variant->destroy(set);
843put_out:
844	module_put(set->type->me);
845out:
846	kfree(set);
847	return ret;
848}
849
850/* Destroy sets */
851
852static const struct nla_policy
853ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
854	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
855	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
856				    .len = IPSET_MAXNAMELEN - 1 },
857};
858
859static void
860ip_set_destroy_set(ip_set_id_t index)
861{
862	struct ip_set *set = nfnl_set(index);
863
864	pr_debug("set: %s\n",  set->name);
865	nfnl_set(index) = NULL;
866
867	/* Must call it without holding any lock */
868	set->variant->destroy(set);
869	module_put(set->type->me);
870	kfree(set);
871}
872
873static int
874ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
875	       const struct nlmsghdr *nlh,
876	       const struct nlattr * const attr[])
877{
878	struct ip_set *s;
879	ip_set_id_t i;
880	int ret = 0;
881
882	if (unlikely(protocol_failed(attr)))
883		return -IPSET_ERR_PROTOCOL;
884
885	/* Commands are serialized and references are
886	 * protected by the ip_set_ref_lock.
887	 * External systems (i.e. xt_set) must call
888	 * ip_set_put|get_nfnl_* functions, that way we
889	 * can safely check references here.
890	 *
891	 * list:set timer can only decrement the reference
892	 * counter, so if it's already zero, we can proceed
893	 * without holding the lock.
894	 */
895	read_lock_bh(&ip_set_ref_lock);
896	if (!attr[IPSET_ATTR_SETNAME]) {
897		for (i = 0; i < ip_set_max; i++) {
898			s = nfnl_set(i);
899			if (s != NULL && s->ref) {
900				ret = -IPSET_ERR_BUSY;
901				goto out;
902			}
903		}
904		read_unlock_bh(&ip_set_ref_lock);
905		for (i = 0; i < ip_set_max; i++) {
906			s = nfnl_set(i);
907			if (s != NULL)
908				ip_set_destroy_set(i);
909		}
910	} else {
911		s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i);
912		if (s == NULL) {
913			ret = -ENOENT;
914			goto out;
915		} else if (s->ref) {
916			ret = -IPSET_ERR_BUSY;
917			goto out;
918		}
919		read_unlock_bh(&ip_set_ref_lock);
920
921		ip_set_destroy_set(i);
922	}
923	return 0;
924out:
925	read_unlock_bh(&ip_set_ref_lock);
926	return ret;
927}
928
929/* Flush sets */
930
931static void
932ip_set_flush_set(struct ip_set *set)
933{
934	pr_debug("set: %s\n",  set->name);
935
936	write_lock_bh(&set->lock);
937	set->variant->flush(set);
938	write_unlock_bh(&set->lock);
939}
940
941static int
942ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
943	     const struct nlmsghdr *nlh,
944	     const struct nlattr * const attr[])
945{
946	struct ip_set *s;
947	ip_set_id_t i;
948
949	if (unlikely(protocol_failed(attr)))
950		return -IPSET_ERR_PROTOCOL;
951
952	if (!attr[IPSET_ATTR_SETNAME]) {
953		for (i = 0; i < ip_set_max; i++) {
954			s = nfnl_set(i);
955			if (s != NULL)
956				ip_set_flush_set(s);
957		}
958	} else {
959		s = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
960		if (s == NULL)
961			return -ENOENT;
962
963		ip_set_flush_set(s);
964	}
965
966	return 0;
967}
968
969/* Rename a set */
970
971static const struct nla_policy
972ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
973	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
974	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
975				    .len = IPSET_MAXNAMELEN - 1 },
976	[IPSET_ATTR_SETNAME2]	= { .type = NLA_NUL_STRING,
977				    .len = IPSET_MAXNAMELEN - 1 },
978};
979
980static int
981ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
982	      const struct nlmsghdr *nlh,
983	      const struct nlattr * const attr[])
984{
985	struct ip_set *set, *s;
986	const char *name2;
987	ip_set_id_t i;
988	int ret = 0;
989
990	if (unlikely(protocol_failed(attr) ||
991		     attr[IPSET_ATTR_SETNAME] == NULL ||
992		     attr[IPSET_ATTR_SETNAME2] == NULL))
993		return -IPSET_ERR_PROTOCOL;
994
995	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
996	if (set == NULL)
997		return -ENOENT;
998
999	read_lock_bh(&ip_set_ref_lock);
1000	if (set->ref != 0) {
1001		ret = -IPSET_ERR_REFERENCED;
1002		goto out;
1003	}
1004
1005	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
1006	for (i = 0; i < ip_set_max; i++) {
1007		s = nfnl_set(i);
1008		if (s != NULL && STREQ(s->name, name2)) {
1009			ret = -IPSET_ERR_EXIST_SETNAME2;
1010			goto out;
1011		}
1012	}
1013	strncpy(set->name, name2, IPSET_MAXNAMELEN);
1014
1015out:
1016	read_unlock_bh(&ip_set_ref_lock);
1017	return ret;
1018}
1019
1020/* Swap two sets so that name/index points to the other.
1021 * References and set names are also swapped.
1022 *
1023 * The commands are serialized by the nfnl mutex and references are
1024 * protected by the ip_set_ref_lock. The kernel interfaces
1025 * do not hold the mutex but the pointer settings are atomic
1026 * so the ip_set_list always contains valid pointers to the sets.
1027 */
1028
1029static int
1030ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
1031	    const struct nlmsghdr *nlh,
1032	    const struct nlattr * const attr[])
1033{
1034	struct ip_set *from, *to;
1035	ip_set_id_t from_id, to_id;
1036	char from_name[IPSET_MAXNAMELEN];
1037
1038	if (unlikely(protocol_failed(attr) ||
1039		     attr[IPSET_ATTR_SETNAME] == NULL ||
1040		     attr[IPSET_ATTR_SETNAME2] == NULL))
1041		return -IPSET_ERR_PROTOCOL;
1042
1043	from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id);
1044	if (from == NULL)
1045		return -ENOENT;
1046
1047	to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id);
1048	if (to == NULL)
1049		return -IPSET_ERR_EXIST_SETNAME2;
1050
1051	/* Features must not change.
1052	 * Not an artificial restriction anymore, as we must prevent
1053	 * possible loops created by swapping in setlist type of sets. */
1054	if (!(from->type->features == to->type->features &&
1055	      from->type->family == to->type->family))
1056		return -IPSET_ERR_TYPE_MISMATCH;
1057
1058	strncpy(from_name, from->name, IPSET_MAXNAMELEN);
1059	strncpy(from->name, to->name, IPSET_MAXNAMELEN);
1060	strncpy(to->name, from_name, IPSET_MAXNAMELEN);
1061
1062	write_lock_bh(&ip_set_ref_lock);
1063	swap(from->ref, to->ref);
1064	nfnl_set(from_id) = to;
1065	nfnl_set(to_id) = from;
1066	write_unlock_bh(&ip_set_ref_lock);
1067
1068	return 0;
1069}
1070
1071/* List/save set data */
1072
1073#define DUMP_INIT	0
1074#define DUMP_ALL	1
1075#define DUMP_ONE	2
1076#define DUMP_LAST	3
1077
1078#define DUMP_TYPE(arg)		(((u32)(arg)) & 0x0000FFFF)
1079#define DUMP_FLAGS(arg)		(((u32)(arg)) >> 16)
1080
1081static int
1082ip_set_dump_done(struct netlink_callback *cb)
1083{
1084	if (cb->args[2]) {
1085		pr_debug("release set %s\n", nfnl_set(cb->args[1])->name);
1086		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
1087	}
1088	return 0;
1089}
1090
1091static inline void
1092dump_attrs(struct nlmsghdr *nlh)
1093{
1094	const struct nlattr *attr;
1095	int rem;
1096
1097	pr_debug("dump nlmsg\n");
1098	nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
1099		pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
1100	}
1101}
1102
1103static int
1104dump_init(struct netlink_callback *cb)
1105{
1106	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
1107	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
1108	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
1109	struct nlattr *attr = (void *)nlh + min_len;
1110	u32 dump_type;
1111	ip_set_id_t index;
1112
1113	/* Second pass, so parser can't fail */
1114	nla_parse(cda, IPSET_ATTR_CMD_MAX,
1115		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
1116
1117	/* cb->args[0] : dump single set/all sets
1118	 *         [1] : set index
1119	 *         [..]: type specific
1120	 */
1121
1122	if (cda[IPSET_ATTR_SETNAME]) {
1123		struct ip_set *set;
1124
1125		set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]),
1126				      &index);
1127		if (set == NULL)
1128			return -ENOENT;
1129
1130		dump_type = DUMP_ONE;
1131		cb->args[1] = index;
1132	} else
1133		dump_type = DUMP_ALL;
1134
1135	if (cda[IPSET_ATTR_FLAGS]) {
1136		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
1137		dump_type |= (f << 16);
1138	}
1139	cb->args[0] = dump_type;
1140
1141	return 0;
1142}
1143
1144static int
1145ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
1146{
1147	ip_set_id_t index = IPSET_INVALID_ID, max;
1148	struct ip_set *set = NULL;
1149	struct nlmsghdr *nlh = NULL;
1150	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
1151	u32 dump_type, dump_flags;
1152	int ret = 0;
1153
1154	if (!cb->args[0]) {
1155		ret = dump_init(cb);
1156		if (ret < 0) {
1157			nlh = nlmsg_hdr(cb->skb);
1158			/* We have to create and send the error message
1159			 * manually :-( */
1160			if (nlh->nlmsg_flags & NLM_F_ACK)
1161				netlink_ack(cb->skb, nlh, ret);
1162			return ret;
1163		}
1164	}
1165
1166	if (cb->args[1] >= ip_set_max)
1167		goto out;
1168
1169	dump_type = DUMP_TYPE(cb->args[0]);
1170	dump_flags = DUMP_FLAGS(cb->args[0]);
1171	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
1172dump_last:
1173	pr_debug("args[0]: %u %u args[1]: %ld\n",
1174		 dump_type, dump_flags, cb->args[1]);
1175	for (; cb->args[1] < max; cb->args[1]++) {
1176		index = (ip_set_id_t) cb->args[1];
1177		set = nfnl_set(index);
1178		if (set == NULL) {
1179			if (dump_type == DUMP_ONE) {
1180				ret = -ENOENT;
1181				goto out;
1182			}
1183			continue;
1184		}
1185		/* When dumping all sets, we must dump "sorted"
1186		 * so that lists (unions of sets) are dumped last.
1187		 */
1188		if (dump_type != DUMP_ONE &&
1189		    ((dump_type == DUMP_ALL) ==
1190		     !!(set->type->features & IPSET_DUMP_LAST)))
1191			continue;
1192		pr_debug("List set: %s\n", set->name);
1193		if (!cb->args[2]) {
1194			/* Start listing: make sure set won't be destroyed */
1195			pr_debug("reference set\n");
1196			__ip_set_get(set);
1197		}
1198		nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
1199				cb->nlh->nlmsg_seq, flags,
1200				IPSET_CMD_LIST);
1201		if (!nlh) {
1202			ret = -EMSGSIZE;
1203			goto release_refcount;
1204		}
1205		if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1206		    nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
1207			goto nla_put_failure;
1208		if (dump_flags & IPSET_FLAG_LIST_SETNAME)
1209			goto next_set;
1210		switch (cb->args[2]) {
1211		case 0:
1212			/* Core header data */
1213			if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
1214					   set->type->name) ||
1215			    nla_put_u8(skb, IPSET_ATTR_FAMILY,
1216				       set->family) ||
1217			    nla_put_u8(skb, IPSET_ATTR_REVISION,
1218				       set->revision))
1219				goto nla_put_failure;
1220			ret = set->variant->head(set, skb);
1221			if (ret < 0)
1222				goto release_refcount;
1223			if (dump_flags & IPSET_FLAG_LIST_HEADER)
1224				goto next_set;
1225			/* Fall through and add elements */
1226		default:
1227			read_lock_bh(&set->lock);
1228			ret = set->variant->list(set, skb, cb);
1229			read_unlock_bh(&set->lock);
1230			if (!cb->args[2])
1231				/* Set is done, proceed with next one */
1232				goto next_set;
1233			goto release_refcount;
1234		}
1235	}
1236	/* If we dump all sets, continue with dumping last ones */
1237	if (dump_type == DUMP_ALL) {
1238		dump_type = DUMP_LAST;
1239		cb->args[0] = dump_type | (dump_flags << 16);
1240		cb->args[1] = 0;
1241		goto dump_last;
1242	}
1243	goto out;
1244
1245nla_put_failure:
1246	ret = -EFAULT;
1247next_set:
1248	if (dump_type == DUMP_ONE)
1249		cb->args[1] = IPSET_INVALID_ID;
1250	else
1251		cb->args[1]++;
1252release_refcount:
1253	/* If there was an error or set is done, release set */
1254	if (ret || !cb->args[2]) {
1255		pr_debug("release set %s\n", nfnl_set(index)->name);
1256		ip_set_put_byindex(index);
1257		cb->args[2] = 0;
1258	}
1259out:
1260	if (nlh) {
1261		nlmsg_end(skb, nlh);
1262		pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
1263		dump_attrs(nlh);
1264	}
1265
1266	return ret < 0 ? ret : skb->len;
1267}
1268
1269static int
1270ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
1271	    const struct nlmsghdr *nlh,
1272	    const struct nlattr * const attr[])
1273{
1274	if (unlikely(protocol_failed(attr)))
1275		return -IPSET_ERR_PROTOCOL;
1276
1277	{
1278		struct netlink_dump_control c = {
1279			.dump = ip_set_dump_start,
1280			.done = ip_set_dump_done,
1281		};
1282		return netlink_dump_start(ctnl, skb, nlh, &c);
1283	}
1284}
1285
1286/* Add, del and test */
1287
1288static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
1289	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1290	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
1291				    .len = IPSET_MAXNAMELEN - 1 },
1292	[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
1293	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
1294	[IPSET_ATTR_ADT]	= { .type = NLA_NESTED },
1295};
1296
1297static int
1298call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
1299	struct nlattr *tb[], enum ipset_adt adt,
1300	u32 flags, bool use_lineno)
1301{
1302	int ret;
1303	u32 lineno = 0;
1304	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
1305
1306	do {
1307		write_lock_bh(&set->lock);
1308		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
1309		write_unlock_bh(&set->lock);
1310		retried = true;
1311	} while (ret == -EAGAIN &&
1312		 set->variant->resize &&
1313		 (ret = set->variant->resize(set, retried)) == 0);
1314
1315	if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
1316		return 0;
1317	if (lineno && use_lineno) {
1318		/* Error in restore/batch mode: send back lineno */
1319		struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
1320		struct sk_buff *skb2;
1321		struct nlmsgerr *errmsg;
1322		size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
1323		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
1324		struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
1325		struct nlattr *cmdattr;
1326		u32 *errline;
1327
1328		skb2 = nlmsg_new(payload, GFP_KERNEL);
1329		if (skb2 == NULL)
1330			return -ENOMEM;
1331		rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
1332				  nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
1333		errmsg = nlmsg_data(rep);
1334		errmsg->error = ret;
1335		memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
1336		cmdattr = (void *)&errmsg->msg + min_len;
1337
1338		nla_parse(cda, IPSET_ATTR_CMD_MAX,
1339			  cmdattr, nlh->nlmsg_len - min_len,
1340			  ip_set_adt_policy);
1341
1342		errline = nla_data(cda[IPSET_ATTR_LINENO]);
1343
1344		*errline = lineno;
1345
1346		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1347		/* Signal netlink not to send its ACK/errmsg.  */
1348		return -EINTR;
1349	}
1350
1351	return ret;
1352}
1353
1354static int
1355ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
1356	    const struct nlmsghdr *nlh,
1357	    const struct nlattr * const attr[])
1358{
1359	struct ip_set *set;
1360	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1361	const struct nlattr *nla;
1362	u32 flags = flag_exist(nlh);
1363	bool use_lineno;
1364	int ret = 0;
1365
1366	if (unlikely(protocol_failed(attr) ||
1367		     attr[IPSET_ATTR_SETNAME] == NULL ||
1368		     !((attr[IPSET_ATTR_DATA] != NULL) ^
1369		       (attr[IPSET_ATTR_ADT] != NULL)) ||
1370		     (attr[IPSET_ATTR_DATA] != NULL &&
1371		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
1372		     (attr[IPSET_ATTR_ADT] != NULL &&
1373		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
1374		       attr[IPSET_ATTR_LINENO] == NULL))))
1375		return -IPSET_ERR_PROTOCOL;
1376
1377	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1378	if (set == NULL)
1379		return -ENOENT;
1380
1381	use_lineno = !!attr[IPSET_ATTR_LINENO];
1382	if (attr[IPSET_ATTR_DATA]) {
1383		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
1384				     attr[IPSET_ATTR_DATA],
1385				     set->type->adt_policy))
1386			return -IPSET_ERR_PROTOCOL;
1387		ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
1388			      use_lineno);
1389	} else {
1390		int nla_rem;
1391
1392		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
1393			memset(tb, 0, sizeof(tb));
1394			if (nla_type(nla) != IPSET_ATTR_DATA ||
1395			    !flag_nested(nla) ||
1396			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
1397					     set->type->adt_policy))
1398				return -IPSET_ERR_PROTOCOL;
1399			ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
1400				      flags, use_lineno);
1401			if (ret < 0)
1402				return ret;
1403		}
1404	}
1405	return ret;
1406}
1407
1408static int
1409ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
1410	    const struct nlmsghdr *nlh,
1411	    const struct nlattr * const attr[])
1412{
1413	struct ip_set *set;
1414	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1415	const struct nlattr *nla;
1416	u32 flags = flag_exist(nlh);
1417	bool use_lineno;
1418	int ret = 0;
1419
1420	if (unlikely(protocol_failed(attr) ||
1421		     attr[IPSET_ATTR_SETNAME] == NULL ||
1422		     !((attr[IPSET_ATTR_DATA] != NULL) ^
1423		       (attr[IPSET_ATTR_ADT] != NULL)) ||
1424		     (attr[IPSET_ATTR_DATA] != NULL &&
1425		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
1426		     (attr[IPSET_ATTR_ADT] != NULL &&
1427		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
1428		       attr[IPSET_ATTR_LINENO] == NULL))))
1429		return -IPSET_ERR_PROTOCOL;
1430
1431	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1432	if (set == NULL)
1433		return -ENOENT;
1434
1435	use_lineno = !!attr[IPSET_ATTR_LINENO];
1436	if (attr[IPSET_ATTR_DATA]) {
1437		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
1438				     attr[IPSET_ATTR_DATA],
1439				     set->type->adt_policy))
1440			return -IPSET_ERR_PROTOCOL;
1441		ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
1442			      use_lineno);
1443	} else {
1444		int nla_rem;
1445
1446		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
1447			memset(tb, 0, sizeof(*tb));
1448			if (nla_type(nla) != IPSET_ATTR_DATA ||
1449			    !flag_nested(nla) ||
1450			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
1451					     set->type->adt_policy))
1452				return -IPSET_ERR_PROTOCOL;
1453			ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
1454				      flags, use_lineno);
1455			if (ret < 0)
1456				return ret;
1457		}
1458	}
1459	return ret;
1460}
1461
1462static int
1463ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
1464	     const struct nlmsghdr *nlh,
1465	     const struct nlattr * const attr[])
1466{
1467	struct ip_set *set;
1468	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
1469	int ret = 0;
1470
1471	if (unlikely(protocol_failed(attr) ||
1472		     attr[IPSET_ATTR_SETNAME] == NULL ||
1473		     attr[IPSET_ATTR_DATA] == NULL ||
1474		     !flag_nested(attr[IPSET_ATTR_DATA])))
1475		return -IPSET_ERR_PROTOCOL;
1476
1477	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1478	if (set == NULL)
1479		return -ENOENT;
1480
1481	if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
1482			     set->type->adt_policy))
1483		return -IPSET_ERR_PROTOCOL;
1484
1485	read_lock_bh(&set->lock);
1486	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
1487	read_unlock_bh(&set->lock);
1488	/* Userspace can't trigger element to be re-added */
1489	if (ret == -EAGAIN)
1490		ret = 1;
1491
1492	return (ret < 0 && ret != -ENOTEMPTY) ? ret :
1493		ret > 0 ? 0 : -IPSET_ERR_EXIST;
1494}
1495
1496/* Get headed data of a set */
1497
1498static int
1499ip_set_header(struct sock *ctnl, struct sk_buff *skb,
1500	      const struct nlmsghdr *nlh,
1501	      const struct nlattr * const attr[])
1502{
1503	const struct ip_set *set;
1504	struct sk_buff *skb2;
1505	struct nlmsghdr *nlh2;
1506	int ret = 0;
1507
1508	if (unlikely(protocol_failed(attr) ||
1509		     attr[IPSET_ATTR_SETNAME] == NULL))
1510		return -IPSET_ERR_PROTOCOL;
1511
1512	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
1513	if (set == NULL)
1514		return -ENOENT;
1515
1516	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1517	if (skb2 == NULL)
1518		return -ENOMEM;
1519
1520	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1521			 IPSET_CMD_HEADER);
1522	if (!nlh2)
1523		goto nlmsg_failure;
1524	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1525	    nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
1526	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
1527	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
1528	    nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision))
1529		goto nla_put_failure;
1530	nlmsg_end(skb2, nlh2);
1531
1532	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1533	if (ret < 0)
1534		return ret;
1535
1536	return 0;
1537
1538nla_put_failure:
1539	nlmsg_cancel(skb2, nlh2);
1540nlmsg_failure:
1541	kfree_skb(skb2);
1542	return -EMSGSIZE;
1543}
1544
1545/* Get type data */
1546
1547static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
1548	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1549	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
1550				    .len = IPSET_MAXNAMELEN - 1 },
1551	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
1552};
1553
1554static int
1555ip_set_type(struct sock *ctnl, struct sk_buff *skb,
1556	    const struct nlmsghdr *nlh,
1557	    const struct nlattr * const attr[])
1558{
1559	struct sk_buff *skb2;
1560	struct nlmsghdr *nlh2;
1561	u8 family, min, max;
1562	const char *typename;
1563	int ret = 0;
1564
1565	if (unlikely(protocol_failed(attr) ||
1566		     attr[IPSET_ATTR_TYPENAME] == NULL ||
1567		     attr[IPSET_ATTR_FAMILY] == NULL))
1568		return -IPSET_ERR_PROTOCOL;
1569
1570	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
1571	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
1572	ret = find_set_type_minmax(typename, family, &min, &max);
1573	if (ret)
1574		return ret;
1575
1576	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1577	if (skb2 == NULL)
1578		return -ENOMEM;
1579
1580	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1581			 IPSET_CMD_TYPE);
1582	if (!nlh2)
1583		goto nlmsg_failure;
1584	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
1585	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
1586	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
1587	    nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
1588	    nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min))
1589		goto nla_put_failure;
1590	nlmsg_end(skb2, nlh2);
1591
1592	pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
1593	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1594	if (ret < 0)
1595		return ret;
1596
1597	return 0;
1598
1599nla_put_failure:
1600	nlmsg_cancel(skb2, nlh2);
1601nlmsg_failure:
1602	kfree_skb(skb2);
1603	return -EMSGSIZE;
1604}
1605
1606/* Get protocol version */
1607
1608static const struct nla_policy
1609ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
1610	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
1611};
1612
1613static int
1614ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
1615		const struct nlmsghdr *nlh,
1616		const struct nlattr * const attr[])
1617{
1618	struct sk_buff *skb2;
1619	struct nlmsghdr *nlh2;
1620	int ret = 0;
1621
1622	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
1623		return -IPSET_ERR_PROTOCOL;
1624
1625	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1626	if (skb2 == NULL)
1627		return -ENOMEM;
1628
1629	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
1630			 IPSET_CMD_PROTOCOL);
1631	if (!nlh2)
1632		goto nlmsg_failure;
1633	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
1634		goto nla_put_failure;
1635	nlmsg_end(skb2, nlh2);
1636
1637	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
1638	if (ret < 0)
1639		return ret;
1640
1641	return 0;
1642
1643nla_put_failure:
1644	nlmsg_cancel(skb2, nlh2);
1645nlmsg_failure:
1646	kfree_skb(skb2);
1647	return -EMSGSIZE;
1648}
1649
1650static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
1651	[IPSET_CMD_NONE]	= {
1652		.call		= ip_set_none,
1653		.attr_count	= IPSET_ATTR_CMD_MAX,
1654	},
1655	[IPSET_CMD_CREATE]	= {
1656		.call		= ip_set_create,
1657		.attr_count	= IPSET_ATTR_CMD_MAX,
1658		.policy		= ip_set_create_policy,
1659	},
1660	[IPSET_CMD_DESTROY]	= {
1661		.call		= ip_set_destroy,
1662		.attr_count	= IPSET_ATTR_CMD_MAX,
1663		.policy		= ip_set_setname_policy,
1664	},
1665	[IPSET_CMD_FLUSH]	= {
1666		.call		= ip_set_flush,
1667		.attr_count	= IPSET_ATTR_CMD_MAX,
1668		.policy		= ip_set_setname_policy,
1669	},
1670	[IPSET_CMD_RENAME]	= {
1671		.call		= ip_set_rename,
1672		.attr_count	= IPSET_ATTR_CMD_MAX,
1673		.policy		= ip_set_setname2_policy,
1674	},
1675	[IPSET_CMD_SWAP]	= {
1676		.call		= ip_set_swap,
1677		.attr_count	= IPSET_ATTR_CMD_MAX,
1678		.policy		= ip_set_setname2_policy,
1679	},
1680	[IPSET_CMD_LIST]	= {
1681		.call		= ip_set_dump,
1682		.attr_count	= IPSET_ATTR_CMD_MAX,
1683		.policy		= ip_set_setname_policy,
1684	},
1685	[IPSET_CMD_SAVE]	= {
1686		.call		= ip_set_dump,
1687		.attr_count	= IPSET_ATTR_CMD_MAX,
1688		.policy		= ip_set_setname_policy,
1689	},
1690	[IPSET_CMD_ADD]	= {
1691		.call		= ip_set_uadd,
1692		.attr_count	= IPSET_ATTR_CMD_MAX,
1693		.policy		= ip_set_adt_policy,
1694	},
1695	[IPSET_CMD_DEL]	= {
1696		.call		= ip_set_udel,
1697		.attr_count	= IPSET_ATTR_CMD_MAX,
1698		.policy		= ip_set_adt_policy,
1699	},
1700	[IPSET_CMD_TEST]	= {
1701		.call		= ip_set_utest,
1702		.attr_count	= IPSET_ATTR_CMD_MAX,
1703		.policy		= ip_set_adt_policy,
1704	},
1705	[IPSET_CMD_HEADER]	= {
1706		.call		= ip_set_header,
1707		.attr_count	= IPSET_ATTR_CMD_MAX,
1708		.policy		= ip_set_setname_policy,
1709	},
1710	[IPSET_CMD_TYPE]	= {
1711		.call		= ip_set_type,
1712		.attr_count	= IPSET_ATTR_CMD_MAX,
1713		.policy		= ip_set_type_policy,
1714	},
1715	[IPSET_CMD_PROTOCOL]	= {
1716		.call		= ip_set_protocol,
1717		.attr_count	= IPSET_ATTR_CMD_MAX,
1718		.policy		= ip_set_protocol_policy,
1719	},
1720};
1721
1722static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
1723	.name		= "ip_set",
1724	.subsys_id	= NFNL_SUBSYS_IPSET,
1725	.cb_count	= IPSET_MSG_MAX,
1726	.cb		= ip_set_netlink_subsys_cb,
1727};
1728
1729/* Interface to iptables/ip6tables */
1730
1731static int
1732ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
1733{
1734	unsigned int *op;
1735	void *data;
1736	int copylen = *len, ret = 0;
1737
1738	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1739		return -EPERM;
1740	if (optval != SO_IP_SET)
1741		return -EBADF;
1742	if (*len < sizeof(unsigned int))
1743		return -EINVAL;
1744
1745	data = vmalloc(*len);
1746	if (!data)
1747		return -ENOMEM;
1748	if (copy_from_user(data, user, *len) != 0) {
1749		ret = -EFAULT;
1750		goto done;
1751	}
1752	op = (unsigned int *) data;
1753
1754	if (*op < IP_SET_OP_VERSION) {
1755		/* Check the version at the beginning of operations */
1756		struct ip_set_req_version *req_version = data;
1757		if (req_version->version != IPSET_PROTOCOL) {
1758			ret = -EPROTO;
1759			goto done;
1760		}
1761	}
1762
1763	switch (*op) {
1764	case IP_SET_OP_VERSION: {
1765		struct ip_set_req_version *req_version = data;
1766
1767		if (*len != sizeof(struct ip_set_req_version)) {
1768			ret = -EINVAL;
1769			goto done;
1770		}
1771
1772		req_version->version = IPSET_PROTOCOL;
1773		ret = copy_to_user(user, req_version,
1774				   sizeof(struct ip_set_req_version));
1775		goto done;
1776	}
1777	case IP_SET_OP_GET_BYNAME: {
1778		struct ip_set_req_get_set *req_get = data;
1779		ip_set_id_t id;
1780
1781		if (*len != sizeof(struct ip_set_req_get_set)) {
1782			ret = -EINVAL;
1783			goto done;
1784		}
1785		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
1786		nfnl_lock(NFNL_SUBSYS_IPSET);
1787		find_set_and_id(req_get->set.name, &id);
1788		req_get->set.index = id;
1789		nfnl_unlock(NFNL_SUBSYS_IPSET);
1790		goto copy;
1791	}
1792	case IP_SET_OP_GET_BYINDEX: {
1793		struct ip_set_req_get_set *req_get = data;
1794		struct ip_set *set;
1795
1796		if (*len != sizeof(struct ip_set_req_get_set) ||
1797		    req_get->set.index >= ip_set_max) {
1798			ret = -EINVAL;
1799			goto done;
1800		}
1801		nfnl_lock(NFNL_SUBSYS_IPSET);
1802		set = nfnl_set(req_get->set.index);
1803		strncpy(req_get->set.name, set ? set->name : "",
1804			IPSET_MAXNAMELEN);
1805		nfnl_unlock(NFNL_SUBSYS_IPSET);
1806		goto copy;
1807	}
1808	default:
1809		ret = -EBADMSG;
1810		goto done;
1811	}	/* end of switch(op) */
1812
1813copy:
1814	ret = copy_to_user(user, data, copylen);
1815
1816done:
1817	vfree(data);
1818	if (ret > 0)
1819		ret = 0;
1820	return ret;
1821}
1822
1823static struct nf_sockopt_ops so_set __read_mostly = {
1824	.pf		= PF_INET,
1825	.get_optmin	= SO_IP_SET,
1826	.get_optmax	= SO_IP_SET + 1,
1827	.get		= &ip_set_sockfn_get,
1828	.owner		= THIS_MODULE,
1829};
1830
1831static int __init
1832ip_set_init(void)
1833{
1834	struct ip_set **list;
1835	int ret;
1836
1837	if (max_sets)
1838		ip_set_max = max_sets;
1839	if (ip_set_max >= IPSET_INVALID_ID)
1840		ip_set_max = IPSET_INVALID_ID - 1;
1841
1842	list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL);
1843	if (!list)
1844		return -ENOMEM;
1845
1846	rcu_assign_pointer(ip_set_list, list);
1847	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
1848	if (ret != 0) {
1849		pr_err("ip_set: cannot register with nfnetlink.\n");
1850		kfree(list);
1851		return ret;
1852	}
1853	ret = nf_register_sockopt(&so_set);
1854	if (ret != 0) {
1855		pr_err("SO_SET registry failed: %d\n", ret);
1856		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
1857		kfree(list);
1858		return ret;
1859	}
1860
1861	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
1862	return 0;
1863}
1864
1865static void __exit
1866ip_set_fini(void)
1867{
1868	struct ip_set **list = rcu_dereference_protected(ip_set_list, 1);
1869
1870	/* There can't be any existing set */
1871	nf_unregister_sockopt(&so_set);
1872	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
1873	kfree(list);
1874	pr_debug("these are the famous last words\n");
1875}
1876
1877module_init(ip_set_init);
1878module_exit(ip_set_fini);
1879