1/* Expectation handling for nf_conntrack. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6 * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/netfilter.h>
15#include <linux/skbuff.h>
16#include <linux/proc_fs.h>
17#include <linux/seq_file.h>
18#include <linux/stddef.h>
19#include <linux/slab.h>
20#include <linux/err.h>
21#include <linux/percpu.h>
22#include <linux/kernel.h>
23#include <linux/jhash.h>
24#include <linux/moduleparam.h>
25#include <linux/export.h>
26#include <net/net_namespace.h>
27
28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_core.h>
30#include <net/netfilter/nf_conntrack_expect.h>
31#include <net/netfilter/nf_conntrack_helper.h>
32#include <net/netfilter/nf_conntrack_tuple.h>
33#include <net/netfilter/nf_conntrack_zones.h>
34
35unsigned int nf_ct_expect_hsize __read_mostly;
36EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
37
38unsigned int nf_ct_expect_max __read_mostly;
39
40static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
41
42/* nf_conntrack_expect helper functions */
43void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
44				u32 portid, int report)
45{
46	struct nf_conn_help *master_help = nfct_help(exp->master);
47	struct net *net = nf_ct_exp_net(exp);
48
49	NF_CT_ASSERT(master_help);
50	NF_CT_ASSERT(!timer_pending(&exp->timeout));
51
52	hlist_del_rcu(&exp->hnode);
53	net->ct.expect_count--;
54
55	hlist_del(&exp->lnode);
56	master_help->expecting[exp->class]--;
57
58	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
59	nf_ct_expect_put(exp);
60
61	NF_CT_STAT_INC(net, expect_delete);
62}
63EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
64
65static void nf_ct_expectation_timed_out(unsigned long ul_expect)
66{
67	struct nf_conntrack_expect *exp = (void *)ul_expect;
68
69	spin_lock_bh(&nf_conntrack_expect_lock);
70	nf_ct_unlink_expect(exp);
71	spin_unlock_bh(&nf_conntrack_expect_lock);
72	nf_ct_expect_put(exp);
73}
74
75static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
76{
77	unsigned int hash;
78
79	if (unlikely(!nf_conntrack_hash_rnd)) {
80		init_nf_conntrack_hash_rnd();
81	}
82
83	hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
84		      (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
85		       (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
86
87	return reciprocal_scale(hash, nf_ct_expect_hsize);
88}
89
90struct nf_conntrack_expect *
91__nf_ct_expect_find(struct net *net, u16 zone,
92		    const struct nf_conntrack_tuple *tuple)
93{
94	struct nf_conntrack_expect *i;
95	unsigned int h;
96
97	if (!net->ct.expect_count)
98		return NULL;
99
100	h = nf_ct_expect_dst_hash(tuple);
101	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
102		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
103		    nf_ct_zone(i->master) == zone)
104			return i;
105	}
106	return NULL;
107}
108EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
109
110/* Just find a expectation corresponding to a tuple. */
111struct nf_conntrack_expect *
112nf_ct_expect_find_get(struct net *net, u16 zone,
113		      const struct nf_conntrack_tuple *tuple)
114{
115	struct nf_conntrack_expect *i;
116
117	rcu_read_lock();
118	i = __nf_ct_expect_find(net, zone, tuple);
119	if (i && !atomic_inc_not_zero(&i->use))
120		i = NULL;
121	rcu_read_unlock();
122
123	return i;
124}
125EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
126
127/* If an expectation for this connection is found, it gets delete from
128 * global list then returned. */
129struct nf_conntrack_expect *
130nf_ct_find_expectation(struct net *net, u16 zone,
131		       const struct nf_conntrack_tuple *tuple)
132{
133	struct nf_conntrack_expect *i, *exp = NULL;
134	unsigned int h;
135
136	if (!net->ct.expect_count)
137		return NULL;
138
139	h = nf_ct_expect_dst_hash(tuple);
140	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
141		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
142		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
143		    nf_ct_zone(i->master) == zone) {
144			exp = i;
145			break;
146		}
147	}
148	if (!exp)
149		return NULL;
150
151	/* If master is not in hash table yet (ie. packet hasn't left
152	   this machine yet), how can other end know about expected?
153	   Hence these are not the droids you are looking for (if
154	   master ct never got confirmed, we'd hold a reference to it
155	   and weird things would happen to future packets). */
156	if (!nf_ct_is_confirmed(exp->master))
157		return NULL;
158
159	/* Avoid race with other CPUs, that for exp->master ct, is
160	 * about to invoke ->destroy(), or nf_ct_delete() via timeout
161	 * or early_drop().
162	 *
163	 * The atomic_inc_not_zero() check tells:  If that fails, we
164	 * know that the ct is being destroyed.  If it succeeds, we
165	 * can be sure the ct cannot disappear underneath.
166	 */
167	if (unlikely(nf_ct_is_dying(exp->master) ||
168		     !atomic_inc_not_zero(&exp->master->ct_general.use)))
169		return NULL;
170
171	if (exp->flags & NF_CT_EXPECT_PERMANENT) {
172		atomic_inc(&exp->use);
173		return exp;
174	} else if (del_timer(&exp->timeout)) {
175		nf_ct_unlink_expect(exp);
176		return exp;
177	}
178	/* Undo exp->master refcnt increase, if del_timer() failed */
179	nf_ct_put(exp->master);
180
181	return NULL;
182}
183
184/* delete all expectations for this conntrack */
185void nf_ct_remove_expectations(struct nf_conn *ct)
186{
187	struct nf_conn_help *help = nfct_help(ct);
188	struct nf_conntrack_expect *exp;
189	struct hlist_node *next;
190
191	/* Optimization: most connection never expect any others. */
192	if (!help)
193		return;
194
195	spin_lock_bh(&nf_conntrack_expect_lock);
196	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
197		if (del_timer(&exp->timeout)) {
198			nf_ct_unlink_expect(exp);
199			nf_ct_expect_put(exp);
200		}
201	}
202	spin_unlock_bh(&nf_conntrack_expect_lock);
203}
204EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
205
206/* Would two expected things clash? */
207static inline int expect_clash(const struct nf_conntrack_expect *a,
208			       const struct nf_conntrack_expect *b)
209{
210	/* Part covered by intersection of masks must be unequal,
211	   otherwise they clash */
212	struct nf_conntrack_tuple_mask intersect_mask;
213	int count;
214
215	intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
216
217	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
218		intersect_mask.src.u3.all[count] =
219			a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
220	}
221
222	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
223}
224
225static inline int expect_matches(const struct nf_conntrack_expect *a,
226				 const struct nf_conntrack_expect *b)
227{
228	return a->master == b->master && a->class == b->class &&
229		nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
230		nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
231		nf_ct_zone(a->master) == nf_ct_zone(b->master);
232}
233
234/* Generally a bad idea to call this: could have matched already. */
235void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
236{
237	spin_lock_bh(&nf_conntrack_expect_lock);
238	if (del_timer(&exp->timeout)) {
239		nf_ct_unlink_expect(exp);
240		nf_ct_expect_put(exp);
241	}
242	spin_unlock_bh(&nf_conntrack_expect_lock);
243}
244EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
245
246/* We don't increase the master conntrack refcount for non-fulfilled
247 * conntracks. During the conntrack destruction, the expectations are
248 * always killed before the conntrack itself */
249struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
250{
251	struct nf_conntrack_expect *new;
252
253	new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
254	if (!new)
255		return NULL;
256
257	new->master = me;
258	atomic_set(&new->use, 1);
259	return new;
260}
261EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
262
263void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
264		       u_int8_t family,
265		       const union nf_inet_addr *saddr,
266		       const union nf_inet_addr *daddr,
267		       u_int8_t proto, const __be16 *src, const __be16 *dst)
268{
269	int len;
270
271	if (family == AF_INET)
272		len = 4;
273	else
274		len = 16;
275
276	exp->flags = 0;
277	exp->class = class;
278	exp->expectfn = NULL;
279	exp->helper = NULL;
280	exp->tuple.src.l3num = family;
281	exp->tuple.dst.protonum = proto;
282
283	if (saddr) {
284		memcpy(&exp->tuple.src.u3, saddr, len);
285		if (sizeof(exp->tuple.src.u3) > len)
286			/* address needs to be cleared for nf_ct_tuple_equal */
287			memset((void *)&exp->tuple.src.u3 + len, 0x00,
288			       sizeof(exp->tuple.src.u3) - len);
289		memset(&exp->mask.src.u3, 0xFF, len);
290		if (sizeof(exp->mask.src.u3) > len)
291			memset((void *)&exp->mask.src.u3 + len, 0x00,
292			       sizeof(exp->mask.src.u3) - len);
293	} else {
294		memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
295		memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
296	}
297
298	if (src) {
299		exp->tuple.src.u.all = *src;
300		exp->mask.src.u.all = htons(0xFFFF);
301	} else {
302		exp->tuple.src.u.all = 0;
303		exp->mask.src.u.all = 0;
304	}
305
306	memcpy(&exp->tuple.dst.u3, daddr, len);
307	if (sizeof(exp->tuple.dst.u3) > len)
308		/* address needs to be cleared for nf_ct_tuple_equal */
309		memset((void *)&exp->tuple.dst.u3 + len, 0x00,
310		       sizeof(exp->tuple.dst.u3) - len);
311
312	exp->tuple.dst.u.all = *dst;
313
314#ifdef CONFIG_NF_NAT_NEEDED
315	memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
316	memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
317#endif
318}
319EXPORT_SYMBOL_GPL(nf_ct_expect_init);
320
321static void nf_ct_expect_free_rcu(struct rcu_head *head)
322{
323	struct nf_conntrack_expect *exp;
324
325	exp = container_of(head, struct nf_conntrack_expect, rcu);
326	kmem_cache_free(nf_ct_expect_cachep, exp);
327}
328
329void nf_ct_expect_put(struct nf_conntrack_expect *exp)
330{
331	if (atomic_dec_and_test(&exp->use))
332		call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
333}
334EXPORT_SYMBOL_GPL(nf_ct_expect_put);
335
336static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
337{
338	struct nf_conn_help *master_help = nfct_help(exp->master);
339	struct nf_conntrack_helper *helper;
340	struct net *net = nf_ct_exp_net(exp);
341	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
342
343	/* two references : one for hash insert, one for the timer */
344	atomic_add(2, &exp->use);
345
346	hlist_add_head(&exp->lnode, &master_help->expectations);
347	master_help->expecting[exp->class]++;
348
349	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
350	net->ct.expect_count++;
351
352	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
353		    (unsigned long)exp);
354	helper = rcu_dereference_protected(master_help->helper,
355					   lockdep_is_held(&nf_conntrack_expect_lock));
356	if (helper) {
357		exp->timeout.expires = jiffies +
358			helper->expect_policy[exp->class].timeout * HZ;
359	}
360	add_timer(&exp->timeout);
361
362	NF_CT_STAT_INC(net, expect_create);
363	return 0;
364}
365
366/* Race with expectations being used means we could have none to find; OK. */
367static void evict_oldest_expect(struct nf_conn *master,
368				struct nf_conntrack_expect *new)
369{
370	struct nf_conn_help *master_help = nfct_help(master);
371	struct nf_conntrack_expect *exp, *last = NULL;
372
373	hlist_for_each_entry(exp, &master_help->expectations, lnode) {
374		if (exp->class == new->class)
375			last = exp;
376	}
377
378	if (last && del_timer(&last->timeout)) {
379		nf_ct_unlink_expect(last);
380		nf_ct_expect_put(last);
381	}
382}
383
384static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
385{
386	const struct nf_conntrack_expect_policy *p;
387	struct nf_conntrack_expect *i;
388	struct nf_conn *master = expect->master;
389	struct nf_conn_help *master_help = nfct_help(master);
390	struct nf_conntrack_helper *helper;
391	struct net *net = nf_ct_exp_net(expect);
392	struct hlist_node *next;
393	unsigned int h;
394	int ret = 1;
395
396	if (!master_help) {
397		ret = -ESHUTDOWN;
398		goto out;
399	}
400	h = nf_ct_expect_dst_hash(&expect->tuple);
401	hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
402		if (expect_matches(i, expect)) {
403			if (del_timer(&i->timeout)) {
404				nf_ct_unlink_expect(i);
405				nf_ct_expect_put(i);
406				break;
407			}
408		} else if (expect_clash(i, expect)) {
409			ret = -EBUSY;
410			goto out;
411		}
412	}
413	/* Will be over limit? */
414	helper = rcu_dereference_protected(master_help->helper,
415					   lockdep_is_held(&nf_conntrack_expect_lock));
416	if (helper) {
417		p = &helper->expect_policy[expect->class];
418		if (p->max_expected &&
419		    master_help->expecting[expect->class] >= p->max_expected) {
420			evict_oldest_expect(master, expect);
421			if (master_help->expecting[expect->class]
422						>= p->max_expected) {
423				ret = -EMFILE;
424				goto out;
425			}
426		}
427	}
428
429	if (net->ct.expect_count >= nf_ct_expect_max) {
430		net_warn_ratelimited("nf_conntrack: expectation table full\n");
431		ret = -EMFILE;
432	}
433out:
434	return ret;
435}
436
437int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
438				u32 portid, int report)
439{
440	int ret;
441
442	spin_lock_bh(&nf_conntrack_expect_lock);
443	ret = __nf_ct_expect_check(expect);
444	if (ret <= 0)
445		goto out;
446
447	ret = nf_ct_expect_insert(expect);
448	if (ret < 0)
449		goto out;
450	spin_unlock_bh(&nf_conntrack_expect_lock);
451	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
452	return ret;
453out:
454	spin_unlock_bh(&nf_conntrack_expect_lock);
455	return ret;
456}
457EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
458
459#ifdef CONFIG_NF_CONNTRACK_PROCFS
460struct ct_expect_iter_state {
461	struct seq_net_private p;
462	unsigned int bucket;
463};
464
465static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
466{
467	struct net *net = seq_file_net(seq);
468	struct ct_expect_iter_state *st = seq->private;
469	struct hlist_node *n;
470
471	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
472		n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
473		if (n)
474			return n;
475	}
476	return NULL;
477}
478
479static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
480					     struct hlist_node *head)
481{
482	struct net *net = seq_file_net(seq);
483	struct ct_expect_iter_state *st = seq->private;
484
485	head = rcu_dereference(hlist_next_rcu(head));
486	while (head == NULL) {
487		if (++st->bucket >= nf_ct_expect_hsize)
488			return NULL;
489		head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
490	}
491	return head;
492}
493
494static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
495{
496	struct hlist_node *head = ct_expect_get_first(seq);
497
498	if (head)
499		while (pos && (head = ct_expect_get_next(seq, head)))
500			pos--;
501	return pos ? NULL : head;
502}
503
504static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
505	__acquires(RCU)
506{
507	rcu_read_lock();
508	return ct_expect_get_idx(seq, *pos);
509}
510
511static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
512{
513	(*pos)++;
514	return ct_expect_get_next(seq, v);
515}
516
517static void exp_seq_stop(struct seq_file *seq, void *v)
518	__releases(RCU)
519{
520	rcu_read_unlock();
521}
522
523static int exp_seq_show(struct seq_file *s, void *v)
524{
525	struct nf_conntrack_expect *expect;
526	struct nf_conntrack_helper *helper;
527	struct hlist_node *n = v;
528	char *delim = "";
529
530	expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
531
532	if (expect->timeout.function)
533		seq_printf(s, "%ld ", timer_pending(&expect->timeout)
534			   ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
535	else
536		seq_printf(s, "- ");
537	seq_printf(s, "l3proto = %u proto=%u ",
538		   expect->tuple.src.l3num,
539		   expect->tuple.dst.protonum);
540	print_tuple(s, &expect->tuple,
541		    __nf_ct_l3proto_find(expect->tuple.src.l3num),
542		    __nf_ct_l4proto_find(expect->tuple.src.l3num,
543				       expect->tuple.dst.protonum));
544
545	if (expect->flags & NF_CT_EXPECT_PERMANENT) {
546		seq_printf(s, "PERMANENT");
547		delim = ",";
548	}
549	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
550		seq_printf(s, "%sINACTIVE", delim);
551		delim = ",";
552	}
553	if (expect->flags & NF_CT_EXPECT_USERSPACE)
554		seq_printf(s, "%sUSERSPACE", delim);
555
556	helper = rcu_dereference(nfct_help(expect->master)->helper);
557	if (helper) {
558		seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
559		if (helper->expect_policy[expect->class].name)
560			seq_printf(s, "/%s",
561				   helper->expect_policy[expect->class].name);
562	}
563
564	return seq_putc(s, '\n');
565}
566
567static const struct seq_operations exp_seq_ops = {
568	.start = exp_seq_start,
569	.next = exp_seq_next,
570	.stop = exp_seq_stop,
571	.show = exp_seq_show
572};
573
574static int exp_open(struct inode *inode, struct file *file)
575{
576	return seq_open_net(inode, file, &exp_seq_ops,
577			sizeof(struct ct_expect_iter_state));
578}
579
580static const struct file_operations exp_file_ops = {
581	.owner   = THIS_MODULE,
582	.open    = exp_open,
583	.read    = seq_read,
584	.llseek  = seq_lseek,
585	.release = seq_release_net,
586};
587#endif /* CONFIG_NF_CONNTRACK_PROCFS */
588
589static int exp_proc_init(struct net *net)
590{
591#ifdef CONFIG_NF_CONNTRACK_PROCFS
592	struct proc_dir_entry *proc;
593
594	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
595			   &exp_file_ops);
596	if (!proc)
597		return -ENOMEM;
598#endif /* CONFIG_NF_CONNTRACK_PROCFS */
599	return 0;
600}
601
602static void exp_proc_remove(struct net *net)
603{
604#ifdef CONFIG_NF_CONNTRACK_PROCFS
605	remove_proc_entry("nf_conntrack_expect", net->proc_net);
606#endif /* CONFIG_NF_CONNTRACK_PROCFS */
607}
608
609module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
610
611int nf_conntrack_expect_pernet_init(struct net *net)
612{
613	int err = -ENOMEM;
614
615	net->ct.expect_count = 0;
616	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
617	if (net->ct.expect_hash == NULL)
618		goto err1;
619
620	err = exp_proc_init(net);
621	if (err < 0)
622		goto err2;
623
624	return 0;
625err2:
626	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
627err1:
628	return err;
629}
630
631void nf_conntrack_expect_pernet_fini(struct net *net)
632{
633	exp_proc_remove(net);
634	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
635}
636
637int nf_conntrack_expect_init(void)
638{
639	if (!nf_ct_expect_hsize) {
640		nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
641		if (!nf_ct_expect_hsize)
642			nf_ct_expect_hsize = 1;
643	}
644	nf_ct_expect_max = nf_ct_expect_hsize * 4;
645	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
646				sizeof(struct nf_conntrack_expect),
647				0, 0, NULL);
648	if (!nf_ct_expect_cachep)
649		return -ENOMEM;
650	return 0;
651}
652
653void nf_conntrack_expect_fini(void)
654{
655	rcu_barrier(); /* Wait for call_rcu() before destroy */
656	kmem_cache_destroy(nf_ct_expect_cachep);
657}
658