nf_conntrack_core.c revision 7c9728c393dceb724d66d696cfabce82151a78e5
1/* Connection state tracking for netfilter.  This is separated from,
2   but required by, the NAT layer; it can also be used by an iptables
3   extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 *	- new API and handling of conntrack/nat helpers
15 *	- now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 *	- add usage/reference counts to ip_conntrack_expect
18 *	- export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 *	- generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 *	- add support various size of conntrack structures.
23 * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24 * 	- restructure nf_conn (introduce nf_conn_help)
25 * 	- redesign 'features' how they were originally intended
26 * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27 * 	- add support for L3 protocol module load on demand.
28 *
29 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30 */
31
32#include <linux/config.h>
33#include <linux/types.h>
34#include <linux/netfilter.h>
35#include <linux/module.h>
36#include <linux/skbuff.h>
37#include <linux/proc_fs.h>
38#include <linux/vmalloc.h>
39#include <linux/stddef.h>
40#include <linux/slab.h>
41#include <linux/random.h>
42#include <linux/jhash.h>
43#include <linux/err.h>
44#include <linux/percpu.h>
45#include <linux/moduleparam.h>
46#include <linux/notifier.h>
47#include <linux/kernel.h>
48#include <linux/netdevice.h>
49#include <linux/socket.h>
50
51/* This rwlock protects the main hash table, protocol/helper/expected
52   registrations, conntrack timers*/
53#define ASSERT_READ_LOCK(x)
54#define ASSERT_WRITE_LOCK(x)
55
56#include <net/netfilter/nf_conntrack.h>
57#include <net/netfilter/nf_conntrack_l3proto.h>
58#include <net/netfilter/nf_conntrack_protocol.h>
59#include <net/netfilter/nf_conntrack_helper.h>
60#include <net/netfilter/nf_conntrack_core.h>
61#include <linux/netfilter_ipv4/listhelp.h>
62
63#define NF_CONNTRACK_VERSION	"0.5.0"
64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(format, args...)
69#endif
70
71DEFINE_RWLOCK(nf_conntrack_lock);
72
73/* nf_conntrack_standalone needs this */
74atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77LIST_HEAD(nf_conntrack_expect_list);
78struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80static LIST_HEAD(helpers);
81unsigned int nf_conntrack_htable_size = 0;
82int nf_conntrack_max;
83struct list_head *nf_conntrack_hash;
84static kmem_cache_t *nf_conntrack_expect_cachep;
85struct nf_conn nf_conntrack_untracked;
86unsigned int nf_ct_log_invalid;
87static LIST_HEAD(unconfirmed);
88static int nf_conntrack_vmalloc;
89
90static unsigned int nf_conntrack_next_id;
91static unsigned int nf_conntrack_expect_next_id;
92#ifdef CONFIG_NF_CONNTRACK_EVENTS
93ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
94ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
95
96DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98/* deliver cached events and clear cache entry - must be called with locally
99 * disabled softirqs */
100static inline void
101__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102{
103	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105	    && ecache->events)
106		atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
107				    ecache->ct);
108
109	ecache->events = 0;
110	nf_ct_put(ecache->ct);
111	ecache->ct = NULL;
112}
113
114/* Deliver all cached events for a particular conntrack. This is called
115 * by code prior to async packet handling for freeing the skb */
116void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117{
118	struct nf_conntrack_ecache *ecache;
119
120	local_bh_disable();
121	ecache = &__get_cpu_var(nf_conntrack_ecache);
122	if (ecache->ct == ct)
123		__nf_ct_deliver_cached_events(ecache);
124	local_bh_enable();
125}
126
127/* Deliver cached events for old pending events, if current conntrack != old */
128void __nf_ct_event_cache_init(struct nf_conn *ct)
129{
130	struct nf_conntrack_ecache *ecache;
131
132	/* take care of delivering potentially old events */
133	ecache = &__get_cpu_var(nf_conntrack_ecache);
134	BUG_ON(ecache->ct == ct);
135	if (ecache->ct)
136		__nf_ct_deliver_cached_events(ecache);
137	/* initialize for this conntrack/packet */
138	ecache->ct = ct;
139	nf_conntrack_get(&ct->ct_general);
140}
141
142/* flush the event cache - touches other CPU's data and must not be called
143 * while packets are still passing through the code */
144static void nf_ct_event_cache_flush(void)
145{
146	struct nf_conntrack_ecache *ecache;
147	int cpu;
148
149	for_each_possible_cpu(cpu) {
150		ecache = &per_cpu(nf_conntrack_ecache, cpu);
151		if (ecache->ct)
152			nf_ct_put(ecache->ct);
153	}
154}
155#else
156static inline void nf_ct_event_cache_flush(void) {}
157#endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162/*
163 * This scheme offers various size of "struct nf_conn" dependent on
164 * features(helper, nat, ...)
165 */
166
167#define NF_CT_FEATURES_NAMELEN	256
168static struct {
169	/* name of slab cache. printed in /proc/slabinfo */
170	char *name;
171
172	/* size of slab cache */
173	size_t size;
174
175	/* slab cache pointer */
176	kmem_cache_t *cachep;
177
178	/* allocated slab cache + modules which uses this slab cache */
179	int use;
180
181} nf_ct_cache[NF_CT_F_NUM];
182
183/* protect members of nf_ct_cache except of "use" */
184DEFINE_RWLOCK(nf_ct_cache_lock);
185
186/* This avoids calling kmem_cache_create() with same name simultaneously */
187static DEFINE_MUTEX(nf_ct_cache_mutex);
188
189extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
190struct nf_conntrack_protocol *
191__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
192{
193	if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
194		return &nf_conntrack_generic_protocol;
195
196	return nf_ct_protos[l3proto][protocol];
197}
198
199/* this is guaranteed to always return a valid protocol helper, since
200 * it falls back to generic_protocol */
201struct nf_conntrack_protocol *
202nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
203{
204	struct nf_conntrack_protocol *p;
205
206	preempt_disable();
207	p = __nf_ct_proto_find(l3proto, protocol);
208	if (!try_module_get(p->me))
209		p = &nf_conntrack_generic_protocol;
210	preempt_enable();
211
212	return p;
213}
214
215void nf_ct_proto_put(struct nf_conntrack_protocol *p)
216{
217	module_put(p->me);
218}
219
220struct nf_conntrack_l3proto *
221nf_ct_l3proto_find_get(u_int16_t l3proto)
222{
223	struct nf_conntrack_l3proto *p;
224
225	preempt_disable();
226	p = __nf_ct_l3proto_find(l3proto);
227	if (!try_module_get(p->me))
228		p = &nf_conntrack_generic_l3proto;
229	preempt_enable();
230
231	return p;
232}
233
234void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
235{
236	module_put(p->me);
237}
238
239int
240nf_ct_l3proto_try_module_get(unsigned short l3proto)
241{
242	int ret;
243	struct nf_conntrack_l3proto *p;
244
245retry:	p = nf_ct_l3proto_find_get(l3proto);
246	if (p == &nf_conntrack_generic_l3proto) {
247		ret = request_module("nf_conntrack-%d", l3proto);
248		if (!ret)
249			goto retry;
250
251		return -EPROTOTYPE;
252	}
253
254	return 0;
255}
256
257void nf_ct_l3proto_module_put(unsigned short l3proto)
258{
259	struct nf_conntrack_l3proto *p;
260
261	preempt_disable();
262	p = __nf_ct_l3proto_find(l3proto);
263	preempt_enable();
264
265	module_put(p->me);
266}
267
268static int nf_conntrack_hash_rnd_initted;
269static unsigned int nf_conntrack_hash_rnd;
270
271static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
272				  unsigned int size, unsigned int rnd)
273{
274	unsigned int a, b;
275	a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
276		  ((tuple->src.l3num) << 16) | tuple->dst.protonum);
277	b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
278			(tuple->src.u.all << 16) | tuple->dst.u.all);
279
280	return jhash_2words(a, b, rnd) % size;
281}
282
283static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
284{
285	return __hash_conntrack(tuple, nf_conntrack_htable_size,
286				nf_conntrack_hash_rnd);
287}
288
289int nf_conntrack_register_cache(u_int32_t features, const char *name,
290				size_t size)
291{
292	int ret = 0;
293	char *cache_name;
294	kmem_cache_t *cachep;
295
296	DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
297	       features, name, size);
298
299	if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
300		DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
301			features);
302		return -EINVAL;
303	}
304
305	mutex_lock(&nf_ct_cache_mutex);
306
307	write_lock_bh(&nf_ct_cache_lock);
308	/* e.g: multiple helpers are loaded */
309	if (nf_ct_cache[features].use > 0) {
310		DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
311		if ((!strncmp(nf_ct_cache[features].name, name,
312			      NF_CT_FEATURES_NAMELEN))
313		    && nf_ct_cache[features].size == size) {
314			DEBUGP("nf_conntrack_register_cache: reusing.\n");
315			nf_ct_cache[features].use++;
316			ret = 0;
317		} else
318			ret = -EBUSY;
319
320		write_unlock_bh(&nf_ct_cache_lock);
321		mutex_unlock(&nf_ct_cache_mutex);
322		return ret;
323	}
324	write_unlock_bh(&nf_ct_cache_lock);
325
326	/*
327	 * The memory space for name of slab cache must be alive until
328	 * cache is destroyed.
329	 */
330	cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
331	if (cache_name == NULL) {
332		DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
333		ret = -ENOMEM;
334		goto out_up_mutex;
335	}
336
337	if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
338						>= NF_CT_FEATURES_NAMELEN) {
339		printk("nf_conntrack_register_cache: name too long\n");
340		ret = -EINVAL;
341		goto out_free_name;
342	}
343
344	cachep = kmem_cache_create(cache_name, size, 0, 0,
345				   NULL, NULL);
346	if (!cachep) {
347		printk("nf_conntrack_register_cache: Can't create slab cache "
348		       "for the features = 0x%x\n", features);
349		ret = -ENOMEM;
350		goto out_free_name;
351	}
352
353	write_lock_bh(&nf_ct_cache_lock);
354	nf_ct_cache[features].use = 1;
355	nf_ct_cache[features].size = size;
356	nf_ct_cache[features].cachep = cachep;
357	nf_ct_cache[features].name = cache_name;
358	write_unlock_bh(&nf_ct_cache_lock);
359
360	goto out_up_mutex;
361
362out_free_name:
363	kfree(cache_name);
364out_up_mutex:
365	mutex_unlock(&nf_ct_cache_mutex);
366	return ret;
367}
368
369/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
370void nf_conntrack_unregister_cache(u_int32_t features)
371{
372	kmem_cache_t *cachep;
373	char *name;
374
375	/*
376	 * This assures that kmem_cache_create() isn't called before destroying
377	 * slab cache.
378	 */
379	DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
380	mutex_lock(&nf_ct_cache_mutex);
381
382	write_lock_bh(&nf_ct_cache_lock);
383	if (--nf_ct_cache[features].use > 0) {
384		write_unlock_bh(&nf_ct_cache_lock);
385		mutex_unlock(&nf_ct_cache_mutex);
386		return;
387	}
388	cachep = nf_ct_cache[features].cachep;
389	name = nf_ct_cache[features].name;
390	nf_ct_cache[features].cachep = NULL;
391	nf_ct_cache[features].name = NULL;
392	nf_ct_cache[features].size = 0;
393	write_unlock_bh(&nf_ct_cache_lock);
394
395	synchronize_net();
396
397	kmem_cache_destroy(cachep);
398	kfree(name);
399
400	mutex_unlock(&nf_ct_cache_mutex);
401}
402
403int
404nf_ct_get_tuple(const struct sk_buff *skb,
405		unsigned int nhoff,
406		unsigned int dataoff,
407		u_int16_t l3num,
408		u_int8_t protonum,
409		struct nf_conntrack_tuple *tuple,
410		const struct nf_conntrack_l3proto *l3proto,
411		const struct nf_conntrack_protocol *protocol)
412{
413	NF_CT_TUPLE_U_BLANK(tuple);
414
415	tuple->src.l3num = l3num;
416	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
417		return 0;
418
419	tuple->dst.protonum = protonum;
420	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
421
422	return protocol->pkt_to_tuple(skb, dataoff, tuple);
423}
424
425int
426nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
427		   const struct nf_conntrack_tuple *orig,
428		   const struct nf_conntrack_l3proto *l3proto,
429		   const struct nf_conntrack_protocol *protocol)
430{
431	NF_CT_TUPLE_U_BLANK(inverse);
432
433	inverse->src.l3num = orig->src.l3num;
434	if (l3proto->invert_tuple(inverse, orig) == 0)
435		return 0;
436
437	inverse->dst.dir = !orig->dst.dir;
438
439	inverse->dst.protonum = orig->dst.protonum;
440	return protocol->invert_tuple(inverse, orig);
441}
442
443/* nf_conntrack_expect helper functions */
444void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
445{
446	struct nf_conn_help *master_help = nfct_help(exp->master);
447
448	NF_CT_ASSERT(master_help);
449	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
450	NF_CT_ASSERT(!timer_pending(&exp->timeout));
451
452	list_del(&exp->list);
453	NF_CT_STAT_INC(expect_delete);
454	master_help->expecting--;
455	nf_conntrack_expect_put(exp);
456}
457
458static void expectation_timed_out(unsigned long ul_expect)
459{
460	struct nf_conntrack_expect *exp = (void *)ul_expect;
461
462	write_lock_bh(&nf_conntrack_lock);
463	nf_ct_unlink_expect(exp);
464	write_unlock_bh(&nf_conntrack_lock);
465	nf_conntrack_expect_put(exp);
466}
467
468struct nf_conntrack_expect *
469__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
470{
471	struct nf_conntrack_expect *i;
472
473	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
474		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
475			atomic_inc(&i->use);
476			return i;
477		}
478	}
479	return NULL;
480}
481
482/* Just find a expectation corresponding to a tuple. */
483struct nf_conntrack_expect *
484nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
485{
486	struct nf_conntrack_expect *i;
487
488	read_lock_bh(&nf_conntrack_lock);
489	i = __nf_conntrack_expect_find(tuple);
490	read_unlock_bh(&nf_conntrack_lock);
491
492	return i;
493}
494
495/* If an expectation for this connection is found, it gets delete from
496 * global list then returned. */
497static struct nf_conntrack_expect *
498find_expectation(const struct nf_conntrack_tuple *tuple)
499{
500	struct nf_conntrack_expect *i;
501
502	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
503	/* If master is not in hash table yet (ie. packet hasn't left
504	   this machine yet), how can other end know about expected?
505	   Hence these are not the droids you are looking for (if
506	   master ct never got confirmed, we'd hold a reference to it
507	   and weird things would happen to future packets). */
508		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
509		    && nf_ct_is_confirmed(i->master)) {
510			if (i->flags & NF_CT_EXPECT_PERMANENT) {
511				atomic_inc(&i->use);
512				return i;
513			} else if (del_timer(&i->timeout)) {
514				nf_ct_unlink_expect(i);
515				return i;
516			}
517		}
518	}
519	return NULL;
520}
521
522/* delete all expectations for this conntrack */
523void nf_ct_remove_expectations(struct nf_conn *ct)
524{
525	struct nf_conntrack_expect *i, *tmp;
526	struct nf_conn_help *help = nfct_help(ct);
527
528	/* Optimization: most connection never expect any others. */
529	if (!help || help->expecting == 0)
530		return;
531
532	list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
533		if (i->master == ct && del_timer(&i->timeout)) {
534			nf_ct_unlink_expect(i);
535			nf_conntrack_expect_put(i);
536 		}
537	}
538}
539
540static void
541clean_from_lists(struct nf_conn *ct)
542{
543	unsigned int ho, hr;
544
545	DEBUGP("clean_from_lists(%p)\n", ct);
546	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
547
548	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
549	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
550	LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
551	LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
552
553	/* Destroy all pending expectations */
554	nf_ct_remove_expectations(ct);
555}
556
557static void
558destroy_conntrack(struct nf_conntrack *nfct)
559{
560	struct nf_conn *ct = (struct nf_conn *)nfct;
561	struct nf_conntrack_l3proto *l3proto;
562	struct nf_conntrack_protocol *proto;
563
564	DEBUGP("destroy_conntrack(%p)\n", ct);
565	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
566	NF_CT_ASSERT(!timer_pending(&ct->timeout));
567
568	nf_conntrack_event(IPCT_DESTROY, ct);
569	set_bit(IPS_DYING_BIT, &ct->status);
570
571	/* To make sure we don't get any weird locking issues here:
572	 * destroy_conntrack() MUST NOT be called with a write lock
573	 * to nf_conntrack_lock!!! -HW */
574	l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
575	if (l3proto && l3proto->destroy)
576		l3proto->destroy(ct);
577
578	proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
579	if (proto && proto->destroy)
580		proto->destroy(ct);
581
582	if (nf_conntrack_destroyed)
583		nf_conntrack_destroyed(ct);
584
585	write_lock_bh(&nf_conntrack_lock);
586	/* Expectations will have been removed in clean_from_lists,
587	 * except TFTP can create an expectation on the first packet,
588	 * before connection is in the list, so we need to clean here,
589	 * too. */
590	nf_ct_remove_expectations(ct);
591
592	/* We overload first tuple to link into unconfirmed list. */
593	if (!nf_ct_is_confirmed(ct)) {
594		BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
595		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
596	}
597
598	NF_CT_STAT_INC(delete);
599	write_unlock_bh(&nf_conntrack_lock);
600
601	if (ct->master)
602		nf_ct_put(ct->master);
603
604	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
605	nf_conntrack_free(ct);
606}
607
608static void death_by_timeout(unsigned long ul_conntrack)
609{
610	struct nf_conn *ct = (void *)ul_conntrack;
611
612	write_lock_bh(&nf_conntrack_lock);
613	/* Inside lock so preempt is disabled on module removal path.
614	 * Otherwise we can get spurious warnings. */
615	NF_CT_STAT_INC(delete_list);
616	clean_from_lists(ct);
617	write_unlock_bh(&nf_conntrack_lock);
618	nf_ct_put(ct);
619}
620
621static inline int
622conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
623		    const struct nf_conntrack_tuple *tuple,
624		    const struct nf_conn *ignored_conntrack)
625{
626	ASSERT_READ_LOCK(&nf_conntrack_lock);
627	return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
628		&& nf_ct_tuple_equal(tuple, &i->tuple);
629}
630
631struct nf_conntrack_tuple_hash *
632__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
633		    const struct nf_conn *ignored_conntrack)
634{
635	struct nf_conntrack_tuple_hash *h;
636	unsigned int hash = hash_conntrack(tuple);
637
638	ASSERT_READ_LOCK(&nf_conntrack_lock);
639	list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
640		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
641			NF_CT_STAT_INC(found);
642			return h;
643		}
644		NF_CT_STAT_INC(searched);
645	}
646
647	return NULL;
648}
649
650/* Find a connection corresponding to a tuple. */
651struct nf_conntrack_tuple_hash *
652nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
653		      const struct nf_conn *ignored_conntrack)
654{
655	struct nf_conntrack_tuple_hash *h;
656
657	read_lock_bh(&nf_conntrack_lock);
658	h = __nf_conntrack_find(tuple, ignored_conntrack);
659	if (h)
660		atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
661	read_unlock_bh(&nf_conntrack_lock);
662
663	return h;
664}
665
666static void __nf_conntrack_hash_insert(struct nf_conn *ct,
667				       unsigned int hash,
668				       unsigned int repl_hash)
669{
670	ct->id = ++nf_conntrack_next_id;
671	list_prepend(&nf_conntrack_hash[hash],
672		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
673	list_prepend(&nf_conntrack_hash[repl_hash],
674		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
675}
676
677void nf_conntrack_hash_insert(struct nf_conn *ct)
678{
679	unsigned int hash, repl_hash;
680
681	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
682	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
683
684	write_lock_bh(&nf_conntrack_lock);
685	__nf_conntrack_hash_insert(ct, hash, repl_hash);
686	write_unlock_bh(&nf_conntrack_lock);
687}
688
689/* Confirm a connection given skb; places it in hash table */
690int
691__nf_conntrack_confirm(struct sk_buff **pskb)
692{
693	unsigned int hash, repl_hash;
694	struct nf_conn *ct;
695	enum ip_conntrack_info ctinfo;
696
697	ct = nf_ct_get(*pskb, &ctinfo);
698
699	/* ipt_REJECT uses nf_conntrack_attach to attach related
700	   ICMP/TCP RST packets in other direction.  Actual packet
701	   which created connection will be IP_CT_NEW or for an
702	   expected connection, IP_CT_RELATED. */
703	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
704		return NF_ACCEPT;
705
706	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
707	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
708
709	/* We're not in hash table, and we refuse to set up related
710	   connections for unconfirmed conns.  But packet copies and
711	   REJECT will give spurious warnings here. */
712	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
713
714	/* No external references means noone else could have
715	   confirmed us. */
716	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
717	DEBUGP("Confirming conntrack %p\n", ct);
718
719	write_lock_bh(&nf_conntrack_lock);
720
721	/* See if there's one in the list already, including reverse:
722	   NAT could have grabbed it without realizing, since we're
723	   not in the hash.  If there is, we lost race. */
724	if (!LIST_FIND(&nf_conntrack_hash[hash],
725		       conntrack_tuple_cmp,
726		       struct nf_conntrack_tuple_hash *,
727		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
728	    && !LIST_FIND(&nf_conntrack_hash[repl_hash],
729			  conntrack_tuple_cmp,
730			  struct nf_conntrack_tuple_hash *,
731			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
732		struct nf_conn_help *help;
733		/* Remove from unconfirmed list */
734		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
735
736		__nf_conntrack_hash_insert(ct, hash, repl_hash);
737		/* Timer relative to confirmation time, not original
738		   setting time, otherwise we'd get timer wrap in
739		   weird delay cases. */
740		ct->timeout.expires += jiffies;
741		add_timer(&ct->timeout);
742		atomic_inc(&ct->ct_general.use);
743		set_bit(IPS_CONFIRMED_BIT, &ct->status);
744		NF_CT_STAT_INC(insert);
745		write_unlock_bh(&nf_conntrack_lock);
746		help = nfct_help(ct);
747		if (help && help->helper)
748			nf_conntrack_event_cache(IPCT_HELPER, *pskb);
749#ifdef CONFIG_NF_NAT_NEEDED
750		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
751		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
752			nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
753#endif
754		nf_conntrack_event_cache(master_ct(ct) ?
755					 IPCT_RELATED : IPCT_NEW, *pskb);
756		return NF_ACCEPT;
757	}
758
759	NF_CT_STAT_INC(insert_failed);
760	write_unlock_bh(&nf_conntrack_lock);
761	return NF_DROP;
762}
763
764/* Returns true if a connection correspondings to the tuple (required
765   for NAT). */
766int
767nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
768			 const struct nf_conn *ignored_conntrack)
769{
770	struct nf_conntrack_tuple_hash *h;
771
772	read_lock_bh(&nf_conntrack_lock);
773	h = __nf_conntrack_find(tuple, ignored_conntrack);
774	read_unlock_bh(&nf_conntrack_lock);
775
776	return h != NULL;
777}
778
779/* There's a small race here where we may free a just-assured
780   connection.  Too bad: we're in trouble anyway. */
781static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
782{
783	return !(test_bit(IPS_ASSURED_BIT,
784			  &nf_ct_tuplehash_to_ctrack(i)->status));
785}
786
787static int early_drop(struct list_head *chain)
788{
789	/* Traverse backwards: gives us oldest, which is roughly LRU */
790	struct nf_conntrack_tuple_hash *h;
791	struct nf_conn *ct = NULL;
792	int dropped = 0;
793
794	read_lock_bh(&nf_conntrack_lock);
795	h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
796	if (h) {
797		ct = nf_ct_tuplehash_to_ctrack(h);
798		atomic_inc(&ct->ct_general.use);
799	}
800	read_unlock_bh(&nf_conntrack_lock);
801
802	if (!ct)
803		return dropped;
804
805	if (del_timer(&ct->timeout)) {
806		death_by_timeout((unsigned long)ct);
807		dropped = 1;
808		NF_CT_STAT_INC(early_drop);
809	}
810	nf_ct_put(ct);
811	return dropped;
812}
813
814static inline int helper_cmp(const struct nf_conntrack_helper *i,
815			     const struct nf_conntrack_tuple *rtuple)
816{
817	return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
818}
819
820static struct nf_conntrack_helper *
821__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
822{
823	return LIST_FIND(&helpers, helper_cmp,
824			 struct nf_conntrack_helper *,
825			 tuple);
826}
827
828struct nf_conntrack_helper *
829nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
830{
831	struct nf_conntrack_helper *helper;
832
833	/* need nf_conntrack_lock to assure that helper exists until
834	 * try_module_get() is called */
835	read_lock_bh(&nf_conntrack_lock);
836
837	helper = __nf_ct_helper_find(tuple);
838	if (helper) {
839		/* need to increase module usage count to assure helper will
840		 * not go away while the caller is e.g. busy putting a
841		 * conntrack in the hash that uses the helper */
842		if (!try_module_get(helper->me))
843			helper = NULL;
844	}
845
846	read_unlock_bh(&nf_conntrack_lock);
847
848	return helper;
849}
850
851void nf_ct_helper_put(struct nf_conntrack_helper *helper)
852{
853	module_put(helper->me);
854}
855
856static struct nf_conn *
857__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
858		     const struct nf_conntrack_tuple *repl,
859		     const struct nf_conntrack_l3proto *l3proto)
860{
861	struct nf_conn *conntrack = NULL;
862	u_int32_t features = 0;
863	struct nf_conntrack_helper *helper;
864
865	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
866		get_random_bytes(&nf_conntrack_hash_rnd, 4);
867		nf_conntrack_hash_rnd_initted = 1;
868	}
869
870	if (nf_conntrack_max
871	    && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
872		unsigned int hash = hash_conntrack(orig);
873		/* Try dropping from this hash chain. */
874		if (!early_drop(&nf_conntrack_hash[hash])) {
875			if (net_ratelimit())
876				printk(KERN_WARNING
877				       "nf_conntrack: table full, dropping"
878				       " packet.\n");
879			return ERR_PTR(-ENOMEM);
880		}
881	}
882
883	/*  find features needed by this conntrack. */
884	features = l3proto->get_features(orig);
885
886	/* FIXME: protect helper list per RCU */
887	read_lock_bh(&nf_conntrack_lock);
888	helper = __nf_ct_helper_find(repl);
889	if (helper)
890		features |= NF_CT_F_HELP;
891	read_unlock_bh(&nf_conntrack_lock);
892
893	DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
894
895	read_lock_bh(&nf_ct_cache_lock);
896
897	if (unlikely(!nf_ct_cache[features].use)) {
898		DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
899			features);
900		goto out;
901	}
902
903	conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
904	if (conntrack == NULL) {
905		DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
906		goto out;
907	}
908
909	memset(conntrack, 0, nf_ct_cache[features].size);
910	conntrack->features = features;
911	if (helper) {
912		struct nf_conn_help *help = nfct_help(conntrack);
913		NF_CT_ASSERT(help);
914		help->helper = helper;
915	}
916
917	atomic_set(&conntrack->ct_general.use, 1);
918	conntrack->ct_general.destroy = destroy_conntrack;
919	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
920	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
921	/* Don't set timer yet: wait for confirmation */
922	init_timer(&conntrack->timeout);
923	conntrack->timeout.data = (unsigned long)conntrack;
924	conntrack->timeout.function = death_by_timeout;
925
926	atomic_inc(&nf_conntrack_count);
927out:
928	read_unlock_bh(&nf_ct_cache_lock);
929	return conntrack;
930}
931
932struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
933				   const struct nf_conntrack_tuple *repl)
934{
935	struct nf_conntrack_l3proto *l3proto;
936
937	l3proto = __nf_ct_l3proto_find(orig->src.l3num);
938	return __nf_conntrack_alloc(orig, repl, l3proto);
939}
940
941void nf_conntrack_free(struct nf_conn *conntrack)
942{
943	u_int32_t features = conntrack->features;
944	NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
945	DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
946	       conntrack);
947	kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
948	atomic_dec(&nf_conntrack_count);
949}
950
951/* Allocate a new conntrack: we return -ENOMEM if classification
952   failed due to stress.  Otherwise it really is unclassifiable. */
953static struct nf_conntrack_tuple_hash *
954init_conntrack(const struct nf_conntrack_tuple *tuple,
955	       struct nf_conntrack_l3proto *l3proto,
956	       struct nf_conntrack_protocol *protocol,
957	       struct sk_buff *skb,
958	       unsigned int dataoff)
959{
960	struct nf_conn *conntrack;
961	struct nf_conntrack_tuple repl_tuple;
962	struct nf_conntrack_expect *exp;
963
964	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
965		DEBUGP("Can't invert tuple.\n");
966		return NULL;
967	}
968
969	conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
970	if (conntrack == NULL || IS_ERR(conntrack)) {
971		DEBUGP("Can't allocate conntrack.\n");
972		return (struct nf_conntrack_tuple_hash *)conntrack;
973	}
974
975	if (!protocol->new(conntrack, skb, dataoff)) {
976		nf_conntrack_free(conntrack);
977		DEBUGP("init conntrack: can't track with proto module\n");
978		return NULL;
979	}
980
981	write_lock_bh(&nf_conntrack_lock);
982	exp = find_expectation(tuple);
983
984	if (exp) {
985		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
986			conntrack, exp);
987		/* Welcome, Mr. Bond.  We've been expecting you... */
988		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
989		conntrack->master = exp->master;
990#ifdef CONFIG_NF_CONNTRACK_MARK
991		conntrack->mark = exp->master->mark;
992#endif
993#ifdef CONFIG_NF_CONNTRACK_SECMARK
994		conntrack->secmark = exp->master->secmark;
995#endif
996		nf_conntrack_get(&conntrack->master->ct_general);
997		NF_CT_STAT_INC(expect_new);
998	} else
999		NF_CT_STAT_INC(new);
1000
1001	/* Overload tuple linked list to put us in unconfirmed list. */
1002	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1003
1004	write_unlock_bh(&nf_conntrack_lock);
1005
1006	if (exp) {
1007		if (exp->expectfn)
1008			exp->expectfn(conntrack, exp);
1009		nf_conntrack_expect_put(exp);
1010	}
1011
1012	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1013}
1014
1015/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1016static inline struct nf_conn *
1017resolve_normal_ct(struct sk_buff *skb,
1018		  unsigned int dataoff,
1019		  u_int16_t l3num,
1020		  u_int8_t protonum,
1021		  struct nf_conntrack_l3proto *l3proto,
1022		  struct nf_conntrack_protocol *proto,
1023		  int *set_reply,
1024		  enum ip_conntrack_info *ctinfo)
1025{
1026	struct nf_conntrack_tuple tuple;
1027	struct nf_conntrack_tuple_hash *h;
1028	struct nf_conn *ct;
1029
1030	if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1031			     dataoff, l3num, protonum, &tuple, l3proto,
1032			     proto)) {
1033		DEBUGP("resolve_normal_ct: Can't get tuple\n");
1034		return NULL;
1035	}
1036
1037	/* look for tuple match */
1038	h = nf_conntrack_find_get(&tuple, NULL);
1039	if (!h) {
1040		h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1041		if (!h)
1042			return NULL;
1043		if (IS_ERR(h))
1044			return (void *)h;
1045	}
1046	ct = nf_ct_tuplehash_to_ctrack(h);
1047
1048	/* It exists; we have (non-exclusive) reference. */
1049	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1050		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1051		/* Please set reply bit if this packet OK */
1052		*set_reply = 1;
1053	} else {
1054		/* Once we've had two way comms, always ESTABLISHED. */
1055		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1056			DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1057			*ctinfo = IP_CT_ESTABLISHED;
1058		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1059			DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1060			*ctinfo = IP_CT_RELATED;
1061		} else {
1062			DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1063			*ctinfo = IP_CT_NEW;
1064		}
1065		*set_reply = 0;
1066	}
1067	skb->nfct = &ct->ct_general;
1068	skb->nfctinfo = *ctinfo;
1069	return ct;
1070}
1071
1072unsigned int
1073nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1074{
1075	struct nf_conn *ct;
1076	enum ip_conntrack_info ctinfo;
1077	struct nf_conntrack_l3proto *l3proto;
1078	struct nf_conntrack_protocol *proto;
1079	unsigned int dataoff;
1080	u_int8_t protonum;
1081	int set_reply = 0;
1082	int ret;
1083
1084	/* Previously seen (loopback or untracked)?  Ignore. */
1085	if ((*pskb)->nfct) {
1086		NF_CT_STAT_INC(ignore);
1087		return NF_ACCEPT;
1088	}
1089
1090	l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1091	if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1092		DEBUGP("not prepared to track yet or error occured\n");
1093		return -ret;
1094	}
1095
1096	proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1097
1098	/* It may be an special packet, error, unclean...
1099	 * inverse of the return code tells to the netfilter
1100	 * core what to do with the packet. */
1101	if (proto->error != NULL &&
1102	    (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1103		NF_CT_STAT_INC(error);
1104		NF_CT_STAT_INC(invalid);
1105		return -ret;
1106	}
1107
1108	ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1109			       &set_reply, &ctinfo);
1110	if (!ct) {
1111		/* Not valid part of a connection */
1112		NF_CT_STAT_INC(invalid);
1113		return NF_ACCEPT;
1114	}
1115
1116	if (IS_ERR(ct)) {
1117		/* Too stressed to deal. */
1118		NF_CT_STAT_INC(drop);
1119		return NF_DROP;
1120	}
1121
1122	NF_CT_ASSERT((*pskb)->nfct);
1123
1124	ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1125	if (ret < 0) {
1126		/* Invalid: inverse of the return code tells
1127		 * the netfilter core what to do */
1128		DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1129		nf_conntrack_put((*pskb)->nfct);
1130		(*pskb)->nfct = NULL;
1131		NF_CT_STAT_INC(invalid);
1132		return -ret;
1133	}
1134
1135	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1136		nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1137
1138	return ret;
1139}
1140
1141int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1142			 const struct nf_conntrack_tuple *orig)
1143{
1144	return nf_ct_invert_tuple(inverse, orig,
1145				  __nf_ct_l3proto_find(orig->src.l3num),
1146				  __nf_ct_proto_find(orig->src.l3num,
1147						     orig->dst.protonum));
1148}
1149
1150/* Would two expected things clash? */
1151static inline int expect_clash(const struct nf_conntrack_expect *a,
1152			       const struct nf_conntrack_expect *b)
1153{
1154	/* Part covered by intersection of masks must be unequal,
1155	   otherwise they clash */
1156	struct nf_conntrack_tuple intersect_mask;
1157	int count;
1158
1159	intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1160	intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1161	intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1162	intersect_mask.dst.protonum = a->mask.dst.protonum
1163					& b->mask.dst.protonum;
1164
1165	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1166		intersect_mask.src.u3.all[count] =
1167			a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1168	}
1169
1170	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1171		intersect_mask.dst.u3.all[count] =
1172			a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1173	}
1174
1175	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1176}
1177
1178static inline int expect_matches(const struct nf_conntrack_expect *a,
1179				 const struct nf_conntrack_expect *b)
1180{
1181	return a->master == b->master
1182		&& nf_ct_tuple_equal(&a->tuple, &b->tuple)
1183		&& nf_ct_tuple_equal(&a->mask, &b->mask);
1184}
1185
1186/* Generally a bad idea to call this: could have matched already. */
1187void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1188{
1189	struct nf_conntrack_expect *i;
1190
1191	write_lock_bh(&nf_conntrack_lock);
1192	/* choose the the oldest expectation to evict */
1193	list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1194		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1195			nf_ct_unlink_expect(i);
1196			write_unlock_bh(&nf_conntrack_lock);
1197			nf_conntrack_expect_put(i);
1198			return;
1199		}
1200	}
1201	write_unlock_bh(&nf_conntrack_lock);
1202}
1203
1204/* We don't increase the master conntrack refcount for non-fulfilled
1205 * conntracks. During the conntrack destruction, the expectations are
1206 * always killed before the conntrack itself */
1207struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1208{
1209	struct nf_conntrack_expect *new;
1210
1211	new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1212	if (!new) {
1213		DEBUGP("expect_related: OOM allocating expect\n");
1214		return NULL;
1215	}
1216	new->master = me;
1217	atomic_set(&new->use, 1);
1218	return new;
1219}
1220
1221void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1222{
1223	if (atomic_dec_and_test(&exp->use))
1224		kmem_cache_free(nf_conntrack_expect_cachep, exp);
1225}
1226
1227static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1228{
1229	struct nf_conn_help *master_help = nfct_help(exp->master);
1230
1231	atomic_inc(&exp->use);
1232	master_help->expecting++;
1233	list_add(&exp->list, &nf_conntrack_expect_list);
1234
1235	init_timer(&exp->timeout);
1236	exp->timeout.data = (unsigned long)exp;
1237	exp->timeout.function = expectation_timed_out;
1238	exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1239	add_timer(&exp->timeout);
1240
1241	exp->id = ++nf_conntrack_expect_next_id;
1242	atomic_inc(&exp->use);
1243	NF_CT_STAT_INC(expect_create);
1244}
1245
1246/* Race with expectations being used means we could have none to find; OK. */
1247static void evict_oldest_expect(struct nf_conn *master)
1248{
1249	struct nf_conntrack_expect *i;
1250
1251	list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1252		if (i->master == master) {
1253			if (del_timer(&i->timeout)) {
1254				nf_ct_unlink_expect(i);
1255				nf_conntrack_expect_put(i);
1256			}
1257			break;
1258		}
1259	}
1260}
1261
1262static inline int refresh_timer(struct nf_conntrack_expect *i)
1263{
1264	struct nf_conn_help *master_help = nfct_help(i->master);
1265
1266	if (!del_timer(&i->timeout))
1267		return 0;
1268
1269	i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1270	add_timer(&i->timeout);
1271	return 1;
1272}
1273
1274int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1275{
1276	struct nf_conntrack_expect *i;
1277	struct nf_conn *master = expect->master;
1278	struct nf_conn_help *master_help = nfct_help(master);
1279	int ret;
1280
1281	NF_CT_ASSERT(master_help);
1282
1283	DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1284	DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1285	DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1286
1287	write_lock_bh(&nf_conntrack_lock);
1288	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1289		if (expect_matches(i, expect)) {
1290			/* Refresh timer: if it's dying, ignore.. */
1291			if (refresh_timer(i)) {
1292				ret = 0;
1293				goto out;
1294			}
1295		} else if (expect_clash(i, expect)) {
1296			ret = -EBUSY;
1297			goto out;
1298		}
1299	}
1300	/* Will be over limit? */
1301	if (master_help->helper->max_expected &&
1302	    master_help->expecting >= master_help->helper->max_expected)
1303		evict_oldest_expect(master);
1304
1305	nf_conntrack_expect_insert(expect);
1306	nf_conntrack_expect_event(IPEXP_NEW, expect);
1307	ret = 0;
1308out:
1309	write_unlock_bh(&nf_conntrack_lock);
1310	return ret;
1311}
1312
1313int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1314{
1315	int ret;
1316	BUG_ON(me->timeout == 0);
1317
1318	ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1319					  sizeof(struct nf_conn)
1320					  + sizeof(struct nf_conn_help)
1321					  + __alignof__(struct nf_conn_help));
1322	if (ret < 0) {
1323		printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1324		return ret;
1325	}
1326	write_lock_bh(&nf_conntrack_lock);
1327	list_prepend(&helpers, me);
1328	write_unlock_bh(&nf_conntrack_lock);
1329
1330	return 0;
1331}
1332
1333struct nf_conntrack_helper *
1334__nf_conntrack_helper_find_byname(const char *name)
1335{
1336	struct nf_conntrack_helper *h;
1337
1338	list_for_each_entry(h, &helpers, list) {
1339		if (!strcmp(h->name, name))
1340			return h;
1341	}
1342
1343	return NULL;
1344}
1345
1346static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1347			 const struct nf_conntrack_helper *me)
1348{
1349	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1350	struct nf_conn_help *help = nfct_help(ct);
1351
1352	if (help && help->helper == me) {
1353		nf_conntrack_event(IPCT_HELPER, ct);
1354		help->helper = NULL;
1355	}
1356	return 0;
1357}
1358
1359void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1360{
1361	unsigned int i;
1362	struct nf_conntrack_expect *exp, *tmp;
1363
1364	/* Need write lock here, to delete helper. */
1365	write_lock_bh(&nf_conntrack_lock);
1366	LIST_DELETE(&helpers, me);
1367
1368	/* Get rid of expectations */
1369	list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1370		struct nf_conn_help *help = nfct_help(exp->master);
1371		if (help->helper == me && del_timer(&exp->timeout)) {
1372			nf_ct_unlink_expect(exp);
1373			nf_conntrack_expect_put(exp);
1374		}
1375	}
1376
1377	/* Get rid of expecteds, set helpers to NULL. */
1378	LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1379	for (i = 0; i < nf_conntrack_htable_size; i++)
1380		LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1381			    struct nf_conntrack_tuple_hash *, me);
1382	write_unlock_bh(&nf_conntrack_lock);
1383
1384	/* Someone could be still looking at the helper in a bh. */
1385	synchronize_net();
1386}
1387
1388/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1389void __nf_ct_refresh_acct(struct nf_conn *ct,
1390			  enum ip_conntrack_info ctinfo,
1391			  const struct sk_buff *skb,
1392			  unsigned long extra_jiffies,
1393			  int do_acct)
1394{
1395	int event = 0;
1396
1397	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1398	NF_CT_ASSERT(skb);
1399
1400	write_lock_bh(&nf_conntrack_lock);
1401
1402	/* Only update if this is not a fixed timeout */
1403	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1404		write_unlock_bh(&nf_conntrack_lock);
1405		return;
1406	}
1407
1408	/* If not in hash table, timer will not be active yet */
1409	if (!nf_ct_is_confirmed(ct)) {
1410		ct->timeout.expires = extra_jiffies;
1411		event = IPCT_REFRESH;
1412	} else {
1413		/* Need del_timer for race avoidance (may already be dying). */
1414		if (del_timer(&ct->timeout)) {
1415			ct->timeout.expires = jiffies + extra_jiffies;
1416			add_timer(&ct->timeout);
1417			event = IPCT_REFRESH;
1418		}
1419	}
1420
1421#ifdef CONFIG_NF_CT_ACCT
1422	if (do_acct) {
1423		ct->counters[CTINFO2DIR(ctinfo)].packets++;
1424		ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1425			skb->len - (unsigned int)(skb->nh.raw - skb->data);
1426	if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1427	    || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1428		event |= IPCT_COUNTER_FILLING;
1429	}
1430#endif
1431
1432	write_unlock_bh(&nf_conntrack_lock);
1433
1434	/* must be unlocked when calling event cache */
1435	if (event)
1436		nf_conntrack_event_cache(event, skb);
1437}
1438
1439#if defined(CONFIG_NF_CT_NETLINK) || \
1440    defined(CONFIG_NF_CT_NETLINK_MODULE)
1441
1442#include <linux/netfilter/nfnetlink.h>
1443#include <linux/netfilter/nfnetlink_conntrack.h>
1444#include <linux/mutex.h>
1445
1446
1447/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1448 * in ip_conntrack_core, since we don't want the protocols to autoload
1449 * or depend on ctnetlink */
1450int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1451			       const struct nf_conntrack_tuple *tuple)
1452{
1453	NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1454		&tuple->src.u.tcp.port);
1455	NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1456		&tuple->dst.u.tcp.port);
1457	return 0;
1458
1459nfattr_failure:
1460	return -1;
1461}
1462
1463static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1464	[CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1465	[CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1466};
1467
1468int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1469			       struct nf_conntrack_tuple *t)
1470{
1471	if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1472		return -EINVAL;
1473
1474	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1475		return -EINVAL;
1476
1477	t->src.u.tcp.port =
1478		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1479	t->dst.u.tcp.port =
1480		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1481
1482	return 0;
1483}
1484#endif
1485
1486/* Used by ipt_REJECT and ip6t_REJECT. */
1487void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1488{
1489	struct nf_conn *ct;
1490	enum ip_conntrack_info ctinfo;
1491
1492	/* This ICMP is in reverse direction to the packet which caused it */
1493	ct = nf_ct_get(skb, &ctinfo);
1494	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1495		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1496	else
1497		ctinfo = IP_CT_RELATED;
1498
1499	/* Attach to new skbuff, and increment count */
1500	nskb->nfct = &ct->ct_general;
1501	nskb->nfctinfo = ctinfo;
1502	nf_conntrack_get(nskb->nfct);
1503}
1504
1505static inline int
1506do_iter(const struct nf_conntrack_tuple_hash *i,
1507	int (*iter)(struct nf_conn *i, void *data),
1508	void *data)
1509{
1510	return iter(nf_ct_tuplehash_to_ctrack(i), data);
1511}
1512
1513/* Bring out ya dead! */
1514static struct nf_conntrack_tuple_hash *
1515get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1516		void *data, unsigned int *bucket)
1517{
1518	struct nf_conntrack_tuple_hash *h = NULL;
1519
1520	write_lock_bh(&nf_conntrack_lock);
1521	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1522		h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1523				struct nf_conntrack_tuple_hash *, iter, data);
1524		if (h)
1525			break;
1526 	}
1527	if (!h)
1528		h = LIST_FIND_W(&unconfirmed, do_iter,
1529				struct nf_conntrack_tuple_hash *, iter, data);
1530	if (h)
1531		atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1532	write_unlock_bh(&nf_conntrack_lock);
1533
1534	return h;
1535}
1536
1537void
1538nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1539{
1540	struct nf_conntrack_tuple_hash *h;
1541	unsigned int bucket = 0;
1542
1543	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1544		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1545		/* Time to push up daises... */
1546		if (del_timer(&ct->timeout))
1547			death_by_timeout((unsigned long)ct);
1548		/* ... else the timer will get him soon. */
1549
1550		nf_ct_put(ct);
1551	}
1552}
1553
1554static int kill_all(struct nf_conn *i, void *data)
1555{
1556	return 1;
1557}
1558
1559static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1560{
1561	if (vmalloced)
1562		vfree(hash);
1563	else
1564		free_pages((unsigned long)hash,
1565			   get_order(sizeof(struct list_head) * size));
1566}
1567
1568void nf_conntrack_flush()
1569{
1570	nf_ct_iterate_cleanup(kill_all, NULL);
1571}
1572
1573/* Mishearing the voices in his head, our hero wonders how he's
1574   supposed to kill the mall. */
1575void nf_conntrack_cleanup(void)
1576{
1577	int i;
1578
1579	ip_ct_attach = NULL;
1580
1581	/* This makes sure all current packets have passed through
1582	   netfilter framework.  Roll on, two-stage module
1583	   delete... */
1584	synchronize_net();
1585
1586	nf_ct_event_cache_flush();
1587 i_see_dead_people:
1588	nf_conntrack_flush();
1589	if (atomic_read(&nf_conntrack_count) != 0) {
1590		schedule();
1591		goto i_see_dead_people;
1592	}
1593	/* wait until all references to nf_conntrack_untracked are dropped */
1594	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1595		schedule();
1596
1597	for (i = 0; i < NF_CT_F_NUM; i++) {
1598		if (nf_ct_cache[i].use == 0)
1599			continue;
1600
1601		NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1602		nf_ct_cache[i].use = 1;
1603		nf_conntrack_unregister_cache(i);
1604	}
1605	kmem_cache_destroy(nf_conntrack_expect_cachep);
1606	free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1607			    nf_conntrack_htable_size);
1608
1609	/* free l3proto protocol tables */
1610	for (i = 0; i < PF_MAX; i++)
1611		if (nf_ct_protos[i]) {
1612			kfree(nf_ct_protos[i]);
1613			nf_ct_protos[i] = NULL;
1614		}
1615}
1616
1617static struct list_head *alloc_hashtable(int size, int *vmalloced)
1618{
1619	struct list_head *hash;
1620	unsigned int i;
1621
1622	*vmalloced = 0;
1623	hash = (void*)__get_free_pages(GFP_KERNEL,
1624				       get_order(sizeof(struct list_head)
1625						 * size));
1626	if (!hash) {
1627		*vmalloced = 1;
1628		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1629		hash = vmalloc(sizeof(struct list_head) * size);
1630	}
1631
1632	if (hash)
1633		for (i = 0; i < size; i++)
1634			INIT_LIST_HEAD(&hash[i]);
1635
1636	return hash;
1637}
1638
1639int set_hashsize(const char *val, struct kernel_param *kp)
1640{
1641	int i, bucket, hashsize, vmalloced;
1642	int old_vmalloced, old_size;
1643	int rnd;
1644	struct list_head *hash, *old_hash;
1645	struct nf_conntrack_tuple_hash *h;
1646
1647	/* On boot, we can set this without any fancy locking. */
1648	if (!nf_conntrack_htable_size)
1649		return param_set_uint(val, kp);
1650
1651	hashsize = simple_strtol(val, NULL, 0);
1652	if (!hashsize)
1653		return -EINVAL;
1654
1655	hash = alloc_hashtable(hashsize, &vmalloced);
1656	if (!hash)
1657		return -ENOMEM;
1658
1659	/* We have to rehahs for the new table anyway, so we also can
1660	 * use a newrandom seed */
1661	get_random_bytes(&rnd, 4);
1662
1663	write_lock_bh(&nf_conntrack_lock);
1664	for (i = 0; i < nf_conntrack_htable_size; i++) {
1665		while (!list_empty(&nf_conntrack_hash[i])) {
1666			h = list_entry(nf_conntrack_hash[i].next,
1667				       struct nf_conntrack_tuple_hash, list);
1668			list_del(&h->list);
1669			bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1670			list_add_tail(&h->list, &hash[bucket]);
1671		}
1672	}
1673	old_size = nf_conntrack_htable_size;
1674	old_vmalloced = nf_conntrack_vmalloc;
1675	old_hash = nf_conntrack_hash;
1676
1677	nf_conntrack_htable_size = hashsize;
1678	nf_conntrack_vmalloc = vmalloced;
1679	nf_conntrack_hash = hash;
1680	nf_conntrack_hash_rnd = rnd;
1681	write_unlock_bh(&nf_conntrack_lock);
1682
1683	free_conntrack_hash(old_hash, old_vmalloced, old_size);
1684	return 0;
1685}
1686
1687module_param_call(hashsize, set_hashsize, param_get_uint,
1688		  &nf_conntrack_htable_size, 0600);
1689
1690int __init nf_conntrack_init(void)
1691{
1692	unsigned int i;
1693	int ret;
1694
1695	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1696	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1697	if (!nf_conntrack_htable_size) {
1698		nf_conntrack_htable_size
1699			= (((num_physpages << PAGE_SHIFT) / 16384)
1700			   / sizeof(struct list_head));
1701		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1702			nf_conntrack_htable_size = 8192;
1703		if (nf_conntrack_htable_size < 16)
1704			nf_conntrack_htable_size = 16;
1705	}
1706	nf_conntrack_max = 8 * nf_conntrack_htable_size;
1707
1708	printk("nf_conntrack version %s (%u buckets, %d max)\n",
1709	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1710	       nf_conntrack_max);
1711
1712	nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1713					    &nf_conntrack_vmalloc);
1714	if (!nf_conntrack_hash) {
1715		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1716		goto err_out;
1717	}
1718
1719	ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1720					  sizeof(struct nf_conn));
1721	if (ret < 0) {
1722		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1723		goto err_free_hash;
1724	}
1725
1726	nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1727					sizeof(struct nf_conntrack_expect),
1728					0, 0, NULL, NULL);
1729	if (!nf_conntrack_expect_cachep) {
1730		printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1731		goto err_free_conntrack_slab;
1732	}
1733
1734	/* Don't NEED lock here, but good form anyway. */
1735	write_lock_bh(&nf_conntrack_lock);
1736        for (i = 0; i < PF_MAX; i++)
1737		nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1738        write_unlock_bh(&nf_conntrack_lock);
1739
1740	/* For use by REJECT target */
1741	ip_ct_attach = __nf_conntrack_attach;
1742
1743	/* Set up fake conntrack:
1744	    - to never be deleted, not in any hashes */
1745	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1746	/*  - and look it like as a confirmed connection */
1747	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1748
1749	return ret;
1750
1751err_free_conntrack_slab:
1752	nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1753err_free_hash:
1754	free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1755			    nf_conntrack_htable_size);
1756err_out:
1757	return -ENOMEM;
1758}
1759