nf_conntrack_core.c revision ab59b19be78aac65cdd599fb5002c9019885e061
1/* Connection state tracking for netfilter. This is separated from, 2 but required by, the NAT layer; it can also be used by an iptables 3 extension. */ 4 5/* (C) 1999-2001 Paul `Rusty' Russell 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 */ 13 14#include <linux/types.h> 15#include <linux/netfilter.h> 16#include <linux/module.h> 17#include <linux/sched.h> 18#include <linux/skbuff.h> 19#include <linux/proc_fs.h> 20#include <linux/vmalloc.h> 21#include <linux/stddef.h> 22#include <linux/slab.h> 23#include <linux/random.h> 24#include <linux/jhash.h> 25#include <linux/err.h> 26#include <linux/percpu.h> 27#include <linux/moduleparam.h> 28#include <linux/notifier.h> 29#include <linux/kernel.h> 30#include <linux/netdevice.h> 31#include <linux/socket.h> 32#include <linux/mm.h> 33#include <linux/rculist_nulls.h> 34 35#include <net/netfilter/nf_conntrack.h> 36#include <net/netfilter/nf_conntrack_l3proto.h> 37#include <net/netfilter/nf_conntrack_l4proto.h> 38#include <net/netfilter/nf_conntrack_expect.h> 39#include <net/netfilter/nf_conntrack_helper.h> 40#include <net/netfilter/nf_conntrack_core.h> 41#include <net/netfilter/nf_conntrack_extend.h> 42#include <net/netfilter/nf_conntrack_acct.h> 43#include <net/netfilter/nf_conntrack_ecache.h> 44#include <net/netfilter/nf_nat.h> 45#include <net/netfilter/nf_nat_core.h> 46 47#define NF_CONNTRACK_VERSION "0.5.0" 48 49int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, 50 enum nf_nat_manip_type manip, 51 const struct nlattr *attr) __read_mostly; 52EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 53 54DEFINE_SPINLOCK(nf_conntrack_lock); 55EXPORT_SYMBOL_GPL(nf_conntrack_lock); 56 57unsigned int nf_conntrack_htable_size __read_mostly; 58EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 59 60unsigned int nf_conntrack_max __read_mostly; 61EXPORT_SYMBOL_GPL(nf_conntrack_max); 62 63struct nf_conn nf_conntrack_untracked __read_mostly; 64EXPORT_SYMBOL_GPL(nf_conntrack_untracked); 65 66static int nf_conntrack_hash_rnd_initted; 67static unsigned int nf_conntrack_hash_rnd; 68 69static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, 70 unsigned int size, unsigned int rnd) 71{ 72 unsigned int n; 73 u_int32_t h; 74 75 /* The direction must be ignored, so we hash everything up to the 76 * destination ports (which is a multiple of 4) and treat the last 77 * three bytes manually. 78 */ 79 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 80 h = jhash2((u32 *)tuple, n, 81 rnd ^ (((__force __u16)tuple->dst.u.all << 16) | 82 tuple->dst.protonum)); 83 84 return ((u64)h * size) >> 32; 85} 86 87static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) 88{ 89 return __hash_conntrack(tuple, nf_conntrack_htable_size, 90 nf_conntrack_hash_rnd); 91} 92 93bool 94nf_ct_get_tuple(const struct sk_buff *skb, 95 unsigned int nhoff, 96 unsigned int dataoff, 97 u_int16_t l3num, 98 u_int8_t protonum, 99 struct nf_conntrack_tuple *tuple, 100 const struct nf_conntrack_l3proto *l3proto, 101 const struct nf_conntrack_l4proto *l4proto) 102{ 103 memset(tuple, 0, sizeof(*tuple)); 104 105 tuple->src.l3num = l3num; 106 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 107 return false; 108 109 tuple->dst.protonum = protonum; 110 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 111 112 return l4proto->pkt_to_tuple(skb, dataoff, tuple); 113} 114EXPORT_SYMBOL_GPL(nf_ct_get_tuple); 115 116bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 117 u_int16_t l3num, struct nf_conntrack_tuple *tuple) 118{ 119 struct nf_conntrack_l3proto *l3proto; 120 struct nf_conntrack_l4proto *l4proto; 121 unsigned int protoff; 122 u_int8_t protonum; 123 int ret; 124 125 rcu_read_lock(); 126 127 l3proto = __nf_ct_l3proto_find(l3num); 128 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); 129 if (ret != NF_ACCEPT) { 130 rcu_read_unlock(); 131 return false; 132 } 133 134 l4proto = __nf_ct_l4proto_find(l3num, protonum); 135 136 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, 137 l3proto, l4proto); 138 139 rcu_read_unlock(); 140 return ret; 141} 142EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 143 144bool 145nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 146 const struct nf_conntrack_tuple *orig, 147 const struct nf_conntrack_l3proto *l3proto, 148 const struct nf_conntrack_l4proto *l4proto) 149{ 150 memset(inverse, 0, sizeof(*inverse)); 151 152 inverse->src.l3num = orig->src.l3num; 153 if (l3proto->invert_tuple(inverse, orig) == 0) 154 return false; 155 156 inverse->dst.dir = !orig->dst.dir; 157 158 inverse->dst.protonum = orig->dst.protonum; 159 return l4proto->invert_tuple(inverse, orig); 160} 161EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 162 163static void 164clean_from_lists(struct nf_conn *ct) 165{ 166 pr_debug("clean_from_lists(%p)\n", ct); 167 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 168 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 169 170 /* Destroy all pending expectations */ 171 nf_ct_remove_expectations(ct); 172} 173 174static void 175destroy_conntrack(struct nf_conntrack *nfct) 176{ 177 struct nf_conn *ct = (struct nf_conn *)nfct; 178 struct net *net = nf_ct_net(ct); 179 struct nf_conntrack_l4proto *l4proto; 180 181 pr_debug("destroy_conntrack(%p)\n", ct); 182 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 183 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 184 185 /* To make sure we don't get any weird locking issues here: 186 * destroy_conntrack() MUST NOT be called with a write lock 187 * to nf_conntrack_lock!!! -HW */ 188 rcu_read_lock(); 189 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 190 if (l4proto && l4proto->destroy) 191 l4proto->destroy(ct); 192 193 rcu_read_unlock(); 194 195 spin_lock_bh(&nf_conntrack_lock); 196 /* Expectations will have been removed in clean_from_lists, 197 * except TFTP can create an expectation on the first packet, 198 * before connection is in the list, so we need to clean here, 199 * too. */ 200 nf_ct_remove_expectations(ct); 201 202 /* We overload first tuple to link into unconfirmed list. */ 203 if (!nf_ct_is_confirmed(ct)) { 204 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 205 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 206 } 207 208 NF_CT_STAT_INC(net, delete); 209 spin_unlock_bh(&nf_conntrack_lock); 210 211 if (ct->master) 212 nf_ct_put(ct->master); 213 214 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 215 nf_conntrack_free(ct); 216} 217 218void nf_ct_delete_from_lists(struct nf_conn *ct) 219{ 220 struct net *net = nf_ct_net(ct); 221 222 nf_ct_helper_destroy(ct); 223 spin_lock_bh(&nf_conntrack_lock); 224 /* Inside lock so preempt is disabled on module removal path. 225 * Otherwise we can get spurious warnings. */ 226 NF_CT_STAT_INC(net, delete_list); 227 clean_from_lists(ct); 228 spin_unlock_bh(&nf_conntrack_lock); 229} 230EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); 231 232static void death_by_event(unsigned long ul_conntrack) 233{ 234 struct nf_conn *ct = (void *)ul_conntrack; 235 struct net *net = nf_ct_net(ct); 236 237 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { 238 /* bad luck, let's retry again */ 239 ct->timeout.expires = jiffies + 240 (random32() % net->ct.sysctl_events_retry_timeout); 241 add_timer(&ct->timeout); 242 return; 243 } 244 /* we've got the event delivered, now it's dying */ 245 set_bit(IPS_DYING_BIT, &ct->status); 246 spin_lock(&nf_conntrack_lock); 247 hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 248 spin_unlock(&nf_conntrack_lock); 249 nf_ct_put(ct); 250} 251 252void nf_ct_insert_dying_list(struct nf_conn *ct) 253{ 254 struct net *net = nf_ct_net(ct); 255 256 /* add this conntrack to the dying list */ 257 spin_lock_bh(&nf_conntrack_lock); 258 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 259 &net->ct.dying); 260 spin_unlock_bh(&nf_conntrack_lock); 261 /* set a new timer to retry event delivery */ 262 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); 263 ct->timeout.expires = jiffies + 264 (random32() % net->ct.sysctl_events_retry_timeout); 265 add_timer(&ct->timeout); 266} 267EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); 268 269static void death_by_timeout(unsigned long ul_conntrack) 270{ 271 struct nf_conn *ct = (void *)ul_conntrack; 272 273 if (!test_bit(IPS_DYING_BIT, &ct->status) && 274 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { 275 /* destroy event was not delivered */ 276 nf_ct_delete_from_lists(ct); 277 nf_ct_insert_dying_list(ct); 278 return; 279 } 280 set_bit(IPS_DYING_BIT, &ct->status); 281 nf_ct_delete_from_lists(ct); 282 nf_ct_put(ct); 283} 284 285/* 286 * Warning : 287 * - Caller must take a reference on returned object 288 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 289 * OR 290 * - Caller must lock nf_conntrack_lock before calling this function 291 */ 292struct nf_conntrack_tuple_hash * 293__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) 294{ 295 struct nf_conntrack_tuple_hash *h; 296 struct hlist_nulls_node *n; 297 unsigned int hash = hash_conntrack(tuple); 298 299 /* Disable BHs the entire time since we normally need to disable them 300 * at least once for the stats anyway. 301 */ 302 local_bh_disable(); 303begin: 304 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 305 if (nf_ct_tuple_equal(tuple, &h->tuple)) { 306 NF_CT_STAT_INC(net, found); 307 local_bh_enable(); 308 return h; 309 } 310 NF_CT_STAT_INC(net, searched); 311 } 312 /* 313 * if the nulls value we got at the end of this lookup is 314 * not the expected one, we must restart lookup. 315 * We probably met an item that was moved to another chain. 316 */ 317 if (get_nulls_value(n) != hash) 318 goto begin; 319 local_bh_enable(); 320 321 return NULL; 322} 323EXPORT_SYMBOL_GPL(__nf_conntrack_find); 324 325/* Find a connection corresponding to a tuple. */ 326struct nf_conntrack_tuple_hash * 327nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple) 328{ 329 struct nf_conntrack_tuple_hash *h; 330 struct nf_conn *ct; 331 332 rcu_read_lock(); 333begin: 334 h = __nf_conntrack_find(net, tuple); 335 if (h) { 336 ct = nf_ct_tuplehash_to_ctrack(h); 337 if (unlikely(nf_ct_is_dying(ct) || 338 !atomic_inc_not_zero(&ct->ct_general.use))) 339 h = NULL; 340 else { 341 if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) { 342 nf_ct_put(ct); 343 goto begin; 344 } 345 } 346 } 347 rcu_read_unlock(); 348 349 return h; 350} 351EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 352 353static void __nf_conntrack_hash_insert(struct nf_conn *ct, 354 unsigned int hash, 355 unsigned int repl_hash) 356{ 357 struct net *net = nf_ct_net(ct); 358 359 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 360 &net->ct.hash[hash]); 361 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 362 &net->ct.hash[repl_hash]); 363} 364 365void nf_conntrack_hash_insert(struct nf_conn *ct) 366{ 367 unsigned int hash, repl_hash; 368 369 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 370 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 371 372 __nf_conntrack_hash_insert(ct, hash, repl_hash); 373} 374EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); 375 376/* Confirm a connection given skb; places it in hash table */ 377int 378__nf_conntrack_confirm(struct sk_buff *skb) 379{ 380 unsigned int hash, repl_hash; 381 struct nf_conntrack_tuple_hash *h; 382 struct nf_conn *ct; 383 struct nf_conn_help *help; 384 struct hlist_nulls_node *n; 385 enum ip_conntrack_info ctinfo; 386 struct net *net; 387 388 ct = nf_ct_get(skb, &ctinfo); 389 net = nf_ct_net(ct); 390 391 /* ipt_REJECT uses nf_conntrack_attach to attach related 392 ICMP/TCP RST packets in other direction. Actual packet 393 which created connection will be IP_CT_NEW or for an 394 expected connection, IP_CT_RELATED. */ 395 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 396 return NF_ACCEPT; 397 398 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 399 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 400 401 /* We're not in hash table, and we refuse to set up related 402 connections for unconfirmed conns. But packet copies and 403 REJECT will give spurious warnings here. */ 404 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 405 406 /* No external references means noone else could have 407 confirmed us. */ 408 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 409 pr_debug("Confirming conntrack %p\n", ct); 410 411 spin_lock_bh(&nf_conntrack_lock); 412 413 /* See if there's one in the list already, including reverse: 414 NAT could have grabbed it without realizing, since we're 415 not in the hash. If there is, we lost race. */ 416 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 417 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 418 &h->tuple)) 419 goto out; 420 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 421 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 422 &h->tuple)) 423 goto out; 424 425 /* Remove from unconfirmed list */ 426 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 427 428 /* Timer relative to confirmation time, not original 429 setting time, otherwise we'd get timer wrap in 430 weird delay cases. */ 431 ct->timeout.expires += jiffies; 432 add_timer(&ct->timeout); 433 atomic_inc(&ct->ct_general.use); 434 set_bit(IPS_CONFIRMED_BIT, &ct->status); 435 436 /* Since the lookup is lockless, hash insertion must be done after 437 * starting the timer and setting the CONFIRMED bit. The RCU barriers 438 * guarantee that no other CPU can find the conntrack before the above 439 * stores are visible. 440 */ 441 __nf_conntrack_hash_insert(ct, hash, repl_hash); 442 NF_CT_STAT_INC(net, insert); 443 spin_unlock_bh(&nf_conntrack_lock); 444 445 help = nfct_help(ct); 446 if (help && help->helper) 447 nf_conntrack_event_cache(IPCT_HELPER, ct); 448 449 nf_conntrack_event_cache(master_ct(ct) ? 450 IPCT_RELATED : IPCT_NEW, ct); 451 return NF_ACCEPT; 452 453out: 454 NF_CT_STAT_INC(net, insert_failed); 455 spin_unlock_bh(&nf_conntrack_lock); 456 return NF_DROP; 457} 458EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 459 460/* Returns true if a connection correspondings to the tuple (required 461 for NAT). */ 462int 463nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 464 const struct nf_conn *ignored_conntrack) 465{ 466 struct net *net = nf_ct_net(ignored_conntrack); 467 struct nf_conntrack_tuple_hash *h; 468 struct hlist_nulls_node *n; 469 unsigned int hash = hash_conntrack(tuple); 470 471 /* Disable BHs the entire time since we need to disable them at 472 * least once for the stats anyway. 473 */ 474 rcu_read_lock_bh(); 475 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 476 if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && 477 nf_ct_tuple_equal(tuple, &h->tuple)) { 478 NF_CT_STAT_INC(net, found); 479 rcu_read_unlock_bh(); 480 return 1; 481 } 482 NF_CT_STAT_INC(net, searched); 483 } 484 rcu_read_unlock_bh(); 485 486 return 0; 487} 488EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 489 490#define NF_CT_EVICTION_RANGE 8 491 492/* There's a small race here where we may free a just-assured 493 connection. Too bad: we're in trouble anyway. */ 494static noinline int early_drop(struct net *net, unsigned int hash) 495{ 496 /* Use oldest entry, which is roughly LRU */ 497 struct nf_conntrack_tuple_hash *h; 498 struct nf_conn *ct = NULL, *tmp; 499 struct hlist_nulls_node *n; 500 unsigned int i, cnt = 0; 501 int dropped = 0; 502 503 rcu_read_lock(); 504 for (i = 0; i < nf_conntrack_htable_size; i++) { 505 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 506 hnnode) { 507 tmp = nf_ct_tuplehash_to_ctrack(h); 508 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 509 ct = tmp; 510 cnt++; 511 } 512 513 if (ct != NULL) { 514 if (likely(!nf_ct_is_dying(ct) && 515 atomic_inc_not_zero(&ct->ct_general.use))) 516 break; 517 else 518 ct = NULL; 519 } 520 521 if (cnt >= NF_CT_EVICTION_RANGE) 522 break; 523 524 hash = (hash + 1) % nf_conntrack_htable_size; 525 } 526 rcu_read_unlock(); 527 528 if (!ct) 529 return dropped; 530 531 if (del_timer(&ct->timeout)) { 532 death_by_timeout((unsigned long)ct); 533 dropped = 1; 534 NF_CT_STAT_INC_ATOMIC(net, early_drop); 535 } 536 nf_ct_put(ct); 537 return dropped; 538} 539 540struct nf_conn *nf_conntrack_alloc(struct net *net, 541 const struct nf_conntrack_tuple *orig, 542 const struct nf_conntrack_tuple *repl, 543 gfp_t gfp) 544{ 545 struct nf_conn *ct; 546 547 if (unlikely(!nf_conntrack_hash_rnd_initted)) { 548 get_random_bytes(&nf_conntrack_hash_rnd, 549 sizeof(nf_conntrack_hash_rnd)); 550 nf_conntrack_hash_rnd_initted = 1; 551 } 552 553 /* We don't want any race condition at early drop stage */ 554 atomic_inc(&net->ct.count); 555 556 if (nf_conntrack_max && 557 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 558 unsigned int hash = hash_conntrack(orig); 559 if (!early_drop(net, hash)) { 560 atomic_dec(&net->ct.count); 561 if (net_ratelimit()) 562 printk(KERN_WARNING 563 "nf_conntrack: table full, dropping" 564 " packet.\n"); 565 return ERR_PTR(-ENOMEM); 566 } 567 } 568 569 /* 570 * Do not use kmem_cache_zalloc(), as this cache uses 571 * SLAB_DESTROY_BY_RCU. 572 */ 573 ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); 574 if (ct == NULL) { 575 pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); 576 atomic_dec(&net->ct.count); 577 return ERR_PTR(-ENOMEM); 578 } 579 /* 580 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next 581 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. 582 */ 583 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, 584 sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); 585 spin_lock_init(&ct->lock); 586 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 587 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 588 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 589 ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; 590 /* Don't set timer yet: wait for confirmation */ 591 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); 592#ifdef CONFIG_NET_NS 593 ct->ct_net = net; 594#endif 595 596 /* 597 * changes to lookup keys must be done before setting refcnt to 1 598 */ 599 smp_wmb(); 600 atomic_set(&ct->ct_general.use, 1); 601 return ct; 602} 603EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 604 605void nf_conntrack_free(struct nf_conn *ct) 606{ 607 struct net *net = nf_ct_net(ct); 608 609 nf_ct_ext_destroy(ct); 610 atomic_dec(&net->ct.count); 611 nf_ct_ext_free(ct); 612 kmem_cache_free(net->ct.nf_conntrack_cachep, ct); 613} 614EXPORT_SYMBOL_GPL(nf_conntrack_free); 615 616/* Allocate a new conntrack: we return -ENOMEM if classification 617 failed due to stress. Otherwise it really is unclassifiable. */ 618static struct nf_conntrack_tuple_hash * 619init_conntrack(struct net *net, 620 const struct nf_conntrack_tuple *tuple, 621 struct nf_conntrack_l3proto *l3proto, 622 struct nf_conntrack_l4proto *l4proto, 623 struct sk_buff *skb, 624 unsigned int dataoff) 625{ 626 struct nf_conn *ct; 627 struct nf_conn_help *help; 628 struct nf_conntrack_tuple repl_tuple; 629 struct nf_conntrack_expect *exp; 630 631 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 632 pr_debug("Can't invert tuple.\n"); 633 return NULL; 634 } 635 636 ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC); 637 if (IS_ERR(ct)) { 638 pr_debug("Can't allocate conntrack.\n"); 639 return (struct nf_conntrack_tuple_hash *)ct; 640 } 641 642 if (!l4proto->new(ct, skb, dataoff)) { 643 nf_conntrack_free(ct); 644 pr_debug("init conntrack: can't track with proto module\n"); 645 return NULL; 646 } 647 648 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 649 nf_ct_ecache_ext_add(ct, GFP_ATOMIC); 650 651 spin_lock_bh(&nf_conntrack_lock); 652 exp = nf_ct_find_expectation(net, tuple); 653 if (exp) { 654 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", 655 ct, exp); 656 /* Welcome, Mr. Bond. We've been expecting you... */ 657 __set_bit(IPS_EXPECTED_BIT, &ct->status); 658 ct->master = exp->master; 659 if (exp->helper) { 660 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 661 if (help) 662 rcu_assign_pointer(help->helper, exp->helper); 663 } 664 665#ifdef CONFIG_NF_CONNTRACK_MARK 666 ct->mark = exp->master->mark; 667#endif 668#ifdef CONFIG_NF_CONNTRACK_SECMARK 669 ct->secmark = exp->master->secmark; 670#endif 671 nf_conntrack_get(&ct->master->ct_general); 672 NF_CT_STAT_INC(net, expect_new); 673 } else { 674 __nf_ct_try_assign_helper(ct, GFP_ATOMIC); 675 NF_CT_STAT_INC(net, new); 676 } 677 678 /* Overload tuple linked list to put us in unconfirmed list. */ 679 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 680 &net->ct.unconfirmed); 681 682 spin_unlock_bh(&nf_conntrack_lock); 683 684 if (exp) { 685 if (exp->expectfn) 686 exp->expectfn(ct, exp); 687 nf_ct_expect_put(exp); 688 } 689 690 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 691} 692 693/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 694static inline struct nf_conn * 695resolve_normal_ct(struct net *net, 696 struct sk_buff *skb, 697 unsigned int dataoff, 698 u_int16_t l3num, 699 u_int8_t protonum, 700 struct nf_conntrack_l3proto *l3proto, 701 struct nf_conntrack_l4proto *l4proto, 702 int *set_reply, 703 enum ip_conntrack_info *ctinfo) 704{ 705 struct nf_conntrack_tuple tuple; 706 struct nf_conntrack_tuple_hash *h; 707 struct nf_conn *ct; 708 709 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 710 dataoff, l3num, protonum, &tuple, l3proto, 711 l4proto)) { 712 pr_debug("resolve_normal_ct: Can't get tuple\n"); 713 return NULL; 714 } 715 716 /* look for tuple match */ 717 h = nf_conntrack_find_get(net, &tuple); 718 if (!h) { 719 h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); 720 if (!h) 721 return NULL; 722 if (IS_ERR(h)) 723 return (void *)h; 724 } 725 ct = nf_ct_tuplehash_to_ctrack(h); 726 727 /* It exists; we have (non-exclusive) reference. */ 728 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 729 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; 730 /* Please set reply bit if this packet OK */ 731 *set_reply = 1; 732 } else { 733 /* Once we've had two way comms, always ESTABLISHED. */ 734 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 735 pr_debug("nf_conntrack_in: normal packet for %p\n", ct); 736 *ctinfo = IP_CT_ESTABLISHED; 737 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 738 pr_debug("nf_conntrack_in: related packet for %p\n", 739 ct); 740 *ctinfo = IP_CT_RELATED; 741 } else { 742 pr_debug("nf_conntrack_in: new packet for %p\n", ct); 743 *ctinfo = IP_CT_NEW; 744 } 745 *set_reply = 0; 746 } 747 skb->nfct = &ct->ct_general; 748 skb->nfctinfo = *ctinfo; 749 return ct; 750} 751 752unsigned int 753nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 754 struct sk_buff *skb) 755{ 756 struct nf_conn *ct; 757 enum ip_conntrack_info ctinfo; 758 struct nf_conntrack_l3proto *l3proto; 759 struct nf_conntrack_l4proto *l4proto; 760 unsigned int dataoff; 761 u_int8_t protonum; 762 int set_reply = 0; 763 int ret; 764 765 /* Previously seen (loopback or untracked)? Ignore. */ 766 if (skb->nfct) { 767 NF_CT_STAT_INC_ATOMIC(net, ignore); 768 return NF_ACCEPT; 769 } 770 771 /* rcu_read_lock()ed by nf_hook_slow */ 772 l3proto = __nf_ct_l3proto_find(pf); 773 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 774 &dataoff, &protonum); 775 if (ret <= 0) { 776 pr_debug("not prepared to track yet or error occured\n"); 777 NF_CT_STAT_INC_ATOMIC(net, error); 778 NF_CT_STAT_INC_ATOMIC(net, invalid); 779 return -ret; 780 } 781 782 l4proto = __nf_ct_l4proto_find(pf, protonum); 783 784 /* It may be an special packet, error, unclean... 785 * inverse of the return code tells to the netfilter 786 * core what to do with the packet. */ 787 if (l4proto->error != NULL) { 788 ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum); 789 if (ret <= 0) { 790 NF_CT_STAT_INC_ATOMIC(net, error); 791 NF_CT_STAT_INC_ATOMIC(net, invalid); 792 return -ret; 793 } 794 } 795 796 ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, 797 l3proto, l4proto, &set_reply, &ctinfo); 798 if (!ct) { 799 /* Not valid part of a connection */ 800 NF_CT_STAT_INC_ATOMIC(net, invalid); 801 return NF_ACCEPT; 802 } 803 804 if (IS_ERR(ct)) { 805 /* Too stressed to deal. */ 806 NF_CT_STAT_INC_ATOMIC(net, drop); 807 return NF_DROP; 808 } 809 810 NF_CT_ASSERT(skb->nfct); 811 812 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); 813 if (ret <= 0) { 814 /* Invalid: inverse of the return code tells 815 * the netfilter core what to do */ 816 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 817 nf_conntrack_put(skb->nfct); 818 skb->nfct = NULL; 819 NF_CT_STAT_INC_ATOMIC(net, invalid); 820 if (ret == -NF_DROP) 821 NF_CT_STAT_INC_ATOMIC(net, drop); 822 return -ret; 823 } 824 825 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 826 nf_conntrack_event_cache(IPCT_STATUS, ct); 827 828 return ret; 829} 830EXPORT_SYMBOL_GPL(nf_conntrack_in); 831 832bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 833 const struct nf_conntrack_tuple *orig) 834{ 835 bool ret; 836 837 rcu_read_lock(); 838 ret = nf_ct_invert_tuple(inverse, orig, 839 __nf_ct_l3proto_find(orig->src.l3num), 840 __nf_ct_l4proto_find(orig->src.l3num, 841 orig->dst.protonum)); 842 rcu_read_unlock(); 843 return ret; 844} 845EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 846 847/* Alter reply tuple (maybe alter helper). This is for NAT, and is 848 implicitly racy: see __nf_conntrack_confirm */ 849void nf_conntrack_alter_reply(struct nf_conn *ct, 850 const struct nf_conntrack_tuple *newreply) 851{ 852 struct nf_conn_help *help = nfct_help(ct); 853 854 /* Should be unconfirmed, so not in hash table yet */ 855 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 856 857 pr_debug("Altering reply tuple of %p to ", ct); 858 nf_ct_dump_tuple(newreply); 859 860 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 861 if (ct->master || (help && !hlist_empty(&help->expectations))) 862 return; 863 864 rcu_read_lock(); 865 __nf_ct_try_assign_helper(ct, GFP_ATOMIC); 866 rcu_read_unlock(); 867} 868EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 869 870/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 871void __nf_ct_refresh_acct(struct nf_conn *ct, 872 enum ip_conntrack_info ctinfo, 873 const struct sk_buff *skb, 874 unsigned long extra_jiffies, 875 int do_acct) 876{ 877 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); 878 NF_CT_ASSERT(skb); 879 880 /* Only update if this is not a fixed timeout */ 881 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 882 goto acct; 883 884 /* If not in hash table, timer will not be active yet */ 885 if (!nf_ct_is_confirmed(ct)) { 886 ct->timeout.expires = extra_jiffies; 887 } else { 888 unsigned long newtime = jiffies + extra_jiffies; 889 890 /* Only update the timeout if the new timeout is at least 891 HZ jiffies from the old timeout. Need del_timer for race 892 avoidance (may already be dying). */ 893 if (newtime - ct->timeout.expires >= HZ) 894 mod_timer_pending(&ct->timeout, newtime); 895 } 896 897acct: 898 if (do_acct) { 899 struct nf_conn_counter *acct; 900 901 acct = nf_conn_acct_find(ct); 902 if (acct) { 903 spin_lock_bh(&ct->lock); 904 acct[CTINFO2DIR(ctinfo)].packets++; 905 acct[CTINFO2DIR(ctinfo)].bytes += 906 skb->len - skb_network_offset(skb); 907 spin_unlock_bh(&ct->lock); 908 } 909 } 910} 911EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 912 913bool __nf_ct_kill_acct(struct nf_conn *ct, 914 enum ip_conntrack_info ctinfo, 915 const struct sk_buff *skb, 916 int do_acct) 917{ 918 if (do_acct) { 919 struct nf_conn_counter *acct; 920 921 acct = nf_conn_acct_find(ct); 922 if (acct) { 923 spin_lock_bh(&ct->lock); 924 acct[CTINFO2DIR(ctinfo)].packets++; 925 acct[CTINFO2DIR(ctinfo)].bytes += 926 skb->len - skb_network_offset(skb); 927 spin_unlock_bh(&ct->lock); 928 } 929 } 930 931 if (del_timer(&ct->timeout)) { 932 ct->timeout.function((unsigned long)ct); 933 return true; 934 } 935 return false; 936} 937EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); 938 939#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 940 941#include <linux/netfilter/nfnetlink.h> 942#include <linux/netfilter/nfnetlink_conntrack.h> 943#include <linux/mutex.h> 944 945/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 946 * in ip_conntrack_core, since we don't want the protocols to autoload 947 * or depend on ctnetlink */ 948int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 949 const struct nf_conntrack_tuple *tuple) 950{ 951 NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); 952 NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); 953 return 0; 954 955nla_put_failure: 956 return -1; 957} 958EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 959 960const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 961 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 962 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 963}; 964EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 965 966int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 967 struct nf_conntrack_tuple *t) 968{ 969 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 970 return -EINVAL; 971 972 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 973 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 974 975 return 0; 976} 977EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 978 979int nf_ct_port_nlattr_tuple_size(void) 980{ 981 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 982} 983EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 984#endif 985 986/* Used by ipt_REJECT and ip6t_REJECT. */ 987static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) 988{ 989 struct nf_conn *ct; 990 enum ip_conntrack_info ctinfo; 991 992 /* This ICMP is in reverse direction to the packet which caused it */ 993 ct = nf_ct_get(skb, &ctinfo); 994 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 995 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; 996 else 997 ctinfo = IP_CT_RELATED; 998 999 /* Attach to new skbuff, and increment count */ 1000 nskb->nfct = &ct->ct_general; 1001 nskb->nfctinfo = ctinfo; 1002 nf_conntrack_get(nskb->nfct); 1003} 1004 1005/* Bring out ya dead! */ 1006static struct nf_conn * 1007get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), 1008 void *data, unsigned int *bucket) 1009{ 1010 struct nf_conntrack_tuple_hash *h; 1011 struct nf_conn *ct; 1012 struct hlist_nulls_node *n; 1013 1014 spin_lock_bh(&nf_conntrack_lock); 1015 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 1016 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1017 ct = nf_ct_tuplehash_to_ctrack(h); 1018 if (iter(ct, data)) 1019 goto found; 1020 } 1021 } 1022 hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { 1023 ct = nf_ct_tuplehash_to_ctrack(h); 1024 if (iter(ct, data)) 1025 set_bit(IPS_DYING_BIT, &ct->status); 1026 } 1027 spin_unlock_bh(&nf_conntrack_lock); 1028 return NULL; 1029found: 1030 atomic_inc(&ct->ct_general.use); 1031 spin_unlock_bh(&nf_conntrack_lock); 1032 return ct; 1033} 1034 1035void nf_ct_iterate_cleanup(struct net *net, 1036 int (*iter)(struct nf_conn *i, void *data), 1037 void *data) 1038{ 1039 struct nf_conn *ct; 1040 unsigned int bucket = 0; 1041 1042 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1043 /* Time to push up daises... */ 1044 if (del_timer(&ct->timeout)) 1045 death_by_timeout((unsigned long)ct); 1046 /* ... else the timer will get him soon. */ 1047 1048 nf_ct_put(ct); 1049 } 1050} 1051EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 1052 1053struct __nf_ct_flush_report { 1054 u32 pid; 1055 int report; 1056}; 1057 1058static int kill_report(struct nf_conn *i, void *data) 1059{ 1060 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; 1061 1062 /* If we fail to deliver the event, death_by_timeout() will retry */ 1063 if (nf_conntrack_event_report(IPCT_DESTROY, i, 1064 fr->pid, fr->report) < 0) 1065 return 1; 1066 1067 /* Avoid the delivery of the destroy event in death_by_timeout(). */ 1068 set_bit(IPS_DYING_BIT, &i->status); 1069 return 1; 1070} 1071 1072static int kill_all(struct nf_conn *i, void *data) 1073{ 1074 return 1; 1075} 1076 1077void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) 1078{ 1079 if (vmalloced) 1080 vfree(hash); 1081 else 1082 free_pages((unsigned long)hash, 1083 get_order(sizeof(struct hlist_head) * size)); 1084} 1085EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); 1086 1087void nf_conntrack_flush_report(struct net *net, u32 pid, int report) 1088{ 1089 struct __nf_ct_flush_report fr = { 1090 .pid = pid, 1091 .report = report, 1092 }; 1093 nf_ct_iterate_cleanup(net, kill_report, &fr); 1094} 1095EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); 1096 1097static void nf_ct_release_dying_list(struct net *net) 1098{ 1099 struct nf_conntrack_tuple_hash *h; 1100 struct nf_conn *ct; 1101 struct hlist_nulls_node *n; 1102 1103 spin_lock_bh(&nf_conntrack_lock); 1104 hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { 1105 ct = nf_ct_tuplehash_to_ctrack(h); 1106 /* never fails to remove them, no listeners at this point */ 1107 nf_ct_kill(ct); 1108 } 1109 spin_unlock_bh(&nf_conntrack_lock); 1110} 1111 1112static void nf_conntrack_cleanup_init_net(void) 1113{ 1114 /* wait until all references to nf_conntrack_untracked are dropped */ 1115 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) 1116 schedule(); 1117 1118 nf_conntrack_helper_fini(); 1119 nf_conntrack_proto_fini(); 1120} 1121 1122static void nf_conntrack_cleanup_net(struct net *net) 1123{ 1124 i_see_dead_people: 1125 nf_ct_iterate_cleanup(net, kill_all, NULL); 1126 nf_ct_release_dying_list(net); 1127 if (atomic_read(&net->ct.count) != 0) { 1128 schedule(); 1129 goto i_see_dead_people; 1130 } 1131 1132 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1133 nf_conntrack_htable_size); 1134 nf_conntrack_ecache_fini(net); 1135 nf_conntrack_acct_fini(net); 1136 nf_conntrack_expect_fini(net); 1137 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1138 kfree(net->ct.slabname); 1139 free_percpu(net->ct.stat); 1140} 1141 1142/* Mishearing the voices in his head, our hero wonders how he's 1143 supposed to kill the mall. */ 1144void nf_conntrack_cleanup(struct net *net) 1145{ 1146 if (net_eq(net, &init_net)) 1147 rcu_assign_pointer(ip_ct_attach, NULL); 1148 1149 /* This makes sure all current packets have passed through 1150 netfilter framework. Roll on, two-stage module 1151 delete... */ 1152 synchronize_net(); 1153 1154 nf_conntrack_cleanup_net(net); 1155 1156 if (net_eq(net, &init_net)) { 1157 rcu_assign_pointer(nf_ct_destroy, NULL); 1158 nf_conntrack_cleanup_init_net(); 1159 } 1160} 1161 1162void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) 1163{ 1164 struct hlist_nulls_head *hash; 1165 unsigned int nr_slots, i; 1166 size_t sz; 1167 1168 *vmalloced = 0; 1169 1170 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 1171 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 1172 sz = nr_slots * sizeof(struct hlist_nulls_head); 1173 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 1174 get_order(sz)); 1175 if (!hash) { 1176 *vmalloced = 1; 1177 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 1178 hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 1179 } 1180 1181 if (hash && nulls) 1182 for (i = 0; i < nr_slots; i++) 1183 INIT_HLIST_NULLS_HEAD(&hash[i], i); 1184 1185 return hash; 1186} 1187EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 1188 1189int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) 1190{ 1191 int i, bucket, vmalloced, old_vmalloced; 1192 unsigned int hashsize, old_size; 1193 int rnd; 1194 struct hlist_nulls_head *hash, *old_hash; 1195 struct nf_conntrack_tuple_hash *h; 1196 1197 /* On boot, we can set this without any fancy locking. */ 1198 if (!nf_conntrack_htable_size) 1199 return param_set_uint(val, kp); 1200 1201 hashsize = simple_strtoul(val, NULL, 0); 1202 if (!hashsize) 1203 return -EINVAL; 1204 1205 hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); 1206 if (!hash) 1207 return -ENOMEM; 1208 1209 /* We have to rehahs for the new table anyway, so we also can 1210 * use a newrandom seed */ 1211 get_random_bytes(&rnd, sizeof(rnd)); 1212 1213 /* Lookups in the old hash might happen in parallel, which means we 1214 * might get false negatives during connection lookup. New connections 1215 * created because of a false negative won't make it into the hash 1216 * though since that required taking the lock. 1217 */ 1218 spin_lock_bh(&nf_conntrack_lock); 1219 for (i = 0; i < nf_conntrack_htable_size; i++) { 1220 while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1221 h = hlist_nulls_entry(init_net.ct.hash[i].first, 1222 struct nf_conntrack_tuple_hash, hnnode); 1223 hlist_nulls_del_rcu(&h->hnnode); 1224 bucket = __hash_conntrack(&h->tuple, hashsize, rnd); 1225 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1226 } 1227 } 1228 old_size = nf_conntrack_htable_size; 1229 old_vmalloced = init_net.ct.hash_vmalloc; 1230 old_hash = init_net.ct.hash; 1231 1232 nf_conntrack_htable_size = hashsize; 1233 init_net.ct.hash_vmalloc = vmalloced; 1234 init_net.ct.hash = hash; 1235 nf_conntrack_hash_rnd = rnd; 1236 spin_unlock_bh(&nf_conntrack_lock); 1237 1238 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); 1239 return 0; 1240} 1241EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 1242 1243module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, 1244 &nf_conntrack_htable_size, 0600); 1245 1246static int nf_conntrack_init_init_net(void) 1247{ 1248 int max_factor = 8; 1249 int ret; 1250 1251 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1252 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 1253 if (!nf_conntrack_htable_size) { 1254 nf_conntrack_htable_size 1255 = (((totalram_pages << PAGE_SHIFT) / 16384) 1256 / sizeof(struct hlist_head)); 1257 if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 1258 nf_conntrack_htable_size = 16384; 1259 if (nf_conntrack_htable_size < 32) 1260 nf_conntrack_htable_size = 32; 1261 1262 /* Use a max. factor of four by default to get the same max as 1263 * with the old struct list_heads. When a table size is given 1264 * we use the old value of 8 to avoid reducing the max. 1265 * entries. */ 1266 max_factor = 4; 1267 } 1268 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1269 1270 printk("nf_conntrack version %s (%u buckets, %d max)\n", 1271 NF_CONNTRACK_VERSION, nf_conntrack_htable_size, 1272 nf_conntrack_max); 1273 1274 ret = nf_conntrack_proto_init(); 1275 if (ret < 0) 1276 goto err_proto; 1277 1278 ret = nf_conntrack_helper_init(); 1279 if (ret < 0) 1280 goto err_helper; 1281 1282 /* Set up fake conntrack: to never be deleted, not in any hashes */ 1283#ifdef CONFIG_NET_NS 1284 nf_conntrack_untracked.ct_net = &init_net; 1285#endif 1286 atomic_set(&nf_conntrack_untracked.ct_general.use, 1); 1287 /* - and look it like as a confirmed connection */ 1288 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); 1289 1290 return 0; 1291 1292err_helper: 1293 nf_conntrack_proto_fini(); 1294err_proto: 1295 return ret; 1296} 1297 1298/* 1299 * We need to use special "null" values, not used in hash table 1300 */ 1301#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 1302#define DYING_NULLS_VAL ((1<<30)+1) 1303 1304static int nf_conntrack_init_net(struct net *net) 1305{ 1306 int ret; 1307 1308 atomic_set(&net->ct.count, 0); 1309 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); 1310 INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); 1311 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 1312 if (!net->ct.stat) { 1313 ret = -ENOMEM; 1314 goto err_stat; 1315 } 1316 1317 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); 1318 if (!net->ct.slabname) { 1319 ret = -ENOMEM; 1320 goto err_slabname; 1321 } 1322 1323 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, 1324 sizeof(struct nf_conn), 0, 1325 SLAB_DESTROY_BY_RCU, NULL); 1326 if (!net->ct.nf_conntrack_cachep) { 1327 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 1328 ret = -ENOMEM; 1329 goto err_cache; 1330 } 1331 net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1332 &net->ct.hash_vmalloc, 1); 1333 if (!net->ct.hash) { 1334 ret = -ENOMEM; 1335 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 1336 goto err_hash; 1337 } 1338 ret = nf_conntrack_expect_init(net); 1339 if (ret < 0) 1340 goto err_expect; 1341 ret = nf_conntrack_acct_init(net); 1342 if (ret < 0) 1343 goto err_acct; 1344 ret = nf_conntrack_ecache_init(net); 1345 if (ret < 0) 1346 goto err_ecache; 1347 1348 return 0; 1349 1350err_ecache: 1351 nf_conntrack_acct_fini(net); 1352err_acct: 1353 nf_conntrack_expect_fini(net); 1354err_expect: 1355 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1356 nf_conntrack_htable_size); 1357err_hash: 1358 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1359err_cache: 1360 kfree(net->ct.slabname); 1361err_slabname: 1362 free_percpu(net->ct.stat); 1363err_stat: 1364 return ret; 1365} 1366 1367s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, 1368 enum ip_conntrack_dir dir, 1369 u32 seq); 1370EXPORT_SYMBOL_GPL(nf_ct_nat_offset); 1371 1372int nf_conntrack_init(struct net *net) 1373{ 1374 int ret; 1375 1376 if (net_eq(net, &init_net)) { 1377 ret = nf_conntrack_init_init_net(); 1378 if (ret < 0) 1379 goto out_init_net; 1380 } 1381 ret = nf_conntrack_init_net(net); 1382 if (ret < 0) 1383 goto out_net; 1384 1385 if (net_eq(net, &init_net)) { 1386 /* For use by REJECT target */ 1387 rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); 1388 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); 1389 1390 /* Howto get NAT offsets */ 1391 rcu_assign_pointer(nf_ct_nat_offset, NULL); 1392 } 1393 return 0; 1394 1395out_net: 1396 if (net_eq(net, &init_net)) 1397 nf_conntrack_cleanup_init_net(); 1398out_init_net: 1399 return ret; 1400} 1401