nf_conntrack_core.c revision f205c5e0c28aa7e0fb6eaaa66e97928f9d9e6994
1/* Connection state tracking for netfilter. This is separated from, 2 but required by, the NAT layer; it can also be used by an iptables 3 extension. */ 4 5/* (C) 1999-2001 Paul `Rusty' Russell 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 */ 13 14#include <linux/types.h> 15#include <linux/netfilter.h> 16#include <linux/module.h> 17#include <linux/skbuff.h> 18#include <linux/proc_fs.h> 19#include <linux/vmalloc.h> 20#include <linux/stddef.h> 21#include <linux/slab.h> 22#include <linux/random.h> 23#include <linux/jhash.h> 24#include <linux/err.h> 25#include <linux/percpu.h> 26#include <linux/moduleparam.h> 27#include <linux/notifier.h> 28#include <linux/kernel.h> 29#include <linux/netdevice.h> 30#include <linux/socket.h> 31#include <linux/mm.h> 32 33#include <net/netfilter/nf_conntrack.h> 34#include <net/netfilter/nf_conntrack_l3proto.h> 35#include <net/netfilter/nf_conntrack_l4proto.h> 36#include <net/netfilter/nf_conntrack_expect.h> 37#include <net/netfilter/nf_conntrack_helper.h> 38#include <net/netfilter/nf_conntrack_core.h> 39#include <net/netfilter/nf_conntrack_extend.h> 40 41#define NF_CONNTRACK_VERSION "0.5.0" 42 43#if 0 44#define DEBUGP printk 45#else 46#define DEBUGP(format, args...) 47#endif 48 49DEFINE_RWLOCK(nf_conntrack_lock); 50EXPORT_SYMBOL_GPL(nf_conntrack_lock); 51 52/* nf_conntrack_standalone needs this */ 53atomic_t nf_conntrack_count = ATOMIC_INIT(0); 54EXPORT_SYMBOL_GPL(nf_conntrack_count); 55 56unsigned int nf_conntrack_htable_size __read_mostly; 57EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 58 59int nf_conntrack_max __read_mostly; 60EXPORT_SYMBOL_GPL(nf_conntrack_max); 61 62struct hlist_head *nf_conntrack_hash __read_mostly; 63EXPORT_SYMBOL_GPL(nf_conntrack_hash); 64 65struct nf_conn nf_conntrack_untracked __read_mostly; 66EXPORT_SYMBOL_GPL(nf_conntrack_untracked); 67 68unsigned int nf_ct_log_invalid __read_mostly; 69HLIST_HEAD(unconfirmed); 70static int nf_conntrack_vmalloc __read_mostly; 71static struct kmem_cache *nf_conntrack_cachep __read_mostly; 72static unsigned int nf_conntrack_next_id; 73 74DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); 75EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); 76 77static int nf_conntrack_hash_rnd_initted; 78static unsigned int nf_conntrack_hash_rnd; 79 80static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, 81 unsigned int size, unsigned int rnd) 82{ 83 unsigned int a, b; 84 85 a = jhash2(tuple->src.u3.all, ARRAY_SIZE(tuple->src.u3.all), 86 (tuple->src.l3num << 16) | tuple->dst.protonum); 87 b = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), 88 (tuple->src.u.all << 16) | tuple->dst.u.all); 89 90 return jhash_2words(a, b, rnd) % size; 91} 92 93static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) 94{ 95 return __hash_conntrack(tuple, nf_conntrack_htable_size, 96 nf_conntrack_hash_rnd); 97} 98 99int 100nf_ct_get_tuple(const struct sk_buff *skb, 101 unsigned int nhoff, 102 unsigned int dataoff, 103 u_int16_t l3num, 104 u_int8_t protonum, 105 struct nf_conntrack_tuple *tuple, 106 const struct nf_conntrack_l3proto *l3proto, 107 const struct nf_conntrack_l4proto *l4proto) 108{ 109 NF_CT_TUPLE_U_BLANK(tuple); 110 111 tuple->src.l3num = l3num; 112 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 113 return 0; 114 115 tuple->dst.protonum = protonum; 116 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 117 118 return l4proto->pkt_to_tuple(skb, dataoff, tuple); 119} 120EXPORT_SYMBOL_GPL(nf_ct_get_tuple); 121 122int 123nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 124 const struct nf_conntrack_tuple *orig, 125 const struct nf_conntrack_l3proto *l3proto, 126 const struct nf_conntrack_l4proto *l4proto) 127{ 128 NF_CT_TUPLE_U_BLANK(inverse); 129 130 inverse->src.l3num = orig->src.l3num; 131 if (l3proto->invert_tuple(inverse, orig) == 0) 132 return 0; 133 134 inverse->dst.dir = !orig->dst.dir; 135 136 inverse->dst.protonum = orig->dst.protonum; 137 return l4proto->invert_tuple(inverse, orig); 138} 139EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 140 141static void 142clean_from_lists(struct nf_conn *ct) 143{ 144 DEBUGP("clean_from_lists(%p)\n", ct); 145 hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 146 hlist_del(&ct->tuplehash[IP_CT_DIR_REPLY].hnode); 147 148 /* Destroy all pending expectations */ 149 nf_ct_remove_expectations(ct); 150} 151 152static void 153destroy_conntrack(struct nf_conntrack *nfct) 154{ 155 struct nf_conn *ct = (struct nf_conn *)nfct; 156 struct nf_conntrack_l4proto *l4proto; 157 158 DEBUGP("destroy_conntrack(%p)\n", ct); 159 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 160 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 161 162 nf_conntrack_event(IPCT_DESTROY, ct); 163 set_bit(IPS_DYING_BIT, &ct->status); 164 165 /* To make sure we don't get any weird locking issues here: 166 * destroy_conntrack() MUST NOT be called with a write lock 167 * to nf_conntrack_lock!!! -HW */ 168 rcu_read_lock(); 169 l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, 170 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); 171 if (l4proto && l4proto->destroy) 172 l4proto->destroy(ct); 173 174 nf_ct_ext_destroy(ct); 175 176 rcu_read_unlock(); 177 178 write_lock_bh(&nf_conntrack_lock); 179 /* Expectations will have been removed in clean_from_lists, 180 * except TFTP can create an expectation on the first packet, 181 * before connection is in the list, so we need to clean here, 182 * too. */ 183 nf_ct_remove_expectations(ct); 184 185 /* We overload first tuple to link into unconfirmed list. */ 186 if (!nf_ct_is_confirmed(ct)) { 187 BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode)); 188 hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 189 } 190 191 NF_CT_STAT_INC(delete); 192 write_unlock_bh(&nf_conntrack_lock); 193 194 if (ct->master) 195 nf_ct_put(ct->master); 196 197 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); 198 nf_conntrack_free(ct); 199} 200 201static void death_by_timeout(unsigned long ul_conntrack) 202{ 203 struct nf_conn *ct = (void *)ul_conntrack; 204 struct nf_conn_help *help = nfct_help(ct); 205 struct nf_conntrack_helper *helper; 206 207 if (help) { 208 rcu_read_lock(); 209 helper = rcu_dereference(help->helper); 210 if (helper && helper->destroy) 211 helper->destroy(ct); 212 rcu_read_unlock(); 213 } 214 215 write_lock_bh(&nf_conntrack_lock); 216 /* Inside lock so preempt is disabled on module removal path. 217 * Otherwise we can get spurious warnings. */ 218 NF_CT_STAT_INC(delete_list); 219 clean_from_lists(ct); 220 write_unlock_bh(&nf_conntrack_lock); 221 nf_ct_put(ct); 222} 223 224struct nf_conntrack_tuple_hash * 225__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, 226 const struct nf_conn *ignored_conntrack) 227{ 228 struct nf_conntrack_tuple_hash *h; 229 struct hlist_node *n; 230 unsigned int hash = hash_conntrack(tuple); 231 232 hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) { 233 if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && 234 nf_ct_tuple_equal(tuple, &h->tuple)) { 235 NF_CT_STAT_INC(found); 236 return h; 237 } 238 NF_CT_STAT_INC(searched); 239 } 240 241 return NULL; 242} 243EXPORT_SYMBOL_GPL(__nf_conntrack_find); 244 245/* Find a connection corresponding to a tuple. */ 246struct nf_conntrack_tuple_hash * 247nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, 248 const struct nf_conn *ignored_conntrack) 249{ 250 struct nf_conntrack_tuple_hash *h; 251 252 read_lock_bh(&nf_conntrack_lock); 253 h = __nf_conntrack_find(tuple, ignored_conntrack); 254 if (h) 255 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); 256 read_unlock_bh(&nf_conntrack_lock); 257 258 return h; 259} 260EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 261 262static void __nf_conntrack_hash_insert(struct nf_conn *ct, 263 unsigned int hash, 264 unsigned int repl_hash) 265{ 266 ct->id = ++nf_conntrack_next_id; 267 hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, 268 &nf_conntrack_hash[hash]); 269 hlist_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, 270 &nf_conntrack_hash[repl_hash]); 271} 272 273void nf_conntrack_hash_insert(struct nf_conn *ct) 274{ 275 unsigned int hash, repl_hash; 276 277 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 278 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 279 280 write_lock_bh(&nf_conntrack_lock); 281 __nf_conntrack_hash_insert(ct, hash, repl_hash); 282 write_unlock_bh(&nf_conntrack_lock); 283} 284EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); 285 286/* Confirm a connection given skb; places it in hash table */ 287int 288__nf_conntrack_confirm(struct sk_buff **pskb) 289{ 290 unsigned int hash, repl_hash; 291 struct nf_conntrack_tuple_hash *h; 292 struct nf_conn *ct; 293 struct nf_conn_help *help; 294 struct hlist_node *n; 295 enum ip_conntrack_info ctinfo; 296 297 ct = nf_ct_get(*pskb, &ctinfo); 298 299 /* ipt_REJECT uses nf_conntrack_attach to attach related 300 ICMP/TCP RST packets in other direction. Actual packet 301 which created connection will be IP_CT_NEW or for an 302 expected connection, IP_CT_RELATED. */ 303 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 304 return NF_ACCEPT; 305 306 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 307 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 308 309 /* We're not in hash table, and we refuse to set up related 310 connections for unconfirmed conns. But packet copies and 311 REJECT will give spurious warnings here. */ 312 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 313 314 /* No external references means noone else could have 315 confirmed us. */ 316 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 317 DEBUGP("Confirming conntrack %p\n", ct); 318 319 write_lock_bh(&nf_conntrack_lock); 320 321 /* See if there's one in the list already, including reverse: 322 NAT could have grabbed it without realizing, since we're 323 not in the hash. If there is, we lost race. */ 324 hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) 325 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 326 &h->tuple)) 327 goto out; 328 hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) 329 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 330 &h->tuple)) 331 goto out; 332 333 /* Remove from unconfirmed list */ 334 hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 335 336 __nf_conntrack_hash_insert(ct, hash, repl_hash); 337 /* Timer relative to confirmation time, not original 338 setting time, otherwise we'd get timer wrap in 339 weird delay cases. */ 340 ct->timeout.expires += jiffies; 341 add_timer(&ct->timeout); 342 atomic_inc(&ct->ct_general.use); 343 set_bit(IPS_CONFIRMED_BIT, &ct->status); 344 NF_CT_STAT_INC(insert); 345 write_unlock_bh(&nf_conntrack_lock); 346 help = nfct_help(ct); 347 if (help && help->helper) 348 nf_conntrack_event_cache(IPCT_HELPER, *pskb); 349#ifdef CONFIG_NF_NAT_NEEDED 350 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || 351 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) 352 nf_conntrack_event_cache(IPCT_NATINFO, *pskb); 353#endif 354 nf_conntrack_event_cache(master_ct(ct) ? 355 IPCT_RELATED : IPCT_NEW, *pskb); 356 return NF_ACCEPT; 357 358out: 359 NF_CT_STAT_INC(insert_failed); 360 write_unlock_bh(&nf_conntrack_lock); 361 return NF_DROP; 362} 363EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 364 365/* Returns true if a connection correspondings to the tuple (required 366 for NAT). */ 367int 368nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 369 const struct nf_conn *ignored_conntrack) 370{ 371 struct nf_conntrack_tuple_hash *h; 372 373 read_lock_bh(&nf_conntrack_lock); 374 h = __nf_conntrack_find(tuple, ignored_conntrack); 375 read_unlock_bh(&nf_conntrack_lock); 376 377 return h != NULL; 378} 379EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 380 381/* There's a small race here where we may free a just-assured 382 connection. Too bad: we're in trouble anyway. */ 383static int early_drop(struct hlist_head *chain) 384{ 385 /* Use oldest entry, which is roughly LRU */ 386 struct nf_conntrack_tuple_hash *h; 387 struct nf_conn *ct = NULL, *tmp; 388 struct hlist_node *n; 389 int dropped = 0; 390 391 read_lock_bh(&nf_conntrack_lock); 392 hlist_for_each_entry(h, n, chain, hnode) { 393 tmp = nf_ct_tuplehash_to_ctrack(h); 394 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 395 ct = tmp; 396 } 397 if (ct) 398 atomic_inc(&ct->ct_general.use); 399 read_unlock_bh(&nf_conntrack_lock); 400 401 if (!ct) 402 return dropped; 403 404 if (del_timer(&ct->timeout)) { 405 death_by_timeout((unsigned long)ct); 406 dropped = 1; 407 NF_CT_STAT_INC_ATOMIC(early_drop); 408 } 409 nf_ct_put(ct); 410 return dropped; 411} 412 413struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, 414 const struct nf_conntrack_tuple *repl) 415{ 416 struct nf_conn *conntrack = NULL; 417 418 if (unlikely(!nf_conntrack_hash_rnd_initted)) { 419 get_random_bytes(&nf_conntrack_hash_rnd, 4); 420 nf_conntrack_hash_rnd_initted = 1; 421 } 422 423 /* We don't want any race condition at early drop stage */ 424 atomic_inc(&nf_conntrack_count); 425 426 if (nf_conntrack_max 427 && atomic_read(&nf_conntrack_count) > nf_conntrack_max) { 428 unsigned int hash = hash_conntrack(orig); 429 /* Try dropping from this hash chain. */ 430 if (!early_drop(&nf_conntrack_hash[hash])) { 431 atomic_dec(&nf_conntrack_count); 432 if (net_ratelimit()) 433 printk(KERN_WARNING 434 "nf_conntrack: table full, dropping" 435 " packet.\n"); 436 return ERR_PTR(-ENOMEM); 437 } 438 } 439 440 conntrack = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC); 441 if (conntrack == NULL) { 442 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack.\n"); 443 atomic_dec(&nf_conntrack_count); 444 return ERR_PTR(-ENOMEM); 445 } 446 447 atomic_set(&conntrack->ct_general.use, 1); 448 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 449 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 450 /* Don't set timer yet: wait for confirmation */ 451 setup_timer(&conntrack->timeout, death_by_timeout, 452 (unsigned long)conntrack); 453 454 return conntrack; 455} 456EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 457 458void nf_conntrack_free(struct nf_conn *conntrack) 459{ 460 nf_ct_ext_free(conntrack); 461 kmem_cache_free(nf_conntrack_cachep, conntrack); 462 atomic_dec(&nf_conntrack_count); 463} 464EXPORT_SYMBOL_GPL(nf_conntrack_free); 465 466/* Allocate a new conntrack: we return -ENOMEM if classification 467 failed due to stress. Otherwise it really is unclassifiable. */ 468static struct nf_conntrack_tuple_hash * 469init_conntrack(const struct nf_conntrack_tuple *tuple, 470 struct nf_conntrack_l3proto *l3proto, 471 struct nf_conntrack_l4proto *l4proto, 472 struct sk_buff *skb, 473 unsigned int dataoff) 474{ 475 struct nf_conn *conntrack; 476 struct nf_conn_help *help; 477 struct nf_conntrack_tuple repl_tuple; 478 struct nf_conntrack_expect *exp; 479 480 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 481 DEBUGP("Can't invert tuple.\n"); 482 return NULL; 483 } 484 485 conntrack = nf_conntrack_alloc(tuple, &repl_tuple); 486 if (conntrack == NULL || IS_ERR(conntrack)) { 487 DEBUGP("Can't allocate conntrack.\n"); 488 return (struct nf_conntrack_tuple_hash *)conntrack; 489 } 490 491 if (!l4proto->new(conntrack, skb, dataoff)) { 492 nf_conntrack_free(conntrack); 493 DEBUGP("init conntrack: can't track with proto module\n"); 494 return NULL; 495 } 496 497 write_lock_bh(&nf_conntrack_lock); 498 exp = find_expectation(tuple); 499 if (exp) { 500 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", 501 conntrack, exp); 502 /* Welcome, Mr. Bond. We've been expecting you... */ 503 __set_bit(IPS_EXPECTED_BIT, &conntrack->status); 504 conntrack->master = exp->master; 505 if (exp->helper) { 506 help = nf_ct_ext_add(conntrack, NF_CT_EXT_HELPER, 507 GFP_ATOMIC); 508 if (help) 509 rcu_assign_pointer(help->helper, exp->helper); 510 else 511 DEBUGP("failed to add helper extension area"); 512 } 513 514#ifdef CONFIG_NF_CONNTRACK_MARK 515 conntrack->mark = exp->master->mark; 516#endif 517#ifdef CONFIG_NF_CONNTRACK_SECMARK 518 conntrack->secmark = exp->master->secmark; 519#endif 520 nf_conntrack_get(&conntrack->master->ct_general); 521 NF_CT_STAT_INC(expect_new); 522 } else { 523 struct nf_conntrack_helper *helper; 524 525 helper = __nf_ct_helper_find(&repl_tuple); 526 if (helper) { 527 help = nf_ct_ext_add(conntrack, NF_CT_EXT_HELPER, 528 GFP_ATOMIC); 529 if (help) 530 /* not in hash table yet, so not strictly 531 necessary */ 532 rcu_assign_pointer(help->helper, helper); 533 else 534 DEBUGP("failed to add helper extension area"); 535 } 536 NF_CT_STAT_INC(new); 537 } 538 539 /* Overload tuple linked list to put us in unconfirmed list. */ 540 hlist_add_head(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].hnode, 541 &unconfirmed); 542 543 write_unlock_bh(&nf_conntrack_lock); 544 545 if (exp) { 546 if (exp->expectfn) 547 exp->expectfn(conntrack, exp); 548 nf_conntrack_expect_put(exp); 549 } 550 551 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 552} 553 554/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 555static inline struct nf_conn * 556resolve_normal_ct(struct sk_buff *skb, 557 unsigned int dataoff, 558 u_int16_t l3num, 559 u_int8_t protonum, 560 struct nf_conntrack_l3proto *l3proto, 561 struct nf_conntrack_l4proto *l4proto, 562 int *set_reply, 563 enum ip_conntrack_info *ctinfo) 564{ 565 struct nf_conntrack_tuple tuple; 566 struct nf_conntrack_tuple_hash *h; 567 struct nf_conn *ct; 568 569 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 570 dataoff, l3num, protonum, &tuple, l3proto, 571 l4proto)) { 572 DEBUGP("resolve_normal_ct: Can't get tuple\n"); 573 return NULL; 574 } 575 576 /* look for tuple match */ 577 h = nf_conntrack_find_get(&tuple, NULL); 578 if (!h) { 579 h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff); 580 if (!h) 581 return NULL; 582 if (IS_ERR(h)) 583 return (void *)h; 584 } 585 ct = nf_ct_tuplehash_to_ctrack(h); 586 587 /* It exists; we have (non-exclusive) reference. */ 588 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 589 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; 590 /* Please set reply bit if this packet OK */ 591 *set_reply = 1; 592 } else { 593 /* Once we've had two way comms, always ESTABLISHED. */ 594 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 595 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); 596 *ctinfo = IP_CT_ESTABLISHED; 597 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 598 DEBUGP("nf_conntrack_in: related packet for %p\n", ct); 599 *ctinfo = IP_CT_RELATED; 600 } else { 601 DEBUGP("nf_conntrack_in: new packet for %p\n", ct); 602 *ctinfo = IP_CT_NEW; 603 } 604 *set_reply = 0; 605 } 606 skb->nfct = &ct->ct_general; 607 skb->nfctinfo = *ctinfo; 608 return ct; 609} 610 611unsigned int 612nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) 613{ 614 struct nf_conn *ct; 615 enum ip_conntrack_info ctinfo; 616 struct nf_conntrack_l3proto *l3proto; 617 struct nf_conntrack_l4proto *l4proto; 618 unsigned int dataoff; 619 u_int8_t protonum; 620 int set_reply = 0; 621 int ret; 622 623 /* Previously seen (loopback or untracked)? Ignore. */ 624 if ((*pskb)->nfct) { 625 NF_CT_STAT_INC_ATOMIC(ignore); 626 return NF_ACCEPT; 627 } 628 629 /* rcu_read_lock()ed by nf_hook_slow */ 630 l3proto = __nf_ct_l3proto_find((u_int16_t)pf); 631 632 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { 633 DEBUGP("not prepared to track yet or error occured\n"); 634 return -ret; 635 } 636 637 l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum); 638 639 /* It may be an special packet, error, unclean... 640 * inverse of the return code tells to the netfilter 641 * core what to do with the packet. */ 642 if (l4proto->error != NULL && 643 (ret = l4proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { 644 NF_CT_STAT_INC_ATOMIC(error); 645 NF_CT_STAT_INC_ATOMIC(invalid); 646 return -ret; 647 } 648 649 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, l4proto, 650 &set_reply, &ctinfo); 651 if (!ct) { 652 /* Not valid part of a connection */ 653 NF_CT_STAT_INC_ATOMIC(invalid); 654 return NF_ACCEPT; 655 } 656 657 if (IS_ERR(ct)) { 658 /* Too stressed to deal. */ 659 NF_CT_STAT_INC_ATOMIC(drop); 660 return NF_DROP; 661 } 662 663 NF_CT_ASSERT((*pskb)->nfct); 664 665 ret = l4proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); 666 if (ret < 0) { 667 /* Invalid: inverse of the return code tells 668 * the netfilter core what to do */ 669 DEBUGP("nf_conntrack_in: Can't track with proto module\n"); 670 nf_conntrack_put((*pskb)->nfct); 671 (*pskb)->nfct = NULL; 672 NF_CT_STAT_INC_ATOMIC(invalid); 673 return -ret; 674 } 675 676 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 677 nf_conntrack_event_cache(IPCT_STATUS, *pskb); 678 679 return ret; 680} 681EXPORT_SYMBOL_GPL(nf_conntrack_in); 682 683int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 684 const struct nf_conntrack_tuple *orig) 685{ 686 int ret; 687 688 rcu_read_lock(); 689 ret = nf_ct_invert_tuple(inverse, orig, 690 __nf_ct_l3proto_find(orig->src.l3num), 691 __nf_ct_l4proto_find(orig->src.l3num, 692 orig->dst.protonum)); 693 rcu_read_unlock(); 694 return ret; 695} 696EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 697 698/* Alter reply tuple (maybe alter helper). This is for NAT, and is 699 implicitly racy: see __nf_conntrack_confirm */ 700void nf_conntrack_alter_reply(struct nf_conn *ct, 701 const struct nf_conntrack_tuple *newreply) 702{ 703 struct nf_conn_help *help = nfct_help(ct); 704 struct nf_conntrack_helper *helper; 705 706 write_lock_bh(&nf_conntrack_lock); 707 /* Should be unconfirmed, so not in hash table yet */ 708 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 709 710 DEBUGP("Altering reply tuple of %p to ", ct); 711 NF_CT_DUMP_TUPLE(newreply); 712 713 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 714 if (ct->master || (help && help->expecting != 0)) 715 goto out; 716 717 helper = __nf_ct_helper_find(newreply); 718 if (helper == NULL) { 719 if (help) 720 rcu_assign_pointer(help->helper, NULL); 721 goto out; 722 } 723 724 if (help == NULL) { 725 help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, GFP_ATOMIC); 726 if (help == NULL) { 727 DEBUGP("failed to add helper extension area"); 728 goto out; 729 } 730 } else { 731 memset(&help->help, 0, sizeof(help->help)); 732 } 733 734 rcu_assign_pointer(help->helper, helper); 735out: 736 write_unlock_bh(&nf_conntrack_lock); 737} 738EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 739 740/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 741void __nf_ct_refresh_acct(struct nf_conn *ct, 742 enum ip_conntrack_info ctinfo, 743 const struct sk_buff *skb, 744 unsigned long extra_jiffies, 745 int do_acct) 746{ 747 int event = 0; 748 749 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); 750 NF_CT_ASSERT(skb); 751 752 write_lock_bh(&nf_conntrack_lock); 753 754 /* Only update if this is not a fixed timeout */ 755 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { 756 write_unlock_bh(&nf_conntrack_lock); 757 return; 758 } 759 760 /* If not in hash table, timer will not be active yet */ 761 if (!nf_ct_is_confirmed(ct)) { 762 ct->timeout.expires = extra_jiffies; 763 event = IPCT_REFRESH; 764 } else { 765 unsigned long newtime = jiffies + extra_jiffies; 766 767 /* Only update the timeout if the new timeout is at least 768 HZ jiffies from the old timeout. Need del_timer for race 769 avoidance (may already be dying). */ 770 if (newtime - ct->timeout.expires >= HZ 771 && del_timer(&ct->timeout)) { 772 ct->timeout.expires = newtime; 773 add_timer(&ct->timeout); 774 event = IPCT_REFRESH; 775 } 776 } 777 778#ifdef CONFIG_NF_CT_ACCT 779 if (do_acct) { 780 ct->counters[CTINFO2DIR(ctinfo)].packets++; 781 ct->counters[CTINFO2DIR(ctinfo)].bytes += 782 skb->len - skb_network_offset(skb); 783 784 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) 785 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) 786 event |= IPCT_COUNTER_FILLING; 787 } 788#endif 789 790 write_unlock_bh(&nf_conntrack_lock); 791 792 /* must be unlocked when calling event cache */ 793 if (event) 794 nf_conntrack_event_cache(event, skb); 795} 796EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 797 798#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 799 800#include <linux/netfilter/nfnetlink.h> 801#include <linux/netfilter/nfnetlink_conntrack.h> 802#include <linux/mutex.h> 803 804 805/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 806 * in ip_conntrack_core, since we don't want the protocols to autoload 807 * or depend on ctnetlink */ 808int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb, 809 const struct nf_conntrack_tuple *tuple) 810{ 811 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), 812 &tuple->src.u.tcp.port); 813 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), 814 &tuple->dst.u.tcp.port); 815 return 0; 816 817nfattr_failure: 818 return -1; 819} 820EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nfattr); 821 822static const size_t cta_min_proto[CTA_PROTO_MAX] = { 823 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), 824 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t) 825}; 826 827int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[], 828 struct nf_conntrack_tuple *t) 829{ 830 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) 831 return -EINVAL; 832 833 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) 834 return -EINVAL; 835 836 t->src.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); 837 t->dst.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); 838 839 return 0; 840} 841EXPORT_SYMBOL_GPL(nf_ct_port_nfattr_to_tuple); 842#endif 843 844/* Used by ipt_REJECT and ip6t_REJECT. */ 845void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) 846{ 847 struct nf_conn *ct; 848 enum ip_conntrack_info ctinfo; 849 850 /* This ICMP is in reverse direction to the packet which caused it */ 851 ct = nf_ct_get(skb, &ctinfo); 852 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 853 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; 854 else 855 ctinfo = IP_CT_RELATED; 856 857 /* Attach to new skbuff, and increment count */ 858 nskb->nfct = &ct->ct_general; 859 nskb->nfctinfo = ctinfo; 860 nf_conntrack_get(nskb->nfct); 861} 862EXPORT_SYMBOL_GPL(__nf_conntrack_attach); 863 864static inline int 865do_iter(const struct nf_conntrack_tuple_hash *i, 866 int (*iter)(struct nf_conn *i, void *data), 867 void *data) 868{ 869 return iter(nf_ct_tuplehash_to_ctrack(i), data); 870} 871 872/* Bring out ya dead! */ 873static struct nf_conn * 874get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 875 void *data, unsigned int *bucket) 876{ 877 struct nf_conntrack_tuple_hash *h; 878 struct nf_conn *ct; 879 struct hlist_node *n; 880 881 write_lock_bh(&nf_conntrack_lock); 882 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 883 hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { 884 ct = nf_ct_tuplehash_to_ctrack(h); 885 if (iter(ct, data)) 886 goto found; 887 } 888 } 889 hlist_for_each_entry(h, n, &unconfirmed, hnode) { 890 ct = nf_ct_tuplehash_to_ctrack(h); 891 if (iter(ct, data)) 892 set_bit(IPS_DYING_BIT, &ct->status); 893 } 894 write_unlock_bh(&nf_conntrack_lock); 895 return NULL; 896found: 897 atomic_inc(&ct->ct_general.use); 898 write_unlock_bh(&nf_conntrack_lock); 899 return ct; 900} 901 902void 903nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) 904{ 905 struct nf_conn *ct; 906 unsigned int bucket = 0; 907 908 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 909 /* Time to push up daises... */ 910 if (del_timer(&ct->timeout)) 911 death_by_timeout((unsigned long)ct); 912 /* ... else the timer will get him soon. */ 913 914 nf_ct_put(ct); 915 } 916} 917EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 918 919static int kill_all(struct nf_conn *i, void *data) 920{ 921 return 1; 922} 923 924static void free_conntrack_hash(struct hlist_head *hash, int vmalloced, 925 int size) 926{ 927 if (vmalloced) 928 vfree(hash); 929 else 930 free_pages((unsigned long)hash, 931 get_order(sizeof(struct hlist_head) * size)); 932} 933 934void nf_conntrack_flush(void) 935{ 936 nf_ct_iterate_cleanup(kill_all, NULL); 937} 938EXPORT_SYMBOL_GPL(nf_conntrack_flush); 939 940/* Mishearing the voices in his head, our hero wonders how he's 941 supposed to kill the mall. */ 942void nf_conntrack_cleanup(void) 943{ 944 rcu_assign_pointer(ip_ct_attach, NULL); 945 946 /* This makes sure all current packets have passed through 947 netfilter framework. Roll on, two-stage module 948 delete... */ 949 synchronize_net(); 950 951 nf_ct_event_cache_flush(); 952 i_see_dead_people: 953 nf_conntrack_flush(); 954 if (atomic_read(&nf_conntrack_count) != 0) { 955 schedule(); 956 goto i_see_dead_people; 957 } 958 /* wait until all references to nf_conntrack_untracked are dropped */ 959 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) 960 schedule(); 961 962 rcu_assign_pointer(nf_ct_destroy, NULL); 963 964 kmem_cache_destroy(nf_conntrack_cachep); 965 kmem_cache_destroy(nf_conntrack_expect_cachep); 966 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, 967 nf_conntrack_htable_size); 968 969 nf_conntrack_proto_fini(); 970 nf_conntrack_helper_fini(); 971} 972 973static struct hlist_head *alloc_hashtable(int *sizep, int *vmalloced) 974{ 975 struct hlist_head *hash; 976 unsigned int size, i; 977 978 *vmalloced = 0; 979 980 size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head)); 981 hash = (void*)__get_free_pages(GFP_KERNEL, 982 get_order(sizeof(struct hlist_head) 983 * size)); 984 if (!hash) { 985 *vmalloced = 1; 986 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 987 hash = vmalloc(sizeof(struct hlist_head) * size); 988 } 989 990 if (hash) 991 for (i = 0; i < size; i++) 992 INIT_HLIST_HEAD(&hash[i]); 993 994 return hash; 995} 996 997int set_hashsize(const char *val, struct kernel_param *kp) 998{ 999 int i, bucket, hashsize, vmalloced; 1000 int old_vmalloced, old_size; 1001 int rnd; 1002 struct hlist_head *hash, *old_hash; 1003 struct nf_conntrack_tuple_hash *h; 1004 1005 /* On boot, we can set this without any fancy locking. */ 1006 if (!nf_conntrack_htable_size) 1007 return param_set_uint(val, kp); 1008 1009 hashsize = simple_strtol(val, NULL, 0); 1010 if (!hashsize) 1011 return -EINVAL; 1012 1013 hash = alloc_hashtable(&hashsize, &vmalloced); 1014 if (!hash) 1015 return -ENOMEM; 1016 1017 /* We have to rehahs for the new table anyway, so we also can 1018 * use a newrandom seed */ 1019 get_random_bytes(&rnd, 4); 1020 1021 write_lock_bh(&nf_conntrack_lock); 1022 for (i = 0; i < nf_conntrack_htable_size; i++) { 1023 while (!hlist_empty(&nf_conntrack_hash[i])) { 1024 h = hlist_entry(nf_conntrack_hash[i].first, 1025 struct nf_conntrack_tuple_hash, hnode); 1026 hlist_del(&h->hnode); 1027 bucket = __hash_conntrack(&h->tuple, hashsize, rnd); 1028 hlist_add_head(&h->hnode, &hash[bucket]); 1029 } 1030 } 1031 old_size = nf_conntrack_htable_size; 1032 old_vmalloced = nf_conntrack_vmalloc; 1033 old_hash = nf_conntrack_hash; 1034 1035 nf_conntrack_htable_size = hashsize; 1036 nf_conntrack_vmalloc = vmalloced; 1037 nf_conntrack_hash = hash; 1038 nf_conntrack_hash_rnd = rnd; 1039 write_unlock_bh(&nf_conntrack_lock); 1040 1041 free_conntrack_hash(old_hash, old_vmalloced, old_size); 1042 return 0; 1043} 1044 1045module_param_call(hashsize, set_hashsize, param_get_uint, 1046 &nf_conntrack_htable_size, 0600); 1047 1048int __init nf_conntrack_init(void) 1049{ 1050 int max_factor = 8; 1051 int ret; 1052 1053 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1054 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 1055 if (!nf_conntrack_htable_size) { 1056 nf_conntrack_htable_size 1057 = (((num_physpages << PAGE_SHIFT) / 16384) 1058 / sizeof(struct hlist_head)); 1059 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) 1060 nf_conntrack_htable_size = 16384; 1061 if (nf_conntrack_htable_size < 32) 1062 nf_conntrack_htable_size = 32; 1063 1064 /* Use a max. factor of four by default to get the same max as 1065 * with the old struct list_heads. When a table size is given 1066 * we use the old value of 8 to avoid reducing the max. 1067 * entries. */ 1068 max_factor = 4; 1069 } 1070 nf_conntrack_hash = alloc_hashtable(&nf_conntrack_htable_size, 1071 &nf_conntrack_vmalloc); 1072 if (!nf_conntrack_hash) { 1073 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 1074 goto err_out; 1075 } 1076 1077 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1078 1079 printk("nf_conntrack version %s (%u buckets, %d max)\n", 1080 NF_CONNTRACK_VERSION, nf_conntrack_htable_size, 1081 nf_conntrack_max); 1082 1083 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1084 sizeof(struct nf_conn), 1085 0, 0, NULL, NULL); 1086 if (!nf_conntrack_cachep) { 1087 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 1088 goto err_free_hash; 1089 } 1090 1091 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", 1092 sizeof(struct nf_conntrack_expect), 1093 0, 0, NULL, NULL); 1094 if (!nf_conntrack_expect_cachep) { 1095 printk(KERN_ERR "Unable to create nf_expect slab cache\n"); 1096 goto err_free_conntrack_slab; 1097 } 1098 1099 ret = nf_conntrack_proto_init(); 1100 if (ret < 0) 1101 goto out_free_expect_slab; 1102 1103 ret = nf_conntrack_helper_init(); 1104 if (ret < 0) 1105 goto out_fini_proto; 1106 1107 /* For use by REJECT target */ 1108 rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); 1109 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); 1110 1111 /* Set up fake conntrack: 1112 - to never be deleted, not in any hashes */ 1113 atomic_set(&nf_conntrack_untracked.ct_general.use, 1); 1114 /* - and look it like as a confirmed connection */ 1115 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); 1116 1117 return ret; 1118 1119out_fini_proto: 1120 nf_conntrack_proto_fini(); 1121out_free_expect_slab: 1122 kmem_cache_destroy(nf_conntrack_expect_cachep); 1123err_free_conntrack_slab: 1124 kmem_cache_destroy(nf_conntrack_cachep); 1125err_free_hash: 1126 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, 1127 nf_conntrack_htable_size); 1128err_out: 1129 return -ENOMEM; 1130} 1131