1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25#define KMSG_COMPONENT "IPVS" 26#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 27 28#include <linux/interrupt.h> 29#include <linux/in.h> 30#include <linux/net.h> 31#include <linux/kernel.h> 32#include <linux/module.h> 33#include <linux/vmalloc.h> 34#include <linux/proc_fs.h> /* for proc_net_* */ 35#include <linux/slab.h> 36#include <linux/seq_file.h> 37#include <linux/jhash.h> 38#include <linux/random.h> 39 40#include <net/net_namespace.h> 41#include <net/ip_vs.h> 42 43 44#ifndef CONFIG_IP_VS_TAB_BITS 45#define CONFIG_IP_VS_TAB_BITS 12 46#endif 47 48/* 49 * Connection hash size. Default is what was selected at compile time. 50*/ 51static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 52module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 53MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 54 55/* size and mask values */ 56int ip_vs_conn_tab_size __read_mostly; 57static int ip_vs_conn_tab_mask __read_mostly; 58 59/* 60 * Connection hash table: for input and output packets lookups of IPVS 61 */ 62static struct hlist_head *ip_vs_conn_tab __read_mostly; 63 64/* SLAB cache for IPVS connections */ 65static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 66 67/* counter for no client port connections */ 68static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 69 70/* random value for IPVS connection hash */ 71static unsigned int ip_vs_conn_rnd __read_mostly; 72 73/* 74 * Fine locking granularity for big connection hash table 75 */ 76#define CT_LOCKARRAY_BITS 5 77#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 78#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 79 80struct ip_vs_aligned_lock 81{ 82 spinlock_t l; 83} __attribute__((__aligned__(SMP_CACHE_BYTES))); 84 85/* lock array for conn table */ 86static struct ip_vs_aligned_lock 87__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 88 89static inline void ct_write_lock_bh(unsigned int key) 90{ 91 spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 92} 93 94static inline void ct_write_unlock_bh(unsigned int key) 95{ 96 spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 97} 98 99 100/* 101 * Returns hash value for IPVS connection entry 102 */ 103static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, 104 const union nf_inet_addr *addr, 105 __be16 port) 106{ 107#ifdef CONFIG_IP_VS_IPV6 108 if (af == AF_INET6) 109 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 110 (__force u32)port, proto, ip_vs_conn_rnd) ^ 111 ((size_t)net>>8)) & ip_vs_conn_tab_mask; 112#endif 113 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 114 ip_vs_conn_rnd) ^ 115 ((size_t)net>>8)) & ip_vs_conn_tab_mask; 116} 117 118static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 119 bool inverse) 120{ 121 const union nf_inet_addr *addr; 122 __be16 port; 123 124 if (p->pe_data && p->pe->hashkey_raw) 125 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & 126 ip_vs_conn_tab_mask; 127 128 if (likely(!inverse)) { 129 addr = p->caddr; 130 port = p->cport; 131 } else { 132 addr = p->vaddr; 133 port = p->vport; 134 } 135 136 return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); 137} 138 139static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 140{ 141 struct ip_vs_conn_param p; 142 143 ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, 144 &cp->caddr, cp->cport, NULL, 0, &p); 145 146 if (cp->pe) { 147 p.pe = cp->pe; 148 p.pe_data = cp->pe_data; 149 p.pe_data_len = cp->pe_data_len; 150 } 151 152 return ip_vs_conn_hashkey_param(&p, false); 153} 154 155/* 156 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. 157 * returns bool success. 158 */ 159static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 160{ 161 unsigned int hash; 162 int ret; 163 164 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 165 return 0; 166 167 /* Hash by protocol, client address and port */ 168 hash = ip_vs_conn_hashkey_conn(cp); 169 170 ct_write_lock_bh(hash); 171 spin_lock(&cp->lock); 172 173 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 174 cp->flags |= IP_VS_CONN_F_HASHED; 175 atomic_inc(&cp->refcnt); 176 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); 177 ret = 1; 178 } else { 179 pr_err("%s(): request for already hashed, called from %pF\n", 180 __func__, __builtin_return_address(0)); 181 ret = 0; 182 } 183 184 spin_unlock(&cp->lock); 185 ct_write_unlock_bh(hash); 186 187 return ret; 188} 189 190 191/* 192 * UNhashes ip_vs_conn from ip_vs_conn_tab. 193 * returns bool success. Caller should hold conn reference. 194 */ 195static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 196{ 197 unsigned int hash; 198 int ret; 199 200 /* unhash it and decrease its reference counter */ 201 hash = ip_vs_conn_hashkey_conn(cp); 202 203 ct_write_lock_bh(hash); 204 spin_lock(&cp->lock); 205 206 if (cp->flags & IP_VS_CONN_F_HASHED) { 207 hlist_del_rcu(&cp->c_list); 208 cp->flags &= ~IP_VS_CONN_F_HASHED; 209 atomic_dec(&cp->refcnt); 210 ret = 1; 211 } else 212 ret = 0; 213 214 spin_unlock(&cp->lock); 215 ct_write_unlock_bh(hash); 216 217 return ret; 218} 219 220/* Try to unlink ip_vs_conn from ip_vs_conn_tab. 221 * returns bool success. 222 */ 223static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) 224{ 225 unsigned int hash; 226 bool ret; 227 228 hash = ip_vs_conn_hashkey_conn(cp); 229 230 ct_write_lock_bh(hash); 231 spin_lock(&cp->lock); 232 233 if (cp->flags & IP_VS_CONN_F_HASHED) { 234 ret = false; 235 /* Decrease refcnt and unlink conn only if we are last user */ 236 if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { 237 hlist_del_rcu(&cp->c_list); 238 cp->flags &= ~IP_VS_CONN_F_HASHED; 239 ret = true; 240 } 241 } else 242 ret = atomic_read(&cp->refcnt) ? false : true; 243 244 spin_unlock(&cp->lock); 245 ct_write_unlock_bh(hash); 246 247 return ret; 248} 249 250 251/* 252 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 253 * Called for pkts coming from OUTside-to-INside. 254 * p->caddr, p->cport: pkt source address (foreign host) 255 * p->vaddr, p->vport: pkt dest address (load balancer) 256 */ 257static inline struct ip_vs_conn * 258__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 259{ 260 unsigned int hash; 261 struct ip_vs_conn *cp; 262 263 hash = ip_vs_conn_hashkey_param(p, false); 264 265 rcu_read_lock(); 266 267 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 268 if (p->cport == cp->cport && p->vport == cp->vport && 269 cp->af == p->af && 270 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 271 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 272 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 273 p->protocol == cp->protocol && 274 ip_vs_conn_net_eq(cp, p->net)) { 275 if (!__ip_vs_conn_get(cp)) 276 continue; 277 /* HIT */ 278 rcu_read_unlock(); 279 return cp; 280 } 281 } 282 283 rcu_read_unlock(); 284 285 return NULL; 286} 287 288struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 289{ 290 struct ip_vs_conn *cp; 291 292 cp = __ip_vs_conn_in_get(p); 293 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { 294 struct ip_vs_conn_param cport_zero_p = *p; 295 cport_zero_p.cport = 0; 296 cp = __ip_vs_conn_in_get(&cport_zero_p); 297 } 298 299 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 300 ip_vs_proto_name(p->protocol), 301 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 302 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 303 cp ? "hit" : "not hit"); 304 305 return cp; 306} 307 308static int 309ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, 310 const struct ip_vs_iphdr *iph, 311 int inverse, struct ip_vs_conn_param *p) 312{ 313 __be16 _ports[2], *pptr; 314 struct net *net = skb_net(skb); 315 316 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 317 if (pptr == NULL) 318 return 1; 319 320 if (likely(!inverse)) 321 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, 322 pptr[0], &iph->daddr, pptr[1], p); 323 else 324 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, 325 pptr[1], &iph->saddr, pptr[0], p); 326 return 0; 327} 328 329struct ip_vs_conn * 330ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, 331 const struct ip_vs_iphdr *iph, int inverse) 332{ 333 struct ip_vs_conn_param p; 334 335 if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) 336 return NULL; 337 338 return ip_vs_conn_in_get(&p); 339} 340EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 341 342/* Get reference to connection template */ 343struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) 344{ 345 unsigned int hash; 346 struct ip_vs_conn *cp; 347 348 hash = ip_vs_conn_hashkey_param(p, false); 349 350 rcu_read_lock(); 351 352 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 353 if (unlikely(p->pe_data && p->pe->ct_match)) { 354 if (!ip_vs_conn_net_eq(cp, p->net)) 355 continue; 356 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { 357 if (__ip_vs_conn_get(cp)) 358 goto out; 359 } 360 continue; 361 } 362 363 if (cp->af == p->af && 364 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 365 /* protocol should only be IPPROTO_IP if 366 * p->vaddr is a fwmark */ 367 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : 368 p->af, p->vaddr, &cp->vaddr) && 369 p->vport == cp->vport && p->cport == cp->cport && 370 cp->flags & IP_VS_CONN_F_TEMPLATE && 371 p->protocol == cp->protocol && 372 ip_vs_conn_net_eq(cp, p->net)) { 373 if (__ip_vs_conn_get(cp)) 374 goto out; 375 } 376 } 377 cp = NULL; 378 379 out: 380 rcu_read_unlock(); 381 382 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 383 ip_vs_proto_name(p->protocol), 384 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 385 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 386 cp ? "hit" : "not hit"); 387 388 return cp; 389} 390 391/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 392 * Called for pkts coming from inside-to-OUTside. 393 * p->caddr, p->cport: pkt source address (inside host) 394 * p->vaddr, p->vport: pkt dest address (foreign host) */ 395struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) 396{ 397 unsigned int hash; 398 struct ip_vs_conn *cp, *ret=NULL; 399 400 /* 401 * Check for "full" addressed entries 402 */ 403 hash = ip_vs_conn_hashkey_param(p, true); 404 405 rcu_read_lock(); 406 407 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 408 if (p->vport == cp->cport && p->cport == cp->dport && 409 cp->af == p->af && 410 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 411 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 412 p->protocol == cp->protocol && 413 ip_vs_conn_net_eq(cp, p->net)) { 414 if (!__ip_vs_conn_get(cp)) 415 continue; 416 /* HIT */ 417 ret = cp; 418 break; 419 } 420 } 421 422 rcu_read_unlock(); 423 424 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 425 ip_vs_proto_name(p->protocol), 426 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 427 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 428 ret ? "hit" : "not hit"); 429 430 return ret; 431} 432 433struct ip_vs_conn * 434ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, 435 const struct ip_vs_iphdr *iph, int inverse) 436{ 437 struct ip_vs_conn_param p; 438 439 if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) 440 return NULL; 441 442 return ip_vs_conn_out_get(&p); 443} 444EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 445 446/* 447 * Put back the conn and restart its timer with its timeout 448 */ 449void ip_vs_conn_put(struct ip_vs_conn *cp) 450{ 451 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 452 0 : cp->timeout; 453 mod_timer(&cp->timer, jiffies+t); 454 455 __ip_vs_conn_put(cp); 456} 457 458 459/* 460 * Fill a no_client_port connection with a client port number 461 */ 462void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 463{ 464 if (ip_vs_conn_unhash(cp)) { 465 spin_lock_bh(&cp->lock); 466 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 467 atomic_dec(&ip_vs_conn_no_cport_cnt); 468 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 469 cp->cport = cport; 470 } 471 spin_unlock_bh(&cp->lock); 472 473 /* hash on new dport */ 474 ip_vs_conn_hash(cp); 475 } 476} 477 478 479/* 480 * Bind a connection entry with the corresponding packet_xmit. 481 * Called by ip_vs_conn_new. 482 */ 483static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 484{ 485 switch (IP_VS_FWD_METHOD(cp)) { 486 case IP_VS_CONN_F_MASQ: 487 cp->packet_xmit = ip_vs_nat_xmit; 488 break; 489 490 case IP_VS_CONN_F_TUNNEL: 491 cp->packet_xmit = ip_vs_tunnel_xmit; 492 break; 493 494 case IP_VS_CONN_F_DROUTE: 495 cp->packet_xmit = ip_vs_dr_xmit; 496 break; 497 498 case IP_VS_CONN_F_LOCALNODE: 499 cp->packet_xmit = ip_vs_null_xmit; 500 break; 501 502 case IP_VS_CONN_F_BYPASS: 503 cp->packet_xmit = ip_vs_bypass_xmit; 504 break; 505 } 506} 507 508#ifdef CONFIG_IP_VS_IPV6 509static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) 510{ 511 switch (IP_VS_FWD_METHOD(cp)) { 512 case IP_VS_CONN_F_MASQ: 513 cp->packet_xmit = ip_vs_nat_xmit_v6; 514 break; 515 516 case IP_VS_CONN_F_TUNNEL: 517 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 518 break; 519 520 case IP_VS_CONN_F_DROUTE: 521 cp->packet_xmit = ip_vs_dr_xmit_v6; 522 break; 523 524 case IP_VS_CONN_F_LOCALNODE: 525 cp->packet_xmit = ip_vs_null_xmit; 526 break; 527 528 case IP_VS_CONN_F_BYPASS: 529 cp->packet_xmit = ip_vs_bypass_xmit_v6; 530 break; 531 } 532} 533#endif 534 535 536static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 537{ 538 return atomic_read(&dest->activeconns) 539 + atomic_read(&dest->inactconns); 540} 541 542/* 543 * Bind a connection entry with a virtual service destination 544 * Called just after a new connection entry is created. 545 */ 546static inline void 547ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 548{ 549 unsigned int conn_flags; 550 __u32 flags; 551 552 /* if dest is NULL, then return directly */ 553 if (!dest) 554 return; 555 556 /* Increase the refcnt counter of the dest */ 557 ip_vs_dest_hold(dest); 558 559 conn_flags = atomic_read(&dest->conn_flags); 560 if (cp->protocol != IPPROTO_UDP) 561 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; 562 flags = cp->flags; 563 /* Bind with the destination and its corresponding transmitter */ 564 if (flags & IP_VS_CONN_F_SYNC) { 565 /* if the connection is not template and is created 566 * by sync, preserve the activity flag. 567 */ 568 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 569 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 570 /* connections inherit forwarding method from dest */ 571 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); 572 } 573 flags |= conn_flags; 574 cp->flags = flags; 575 cp->dest = dest; 576 577 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 578 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 579 "dest->refcnt:%d\n", 580 ip_vs_proto_name(cp->protocol), 581 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 582 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 583 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 584 ip_vs_fwd_tag(cp), cp->state, 585 cp->flags, atomic_read(&cp->refcnt), 586 atomic_read(&dest->refcnt)); 587 588 /* Update the connection counters */ 589 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 590 /* It is a normal connection, so modify the counters 591 * according to the flags, later the protocol can 592 * update them on state change 593 */ 594 if (!(flags & IP_VS_CONN_F_INACTIVE)) 595 atomic_inc(&dest->activeconns); 596 else 597 atomic_inc(&dest->inactconns); 598 } else { 599 /* It is a persistent connection/template, so increase 600 the persistent connection counter */ 601 atomic_inc(&dest->persistconns); 602 } 603 604 if (dest->u_threshold != 0 && 605 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 606 dest->flags |= IP_VS_DEST_F_OVERLOAD; 607} 608 609 610/* 611 * Check if there is a destination for the connection, if so 612 * bind the connection to the destination. 613 */ 614void ip_vs_try_bind_dest(struct ip_vs_conn *cp) 615{ 616 struct ip_vs_dest *dest; 617 618 rcu_read_lock(); 619 dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, 620 cp->dport, &cp->vaddr, cp->vport, 621 cp->protocol, cp->fwmark, cp->flags); 622 if (dest) { 623 struct ip_vs_proto_data *pd; 624 625 spin_lock_bh(&cp->lock); 626 if (cp->dest) { 627 spin_unlock_bh(&cp->lock); 628 rcu_read_unlock(); 629 return; 630 } 631 632 /* Applications work depending on the forwarding method 633 * but better to reassign them always when binding dest */ 634 if (cp->app) 635 ip_vs_unbind_app(cp); 636 637 ip_vs_bind_dest(cp, dest); 638 spin_unlock_bh(&cp->lock); 639 640 /* Update its packet transmitter */ 641 cp->packet_xmit = NULL; 642#ifdef CONFIG_IP_VS_IPV6 643 if (cp->af == AF_INET6) 644 ip_vs_bind_xmit_v6(cp); 645 else 646#endif 647 ip_vs_bind_xmit(cp); 648 649 pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); 650 if (pd && atomic_read(&pd->appcnt)) 651 ip_vs_bind_app(cp, pd->pp); 652 } 653 rcu_read_unlock(); 654} 655 656 657/* 658 * Unbind a connection entry with its VS destination 659 * Called by the ip_vs_conn_expire function. 660 */ 661static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 662{ 663 struct ip_vs_dest *dest = cp->dest; 664 665 if (!dest) 666 return; 667 668 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " 669 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 670 "dest->refcnt:%d\n", 671 ip_vs_proto_name(cp->protocol), 672 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 673 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 674 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 675 ip_vs_fwd_tag(cp), cp->state, 676 cp->flags, atomic_read(&cp->refcnt), 677 atomic_read(&dest->refcnt)); 678 679 /* Update the connection counters */ 680 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 681 /* It is a normal connection, so decrease the inactconns 682 or activeconns counter */ 683 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 684 atomic_dec(&dest->inactconns); 685 } else { 686 atomic_dec(&dest->activeconns); 687 } 688 } else { 689 /* It is a persistent connection/template, so decrease 690 the persistent connection counter */ 691 atomic_dec(&dest->persistconns); 692 } 693 694 if (dest->l_threshold != 0) { 695 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 696 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 697 } else if (dest->u_threshold != 0) { 698 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 699 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 700 } else { 701 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 702 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 703 } 704 705 ip_vs_dest_put(dest); 706} 707 708static int expire_quiescent_template(struct netns_ipvs *ipvs, 709 struct ip_vs_dest *dest) 710{ 711#ifdef CONFIG_SYSCTL 712 return ipvs->sysctl_expire_quiescent_template && 713 (atomic_read(&dest->weight) == 0); 714#else 715 return 0; 716#endif 717} 718 719/* 720 * Checking if the destination of a connection template is available. 721 * If available, return 1, otherwise invalidate this connection 722 * template and return 0. 723 */ 724int ip_vs_check_template(struct ip_vs_conn *ct) 725{ 726 struct ip_vs_dest *dest = ct->dest; 727 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); 728 729 /* 730 * Checking the dest server status. 731 */ 732 if ((dest == NULL) || 733 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 734 expire_quiescent_template(ipvs, dest)) { 735 IP_VS_DBG_BUF(9, "check_template: dest not available for " 736 "protocol %s s:%s:%d v:%s:%d " 737 "-> d:%s:%d\n", 738 ip_vs_proto_name(ct->protocol), 739 IP_VS_DBG_ADDR(ct->af, &ct->caddr), 740 ntohs(ct->cport), 741 IP_VS_DBG_ADDR(ct->af, &ct->vaddr), 742 ntohs(ct->vport), 743 IP_VS_DBG_ADDR(ct->af, &ct->daddr), 744 ntohs(ct->dport)); 745 746 /* 747 * Invalidate the connection template 748 */ 749 if (ct->vport != htons(0xffff)) { 750 if (ip_vs_conn_unhash(ct)) { 751 ct->dport = htons(0xffff); 752 ct->vport = htons(0xffff); 753 ct->cport = 0; 754 ip_vs_conn_hash(ct); 755 } 756 } 757 758 /* 759 * Simply decrease the refcnt of the template, 760 * don't restart its timer. 761 */ 762 __ip_vs_conn_put(ct); 763 return 0; 764 } 765 return 1; 766} 767 768static void ip_vs_conn_rcu_free(struct rcu_head *head) 769{ 770 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, 771 rcu_head); 772 773 ip_vs_pe_put(cp->pe); 774 kfree(cp->pe_data); 775 kmem_cache_free(ip_vs_conn_cachep, cp); 776} 777 778static void ip_vs_conn_expire(unsigned long data) 779{ 780 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 781 struct net *net = ip_vs_conn_net(cp); 782 struct netns_ipvs *ipvs = net_ipvs(net); 783 784 /* 785 * do I control anybody? 786 */ 787 if (atomic_read(&cp->n_control)) 788 goto expire_later; 789 790 /* Unlink conn if not referenced anymore */ 791 if (likely(ip_vs_conn_unlink(cp))) { 792 /* delete the timer if it is activated by other users */ 793 del_timer(&cp->timer); 794 795 /* does anybody control me? */ 796 if (cp->control) 797 ip_vs_control_del(cp); 798 799 if (cp->flags & IP_VS_CONN_F_NFCT) { 800 ip_vs_conn_drop_conntrack(cp); 801 /* Do not access conntracks during subsys cleanup 802 * because nf_conntrack_find_get can not be used after 803 * conntrack cleanup for the net. 804 */ 805 smp_rmb(); 806 if (ipvs->enable) 807 ip_vs_conn_drop_conntrack(cp); 808 } 809 810 if (unlikely(cp->app != NULL)) 811 ip_vs_unbind_app(cp); 812 ip_vs_unbind_dest(cp); 813 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 814 atomic_dec(&ip_vs_conn_no_cport_cnt); 815 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); 816 atomic_dec(&ipvs->conn_count); 817 return; 818 } 819 820 expire_later: 821 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", 822 atomic_read(&cp->refcnt), 823 atomic_read(&cp->n_control)); 824 825 atomic_inc(&cp->refcnt); 826 cp->timeout = 60*HZ; 827 828 if (ipvs->sync_state & IP_VS_STATE_MASTER) 829 ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); 830 831 ip_vs_conn_put(cp); 832} 833 834/* Modify timer, so that it expires as soon as possible. 835 * Can be called without reference only if under RCU lock. 836 */ 837void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 838{ 839 /* Using mod_timer_pending will ensure the timer is not 840 * modified after the final del_timer in ip_vs_conn_expire. 841 */ 842 if (timer_pending(&cp->timer) && 843 time_after(cp->timer.expires, jiffies)) 844 mod_timer_pending(&cp->timer, jiffies); 845} 846 847 848/* 849 * Create a new connection entry and hash it into the ip_vs_conn_tab 850 */ 851struct ip_vs_conn * 852ip_vs_conn_new(const struct ip_vs_conn_param *p, 853 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, 854 struct ip_vs_dest *dest, __u32 fwmark) 855{ 856 struct ip_vs_conn *cp; 857 struct netns_ipvs *ipvs = net_ipvs(p->net); 858 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, 859 p->protocol); 860 861 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 862 if (cp == NULL) { 863 IP_VS_ERR_RL("%s(): no memory\n", __func__); 864 return NULL; 865 } 866 867 INIT_HLIST_NODE(&cp->c_list); 868 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 869 ip_vs_conn_net_set(cp, p->net); 870 cp->af = p->af; 871 cp->protocol = p->protocol; 872 ip_vs_addr_set(p->af, &cp->caddr, p->caddr); 873 cp->cport = p->cport; 874 ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); 875 cp->vport = p->vport; 876 /* proto should only be IPPROTO_IP if d_addr is a fwmark */ 877 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, 878 &cp->daddr, daddr); 879 cp->dport = dport; 880 cp->flags = flags; 881 cp->fwmark = fwmark; 882 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { 883 ip_vs_pe_get(p->pe); 884 cp->pe = p->pe; 885 cp->pe_data = p->pe_data; 886 cp->pe_data_len = p->pe_data_len; 887 } else { 888 cp->pe = NULL; 889 cp->pe_data = NULL; 890 cp->pe_data_len = 0; 891 } 892 spin_lock_init(&cp->lock); 893 894 /* 895 * Set the entry is referenced by the current thread before hashing 896 * it in the table, so that other thread run ip_vs_random_dropentry 897 * but cannot drop this entry. 898 */ 899 atomic_set(&cp->refcnt, 1); 900 901 cp->control = NULL; 902 atomic_set(&cp->n_control, 0); 903 atomic_set(&cp->in_pkts, 0); 904 905 cp->packet_xmit = NULL; 906 cp->app = NULL; 907 cp->app_data = NULL; 908 /* reset struct ip_vs_seq */ 909 cp->in_seq.delta = 0; 910 cp->out_seq.delta = 0; 911 912 atomic_inc(&ipvs->conn_count); 913 if (flags & IP_VS_CONN_F_NO_CPORT) 914 atomic_inc(&ip_vs_conn_no_cport_cnt); 915 916 /* Bind the connection with a destination server */ 917 cp->dest = NULL; 918 ip_vs_bind_dest(cp, dest); 919 920 /* Set its state and timeout */ 921 cp->state = 0; 922 cp->old_state = 0; 923 cp->timeout = 3*HZ; 924 cp->sync_endtime = jiffies & ~3UL; 925 926 /* Bind its packet transmitter */ 927#ifdef CONFIG_IP_VS_IPV6 928 if (p->af == AF_INET6) 929 ip_vs_bind_xmit_v6(cp); 930 else 931#endif 932 ip_vs_bind_xmit(cp); 933 934 if (unlikely(pd && atomic_read(&pd->appcnt))) 935 ip_vs_bind_app(cp, pd->pp); 936 937 /* 938 * Allow conntrack to be preserved. By default, conntrack 939 * is created and destroyed for every packet. 940 * Sometimes keeping conntrack can be useful for 941 * IP_VS_CONN_F_ONE_PACKET too. 942 */ 943 944 if (ip_vs_conntrack_enabled(ipvs)) 945 cp->flags |= IP_VS_CONN_F_NFCT; 946 947 /* Hash it in the ip_vs_conn_tab finally */ 948 ip_vs_conn_hash(cp); 949 950 return cp; 951} 952 953/* 954 * /proc/net/ip_vs_conn entries 955 */ 956#ifdef CONFIG_PROC_FS 957struct ip_vs_iter_state { 958 struct seq_net_private p; 959 struct hlist_head *l; 960}; 961 962static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 963{ 964 int idx; 965 struct ip_vs_conn *cp; 966 struct ip_vs_iter_state *iter = seq->private; 967 968 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 969 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 970 /* __ip_vs_conn_get() is not needed by 971 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show 972 */ 973 if (pos-- == 0) { 974 iter->l = &ip_vs_conn_tab[idx]; 975 return cp; 976 } 977 } 978 rcu_read_unlock(); 979 rcu_read_lock(); 980 } 981 982 return NULL; 983} 984 985static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 986 __acquires(RCU) 987{ 988 struct ip_vs_iter_state *iter = seq->private; 989 990 iter->l = NULL; 991 rcu_read_lock(); 992 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 993} 994 995static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 996{ 997 struct ip_vs_conn *cp = v; 998 struct ip_vs_iter_state *iter = seq->private; 999 struct hlist_node *e; 1000 struct hlist_head *l = iter->l; 1001 int idx; 1002 1003 ++*pos; 1004 if (v == SEQ_START_TOKEN) 1005 return ip_vs_conn_array(seq, 0); 1006 1007 /* more on same hash chain? */ 1008 e = rcu_dereference(hlist_next_rcu(&cp->c_list)); 1009 if (e) 1010 return hlist_entry(e, struct ip_vs_conn, c_list); 1011 1012 idx = l - ip_vs_conn_tab; 1013 while (++idx < ip_vs_conn_tab_size) { 1014 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1015 iter->l = &ip_vs_conn_tab[idx]; 1016 return cp; 1017 } 1018 rcu_read_unlock(); 1019 rcu_read_lock(); 1020 } 1021 iter->l = NULL; 1022 return NULL; 1023} 1024 1025static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 1026 __releases(RCU) 1027{ 1028 rcu_read_unlock(); 1029} 1030 1031static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 1032{ 1033 1034 if (v == SEQ_START_TOKEN) 1035 seq_puts(seq, 1036 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 1037 else { 1038 const struct ip_vs_conn *cp = v; 1039 struct net *net = seq_file_net(seq); 1040 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 1041 size_t len = 0; 1042 1043 if (!ip_vs_conn_net_eq(cp, net)) 1044 return 0; 1045 if (cp->pe_data) { 1046 pe_data[0] = ' '; 1047 len = strlen(cp->pe->name); 1048 memcpy(pe_data + 1, cp->pe->name, len); 1049 pe_data[len + 1] = ' '; 1050 len += 2; 1051 len += cp->pe->show_pe_data(cp, pe_data + len); 1052 } 1053 pe_data[len] = '\0'; 1054 1055#ifdef CONFIG_IP_VS_IPV6 1056 if (cp->af == AF_INET6) 1057 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1058 "%pI6 %04X %-11s %7lu%s\n", 1059 ip_vs_proto_name(cp->protocol), 1060 &cp->caddr.in6, ntohs(cp->cport), 1061 &cp->vaddr.in6, ntohs(cp->vport), 1062 &cp->daddr.in6, ntohs(cp->dport), 1063 ip_vs_state_name(cp->protocol, cp->state), 1064 (cp->timer.expires-jiffies)/HZ, pe_data); 1065 else 1066#endif 1067 seq_printf(seq, 1068 "%-3s %08X %04X %08X %04X" 1069 " %08X %04X %-11s %7lu%s\n", 1070 ip_vs_proto_name(cp->protocol), 1071 ntohl(cp->caddr.ip), ntohs(cp->cport), 1072 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1073 ntohl(cp->daddr.ip), ntohs(cp->dport), 1074 ip_vs_state_name(cp->protocol, cp->state), 1075 (cp->timer.expires-jiffies)/HZ, pe_data); 1076 } 1077 return 0; 1078} 1079 1080static const struct seq_operations ip_vs_conn_seq_ops = { 1081 .start = ip_vs_conn_seq_start, 1082 .next = ip_vs_conn_seq_next, 1083 .stop = ip_vs_conn_seq_stop, 1084 .show = ip_vs_conn_seq_show, 1085}; 1086 1087static int ip_vs_conn_open(struct inode *inode, struct file *file) 1088{ 1089 return seq_open_net(inode, file, &ip_vs_conn_seq_ops, 1090 sizeof(struct ip_vs_iter_state)); 1091} 1092 1093static const struct file_operations ip_vs_conn_fops = { 1094 .owner = THIS_MODULE, 1095 .open = ip_vs_conn_open, 1096 .read = seq_read, 1097 .llseek = seq_lseek, 1098 .release = seq_release_net, 1099}; 1100 1101static const char *ip_vs_origin_name(unsigned int flags) 1102{ 1103 if (flags & IP_VS_CONN_F_SYNC) 1104 return "SYNC"; 1105 else 1106 return "LOCAL"; 1107} 1108 1109static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 1110{ 1111 1112 if (v == SEQ_START_TOKEN) 1113 seq_puts(seq, 1114 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1115 else { 1116 const struct ip_vs_conn *cp = v; 1117 struct net *net = seq_file_net(seq); 1118 1119 if (!ip_vs_conn_net_eq(cp, net)) 1120 return 0; 1121 1122#ifdef CONFIG_IP_VS_IPV6 1123 if (cp->af == AF_INET6) 1124 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", 1125 ip_vs_proto_name(cp->protocol), 1126 &cp->caddr.in6, ntohs(cp->cport), 1127 &cp->vaddr.in6, ntohs(cp->vport), 1128 &cp->daddr.in6, ntohs(cp->dport), 1129 ip_vs_state_name(cp->protocol, cp->state), 1130 ip_vs_origin_name(cp->flags), 1131 (cp->timer.expires-jiffies)/HZ); 1132 else 1133#endif 1134 seq_printf(seq, 1135 "%-3s %08X %04X %08X %04X " 1136 "%08X %04X %-11s %-6s %7lu\n", 1137 ip_vs_proto_name(cp->protocol), 1138 ntohl(cp->caddr.ip), ntohs(cp->cport), 1139 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1140 ntohl(cp->daddr.ip), ntohs(cp->dport), 1141 ip_vs_state_name(cp->protocol, cp->state), 1142 ip_vs_origin_name(cp->flags), 1143 (cp->timer.expires-jiffies)/HZ); 1144 } 1145 return 0; 1146} 1147 1148static const struct seq_operations ip_vs_conn_sync_seq_ops = { 1149 .start = ip_vs_conn_seq_start, 1150 .next = ip_vs_conn_seq_next, 1151 .stop = ip_vs_conn_seq_stop, 1152 .show = ip_vs_conn_sync_seq_show, 1153}; 1154 1155static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 1156{ 1157 return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, 1158 sizeof(struct ip_vs_iter_state)); 1159} 1160 1161static const struct file_operations ip_vs_conn_sync_fops = { 1162 .owner = THIS_MODULE, 1163 .open = ip_vs_conn_sync_open, 1164 .read = seq_read, 1165 .llseek = seq_lseek, 1166 .release = seq_release_net, 1167}; 1168 1169#endif 1170 1171 1172/* 1173 * Randomly drop connection entries before running out of memory 1174 */ 1175static inline int todrop_entry(struct ip_vs_conn *cp) 1176{ 1177 /* 1178 * The drop rate array needs tuning for real environments. 1179 * Called from timer bh only => no locking 1180 */ 1181 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 1182 static char todrop_counter[9] = {0}; 1183 int i; 1184 1185 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 1186 This will leave enough time for normal connection to get 1187 through. */ 1188 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 1189 return 0; 1190 1191 /* Don't drop the entry if its number of incoming packets is not 1192 located in [0, 8] */ 1193 i = atomic_read(&cp->in_pkts); 1194 if (i > 8 || i < 0) return 0; 1195 1196 if (!todrop_rate[i]) return 0; 1197 if (--todrop_counter[i] > 0) return 0; 1198 1199 todrop_counter[i] = todrop_rate[i]; 1200 return 1; 1201} 1202 1203/* Called from keventd and must protect itself from softirqs */ 1204void ip_vs_random_dropentry(struct net *net) 1205{ 1206 int idx; 1207 struct ip_vs_conn *cp, *cp_c; 1208 1209 /* 1210 * Randomly scan 1/32 of the whole table every second 1211 */ 1212 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1213 unsigned int hash = net_random() & ip_vs_conn_tab_mask; 1214 1215 /* 1216 * Lock is actually needed in this loop. 1217 */ 1218 rcu_read_lock(); 1219 1220 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1221 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 1222 /* connection template */ 1223 continue; 1224 if (!ip_vs_conn_net_eq(cp, net)) 1225 continue; 1226 if (cp->protocol == IPPROTO_TCP) { 1227 switch(cp->state) { 1228 case IP_VS_TCP_S_SYN_RECV: 1229 case IP_VS_TCP_S_SYNACK: 1230 break; 1231 1232 case IP_VS_TCP_S_ESTABLISHED: 1233 if (todrop_entry(cp)) 1234 break; 1235 continue; 1236 1237 default: 1238 continue; 1239 } 1240 } else { 1241 if (!todrop_entry(cp)) 1242 continue; 1243 } 1244 1245 IP_VS_DBG(4, "del connection\n"); 1246 ip_vs_conn_expire_now(cp); 1247 cp_c = cp->control; 1248 /* cp->control is valid only with reference to cp */ 1249 if (cp_c && __ip_vs_conn_get(cp)) { 1250 IP_VS_DBG(4, "del conn template\n"); 1251 ip_vs_conn_expire_now(cp_c); 1252 __ip_vs_conn_put(cp); 1253 } 1254 } 1255 rcu_read_unlock(); 1256 } 1257} 1258 1259 1260/* 1261 * Flush all the connection entries in the ip_vs_conn_tab 1262 */ 1263static void ip_vs_conn_flush(struct net *net) 1264{ 1265 int idx; 1266 struct ip_vs_conn *cp, *cp_c; 1267 struct netns_ipvs *ipvs = net_ipvs(net); 1268 1269flush_again: 1270 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1271 /* 1272 * Lock is actually needed in this loop. 1273 */ 1274 rcu_read_lock(); 1275 1276 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1277 if (!ip_vs_conn_net_eq(cp, net)) 1278 continue; 1279 IP_VS_DBG(4, "del connection\n"); 1280 ip_vs_conn_expire_now(cp); 1281 cp_c = cp->control; 1282 /* cp->control is valid only with reference to cp */ 1283 if (cp_c && __ip_vs_conn_get(cp)) { 1284 IP_VS_DBG(4, "del conn template\n"); 1285 ip_vs_conn_expire_now(cp_c); 1286 __ip_vs_conn_put(cp); 1287 } 1288 } 1289 rcu_read_unlock(); 1290 } 1291 1292 /* the counter may be not NULL, because maybe some conn entries 1293 are run by slow timer handler or unhashed but still referred */ 1294 if (atomic_read(&ipvs->conn_count) != 0) { 1295 schedule(); 1296 goto flush_again; 1297 } 1298} 1299/* 1300 * per netns init and exit 1301 */ 1302int __net_init ip_vs_conn_net_init(struct net *net) 1303{ 1304 struct netns_ipvs *ipvs = net_ipvs(net); 1305 1306 atomic_set(&ipvs->conn_count, 0); 1307 1308 proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); 1309 proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); 1310 return 0; 1311} 1312 1313void __net_exit ip_vs_conn_net_cleanup(struct net *net) 1314{ 1315 /* flush all the connection entries first */ 1316 ip_vs_conn_flush(net); 1317 remove_proc_entry("ip_vs_conn", net->proc_net); 1318 remove_proc_entry("ip_vs_conn_sync", net->proc_net); 1319} 1320 1321int __init ip_vs_conn_init(void) 1322{ 1323 int idx; 1324 1325 /* Compute size and mask */ 1326 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1327 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1328 1329 /* 1330 * Allocate the connection hash table and initialize its list heads 1331 */ 1332 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); 1333 if (!ip_vs_conn_tab) 1334 return -ENOMEM; 1335 1336 /* Allocate ip_vs_conn slab cache */ 1337 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 1338 sizeof(struct ip_vs_conn), 0, 1339 SLAB_HWCACHE_ALIGN, NULL); 1340 if (!ip_vs_conn_cachep) { 1341 vfree(ip_vs_conn_tab); 1342 return -ENOMEM; 1343 } 1344 1345 pr_info("Connection hash table configured " 1346 "(size=%d, memory=%ldKbytes)\n", 1347 ip_vs_conn_tab_size, 1348 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1349 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1350 sizeof(struct ip_vs_conn)); 1351 1352 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1353 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 1354 1355 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1356 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); 1357 } 1358 1359 /* calculate the random value for connection hash */ 1360 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1361 1362 return 0; 1363} 1364 1365void ip_vs_conn_cleanup(void) 1366{ 1367 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ 1368 rcu_barrier(); 1369 /* Release the empty cache */ 1370 kmem_cache_destroy(ip_vs_conn_cachep); 1371 vfree(ip_vs_conn_tab); 1372} 1373