1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25#define KMSG_COMPONENT "IPVS" 26#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 27 28#include <linux/interrupt.h> 29#include <linux/in.h> 30#include <linux/inet.h> 31#include <linux/net.h> 32#include <linux/kernel.h> 33#include <linux/module.h> 34#include <linux/vmalloc.h> 35#include <linux/proc_fs.h> /* for proc_net_* */ 36#include <linux/slab.h> 37#include <linux/seq_file.h> 38#include <linux/jhash.h> 39#include <linux/random.h> 40 41#include <net/net_namespace.h> 42#include <net/ip_vs.h> 43 44 45#ifndef CONFIG_IP_VS_TAB_BITS 46#define CONFIG_IP_VS_TAB_BITS 12 47#endif 48 49/* 50 * Connection hash size. Default is what was selected at compile time. 51*/ 52static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 53module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 54MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 55 56/* size and mask values */ 57int ip_vs_conn_tab_size __read_mostly; 58static int ip_vs_conn_tab_mask __read_mostly; 59 60/* 61 * Connection hash table: for input and output packets lookups of IPVS 62 */ 63static struct hlist_head *ip_vs_conn_tab __read_mostly; 64 65/* SLAB cache for IPVS connections */ 66static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 67 68/* counter for no client port connections */ 69static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 70 71/* random value for IPVS connection hash */ 72static unsigned int ip_vs_conn_rnd __read_mostly; 73 74/* 75 * Fine locking granularity for big connection hash table 76 */ 77#define CT_LOCKARRAY_BITS 5 78#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 79#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 80 81/* We need an addrstrlen that works with or without v6 */ 82#ifdef CONFIG_IP_VS_IPV6 83#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN 84#else 85#define IP_VS_ADDRSTRLEN (8+1) 86#endif 87 88struct ip_vs_aligned_lock 89{ 90 spinlock_t l; 91} __attribute__((__aligned__(SMP_CACHE_BYTES))); 92 93/* lock array for conn table */ 94static struct ip_vs_aligned_lock 95__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 96 97static inline void ct_write_lock_bh(unsigned int key) 98{ 99 spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 100} 101 102static inline void ct_write_unlock_bh(unsigned int key) 103{ 104 spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 105} 106 107 108/* 109 * Returns hash value for IPVS connection entry 110 */ 111static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, 112 const union nf_inet_addr *addr, 113 __be16 port) 114{ 115#ifdef CONFIG_IP_VS_IPV6 116 if (af == AF_INET6) 117 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 118 (__force u32)port, proto, ip_vs_conn_rnd) ^ 119 ((size_t)net>>8)) & ip_vs_conn_tab_mask; 120#endif 121 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 122 ip_vs_conn_rnd) ^ 123 ((size_t)net>>8)) & ip_vs_conn_tab_mask; 124} 125 126static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 127 bool inverse) 128{ 129 const union nf_inet_addr *addr; 130 __be16 port; 131 132 if (p->pe_data && p->pe->hashkey_raw) 133 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & 134 ip_vs_conn_tab_mask; 135 136 if (likely(!inverse)) { 137 addr = p->caddr; 138 port = p->cport; 139 } else { 140 addr = p->vaddr; 141 port = p->vport; 142 } 143 144 return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); 145} 146 147static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 148{ 149 struct ip_vs_conn_param p; 150 151 ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, 152 &cp->caddr, cp->cport, NULL, 0, &p); 153 154 if (cp->pe) { 155 p.pe = cp->pe; 156 p.pe_data = cp->pe_data; 157 p.pe_data_len = cp->pe_data_len; 158 } 159 160 return ip_vs_conn_hashkey_param(&p, false); 161} 162 163/* 164 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. 165 * returns bool success. 166 */ 167static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 168{ 169 unsigned int hash; 170 int ret; 171 172 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 173 return 0; 174 175 /* Hash by protocol, client address and port */ 176 hash = ip_vs_conn_hashkey_conn(cp); 177 178 ct_write_lock_bh(hash); 179 spin_lock(&cp->lock); 180 181 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 182 cp->flags |= IP_VS_CONN_F_HASHED; 183 atomic_inc(&cp->refcnt); 184 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); 185 ret = 1; 186 } else { 187 pr_err("%s(): request for already hashed, called from %pF\n", 188 __func__, __builtin_return_address(0)); 189 ret = 0; 190 } 191 192 spin_unlock(&cp->lock); 193 ct_write_unlock_bh(hash); 194 195 return ret; 196} 197 198 199/* 200 * UNhashes ip_vs_conn from ip_vs_conn_tab. 201 * returns bool success. Caller should hold conn reference. 202 */ 203static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 204{ 205 unsigned int hash; 206 int ret; 207 208 /* unhash it and decrease its reference counter */ 209 hash = ip_vs_conn_hashkey_conn(cp); 210 211 ct_write_lock_bh(hash); 212 spin_lock(&cp->lock); 213 214 if (cp->flags & IP_VS_CONN_F_HASHED) { 215 hlist_del_rcu(&cp->c_list); 216 cp->flags &= ~IP_VS_CONN_F_HASHED; 217 atomic_dec(&cp->refcnt); 218 ret = 1; 219 } else 220 ret = 0; 221 222 spin_unlock(&cp->lock); 223 ct_write_unlock_bh(hash); 224 225 return ret; 226} 227 228/* Try to unlink ip_vs_conn from ip_vs_conn_tab. 229 * returns bool success. 230 */ 231static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) 232{ 233 unsigned int hash; 234 bool ret; 235 236 hash = ip_vs_conn_hashkey_conn(cp); 237 238 ct_write_lock_bh(hash); 239 spin_lock(&cp->lock); 240 241 if (cp->flags & IP_VS_CONN_F_HASHED) { 242 ret = false; 243 /* Decrease refcnt and unlink conn only if we are last user */ 244 if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { 245 hlist_del_rcu(&cp->c_list); 246 cp->flags &= ~IP_VS_CONN_F_HASHED; 247 ret = true; 248 } 249 } else 250 ret = atomic_read(&cp->refcnt) ? false : true; 251 252 spin_unlock(&cp->lock); 253 ct_write_unlock_bh(hash); 254 255 return ret; 256} 257 258 259/* 260 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 261 * Called for pkts coming from OUTside-to-INside. 262 * p->caddr, p->cport: pkt source address (foreign host) 263 * p->vaddr, p->vport: pkt dest address (load balancer) 264 */ 265static inline struct ip_vs_conn * 266__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 267{ 268 unsigned int hash; 269 struct ip_vs_conn *cp; 270 271 hash = ip_vs_conn_hashkey_param(p, false); 272 273 rcu_read_lock(); 274 275 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 276 if (p->cport == cp->cport && p->vport == cp->vport && 277 cp->af == p->af && 278 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 279 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 280 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 281 p->protocol == cp->protocol && 282 ip_vs_conn_net_eq(cp, p->net)) { 283 if (!__ip_vs_conn_get(cp)) 284 continue; 285 /* HIT */ 286 rcu_read_unlock(); 287 return cp; 288 } 289 } 290 291 rcu_read_unlock(); 292 293 return NULL; 294} 295 296struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 297{ 298 struct ip_vs_conn *cp; 299 300 cp = __ip_vs_conn_in_get(p); 301 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { 302 struct ip_vs_conn_param cport_zero_p = *p; 303 cport_zero_p.cport = 0; 304 cp = __ip_vs_conn_in_get(&cport_zero_p); 305 } 306 307 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 308 ip_vs_proto_name(p->protocol), 309 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 310 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 311 cp ? "hit" : "not hit"); 312 313 return cp; 314} 315 316static int 317ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, 318 const struct ip_vs_iphdr *iph, 319 int inverse, struct ip_vs_conn_param *p) 320{ 321 __be16 _ports[2], *pptr; 322 struct net *net = skb_net(skb); 323 324 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 325 if (pptr == NULL) 326 return 1; 327 328 if (likely(!inverse)) 329 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, 330 pptr[0], &iph->daddr, pptr[1], p); 331 else 332 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, 333 pptr[1], &iph->saddr, pptr[0], p); 334 return 0; 335} 336 337struct ip_vs_conn * 338ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, 339 const struct ip_vs_iphdr *iph, int inverse) 340{ 341 struct ip_vs_conn_param p; 342 343 if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) 344 return NULL; 345 346 return ip_vs_conn_in_get(&p); 347} 348EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 349 350/* Get reference to connection template */ 351struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) 352{ 353 unsigned int hash; 354 struct ip_vs_conn *cp; 355 356 hash = ip_vs_conn_hashkey_param(p, false); 357 358 rcu_read_lock(); 359 360 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 361 if (unlikely(p->pe_data && p->pe->ct_match)) { 362 if (!ip_vs_conn_net_eq(cp, p->net)) 363 continue; 364 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { 365 if (__ip_vs_conn_get(cp)) 366 goto out; 367 } 368 continue; 369 } 370 371 if (cp->af == p->af && 372 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 373 /* protocol should only be IPPROTO_IP if 374 * p->vaddr is a fwmark */ 375 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : 376 p->af, p->vaddr, &cp->vaddr) && 377 p->vport == cp->vport && p->cport == cp->cport && 378 cp->flags & IP_VS_CONN_F_TEMPLATE && 379 p->protocol == cp->protocol && 380 ip_vs_conn_net_eq(cp, p->net)) { 381 if (__ip_vs_conn_get(cp)) 382 goto out; 383 } 384 } 385 cp = NULL; 386 387 out: 388 rcu_read_unlock(); 389 390 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 391 ip_vs_proto_name(p->protocol), 392 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 393 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 394 cp ? "hit" : "not hit"); 395 396 return cp; 397} 398 399/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 400 * Called for pkts coming from inside-to-OUTside. 401 * p->caddr, p->cport: pkt source address (inside host) 402 * p->vaddr, p->vport: pkt dest address (foreign host) */ 403struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) 404{ 405 unsigned int hash; 406 struct ip_vs_conn *cp, *ret=NULL; 407 408 /* 409 * Check for "full" addressed entries 410 */ 411 hash = ip_vs_conn_hashkey_param(p, true); 412 413 rcu_read_lock(); 414 415 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 416 if (p->vport == cp->cport && p->cport == cp->dport && 417 cp->af == p->af && 418 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 419 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 420 p->protocol == cp->protocol && 421 ip_vs_conn_net_eq(cp, p->net)) { 422 if (!__ip_vs_conn_get(cp)) 423 continue; 424 /* HIT */ 425 ret = cp; 426 break; 427 } 428 } 429 430 rcu_read_unlock(); 431 432 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 433 ip_vs_proto_name(p->protocol), 434 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 435 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 436 ret ? "hit" : "not hit"); 437 438 return ret; 439} 440 441struct ip_vs_conn * 442ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, 443 const struct ip_vs_iphdr *iph, int inverse) 444{ 445 struct ip_vs_conn_param p; 446 447 if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) 448 return NULL; 449 450 return ip_vs_conn_out_get(&p); 451} 452EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 453 454/* 455 * Put back the conn and restart its timer with its timeout 456 */ 457void ip_vs_conn_put(struct ip_vs_conn *cp) 458{ 459 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 460 0 : cp->timeout; 461 mod_timer(&cp->timer, jiffies+t); 462 463 __ip_vs_conn_put(cp); 464} 465 466 467/* 468 * Fill a no_client_port connection with a client port number 469 */ 470void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 471{ 472 if (ip_vs_conn_unhash(cp)) { 473 spin_lock_bh(&cp->lock); 474 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 475 atomic_dec(&ip_vs_conn_no_cport_cnt); 476 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 477 cp->cport = cport; 478 } 479 spin_unlock_bh(&cp->lock); 480 481 /* hash on new dport */ 482 ip_vs_conn_hash(cp); 483 } 484} 485 486 487/* 488 * Bind a connection entry with the corresponding packet_xmit. 489 * Called by ip_vs_conn_new. 490 */ 491static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 492{ 493 switch (IP_VS_FWD_METHOD(cp)) { 494 case IP_VS_CONN_F_MASQ: 495 cp->packet_xmit = ip_vs_nat_xmit; 496 break; 497 498 case IP_VS_CONN_F_TUNNEL: 499#ifdef CONFIG_IP_VS_IPV6 500 if (cp->daf == AF_INET6) 501 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 502 else 503#endif 504 cp->packet_xmit = ip_vs_tunnel_xmit; 505 break; 506 507 case IP_VS_CONN_F_DROUTE: 508 cp->packet_xmit = ip_vs_dr_xmit; 509 break; 510 511 case IP_VS_CONN_F_LOCALNODE: 512 cp->packet_xmit = ip_vs_null_xmit; 513 break; 514 515 case IP_VS_CONN_F_BYPASS: 516 cp->packet_xmit = ip_vs_bypass_xmit; 517 break; 518 } 519} 520 521#ifdef CONFIG_IP_VS_IPV6 522static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) 523{ 524 switch (IP_VS_FWD_METHOD(cp)) { 525 case IP_VS_CONN_F_MASQ: 526 cp->packet_xmit = ip_vs_nat_xmit_v6; 527 break; 528 529 case IP_VS_CONN_F_TUNNEL: 530 if (cp->daf == AF_INET6) 531 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 532 else 533 cp->packet_xmit = ip_vs_tunnel_xmit; 534 break; 535 536 case IP_VS_CONN_F_DROUTE: 537 cp->packet_xmit = ip_vs_dr_xmit_v6; 538 break; 539 540 case IP_VS_CONN_F_LOCALNODE: 541 cp->packet_xmit = ip_vs_null_xmit; 542 break; 543 544 case IP_VS_CONN_F_BYPASS: 545 cp->packet_xmit = ip_vs_bypass_xmit_v6; 546 break; 547 } 548} 549#endif 550 551 552static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 553{ 554 return atomic_read(&dest->activeconns) 555 + atomic_read(&dest->inactconns); 556} 557 558/* 559 * Bind a connection entry with a virtual service destination 560 * Called just after a new connection entry is created. 561 */ 562static inline void 563ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 564{ 565 unsigned int conn_flags; 566 __u32 flags; 567 568 /* if dest is NULL, then return directly */ 569 if (!dest) 570 return; 571 572 /* Increase the refcnt counter of the dest */ 573 ip_vs_dest_hold(dest); 574 575 conn_flags = atomic_read(&dest->conn_flags); 576 if (cp->protocol != IPPROTO_UDP) 577 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; 578 flags = cp->flags; 579 /* Bind with the destination and its corresponding transmitter */ 580 if (flags & IP_VS_CONN_F_SYNC) { 581 /* if the connection is not template and is created 582 * by sync, preserve the activity flag. 583 */ 584 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 585 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 586 /* connections inherit forwarding method from dest */ 587 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); 588 } 589 flags |= conn_flags; 590 cp->flags = flags; 591 cp->dest = dest; 592 593 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 594 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 595 "dest->refcnt:%d\n", 596 ip_vs_proto_name(cp->protocol), 597 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 598 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 599 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 600 ip_vs_fwd_tag(cp), cp->state, 601 cp->flags, atomic_read(&cp->refcnt), 602 atomic_read(&dest->refcnt)); 603 604 /* Update the connection counters */ 605 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 606 /* It is a normal connection, so modify the counters 607 * according to the flags, later the protocol can 608 * update them on state change 609 */ 610 if (!(flags & IP_VS_CONN_F_INACTIVE)) 611 atomic_inc(&dest->activeconns); 612 else 613 atomic_inc(&dest->inactconns); 614 } else { 615 /* It is a persistent connection/template, so increase 616 the persistent connection counter */ 617 atomic_inc(&dest->persistconns); 618 } 619 620 if (dest->u_threshold != 0 && 621 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 622 dest->flags |= IP_VS_DEST_F_OVERLOAD; 623} 624 625 626/* 627 * Check if there is a destination for the connection, if so 628 * bind the connection to the destination. 629 */ 630void ip_vs_try_bind_dest(struct ip_vs_conn *cp) 631{ 632 struct ip_vs_dest *dest; 633 634 rcu_read_lock(); 635 636 /* This function is only invoked by the synchronization code. We do 637 * not currently support heterogeneous pools with synchronization, 638 * so we can make the assumption that the svc_af is the same as the 639 * dest_af 640 */ 641 dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, 642 cp->dport, &cp->vaddr, cp->vport, 643 cp->protocol, cp->fwmark, cp->flags); 644 if (dest) { 645 struct ip_vs_proto_data *pd; 646 647 spin_lock_bh(&cp->lock); 648 if (cp->dest) { 649 spin_unlock_bh(&cp->lock); 650 rcu_read_unlock(); 651 return; 652 } 653 654 /* Applications work depending on the forwarding method 655 * but better to reassign them always when binding dest */ 656 if (cp->app) 657 ip_vs_unbind_app(cp); 658 659 ip_vs_bind_dest(cp, dest); 660 spin_unlock_bh(&cp->lock); 661 662 /* Update its packet transmitter */ 663 cp->packet_xmit = NULL; 664#ifdef CONFIG_IP_VS_IPV6 665 if (cp->af == AF_INET6) 666 ip_vs_bind_xmit_v6(cp); 667 else 668#endif 669 ip_vs_bind_xmit(cp); 670 671 pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); 672 if (pd && atomic_read(&pd->appcnt)) 673 ip_vs_bind_app(cp, pd->pp); 674 } 675 rcu_read_unlock(); 676} 677 678 679/* 680 * Unbind a connection entry with its VS destination 681 * Called by the ip_vs_conn_expire function. 682 */ 683static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 684{ 685 struct ip_vs_dest *dest = cp->dest; 686 687 if (!dest) 688 return; 689 690 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " 691 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 692 "dest->refcnt:%d\n", 693 ip_vs_proto_name(cp->protocol), 694 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 695 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 696 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 697 ip_vs_fwd_tag(cp), cp->state, 698 cp->flags, atomic_read(&cp->refcnt), 699 atomic_read(&dest->refcnt)); 700 701 /* Update the connection counters */ 702 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 703 /* It is a normal connection, so decrease the inactconns 704 or activeconns counter */ 705 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 706 atomic_dec(&dest->inactconns); 707 } else { 708 atomic_dec(&dest->activeconns); 709 } 710 } else { 711 /* It is a persistent connection/template, so decrease 712 the persistent connection counter */ 713 atomic_dec(&dest->persistconns); 714 } 715 716 if (dest->l_threshold != 0) { 717 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 718 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 719 } else if (dest->u_threshold != 0) { 720 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 721 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 722 } else { 723 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 724 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 725 } 726 727 ip_vs_dest_put(dest); 728} 729 730static int expire_quiescent_template(struct netns_ipvs *ipvs, 731 struct ip_vs_dest *dest) 732{ 733#ifdef CONFIG_SYSCTL 734 return ipvs->sysctl_expire_quiescent_template && 735 (atomic_read(&dest->weight) == 0); 736#else 737 return 0; 738#endif 739} 740 741/* 742 * Checking if the destination of a connection template is available. 743 * If available, return 1, otherwise invalidate this connection 744 * template and return 0. 745 */ 746int ip_vs_check_template(struct ip_vs_conn *ct) 747{ 748 struct ip_vs_dest *dest = ct->dest; 749 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); 750 751 /* 752 * Checking the dest server status. 753 */ 754 if ((dest == NULL) || 755 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 756 expire_quiescent_template(ipvs, dest)) { 757 IP_VS_DBG_BUF(9, "check_template: dest not available for " 758 "protocol %s s:%s:%d v:%s:%d " 759 "-> d:%s:%d\n", 760 ip_vs_proto_name(ct->protocol), 761 IP_VS_DBG_ADDR(ct->af, &ct->caddr), 762 ntohs(ct->cport), 763 IP_VS_DBG_ADDR(ct->af, &ct->vaddr), 764 ntohs(ct->vport), 765 IP_VS_DBG_ADDR(ct->daf, &ct->daddr), 766 ntohs(ct->dport)); 767 768 /* 769 * Invalidate the connection template 770 */ 771 if (ct->vport != htons(0xffff)) { 772 if (ip_vs_conn_unhash(ct)) { 773 ct->dport = htons(0xffff); 774 ct->vport = htons(0xffff); 775 ct->cport = 0; 776 ip_vs_conn_hash(ct); 777 } 778 } 779 780 /* 781 * Simply decrease the refcnt of the template, 782 * don't restart its timer. 783 */ 784 __ip_vs_conn_put(ct); 785 return 0; 786 } 787 return 1; 788} 789 790static void ip_vs_conn_rcu_free(struct rcu_head *head) 791{ 792 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, 793 rcu_head); 794 795 ip_vs_pe_put(cp->pe); 796 kfree(cp->pe_data); 797 kmem_cache_free(ip_vs_conn_cachep, cp); 798} 799 800static void ip_vs_conn_expire(unsigned long data) 801{ 802 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 803 struct net *net = ip_vs_conn_net(cp); 804 struct netns_ipvs *ipvs = net_ipvs(net); 805 806 /* 807 * do I control anybody? 808 */ 809 if (atomic_read(&cp->n_control)) 810 goto expire_later; 811 812 /* Unlink conn if not referenced anymore */ 813 if (likely(ip_vs_conn_unlink(cp))) { 814 /* delete the timer if it is activated by other users */ 815 del_timer(&cp->timer); 816 817 /* does anybody control me? */ 818 if (cp->control) 819 ip_vs_control_del(cp); 820 821 if (cp->flags & IP_VS_CONN_F_NFCT) { 822 /* Do not access conntracks during subsys cleanup 823 * because nf_conntrack_find_get can not be used after 824 * conntrack cleanup for the net. 825 */ 826 smp_rmb(); 827 if (ipvs->enable) 828 ip_vs_conn_drop_conntrack(cp); 829 } 830 831 if (unlikely(cp->app != NULL)) 832 ip_vs_unbind_app(cp); 833 ip_vs_unbind_dest(cp); 834 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 835 atomic_dec(&ip_vs_conn_no_cport_cnt); 836 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); 837 atomic_dec(&ipvs->conn_count); 838 return; 839 } 840 841 expire_later: 842 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", 843 atomic_read(&cp->refcnt), 844 atomic_read(&cp->n_control)); 845 846 atomic_inc(&cp->refcnt); 847 cp->timeout = 60*HZ; 848 849 if (ipvs->sync_state & IP_VS_STATE_MASTER) 850 ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); 851 852 ip_vs_conn_put(cp); 853} 854 855/* Modify timer, so that it expires as soon as possible. 856 * Can be called without reference only if under RCU lock. 857 */ 858void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 859{ 860 /* Using mod_timer_pending will ensure the timer is not 861 * modified after the final del_timer in ip_vs_conn_expire. 862 */ 863 if (timer_pending(&cp->timer) && 864 time_after(cp->timer.expires, jiffies)) 865 mod_timer_pending(&cp->timer, jiffies); 866} 867 868 869/* 870 * Create a new connection entry and hash it into the ip_vs_conn_tab 871 */ 872struct ip_vs_conn * 873ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, 874 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, 875 struct ip_vs_dest *dest, __u32 fwmark) 876{ 877 struct ip_vs_conn *cp; 878 struct netns_ipvs *ipvs = net_ipvs(p->net); 879 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, 880 p->protocol); 881 882 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 883 if (cp == NULL) { 884 IP_VS_ERR_RL("%s(): no memory\n", __func__); 885 return NULL; 886 } 887 888 INIT_HLIST_NODE(&cp->c_list); 889 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 890 ip_vs_conn_net_set(cp, p->net); 891 cp->af = p->af; 892 cp->daf = dest_af; 893 cp->protocol = p->protocol; 894 ip_vs_addr_set(p->af, &cp->caddr, p->caddr); 895 cp->cport = p->cport; 896 /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ 897 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, 898 &cp->vaddr, p->vaddr); 899 cp->vport = p->vport; 900 ip_vs_addr_set(cp->daf, &cp->daddr, daddr); 901 cp->dport = dport; 902 cp->flags = flags; 903 cp->fwmark = fwmark; 904 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { 905 ip_vs_pe_get(p->pe); 906 cp->pe = p->pe; 907 cp->pe_data = p->pe_data; 908 cp->pe_data_len = p->pe_data_len; 909 } else { 910 cp->pe = NULL; 911 cp->pe_data = NULL; 912 cp->pe_data_len = 0; 913 } 914 spin_lock_init(&cp->lock); 915 916 /* 917 * Set the entry is referenced by the current thread before hashing 918 * it in the table, so that other thread run ip_vs_random_dropentry 919 * but cannot drop this entry. 920 */ 921 atomic_set(&cp->refcnt, 1); 922 923 cp->control = NULL; 924 atomic_set(&cp->n_control, 0); 925 atomic_set(&cp->in_pkts, 0); 926 927 cp->packet_xmit = NULL; 928 cp->app = NULL; 929 cp->app_data = NULL; 930 /* reset struct ip_vs_seq */ 931 cp->in_seq.delta = 0; 932 cp->out_seq.delta = 0; 933 934 atomic_inc(&ipvs->conn_count); 935 if (flags & IP_VS_CONN_F_NO_CPORT) 936 atomic_inc(&ip_vs_conn_no_cport_cnt); 937 938 /* Bind the connection with a destination server */ 939 cp->dest = NULL; 940 ip_vs_bind_dest(cp, dest); 941 942 /* Set its state and timeout */ 943 cp->state = 0; 944 cp->old_state = 0; 945 cp->timeout = 3*HZ; 946 cp->sync_endtime = jiffies & ~3UL; 947 948 /* Bind its packet transmitter */ 949#ifdef CONFIG_IP_VS_IPV6 950 if (p->af == AF_INET6) 951 ip_vs_bind_xmit_v6(cp); 952 else 953#endif 954 ip_vs_bind_xmit(cp); 955 956 if (unlikely(pd && atomic_read(&pd->appcnt))) 957 ip_vs_bind_app(cp, pd->pp); 958 959 /* 960 * Allow conntrack to be preserved. By default, conntrack 961 * is created and destroyed for every packet. 962 * Sometimes keeping conntrack can be useful for 963 * IP_VS_CONN_F_ONE_PACKET too. 964 */ 965 966 if (ip_vs_conntrack_enabled(ipvs)) 967 cp->flags |= IP_VS_CONN_F_NFCT; 968 969 /* Hash it in the ip_vs_conn_tab finally */ 970 ip_vs_conn_hash(cp); 971 972 return cp; 973} 974 975/* 976 * /proc/net/ip_vs_conn entries 977 */ 978#ifdef CONFIG_PROC_FS 979struct ip_vs_iter_state { 980 struct seq_net_private p; 981 struct hlist_head *l; 982}; 983 984static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 985{ 986 int idx; 987 struct ip_vs_conn *cp; 988 struct ip_vs_iter_state *iter = seq->private; 989 990 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 991 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 992 /* __ip_vs_conn_get() is not needed by 993 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show 994 */ 995 if (pos-- == 0) { 996 iter->l = &ip_vs_conn_tab[idx]; 997 return cp; 998 } 999 } 1000 cond_resched_rcu(); 1001 } 1002 1003 return NULL; 1004} 1005 1006static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 1007 __acquires(RCU) 1008{ 1009 struct ip_vs_iter_state *iter = seq->private; 1010 1011 iter->l = NULL; 1012 rcu_read_lock(); 1013 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 1014} 1015 1016static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1017{ 1018 struct ip_vs_conn *cp = v; 1019 struct ip_vs_iter_state *iter = seq->private; 1020 struct hlist_node *e; 1021 struct hlist_head *l = iter->l; 1022 int idx; 1023 1024 ++*pos; 1025 if (v == SEQ_START_TOKEN) 1026 return ip_vs_conn_array(seq, 0); 1027 1028 /* more on same hash chain? */ 1029 e = rcu_dereference(hlist_next_rcu(&cp->c_list)); 1030 if (e) 1031 return hlist_entry(e, struct ip_vs_conn, c_list); 1032 1033 idx = l - ip_vs_conn_tab; 1034 while (++idx < ip_vs_conn_tab_size) { 1035 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1036 iter->l = &ip_vs_conn_tab[idx]; 1037 return cp; 1038 } 1039 cond_resched_rcu(); 1040 } 1041 iter->l = NULL; 1042 return NULL; 1043} 1044 1045static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 1046 __releases(RCU) 1047{ 1048 rcu_read_unlock(); 1049} 1050 1051static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 1052{ 1053 1054 if (v == SEQ_START_TOKEN) 1055 seq_puts(seq, 1056 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 1057 else { 1058 const struct ip_vs_conn *cp = v; 1059 struct net *net = seq_file_net(seq); 1060 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 1061 size_t len = 0; 1062 char dbuf[IP_VS_ADDRSTRLEN]; 1063 1064 if (!ip_vs_conn_net_eq(cp, net)) 1065 return 0; 1066 if (cp->pe_data) { 1067 pe_data[0] = ' '; 1068 len = strlen(cp->pe->name); 1069 memcpy(pe_data + 1, cp->pe->name, len); 1070 pe_data[len + 1] = ' '; 1071 len += 2; 1072 len += cp->pe->show_pe_data(cp, pe_data + len); 1073 } 1074 pe_data[len] = '\0'; 1075 1076#ifdef CONFIG_IP_VS_IPV6 1077 if (cp->daf == AF_INET6) 1078 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1079 else 1080#endif 1081 snprintf(dbuf, sizeof(dbuf), "%08X", 1082 ntohl(cp->daddr.ip)); 1083 1084#ifdef CONFIG_IP_VS_IPV6 1085 if (cp->af == AF_INET6) 1086 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1087 "%s %04X %-11s %7lu%s\n", 1088 ip_vs_proto_name(cp->protocol), 1089 &cp->caddr.in6, ntohs(cp->cport), 1090 &cp->vaddr.in6, ntohs(cp->vport), 1091 dbuf, ntohs(cp->dport), 1092 ip_vs_state_name(cp->protocol, cp->state), 1093 (cp->timer.expires-jiffies)/HZ, pe_data); 1094 else 1095#endif 1096 seq_printf(seq, 1097 "%-3s %08X %04X %08X %04X" 1098 " %s %04X %-11s %7lu%s\n", 1099 ip_vs_proto_name(cp->protocol), 1100 ntohl(cp->caddr.ip), ntohs(cp->cport), 1101 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1102 dbuf, ntohs(cp->dport), 1103 ip_vs_state_name(cp->protocol, cp->state), 1104 (cp->timer.expires-jiffies)/HZ, pe_data); 1105 } 1106 return 0; 1107} 1108 1109static const struct seq_operations ip_vs_conn_seq_ops = { 1110 .start = ip_vs_conn_seq_start, 1111 .next = ip_vs_conn_seq_next, 1112 .stop = ip_vs_conn_seq_stop, 1113 .show = ip_vs_conn_seq_show, 1114}; 1115 1116static int ip_vs_conn_open(struct inode *inode, struct file *file) 1117{ 1118 return seq_open_net(inode, file, &ip_vs_conn_seq_ops, 1119 sizeof(struct ip_vs_iter_state)); 1120} 1121 1122static const struct file_operations ip_vs_conn_fops = { 1123 .owner = THIS_MODULE, 1124 .open = ip_vs_conn_open, 1125 .read = seq_read, 1126 .llseek = seq_lseek, 1127 .release = seq_release_net, 1128}; 1129 1130static const char *ip_vs_origin_name(unsigned int flags) 1131{ 1132 if (flags & IP_VS_CONN_F_SYNC) 1133 return "SYNC"; 1134 else 1135 return "LOCAL"; 1136} 1137 1138static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 1139{ 1140 char dbuf[IP_VS_ADDRSTRLEN]; 1141 1142 if (v == SEQ_START_TOKEN) 1143 seq_puts(seq, 1144 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1145 else { 1146 const struct ip_vs_conn *cp = v; 1147 struct net *net = seq_file_net(seq); 1148 1149 if (!ip_vs_conn_net_eq(cp, net)) 1150 return 0; 1151 1152#ifdef CONFIG_IP_VS_IPV6 1153 if (cp->daf == AF_INET6) 1154 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1155 else 1156#endif 1157 snprintf(dbuf, sizeof(dbuf), "%08X", 1158 ntohl(cp->daddr.ip)); 1159 1160#ifdef CONFIG_IP_VS_IPV6 1161 if (cp->af == AF_INET6) 1162 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1163 "%s %04X %-11s %-6s %7lu\n", 1164 ip_vs_proto_name(cp->protocol), 1165 &cp->caddr.in6, ntohs(cp->cport), 1166 &cp->vaddr.in6, ntohs(cp->vport), 1167 dbuf, ntohs(cp->dport), 1168 ip_vs_state_name(cp->protocol, cp->state), 1169 ip_vs_origin_name(cp->flags), 1170 (cp->timer.expires-jiffies)/HZ); 1171 else 1172#endif 1173 seq_printf(seq, 1174 "%-3s %08X %04X %08X %04X " 1175 "%s %04X %-11s %-6s %7lu\n", 1176 ip_vs_proto_name(cp->protocol), 1177 ntohl(cp->caddr.ip), ntohs(cp->cport), 1178 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1179 dbuf, ntohs(cp->dport), 1180 ip_vs_state_name(cp->protocol, cp->state), 1181 ip_vs_origin_name(cp->flags), 1182 (cp->timer.expires-jiffies)/HZ); 1183 } 1184 return 0; 1185} 1186 1187static const struct seq_operations ip_vs_conn_sync_seq_ops = { 1188 .start = ip_vs_conn_seq_start, 1189 .next = ip_vs_conn_seq_next, 1190 .stop = ip_vs_conn_seq_stop, 1191 .show = ip_vs_conn_sync_seq_show, 1192}; 1193 1194static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 1195{ 1196 return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, 1197 sizeof(struct ip_vs_iter_state)); 1198} 1199 1200static const struct file_operations ip_vs_conn_sync_fops = { 1201 .owner = THIS_MODULE, 1202 .open = ip_vs_conn_sync_open, 1203 .read = seq_read, 1204 .llseek = seq_lseek, 1205 .release = seq_release_net, 1206}; 1207 1208#endif 1209 1210 1211/* 1212 * Randomly drop connection entries before running out of memory 1213 */ 1214static inline int todrop_entry(struct ip_vs_conn *cp) 1215{ 1216 /* 1217 * The drop rate array needs tuning for real environments. 1218 * Called from timer bh only => no locking 1219 */ 1220 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 1221 static char todrop_counter[9] = {0}; 1222 int i; 1223 1224 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 1225 This will leave enough time for normal connection to get 1226 through. */ 1227 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 1228 return 0; 1229 1230 /* Don't drop the entry if its number of incoming packets is not 1231 located in [0, 8] */ 1232 i = atomic_read(&cp->in_pkts); 1233 if (i > 8 || i < 0) return 0; 1234 1235 if (!todrop_rate[i]) return 0; 1236 if (--todrop_counter[i] > 0) return 0; 1237 1238 todrop_counter[i] = todrop_rate[i]; 1239 return 1; 1240} 1241 1242/* Called from keventd and must protect itself from softirqs */ 1243void ip_vs_random_dropentry(struct net *net) 1244{ 1245 int idx; 1246 struct ip_vs_conn *cp, *cp_c; 1247 1248 rcu_read_lock(); 1249 /* 1250 * Randomly scan 1/32 of the whole table every second 1251 */ 1252 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1253 unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; 1254 1255 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1256 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 1257 /* connection template */ 1258 continue; 1259 if (!ip_vs_conn_net_eq(cp, net)) 1260 continue; 1261 if (cp->protocol == IPPROTO_TCP) { 1262 switch(cp->state) { 1263 case IP_VS_TCP_S_SYN_RECV: 1264 case IP_VS_TCP_S_SYNACK: 1265 break; 1266 1267 case IP_VS_TCP_S_ESTABLISHED: 1268 if (todrop_entry(cp)) 1269 break; 1270 continue; 1271 1272 default: 1273 continue; 1274 } 1275 } else if (cp->protocol == IPPROTO_SCTP) { 1276 switch (cp->state) { 1277 case IP_VS_SCTP_S_INIT1: 1278 case IP_VS_SCTP_S_INIT: 1279 break; 1280 case IP_VS_SCTP_S_ESTABLISHED: 1281 if (todrop_entry(cp)) 1282 break; 1283 continue; 1284 default: 1285 continue; 1286 } 1287 } else { 1288 if (!todrop_entry(cp)) 1289 continue; 1290 } 1291 1292 IP_VS_DBG(4, "del connection\n"); 1293 ip_vs_conn_expire_now(cp); 1294 cp_c = cp->control; 1295 /* cp->control is valid only with reference to cp */ 1296 if (cp_c && __ip_vs_conn_get(cp)) { 1297 IP_VS_DBG(4, "del conn template\n"); 1298 ip_vs_conn_expire_now(cp_c); 1299 __ip_vs_conn_put(cp); 1300 } 1301 } 1302 cond_resched_rcu(); 1303 } 1304 rcu_read_unlock(); 1305} 1306 1307 1308/* 1309 * Flush all the connection entries in the ip_vs_conn_tab 1310 */ 1311static void ip_vs_conn_flush(struct net *net) 1312{ 1313 int idx; 1314 struct ip_vs_conn *cp, *cp_c; 1315 struct netns_ipvs *ipvs = net_ipvs(net); 1316 1317flush_again: 1318 rcu_read_lock(); 1319 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1320 1321 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1322 if (!ip_vs_conn_net_eq(cp, net)) 1323 continue; 1324 IP_VS_DBG(4, "del connection\n"); 1325 ip_vs_conn_expire_now(cp); 1326 cp_c = cp->control; 1327 /* cp->control is valid only with reference to cp */ 1328 if (cp_c && __ip_vs_conn_get(cp)) { 1329 IP_VS_DBG(4, "del conn template\n"); 1330 ip_vs_conn_expire_now(cp_c); 1331 __ip_vs_conn_put(cp); 1332 } 1333 } 1334 cond_resched_rcu(); 1335 } 1336 rcu_read_unlock(); 1337 1338 /* the counter may be not NULL, because maybe some conn entries 1339 are run by slow timer handler or unhashed but still referred */ 1340 if (atomic_read(&ipvs->conn_count) != 0) { 1341 schedule(); 1342 goto flush_again; 1343 } 1344} 1345/* 1346 * per netns init and exit 1347 */ 1348int __net_init ip_vs_conn_net_init(struct net *net) 1349{ 1350 struct netns_ipvs *ipvs = net_ipvs(net); 1351 1352 atomic_set(&ipvs->conn_count, 0); 1353 1354 proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); 1355 proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); 1356 return 0; 1357} 1358 1359void __net_exit ip_vs_conn_net_cleanup(struct net *net) 1360{ 1361 /* flush all the connection entries first */ 1362 ip_vs_conn_flush(net); 1363 remove_proc_entry("ip_vs_conn", net->proc_net); 1364 remove_proc_entry("ip_vs_conn_sync", net->proc_net); 1365} 1366 1367int __init ip_vs_conn_init(void) 1368{ 1369 int idx; 1370 1371 /* Compute size and mask */ 1372 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1373 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1374 1375 /* 1376 * Allocate the connection hash table and initialize its list heads 1377 */ 1378 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); 1379 if (!ip_vs_conn_tab) 1380 return -ENOMEM; 1381 1382 /* Allocate ip_vs_conn slab cache */ 1383 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 1384 sizeof(struct ip_vs_conn), 0, 1385 SLAB_HWCACHE_ALIGN, NULL); 1386 if (!ip_vs_conn_cachep) { 1387 vfree(ip_vs_conn_tab); 1388 return -ENOMEM; 1389 } 1390 1391 pr_info("Connection hash table configured " 1392 "(size=%d, memory=%ldKbytes)\n", 1393 ip_vs_conn_tab_size, 1394 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1395 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1396 sizeof(struct ip_vs_conn)); 1397 1398 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1399 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 1400 1401 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1402 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); 1403 } 1404 1405 /* calculate the random value for connection hash */ 1406 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1407 1408 return 0; 1409} 1410 1411void ip_vs_conn_cleanup(void) 1412{ 1413 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ 1414 rcu_barrier(); 1415 /* Release the empty cache */ 1416 kmem_cache_destroy(ip_vs_conn_cachep); 1417 vfree(ip_vs_conn_tab); 1418} 1419