1/* 2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS 3 * 4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 5 * Julian Anastasov <ja@ssi.bg> 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> 13 * 14 * Network name space (netns) aware. 15 * Global data moved to netns i.e struct netns_ipvs 16 * tcp_timeouts table has copy per netns in a hash table per 17 * protocol ip_vs_proto_data and is handled by netns 18 */ 19 20#define KMSG_COMPONENT "IPVS" 21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23#include <linux/kernel.h> 24#include <linux/ip.h> 25#include <linux/tcp.h> /* for tcphdr */ 26#include <net/ip.h> 27#include <net/tcp.h> /* for csum_tcpudp_magic */ 28#include <net/ip6_checksum.h> 29#include <linux/netfilter.h> 30#include <linux/netfilter_ipv4.h> 31 32#include <net/ip_vs.h> 33 34static int 35tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 36 int *verdict, struct ip_vs_conn **cpp, 37 struct ip_vs_iphdr *iph) 38{ 39 struct net *net; 40 struct ip_vs_service *svc; 41 struct tcphdr _tcph, *th; 42 struct netns_ipvs *ipvs; 43 44 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 45 if (th == NULL) { 46 *verdict = NF_DROP; 47 return 0; 48 } 49 net = skb_net(skb); 50 ipvs = net_ipvs(net); 51 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ 52 rcu_read_lock(); 53 if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && 54 (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, 55 &iph->daddr, th->dest))) { 56 int ignored; 57 58 if (ip_vs_todrop(ipvs)) { 59 /* 60 * It seems that we are very loaded. 61 * We have to drop this packet :( 62 */ 63 rcu_read_unlock(); 64 *verdict = NF_DROP; 65 return 0; 66 } 67 68 /* 69 * Let the virtual server select a real server for the 70 * incoming connection, and create a connection entry. 71 */ 72 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); 73 if (!*cpp && ignored <= 0) { 74 if (!ignored) 75 *verdict = ip_vs_leave(svc, skb, pd, iph); 76 else 77 *verdict = NF_DROP; 78 rcu_read_unlock(); 79 return 0; 80 } 81 } 82 rcu_read_unlock(); 83 /* NF_ACCEPT */ 84 return 1; 85} 86 87 88static inline void 89tcp_fast_csum_update(int af, struct tcphdr *tcph, 90 const union nf_inet_addr *oldip, 91 const union nf_inet_addr *newip, 92 __be16 oldport, __be16 newport) 93{ 94#ifdef CONFIG_IP_VS_IPV6 95 if (af == AF_INET6) 96 tcph->check = 97 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 98 ip_vs_check_diff2(oldport, newport, 99 ~csum_unfold(tcph->check)))); 100 else 101#endif 102 tcph->check = 103 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 104 ip_vs_check_diff2(oldport, newport, 105 ~csum_unfold(tcph->check)))); 106} 107 108 109static inline void 110tcp_partial_csum_update(int af, struct tcphdr *tcph, 111 const union nf_inet_addr *oldip, 112 const union nf_inet_addr *newip, 113 __be16 oldlen, __be16 newlen) 114{ 115#ifdef CONFIG_IP_VS_IPV6 116 if (af == AF_INET6) 117 tcph->check = 118 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 119 ip_vs_check_diff2(oldlen, newlen, 120 csum_unfold(tcph->check)))); 121 else 122#endif 123 tcph->check = 124 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 125 ip_vs_check_diff2(oldlen, newlen, 126 csum_unfold(tcph->check)))); 127} 128 129 130static int 131tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 132 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 133{ 134 struct tcphdr *tcph; 135 unsigned int tcphoff = iph->len; 136 int oldlen; 137 int payload_csum = 0; 138 139#ifdef CONFIG_IP_VS_IPV6 140 if (cp->af == AF_INET6 && iph->fragoffs) 141 return 1; 142#endif 143 oldlen = skb->len - tcphoff; 144 145 /* csum_check requires unshared skb */ 146 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) 147 return 0; 148 149 if (unlikely(cp->app != NULL)) { 150 int ret; 151 152 /* Some checks before mangling */ 153 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 154 return 0; 155 156 /* Call application helper if needed */ 157 if (!(ret = ip_vs_app_pkt_out(cp, skb))) 158 return 0; 159 /* ret=2: csum update is needed after payload mangling */ 160 if (ret == 1) 161 oldlen = skb->len - tcphoff; 162 else 163 payload_csum = 1; 164 } 165 166 tcph = (void *)skb_network_header(skb) + tcphoff; 167 tcph->source = cp->vport; 168 169 /* Adjust TCP checksums */ 170 if (skb->ip_summed == CHECKSUM_PARTIAL) { 171 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 172 htons(oldlen), 173 htons(skb->len - tcphoff)); 174 } else if (!payload_csum) { 175 /* Only port and addr are changed, do fast csum update */ 176 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 177 cp->dport, cp->vport); 178 if (skb->ip_summed == CHECKSUM_COMPLETE) 179 skb->ip_summed = (cp->app && pp->csum_check) ? 180 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 181 } else { 182 /* full checksum calculation */ 183 tcph->check = 0; 184 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 185#ifdef CONFIG_IP_VS_IPV6 186 if (cp->af == AF_INET6) 187 tcph->check = csum_ipv6_magic(&cp->vaddr.in6, 188 &cp->caddr.in6, 189 skb->len - tcphoff, 190 cp->protocol, skb->csum); 191 else 192#endif 193 tcph->check = csum_tcpudp_magic(cp->vaddr.ip, 194 cp->caddr.ip, 195 skb->len - tcphoff, 196 cp->protocol, 197 skb->csum); 198 skb->ip_summed = CHECKSUM_UNNECESSARY; 199 200 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 201 pp->name, tcph->check, 202 (char*)&(tcph->check) - (char*)tcph); 203 } 204 return 1; 205} 206 207 208static int 209tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 210 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 211{ 212 struct tcphdr *tcph; 213 unsigned int tcphoff = iph->len; 214 int oldlen; 215 int payload_csum = 0; 216 217#ifdef CONFIG_IP_VS_IPV6 218 if (cp->af == AF_INET6 && iph->fragoffs) 219 return 1; 220#endif 221 oldlen = skb->len - tcphoff; 222 223 /* csum_check requires unshared skb */ 224 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) 225 return 0; 226 227 if (unlikely(cp->app != NULL)) { 228 int ret; 229 230 /* Some checks before mangling */ 231 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 232 return 0; 233 234 /* 235 * Attempt ip_vs_app call. 236 * It will fix ip_vs_conn and iph ack_seq stuff 237 */ 238 if (!(ret = ip_vs_app_pkt_in(cp, skb))) 239 return 0; 240 /* ret=2: csum update is needed after payload mangling */ 241 if (ret == 1) 242 oldlen = skb->len - tcphoff; 243 else 244 payload_csum = 1; 245 } 246 247 tcph = (void *)skb_network_header(skb) + tcphoff; 248 tcph->dest = cp->dport; 249 250 /* 251 * Adjust TCP checksums 252 */ 253 if (skb->ip_summed == CHECKSUM_PARTIAL) { 254 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 255 htons(oldlen), 256 htons(skb->len - tcphoff)); 257 } else if (!payload_csum) { 258 /* Only port and addr are changed, do fast csum update */ 259 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 260 cp->vport, cp->dport); 261 if (skb->ip_summed == CHECKSUM_COMPLETE) 262 skb->ip_summed = (cp->app && pp->csum_check) ? 263 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 264 } else { 265 /* full checksum calculation */ 266 tcph->check = 0; 267 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 268#ifdef CONFIG_IP_VS_IPV6 269 if (cp->af == AF_INET6) 270 tcph->check = csum_ipv6_magic(&cp->caddr.in6, 271 &cp->daddr.in6, 272 skb->len - tcphoff, 273 cp->protocol, skb->csum); 274 else 275#endif 276 tcph->check = csum_tcpudp_magic(cp->caddr.ip, 277 cp->daddr.ip, 278 skb->len - tcphoff, 279 cp->protocol, 280 skb->csum); 281 skb->ip_summed = CHECKSUM_UNNECESSARY; 282 } 283 return 1; 284} 285 286 287static int 288tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) 289{ 290 unsigned int tcphoff; 291 292#ifdef CONFIG_IP_VS_IPV6 293 if (af == AF_INET6) 294 tcphoff = sizeof(struct ipv6hdr); 295 else 296#endif 297 tcphoff = ip_hdrlen(skb); 298 299 switch (skb->ip_summed) { 300 case CHECKSUM_NONE: 301 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 302 case CHECKSUM_COMPLETE: 303#ifdef CONFIG_IP_VS_IPV6 304 if (af == AF_INET6) { 305 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 306 &ipv6_hdr(skb)->daddr, 307 skb->len - tcphoff, 308 ipv6_hdr(skb)->nexthdr, 309 skb->csum)) { 310 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 311 "Failed checksum for"); 312 return 0; 313 } 314 } else 315#endif 316 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, 317 ip_hdr(skb)->daddr, 318 skb->len - tcphoff, 319 ip_hdr(skb)->protocol, 320 skb->csum)) { 321 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 322 "Failed checksum for"); 323 return 0; 324 } 325 break; 326 default: 327 /* No need to checksum. */ 328 break; 329 } 330 331 return 1; 332} 333 334 335#define TCP_DIR_INPUT 0 336#define TCP_DIR_OUTPUT 4 337#define TCP_DIR_INPUT_ONLY 8 338 339static const int tcp_state_off[IP_VS_DIR_LAST] = { 340 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, 341 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, 342 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, 343}; 344 345/* 346 * Timeout table[state] 347 */ 348static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { 349 [IP_VS_TCP_S_NONE] = 2*HZ, 350 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, 351 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, 352 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, 353 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, 354 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, 355 [IP_VS_TCP_S_CLOSE] = 10*HZ, 356 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, 357 [IP_VS_TCP_S_LAST_ACK] = 30*HZ, 358 [IP_VS_TCP_S_LISTEN] = 2*60*HZ, 359 [IP_VS_TCP_S_SYNACK] = 120*HZ, 360 [IP_VS_TCP_S_LAST] = 2*HZ, 361}; 362 363static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { 364 [IP_VS_TCP_S_NONE] = "NONE", 365 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", 366 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", 367 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", 368 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", 369 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", 370 [IP_VS_TCP_S_CLOSE] = "CLOSE", 371 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", 372 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", 373 [IP_VS_TCP_S_LISTEN] = "LISTEN", 374 [IP_VS_TCP_S_SYNACK] = "SYNACK", 375 [IP_VS_TCP_S_LAST] = "BUG!", 376}; 377 378#define sNO IP_VS_TCP_S_NONE 379#define sES IP_VS_TCP_S_ESTABLISHED 380#define sSS IP_VS_TCP_S_SYN_SENT 381#define sSR IP_VS_TCP_S_SYN_RECV 382#define sFW IP_VS_TCP_S_FIN_WAIT 383#define sTW IP_VS_TCP_S_TIME_WAIT 384#define sCL IP_VS_TCP_S_CLOSE 385#define sCW IP_VS_TCP_S_CLOSE_WAIT 386#define sLA IP_VS_TCP_S_LAST_ACK 387#define sLI IP_VS_TCP_S_LISTEN 388#define sSA IP_VS_TCP_S_SYNACK 389 390struct tcp_states_t { 391 int next_state[IP_VS_TCP_S_LAST]; 392}; 393 394static const char * tcp_state_name(int state) 395{ 396 if (state >= IP_VS_TCP_S_LAST) 397 return "ERR!"; 398 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; 399} 400 401static struct tcp_states_t tcp_states [] = { 402/* INPUT */ 403/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 404/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, 405/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, 406/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 407/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, 408 409/* OUTPUT */ 410/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 411/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, 412/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, 413/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, 414/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, 415 416/* INPUT-ONLY */ 417/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 418/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, 419/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, 420/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 421/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 422}; 423 424static struct tcp_states_t tcp_states_dos [] = { 425/* INPUT */ 426/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 427/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, 428/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, 429/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, 430/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 431 432/* OUTPUT */ 433/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 434/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, 435/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, 436/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, 437/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, 438 439/* INPUT-ONLY */ 440/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 441/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, 442/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, 443/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 444/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 445}; 446 447static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) 448{ 449 int on = (flags & 1); /* secure_tcp */ 450 451 /* 452 ** FIXME: change secure_tcp to independent sysctl var 453 ** or make it per-service or per-app because it is valid 454 ** for most if not for all of the applications. Something 455 ** like "capabilities" (flags) for each object. 456 */ 457 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); 458} 459 460static inline int tcp_state_idx(struct tcphdr *th) 461{ 462 if (th->rst) 463 return 3; 464 if (th->syn) 465 return 0; 466 if (th->fin) 467 return 1; 468 if (th->ack) 469 return 2; 470 return -1; 471} 472 473static inline void 474set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, 475 int direction, struct tcphdr *th) 476{ 477 int state_idx; 478 int new_state = IP_VS_TCP_S_CLOSE; 479 int state_off = tcp_state_off[direction]; 480 481 /* 482 * Update state offset to INPUT_ONLY if necessary 483 * or delete NO_OUTPUT flag if output packet detected 484 */ 485 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { 486 if (state_off == TCP_DIR_OUTPUT) 487 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; 488 else 489 state_off = TCP_DIR_INPUT_ONLY; 490 } 491 492 if ((state_idx = tcp_state_idx(th)) < 0) { 493 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); 494 goto tcp_state_out; 495 } 496 497 new_state = 498 pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; 499 500 tcp_state_out: 501 if (new_state != cp->state) { 502 struct ip_vs_dest *dest = cp->dest; 503 504 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" 505 "%s:%d state: %s->%s conn->refcnt:%d\n", 506 pd->pp->name, 507 ((state_off == TCP_DIR_OUTPUT) ? 508 "output " : "input "), 509 th->syn ? 'S' : '.', 510 th->fin ? 'F' : '.', 511 th->ack ? 'A' : '.', 512 th->rst ? 'R' : '.', 513 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), 514 ntohs(cp->dport), 515 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 516 ntohs(cp->cport), 517 tcp_state_name(cp->state), 518 tcp_state_name(new_state), 519 atomic_read(&cp->refcnt)); 520 521 if (dest) { 522 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 523 (new_state != IP_VS_TCP_S_ESTABLISHED)) { 524 atomic_dec(&dest->activeconns); 525 atomic_inc(&dest->inactconns); 526 cp->flags |= IP_VS_CONN_F_INACTIVE; 527 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && 528 (new_state == IP_VS_TCP_S_ESTABLISHED)) { 529 atomic_inc(&dest->activeconns); 530 atomic_dec(&dest->inactconns); 531 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 532 } 533 } 534 } 535 536 if (likely(pd)) 537 cp->timeout = pd->timeout_table[cp->state = new_state]; 538 else /* What to do ? */ 539 cp->timeout = tcp_timeouts[cp->state = new_state]; 540} 541 542/* 543 * Handle state transitions 544 */ 545static void 546tcp_state_transition(struct ip_vs_conn *cp, int direction, 547 const struct sk_buff *skb, 548 struct ip_vs_proto_data *pd) 549{ 550 struct tcphdr _tcph, *th; 551 552#ifdef CONFIG_IP_VS_IPV6 553 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); 554#else 555 int ihl = ip_hdrlen(skb); 556#endif 557 558 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); 559 if (th == NULL) 560 return; 561 562 spin_lock_bh(&cp->lock); 563 set_tcp_state(pd, cp, direction, th); 564 spin_unlock_bh(&cp->lock); 565} 566 567static inline __u16 tcp_app_hashkey(__be16 port) 568{ 569 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) 570 & TCP_APP_TAB_MASK; 571} 572 573 574static int tcp_register_app(struct net *net, struct ip_vs_app *inc) 575{ 576 struct ip_vs_app *i; 577 __u16 hash; 578 __be16 port = inc->port; 579 int ret = 0; 580 struct netns_ipvs *ipvs = net_ipvs(net); 581 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); 582 583 hash = tcp_app_hashkey(port); 584 585 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { 586 if (i->port == port) { 587 ret = -EEXIST; 588 goto out; 589 } 590 } 591 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); 592 atomic_inc(&pd->appcnt); 593 594 out: 595 return ret; 596} 597 598 599static void 600tcp_unregister_app(struct net *net, struct ip_vs_app *inc) 601{ 602 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); 603 604 atomic_dec(&pd->appcnt); 605 list_del_rcu(&inc->p_list); 606} 607 608 609static int 610tcp_app_conn_bind(struct ip_vs_conn *cp) 611{ 612 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); 613 int hash; 614 struct ip_vs_app *inc; 615 int result = 0; 616 617 /* Default binding: bind app only for NAT */ 618 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 619 return 0; 620 621 /* Lookup application incarnations and bind the right one */ 622 hash = tcp_app_hashkey(cp->vport); 623 624 rcu_read_lock(); 625 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { 626 if (inc->port == cp->vport) { 627 if (unlikely(!ip_vs_app_inc_get(inc))) 628 break; 629 rcu_read_unlock(); 630 631 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" 632 "%s:%u to app %s on port %u\n", 633 __func__, 634 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 635 ntohs(cp->cport), 636 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), 637 ntohs(cp->vport), 638 inc->name, ntohs(inc->port)); 639 640 cp->app = inc; 641 if (inc->init_conn) 642 result = inc->init_conn(inc, cp); 643 goto out; 644 } 645 } 646 rcu_read_unlock(); 647 648 out: 649 return result; 650} 651 652 653/* 654 * Set LISTEN timeout. (ip_vs_conn_put will setup timer) 655 */ 656void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) 657{ 658 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); 659 660 spin_lock_bh(&cp->lock); 661 cp->state = IP_VS_TCP_S_LISTEN; 662 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] 663 : tcp_timeouts[IP_VS_TCP_S_LISTEN]); 664 spin_unlock_bh(&cp->lock); 665} 666 667/* --------------------------------------------- 668 * timeouts is netns related now. 669 * --------------------------------------------- 670 */ 671static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) 672{ 673 struct netns_ipvs *ipvs = net_ipvs(net); 674 675 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); 676 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, 677 sizeof(tcp_timeouts)); 678 if (!pd->timeout_table) 679 return -ENOMEM; 680 pd->tcp_state_table = tcp_states; 681 return 0; 682} 683 684static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) 685{ 686 kfree(pd->timeout_table); 687} 688 689 690struct ip_vs_protocol ip_vs_protocol_tcp = { 691 .name = "TCP", 692 .protocol = IPPROTO_TCP, 693 .num_states = IP_VS_TCP_S_LAST, 694 .dont_defrag = 0, 695 .init = NULL, 696 .exit = NULL, 697 .init_netns = __ip_vs_tcp_init, 698 .exit_netns = __ip_vs_tcp_exit, 699 .register_app = tcp_register_app, 700 .unregister_app = tcp_unregister_app, 701 .conn_schedule = tcp_conn_schedule, 702 .conn_in_get = ip_vs_conn_in_get_proto, 703 .conn_out_get = ip_vs_conn_out_get_proto, 704 .snat_handler = tcp_snat_handler, 705 .dnat_handler = tcp_dnat_handler, 706 .csum_check = tcp_csum_check, 707 .state_name = tcp_state_name, 708 .state_transition = tcp_state_transition, 709 .app_conn_bind = tcp_app_conn_bind, 710 .debug_packet = ip_vs_tcpudp_debug_packet, 711 .timeout_change = tcp_timeout_change, 712}; 713