1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. 20 * 21 * Changes: 22 * Paul `Rusty' Russell properly handle non-linear skbs 23 * Harald Welte don't use nfcache 24 * 25 */ 26 27#define KMSG_COMPONENT "IPVS" 28#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 29 30#include <linux/module.h> 31#include <linux/kernel.h> 32#include <linux/ip.h> 33#include <linux/tcp.h> 34#include <linux/sctp.h> 35#include <linux/icmp.h> 36#include <linux/slab.h> 37 38#include <net/ip.h> 39#include <net/tcp.h> 40#include <net/udp.h> 41#include <net/icmp.h> /* for icmp_send */ 42#include <net/route.h> 43#include <net/ip6_checksum.h> 44#include <net/netns/generic.h> /* net_generic() */ 45 46#include <linux/netfilter.h> 47#include <linux/netfilter_ipv4.h> 48 49#ifdef CONFIG_IP_VS_IPV6 50#include <net/ipv6.h> 51#include <linux/netfilter_ipv6.h> 52#include <net/ip6_route.h> 53#endif 54 55#include <net/ip_vs.h> 56 57 58EXPORT_SYMBOL(register_ip_vs_scheduler); 59EXPORT_SYMBOL(unregister_ip_vs_scheduler); 60EXPORT_SYMBOL(ip_vs_proto_name); 61EXPORT_SYMBOL(ip_vs_conn_new); 62EXPORT_SYMBOL(ip_vs_conn_in_get); 63EXPORT_SYMBOL(ip_vs_conn_out_get); 64#ifdef CONFIG_IP_VS_PROTO_TCP 65EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 66#endif 67EXPORT_SYMBOL(ip_vs_conn_put); 68#ifdef CONFIG_IP_VS_DEBUG 69EXPORT_SYMBOL(ip_vs_get_debug_level); 70#endif 71 72static int ip_vs_net_id __read_mostly; 73/* netns cnt used for uniqueness */ 74static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 75 76/* ID used in ICMP lookups */ 77#define icmp_id(icmph) (((icmph)->un).echo.id) 78#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 79 80const char *ip_vs_proto_name(unsigned int proto) 81{ 82 static char buf[20]; 83 84 switch (proto) { 85 case IPPROTO_IP: 86 return "IP"; 87 case IPPROTO_UDP: 88 return "UDP"; 89 case IPPROTO_TCP: 90 return "TCP"; 91 case IPPROTO_SCTP: 92 return "SCTP"; 93 case IPPROTO_ICMP: 94 return "ICMP"; 95#ifdef CONFIG_IP_VS_IPV6 96 case IPPROTO_ICMPV6: 97 return "ICMPv6"; 98#endif 99 default: 100 sprintf(buf, "IP_%u", proto); 101 return buf; 102 } 103} 104 105void ip_vs_init_hash_table(struct list_head *table, int rows) 106{ 107 while (--rows >= 0) 108 INIT_LIST_HEAD(&table[rows]); 109} 110 111static inline void 112ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 113{ 114 struct ip_vs_dest *dest = cp->dest; 115 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 116 117 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 118 struct ip_vs_cpu_stats *s; 119 struct ip_vs_service *svc; 120 121 s = this_cpu_ptr(dest->stats.cpustats); 122 s->ustats.inpkts++; 123 u64_stats_update_begin(&s->syncp); 124 s->ustats.inbytes += skb->len; 125 u64_stats_update_end(&s->syncp); 126 127 rcu_read_lock(); 128 svc = rcu_dereference(dest->svc); 129 s = this_cpu_ptr(svc->stats.cpustats); 130 s->ustats.inpkts++; 131 u64_stats_update_begin(&s->syncp); 132 s->ustats.inbytes += skb->len; 133 u64_stats_update_end(&s->syncp); 134 rcu_read_unlock(); 135 136 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 137 s->ustats.inpkts++; 138 u64_stats_update_begin(&s->syncp); 139 s->ustats.inbytes += skb->len; 140 u64_stats_update_end(&s->syncp); 141 } 142} 143 144 145static inline void 146ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 147{ 148 struct ip_vs_dest *dest = cp->dest; 149 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 150 151 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 152 struct ip_vs_cpu_stats *s; 153 struct ip_vs_service *svc; 154 155 s = this_cpu_ptr(dest->stats.cpustats); 156 s->ustats.outpkts++; 157 u64_stats_update_begin(&s->syncp); 158 s->ustats.outbytes += skb->len; 159 u64_stats_update_end(&s->syncp); 160 161 rcu_read_lock(); 162 svc = rcu_dereference(dest->svc); 163 s = this_cpu_ptr(svc->stats.cpustats); 164 s->ustats.outpkts++; 165 u64_stats_update_begin(&s->syncp); 166 s->ustats.outbytes += skb->len; 167 u64_stats_update_end(&s->syncp); 168 rcu_read_unlock(); 169 170 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 171 s->ustats.outpkts++; 172 u64_stats_update_begin(&s->syncp); 173 s->ustats.outbytes += skb->len; 174 u64_stats_update_end(&s->syncp); 175 } 176} 177 178 179static inline void 180ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 181{ 182 struct netns_ipvs *ipvs = net_ipvs(svc->net); 183 struct ip_vs_cpu_stats *s; 184 185 s = this_cpu_ptr(cp->dest->stats.cpustats); 186 s->ustats.conns++; 187 188 s = this_cpu_ptr(svc->stats.cpustats); 189 s->ustats.conns++; 190 191 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 192 s->ustats.conns++; 193} 194 195 196static inline void 197ip_vs_set_state(struct ip_vs_conn *cp, int direction, 198 const struct sk_buff *skb, 199 struct ip_vs_proto_data *pd) 200{ 201 if (likely(pd->pp->state_transition)) 202 pd->pp->state_transition(cp, direction, skb, pd); 203} 204 205static inline int 206ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 207 struct sk_buff *skb, int protocol, 208 const union nf_inet_addr *caddr, __be16 cport, 209 const union nf_inet_addr *vaddr, __be16 vport, 210 struct ip_vs_conn_param *p) 211{ 212 ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, 213 vport, p); 214 p->pe = rcu_dereference(svc->pe); 215 if (p->pe && p->pe->fill_param) 216 return p->pe->fill_param(p, skb); 217 218 return 0; 219} 220 221/* 222 * IPVS persistent scheduling function 223 * It creates a connection entry according to its template if exists, 224 * or selects a server and creates a connection entry plus a template. 225 * Locking: we are svc user (svc->refcnt), so we hold all dests too 226 * Protocols supported: TCP, UDP 227 */ 228static struct ip_vs_conn * 229ip_vs_sched_persist(struct ip_vs_service *svc, 230 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 231 int *ignored, struct ip_vs_iphdr *iph) 232{ 233 struct ip_vs_conn *cp = NULL; 234 struct ip_vs_dest *dest; 235 struct ip_vs_conn *ct; 236 __be16 dport = 0; /* destination port to forward */ 237 unsigned int flags; 238 struct ip_vs_conn_param param; 239 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 240 union nf_inet_addr snet; /* source network of the client, 241 after masking */ 242 243 /* Mask saddr with the netmask to adjust template granularity */ 244#ifdef CONFIG_IP_VS_IPV6 245 if (svc->af == AF_INET6) 246 ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, 247 (__force __u32) svc->netmask); 248 else 249#endif 250 snet.ip = iph->saddr.ip & svc->netmask; 251 252 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 253 "mnet %s\n", 254 IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), 255 IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), 256 IP_VS_DBG_ADDR(svc->af, &snet)); 257 258 /* 259 * As far as we know, FTP is a very complicated network protocol, and 260 * it uses control connection and data connections. For active FTP, 261 * FTP server initialize data connection to the client, its source port 262 * is often 20. For passive FTP, FTP server tells the clients the port 263 * that it passively listens to, and the client issues the data 264 * connection. In the tunneling or direct routing mode, the load 265 * balancer is on the client-to-server half of connection, the port 266 * number is unknown to the load balancer. So, a conn template like 267 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 268 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 269 * is created for other persistent services. 270 */ 271 { 272 int protocol = iph->protocol; 273 const union nf_inet_addr *vaddr = &iph->daddr; 274 __be16 vport = 0; 275 276 if (dst_port == svc->port) { 277 /* non-FTP template: 278 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 279 * FTP template: 280 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 281 */ 282 if (svc->port != FTPPORT) 283 vport = dst_port; 284 } else { 285 /* Note: persistent fwmark-based services and 286 * persistent port zero service are handled here. 287 * fwmark template: 288 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 289 * port zero template: 290 * <protocol,caddr,0,vaddr,0,daddr,0> 291 */ 292 if (svc->fwmark) { 293 protocol = IPPROTO_IP; 294 vaddr = &fwmark; 295 } 296 } 297 /* return *ignored = -1 so NF_DROP can be used */ 298 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 299 vaddr, vport, ¶m) < 0) { 300 *ignored = -1; 301 return NULL; 302 } 303 } 304 305 /* Check if a template already exists */ 306 ct = ip_vs_ct_in_get(¶m); 307 if (!ct || !ip_vs_check_template(ct)) { 308 struct ip_vs_scheduler *sched; 309 310 /* 311 * No template found or the dest of the connection 312 * template is not available. 313 * return *ignored=0 i.e. ICMP and NF_DROP 314 */ 315 sched = rcu_dereference(svc->scheduler); 316 dest = sched->schedule(svc, skb, iph); 317 if (!dest) { 318 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 319 kfree(param.pe_data); 320 *ignored = 0; 321 return NULL; 322 } 323 324 if (dst_port == svc->port && svc->port != FTPPORT) 325 dport = dest->port; 326 327 /* Create a template 328 * This adds param.pe_data to the template, 329 * and thus param.pe_data will be destroyed 330 * when the template expires */ 331 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 332 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 333 if (ct == NULL) { 334 kfree(param.pe_data); 335 *ignored = -1; 336 return NULL; 337 } 338 339 ct->timeout = svc->timeout; 340 } else { 341 /* set destination with the found template */ 342 dest = ct->dest; 343 kfree(param.pe_data); 344 } 345 346 dport = dst_port; 347 if (dport == svc->port && dest->port) 348 dport = dest->port; 349 350 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 351 && iph->protocol == IPPROTO_UDP) ? 352 IP_VS_CONN_F_ONE_PACKET : 0; 353 354 /* 355 * Create a new connection according to the template 356 */ 357 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, 358 src_port, &iph->daddr, dst_port, ¶m); 359 360 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 361 skb->mark); 362 if (cp == NULL) { 363 ip_vs_conn_put(ct); 364 *ignored = -1; 365 return NULL; 366 } 367 368 /* 369 * Add its control 370 */ 371 ip_vs_control_add(cp, ct); 372 ip_vs_conn_put(ct); 373 374 ip_vs_conn_stats(cp, svc); 375 return cp; 376} 377 378 379/* 380 * IPVS main scheduling function 381 * It selects a server according to the virtual service, and 382 * creates a connection entry. 383 * Protocols supported: TCP, UDP 384 * 385 * Usage of *ignored 386 * 387 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 388 * svc/scheduler decides that this packet should be accepted with 389 * NF_ACCEPT because it must not be scheduled. 390 * 391 * 0 : scheduler can not find destination, so try bypass or 392 * return ICMP and then NF_DROP (ip_vs_leave). 393 * 394 * -1 : scheduler tried to schedule but fatal error occurred, eg. 395 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 396 * failure such as missing Call-ID, ENOMEM on skb_linearize 397 * or pe_data. In this case we should return NF_DROP without 398 * any attempts to send ICMP with ip_vs_leave. 399 */ 400struct ip_vs_conn * 401ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 402 struct ip_vs_proto_data *pd, int *ignored, 403 struct ip_vs_iphdr *iph) 404{ 405 struct ip_vs_protocol *pp = pd->pp; 406 struct ip_vs_conn *cp = NULL; 407 struct ip_vs_scheduler *sched; 408 struct ip_vs_dest *dest; 409 __be16 _ports[2], *pptr; 410 unsigned int flags; 411 412 *ignored = 1; 413 /* 414 * IPv6 frags, only the first hit here. 415 */ 416 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 417 if (pptr == NULL) 418 return NULL; 419 420 /* 421 * FTPDATA needs this check when using local real server. 422 * Never schedule Active FTPDATA connections from real server. 423 * For LVS-NAT they must be already created. For other methods 424 * with persistence the connection is created on SYN+ACK. 425 */ 426 if (pptr[0] == FTPDATA) { 427 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, 428 "Not scheduling FTPDATA"); 429 return NULL; 430 } 431 432 /* 433 * Do not schedule replies from local real server. 434 */ 435 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 436 (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { 437 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, 438 "Not scheduling reply for existing connection"); 439 __ip_vs_conn_put(cp); 440 return NULL; 441 } 442 443 /* 444 * Persistent service 445 */ 446 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 447 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, 448 iph); 449 450 *ignored = 0; 451 452 /* 453 * Non-persistent service 454 */ 455 if (!svc->fwmark && pptr[1] != svc->port) { 456 if (!svc->port) 457 pr_err("Schedule: port zero only supported " 458 "in persistent services, " 459 "check your ipvs configuration\n"); 460 return NULL; 461 } 462 463 sched = rcu_dereference(svc->scheduler); 464 dest = sched->schedule(svc, skb, iph); 465 if (dest == NULL) { 466 IP_VS_DBG(1, "Schedule: no dest found.\n"); 467 return NULL; 468 } 469 470 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 471 && iph->protocol == IPPROTO_UDP) ? 472 IP_VS_CONN_F_ONE_PACKET : 0; 473 474 /* 475 * Create a connection entry. 476 */ 477 { 478 struct ip_vs_conn_param p; 479 480 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, 481 &iph->saddr, pptr[0], &iph->daddr, 482 pptr[1], &p); 483 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 484 dest->port ? dest->port : pptr[1], 485 flags, dest, skb->mark); 486 if (!cp) { 487 *ignored = -1; 488 return NULL; 489 } 490 } 491 492 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 493 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 494 ip_vs_fwd_tag(cp), 495 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 496 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 497 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 498 cp->flags, atomic_read(&cp->refcnt)); 499 500 ip_vs_conn_stats(cp, svc); 501 return cp; 502} 503 504 505/* 506 * Pass or drop the packet. 507 * Called by ip_vs_in, when the virtual service is available but 508 * no destination is available for a new connection. 509 */ 510int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 511 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 512{ 513 __be16 _ports[2], *pptr; 514#ifdef CONFIG_SYSCTL 515 struct net *net; 516 struct netns_ipvs *ipvs; 517 int unicast; 518#endif 519 520 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 521 if (pptr == NULL) { 522 return NF_DROP; 523 } 524 525#ifdef CONFIG_SYSCTL 526 net = skb_net(skb); 527 528#ifdef CONFIG_IP_VS_IPV6 529 if (svc->af == AF_INET6) 530 unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; 531 else 532#endif 533 unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); 534 535 /* if it is fwmark-based service, the cache_bypass sysctl is up 536 and the destination is a non-local unicast, then create 537 a cache_bypass connection entry */ 538 ipvs = net_ipvs(net); 539 if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { 540 int ret; 541 struct ip_vs_conn *cp; 542 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 543 iph->protocol == IPPROTO_UDP) ? 544 IP_VS_CONN_F_ONE_PACKET : 0; 545 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 546 547 /* create a new connection entry */ 548 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 549 { 550 struct ip_vs_conn_param p; 551 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, 552 &iph->saddr, pptr[0], 553 &iph->daddr, pptr[1], &p); 554 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 555 IP_VS_CONN_F_BYPASS | flags, 556 NULL, skb->mark); 557 if (!cp) 558 return NF_DROP; 559 } 560 561 /* statistics */ 562 ip_vs_in_stats(cp, skb); 563 564 /* set state */ 565 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 566 567 /* transmit the first SYN packet */ 568 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 569 /* do not touch skb anymore */ 570 571 atomic_inc(&cp->in_pkts); 572 ip_vs_conn_put(cp); 573 return ret; 574 } 575#endif 576 577 /* 578 * When the virtual ftp service is presented, packets destined 579 * for other services on the VIP may get here (except services 580 * listed in the ipvs table), pass the packets, because it is 581 * not ipvs job to decide to drop the packets. 582 */ 583 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) 584 return NF_ACCEPT; 585 586 /* 587 * Notify the client that the destination is unreachable, and 588 * release the socket buffer. 589 * Since it is in IP layer, the TCP socket is not actually 590 * created, the TCP RST packet cannot be sent, instead that 591 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 592 */ 593#ifdef CONFIG_IP_VS_IPV6 594 if (svc->af == AF_INET6) { 595 if (!skb->dev) { 596 struct net *net_ = dev_net(skb_dst(skb)->dev); 597 598 skb->dev = net_->loopback_dev; 599 } 600 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 601 } else 602#endif 603 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 604 605 return NF_DROP; 606} 607 608#ifdef CONFIG_SYSCTL 609 610static int sysctl_snat_reroute(struct sk_buff *skb) 611{ 612 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 613 return ipvs->sysctl_snat_reroute; 614} 615 616static int sysctl_nat_icmp_send(struct net *net) 617{ 618 struct netns_ipvs *ipvs = net_ipvs(net); 619 return ipvs->sysctl_nat_icmp_send; 620} 621 622static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) 623{ 624 return ipvs->sysctl_expire_nodest_conn; 625} 626 627#else 628 629static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } 630static int sysctl_nat_icmp_send(struct net *net) { return 0; } 631static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } 632 633#endif 634 635__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 636{ 637 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 638} 639 640static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 641{ 642 if (NF_INET_LOCAL_IN == hooknum) 643 return IP_DEFRAG_VS_IN; 644 if (NF_INET_FORWARD == hooknum) 645 return IP_DEFRAG_VS_FWD; 646 return IP_DEFRAG_VS_OUT; 647} 648 649static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) 650{ 651 int err; 652 653 local_bh_disable(); 654 err = ip_defrag(skb, user); 655 local_bh_enable(); 656 if (!err) 657 ip_send_check(ip_hdr(skb)); 658 659 return err; 660} 661 662static int ip_vs_route_me_harder(int af, struct sk_buff *skb) 663{ 664#ifdef CONFIG_IP_VS_IPV6 665 if (af == AF_INET6) { 666 if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) 667 return 1; 668 } else 669#endif 670 if ((sysctl_snat_reroute(skb) || 671 skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 672 ip_route_me_harder(skb, RTN_LOCAL) != 0) 673 return 1; 674 675 return 0; 676} 677 678/* 679 * Packet has been made sufficiently writable in caller 680 * - inout: 1=in->out, 0=out->in 681 */ 682void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 683 struct ip_vs_conn *cp, int inout) 684{ 685 struct iphdr *iph = ip_hdr(skb); 686 unsigned int icmp_offset = iph->ihl*4; 687 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 688 icmp_offset); 689 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 690 691 if (inout) { 692 iph->saddr = cp->vaddr.ip; 693 ip_send_check(iph); 694 ciph->daddr = cp->vaddr.ip; 695 ip_send_check(ciph); 696 } else { 697 iph->daddr = cp->daddr.ip; 698 ip_send_check(iph); 699 ciph->saddr = cp->daddr.ip; 700 ip_send_check(ciph); 701 } 702 703 /* the TCP/UDP/SCTP port */ 704 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 705 IPPROTO_SCTP == ciph->protocol) { 706 __be16 *ports = (void *)ciph + ciph->ihl*4; 707 708 if (inout) 709 ports[1] = cp->vport; 710 else 711 ports[0] = cp->dport; 712 } 713 714 /* And finally the ICMP checksum */ 715 icmph->checksum = 0; 716 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 717 skb->ip_summed = CHECKSUM_UNNECESSARY; 718 719 if (inout) 720 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 721 "Forwarding altered outgoing ICMP"); 722 else 723 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 724 "Forwarding altered incoming ICMP"); 725} 726 727#ifdef CONFIG_IP_VS_IPV6 728void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 729 struct ip_vs_conn *cp, int inout) 730{ 731 struct ipv6hdr *iph = ipv6_hdr(skb); 732 unsigned int icmp_offset = 0; 733 unsigned int offs = 0; /* header offset*/ 734 int protocol; 735 struct icmp6hdr *icmph; 736 struct ipv6hdr *ciph; 737 unsigned short fragoffs; 738 739 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 740 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 741 offs = icmp_offset + sizeof(struct icmp6hdr); 742 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 743 744 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 745 746 if (inout) { 747 iph->saddr = cp->vaddr.in6; 748 ciph->daddr = cp->vaddr.in6; 749 } else { 750 iph->daddr = cp->daddr.in6; 751 ciph->saddr = cp->daddr.in6; 752 } 753 754 /* the TCP/UDP/SCTP port */ 755 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 756 IPPROTO_SCTP == protocol)) { 757 __be16 *ports = (void *)(skb_network_header(skb) + offs); 758 759 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 760 ntohs(inout ? ports[1] : ports[0]), 761 ntohs(inout ? cp->vport : cp->dport)); 762 if (inout) 763 ports[1] = cp->vport; 764 else 765 ports[0] = cp->dport; 766 } 767 768 /* And finally the ICMP checksum */ 769 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 770 skb->len - icmp_offset, 771 IPPROTO_ICMPV6, 0); 772 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 773 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 774 skb->ip_summed = CHECKSUM_PARTIAL; 775 776 if (inout) 777 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 778 (void *)ciph - (void *)iph, 779 "Forwarding altered outgoing ICMPv6"); 780 else 781 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 782 (void *)ciph - (void *)iph, 783 "Forwarding altered incoming ICMPv6"); 784} 785#endif 786 787/* Handle relevant response ICMP messages - forward to the right 788 * destination host. 789 */ 790static int handle_response_icmp(int af, struct sk_buff *skb, 791 union nf_inet_addr *snet, 792 __u8 protocol, struct ip_vs_conn *cp, 793 struct ip_vs_protocol *pp, 794 unsigned int offset, unsigned int ihl) 795{ 796 unsigned int verdict = NF_DROP; 797 798 if (IP_VS_FWD_METHOD(cp) != 0) { 799 pr_err("shouldn't reach here, because the box is on the " 800 "half connection in the tun/dr module.\n"); 801 } 802 803 /* Ensure the checksum is correct */ 804 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 805 /* Failed checksum! */ 806 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 807 IP_VS_DBG_ADDR(af, snet)); 808 goto out; 809 } 810 811 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 812 IPPROTO_SCTP == protocol) 813 offset += 2 * sizeof(__u16); 814 if (!skb_make_writable(skb, offset)) 815 goto out; 816 817#ifdef CONFIG_IP_VS_IPV6 818 if (af == AF_INET6) 819 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 820 else 821#endif 822 ip_vs_nat_icmp(skb, pp, cp, 1); 823 824 if (ip_vs_route_me_harder(af, skb)) 825 goto out; 826 827 /* do the statistics and put it back */ 828 ip_vs_out_stats(cp, skb); 829 830 skb->ipvs_property = 1; 831 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 832 ip_vs_notrack(skb); 833 else 834 ip_vs_update_conntrack(skb, cp, 0); 835 verdict = NF_ACCEPT; 836 837out: 838 __ip_vs_conn_put(cp); 839 840 return verdict; 841} 842 843/* 844 * Handle ICMP messages in the inside-to-outside direction (outgoing). 845 * Find any that might be relevant, check against existing connections. 846 * Currently handles error types - unreachable, quench, ttl exceeded. 847 */ 848static int ip_vs_out_icmp(struct sk_buff *skb, int *related, 849 unsigned int hooknum) 850{ 851 struct iphdr *iph; 852 struct icmphdr _icmph, *ic; 853 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 854 struct ip_vs_iphdr ciph; 855 struct ip_vs_conn *cp; 856 struct ip_vs_protocol *pp; 857 unsigned int offset, ihl; 858 union nf_inet_addr snet; 859 860 *related = 1; 861 862 /* reassemble IP fragments */ 863 if (ip_is_fragment(ip_hdr(skb))) { 864 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) 865 return NF_STOLEN; 866 } 867 868 iph = ip_hdr(skb); 869 offset = ihl = iph->ihl * 4; 870 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 871 if (ic == NULL) 872 return NF_DROP; 873 874 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 875 ic->type, ntohs(icmp_id(ic)), 876 &iph->saddr, &iph->daddr); 877 878 /* 879 * Work through seeing if this is for us. 880 * These checks are supposed to be in an order that means easy 881 * things are checked first to speed up processing.... however 882 * this means that some packets will manage to get a long way 883 * down this stack and then be rejected, but that's life. 884 */ 885 if ((ic->type != ICMP_DEST_UNREACH) && 886 (ic->type != ICMP_SOURCE_QUENCH) && 887 (ic->type != ICMP_TIME_EXCEEDED)) { 888 *related = 0; 889 return NF_ACCEPT; 890 } 891 892 /* Now find the contained IP header */ 893 offset += sizeof(_icmph); 894 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 895 if (cih == NULL) 896 return NF_ACCEPT; /* The packet looks wrong, ignore */ 897 898 pp = ip_vs_proto_get(cih->protocol); 899 if (!pp) 900 return NF_ACCEPT; 901 902 /* Is the embedded protocol header present? */ 903 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 904 pp->dont_defrag)) 905 return NF_ACCEPT; 906 907 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 908 "Checking outgoing ICMP for"); 909 910 ip_vs_fill_ip4hdr(cih, &ciph); 911 ciph.len += offset; 912 /* The embedded headers contain source and dest in reverse order */ 913 cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); 914 if (!cp) 915 return NF_ACCEPT; 916 917 snet.ip = iph->saddr; 918 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 919 pp, ciph.len, ihl); 920} 921 922#ifdef CONFIG_IP_VS_IPV6 923static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, 924 unsigned int hooknum, struct ip_vs_iphdr *ipvsh) 925{ 926 struct icmp6hdr _icmph, *ic; 927 struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ 928 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 929 struct ip_vs_conn *cp; 930 struct ip_vs_protocol *pp; 931 union nf_inet_addr snet; 932 unsigned int writable; 933 934 *related = 1; 935 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); 936 if (ic == NULL) 937 return NF_DROP; 938 939 /* 940 * Work through seeing if this is for us. 941 * These checks are supposed to be in an order that means easy 942 * things are checked first to speed up processing.... however 943 * this means that some packets will manage to get a long way 944 * down this stack and then be rejected, but that's life. 945 */ 946 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 947 *related = 0; 948 return NF_ACCEPT; 949 } 950 /* Fragment header that is before ICMP header tells us that: 951 * it's not an error message since they can't be fragmented. 952 */ 953 if (ipvsh->flags & IP6_FH_F_FRAG) 954 return NF_DROP; 955 956 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 957 ic->icmp6_type, ntohs(icmpv6_id(ic)), 958 &ipvsh->saddr, &ipvsh->daddr); 959 960 /* Now find the contained IP header */ 961 ciph.len = ipvsh->len + sizeof(_icmph); 962 ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); 963 if (ip6h == NULL) 964 return NF_ACCEPT; /* The packet looks wrong, ignore */ 965 ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ 966 ciph.daddr.in6 = ip6h->daddr; 967 /* skip possible IPv6 exthdrs of contained IPv6 packet */ 968 ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); 969 if (ciph.protocol < 0) 970 return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ 971 972 pp = ip_vs_proto_get(ciph.protocol); 973 if (!pp) 974 return NF_ACCEPT; 975 976 /* The embedded headers contain source and dest in reverse order */ 977 cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); 978 if (!cp) 979 return NF_ACCEPT; 980 981 snet.in6 = ciph.saddr.in6; 982 writable = ciph.len; 983 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 984 pp, writable, sizeof(struct ipv6hdr)); 985} 986#endif 987 988/* 989 * Check if sctp chunc is ABORT chunk 990 */ 991static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 992{ 993 sctp_chunkhdr_t *sch, schunk; 994 sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), 995 sizeof(schunk), &schunk); 996 if (sch == NULL) 997 return 0; 998 if (sch->type == SCTP_CID_ABORT) 999 return 1; 1000 return 0; 1001} 1002 1003static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1004{ 1005 struct tcphdr _tcph, *th; 1006 1007 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1008 if (th == NULL) 1009 return 0; 1010 return th->rst; 1011} 1012 1013static inline bool is_new_conn(const struct sk_buff *skb, 1014 struct ip_vs_iphdr *iph) 1015{ 1016 switch (iph->protocol) { 1017 case IPPROTO_TCP: { 1018 struct tcphdr _tcph, *th; 1019 1020 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1021 if (th == NULL) 1022 return false; 1023 return th->syn; 1024 } 1025 case IPPROTO_SCTP: { 1026 sctp_chunkhdr_t *sch, schunk; 1027 1028 sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), 1029 sizeof(schunk), &schunk); 1030 if (sch == NULL) 1031 return false; 1032 return sch->type == SCTP_CID_INIT; 1033 } 1034 default: 1035 return false; 1036 } 1037} 1038 1039/* Handle response packets: rewrite addresses and send away... 1040 */ 1041static unsigned int 1042handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1043 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 1044{ 1045 struct ip_vs_protocol *pp = pd->pp; 1046 1047 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); 1048 1049 if (!skb_make_writable(skb, iph->len)) 1050 goto drop; 1051 1052 /* mangle the packet */ 1053 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) 1054 goto drop; 1055 1056#ifdef CONFIG_IP_VS_IPV6 1057 if (af == AF_INET6) 1058 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1059 else 1060#endif 1061 { 1062 ip_hdr(skb)->saddr = cp->vaddr.ip; 1063 ip_send_check(ip_hdr(skb)); 1064 } 1065 1066 /* 1067 * nf_iterate does not expect change in the skb->dst->dev. 1068 * It looks like it is not fatal to enable this code for hooks 1069 * where our handlers are at the end of the chain list and 1070 * when all next handlers use skb->dst->dev and not outdev. 1071 * It will definitely route properly the inout NAT traffic 1072 * when multiple paths are used. 1073 */ 1074 1075 /* For policy routing, packets originating from this 1076 * machine itself may be routed differently to packets 1077 * passing through. We want this packet to be routed as 1078 * if it came from this machine itself. So re-compute 1079 * the routing information. 1080 */ 1081 if (ip_vs_route_me_harder(af, skb)) 1082 goto drop; 1083 1084 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); 1085 1086 ip_vs_out_stats(cp, skb); 1087 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1088 skb->ipvs_property = 1; 1089 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1090 ip_vs_notrack(skb); 1091 else 1092 ip_vs_update_conntrack(skb, cp, 0); 1093 ip_vs_conn_put(cp); 1094 1095 LeaveFunction(11); 1096 return NF_ACCEPT; 1097 1098drop: 1099 ip_vs_conn_put(cp); 1100 kfree_skb(skb); 1101 LeaveFunction(11); 1102 return NF_STOLEN; 1103} 1104 1105/* 1106 * Check if outgoing packet belongs to the established ip_vs_conn. 1107 */ 1108static unsigned int 1109ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) 1110{ 1111 struct net *net = NULL; 1112 struct ip_vs_iphdr iph; 1113 struct ip_vs_protocol *pp; 1114 struct ip_vs_proto_data *pd; 1115 struct ip_vs_conn *cp; 1116 1117 EnterFunction(11); 1118 1119 /* Already marked as IPVS request or reply? */ 1120 if (skb->ipvs_property) 1121 return NF_ACCEPT; 1122 1123 /* Bad... Do not break raw sockets */ 1124 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && 1125 af == AF_INET)) { 1126 struct sock *sk = skb->sk; 1127 struct inet_sock *inet = inet_sk(skb->sk); 1128 1129 if (inet && sk->sk_family == PF_INET && inet->nodefrag) 1130 return NF_ACCEPT; 1131 } 1132 1133 if (unlikely(!skb_dst(skb))) 1134 return NF_ACCEPT; 1135 1136 net = skb_net(skb); 1137 if (!net_ipvs(net)->enable) 1138 return NF_ACCEPT; 1139 1140 ip_vs_fill_iph_skb(af, skb, &iph); 1141#ifdef CONFIG_IP_VS_IPV6 1142 if (af == AF_INET6) { 1143 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1144 int related; 1145 int verdict = ip_vs_out_icmp_v6(skb, &related, 1146 hooknum, &iph); 1147 1148 if (related) 1149 return verdict; 1150 } 1151 } else 1152#endif 1153 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1154 int related; 1155 int verdict = ip_vs_out_icmp(skb, &related, hooknum); 1156 1157 if (related) 1158 return verdict; 1159 } 1160 1161 pd = ip_vs_proto_data_get(net, iph.protocol); 1162 if (unlikely(!pd)) 1163 return NF_ACCEPT; 1164 pp = pd->pp; 1165 1166 /* reassemble IP fragments */ 1167#ifdef CONFIG_IP_VS_IPV6 1168 if (af == AF_INET) 1169#endif 1170 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1171 if (ip_vs_gather_frags(skb, 1172 ip_vs_defrag_user(hooknum))) 1173 return NF_STOLEN; 1174 1175 ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); 1176 } 1177 1178 /* 1179 * Check if the packet belongs to an existing entry 1180 */ 1181 cp = pp->conn_out_get(af, skb, &iph, 0); 1182 1183 if (likely(cp)) 1184 return handle_response(af, skb, pd, cp, &iph); 1185 if (sysctl_nat_icmp_send(net) && 1186 (pp->protocol == IPPROTO_TCP || 1187 pp->protocol == IPPROTO_UDP || 1188 pp->protocol == IPPROTO_SCTP)) { 1189 __be16 _ports[2], *pptr; 1190 1191 pptr = frag_safe_skb_hp(skb, iph.len, 1192 sizeof(_ports), _ports, &iph); 1193 if (pptr == NULL) 1194 return NF_ACCEPT; /* Not for me */ 1195 if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, 1196 pptr[0])) { 1197 /* 1198 * Notify the real server: there is no 1199 * existing entry if it is not RST 1200 * packet or not TCP packet. 1201 */ 1202 if ((iph.protocol != IPPROTO_TCP && 1203 iph.protocol != IPPROTO_SCTP) 1204 || ((iph.protocol == IPPROTO_TCP 1205 && !is_tcp_reset(skb, iph.len)) 1206 || (iph.protocol == IPPROTO_SCTP 1207 && !is_sctp_abort(skb, 1208 iph.len)))) { 1209#ifdef CONFIG_IP_VS_IPV6 1210 if (af == AF_INET6) { 1211 if (!skb->dev) 1212 skb->dev = net->loopback_dev; 1213 icmpv6_send(skb, 1214 ICMPV6_DEST_UNREACH, 1215 ICMPV6_PORT_UNREACH, 1216 0); 1217 } else 1218#endif 1219 icmp_send(skb, 1220 ICMP_DEST_UNREACH, 1221 ICMP_PORT_UNREACH, 0); 1222 return NF_DROP; 1223 } 1224 } 1225 } 1226 IP_VS_DBG_PKT(12, af, pp, skb, 0, 1227 "ip_vs_out: packet continues traversal as normal"); 1228 return NF_ACCEPT; 1229} 1230 1231/* 1232 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1233 * used only for VS/NAT. 1234 * Check if packet is reply for established ip_vs_conn. 1235 */ 1236static unsigned int 1237ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1238 const struct net_device *in, const struct net_device *out, 1239 int (*okfn)(struct sk_buff *)) 1240{ 1241 return ip_vs_out(ops->hooknum, skb, AF_INET); 1242} 1243 1244/* 1245 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1246 * Check if packet is reply for established ip_vs_conn. 1247 */ 1248static unsigned int 1249ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1250 const struct net_device *in, const struct net_device *out, 1251 int (*okfn)(struct sk_buff *)) 1252{ 1253 return ip_vs_out(ops->hooknum, skb, AF_INET); 1254} 1255 1256#ifdef CONFIG_IP_VS_IPV6 1257 1258/* 1259 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1260 * used only for VS/NAT. 1261 * Check if packet is reply for established ip_vs_conn. 1262 */ 1263static unsigned int 1264ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1265 const struct net_device *in, const struct net_device *out, 1266 int (*okfn)(struct sk_buff *)) 1267{ 1268 return ip_vs_out(ops->hooknum, skb, AF_INET6); 1269} 1270 1271/* 1272 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1273 * Check if packet is reply for established ip_vs_conn. 1274 */ 1275static unsigned int 1276ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1277 const struct net_device *in, const struct net_device *out, 1278 int (*okfn)(struct sk_buff *)) 1279{ 1280 return ip_vs_out(ops->hooknum, skb, AF_INET6); 1281} 1282 1283#endif 1284 1285/* 1286 * Handle ICMP messages in the outside-to-inside direction (incoming). 1287 * Find any that might be relevant, check against existing connections, 1288 * forward to the right destination host if relevant. 1289 * Currently handles error types - unreachable, quench, ttl exceeded. 1290 */ 1291static int 1292ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) 1293{ 1294 struct net *net = NULL; 1295 struct iphdr *iph; 1296 struct icmphdr _icmph, *ic; 1297 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1298 struct ip_vs_iphdr ciph; 1299 struct ip_vs_conn *cp; 1300 struct ip_vs_protocol *pp; 1301 struct ip_vs_proto_data *pd; 1302 unsigned int offset, offset2, ihl, verdict; 1303 bool ipip; 1304 1305 *related = 1; 1306 1307 /* reassemble IP fragments */ 1308 if (ip_is_fragment(ip_hdr(skb))) { 1309 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) 1310 return NF_STOLEN; 1311 } 1312 1313 iph = ip_hdr(skb); 1314 offset = ihl = iph->ihl * 4; 1315 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1316 if (ic == NULL) 1317 return NF_DROP; 1318 1319 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1320 ic->type, ntohs(icmp_id(ic)), 1321 &iph->saddr, &iph->daddr); 1322 1323 /* 1324 * Work through seeing if this is for us. 1325 * These checks are supposed to be in an order that means easy 1326 * things are checked first to speed up processing.... however 1327 * this means that some packets will manage to get a long way 1328 * down this stack and then be rejected, but that's life. 1329 */ 1330 if ((ic->type != ICMP_DEST_UNREACH) && 1331 (ic->type != ICMP_SOURCE_QUENCH) && 1332 (ic->type != ICMP_TIME_EXCEEDED)) { 1333 *related = 0; 1334 return NF_ACCEPT; 1335 } 1336 1337 /* Now find the contained IP header */ 1338 offset += sizeof(_icmph); 1339 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1340 if (cih == NULL) 1341 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1342 1343 net = skb_net(skb); 1344 1345 /* Special case for errors for IPIP packets */ 1346 ipip = false; 1347 if (cih->protocol == IPPROTO_IPIP) { 1348 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1349 return NF_ACCEPT; 1350 /* Error for our IPIP must arrive at LOCAL_IN */ 1351 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1352 return NF_ACCEPT; 1353 offset += cih->ihl * 4; 1354 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1355 if (cih == NULL) 1356 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1357 ipip = true; 1358 } 1359 1360 pd = ip_vs_proto_data_get(net, cih->protocol); 1361 if (!pd) 1362 return NF_ACCEPT; 1363 pp = pd->pp; 1364 1365 /* Is the embedded protocol header present? */ 1366 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1367 pp->dont_defrag)) 1368 return NF_ACCEPT; 1369 1370 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1371 "Checking incoming ICMP for"); 1372 1373 offset2 = offset; 1374 ip_vs_fill_ip4hdr(cih, &ciph); 1375 ciph.len += offset; 1376 offset = ciph.len; 1377 /* The embedded headers contain source and dest in reverse order. 1378 * For IPIP this is error for request, not for reply. 1379 */ 1380 cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); 1381 if (!cp) 1382 return NF_ACCEPT; 1383 1384 verdict = NF_DROP; 1385 1386 /* Ensure the checksum is correct */ 1387 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1388 /* Failed checksum! */ 1389 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1390 &iph->saddr); 1391 goto out; 1392 } 1393 1394 if (ipip) { 1395 __be32 info = ic->un.gateway; 1396 __u8 type = ic->type; 1397 __u8 code = ic->code; 1398 1399 /* Update the MTU */ 1400 if (ic->type == ICMP_DEST_UNREACH && 1401 ic->code == ICMP_FRAG_NEEDED) { 1402 struct ip_vs_dest *dest = cp->dest; 1403 u32 mtu = ntohs(ic->un.frag.mtu); 1404 __be16 frag_off = cih->frag_off; 1405 1406 /* Strip outer IP and ICMP, go to IPIP header */ 1407 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1408 goto ignore_ipip; 1409 offset2 -= ihl + sizeof(_icmph); 1410 skb_reset_network_header(skb); 1411 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", 1412 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); 1413 ipv4_update_pmtu(skb, dev_net(skb->dev), 1414 mtu, 0, 0, 0, 0); 1415 /* Client uses PMTUD? */ 1416 if (!(frag_off & htons(IP_DF))) 1417 goto ignore_ipip; 1418 /* Prefer the resulting PMTU */ 1419 if (dest) { 1420 struct ip_vs_dest_dst *dest_dst; 1421 1422 rcu_read_lock(); 1423 dest_dst = rcu_dereference(dest->dest_dst); 1424 if (dest_dst) 1425 mtu = dst_mtu(dest_dst->dst_cache); 1426 rcu_read_unlock(); 1427 } 1428 if (mtu > 68 + sizeof(struct iphdr)) 1429 mtu -= sizeof(struct iphdr); 1430 info = htonl(mtu); 1431 } 1432 /* Strip outer IP, ICMP and IPIP, go to IP header of 1433 * original request. 1434 */ 1435 if (pskb_pull(skb, offset2) == NULL) 1436 goto ignore_ipip; 1437 skb_reset_network_header(skb); 1438 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1439 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1440 type, code, ntohl(info)); 1441 icmp_send(skb, type, code, info); 1442 /* ICMP can be shorter but anyways, account it */ 1443 ip_vs_out_stats(cp, skb); 1444 1445ignore_ipip: 1446 consume_skb(skb); 1447 verdict = NF_STOLEN; 1448 goto out; 1449 } 1450 1451 /* do the statistics and put it back */ 1452 ip_vs_in_stats(cp, skb); 1453 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1454 IPPROTO_SCTP == cih->protocol) 1455 offset += 2 * sizeof(__u16); 1456 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1457 1458out: 1459 __ip_vs_conn_put(cp); 1460 1461 return verdict; 1462} 1463 1464#ifdef CONFIG_IP_VS_IPV6 1465static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, 1466 unsigned int hooknum, struct ip_vs_iphdr *iph) 1467{ 1468 struct net *net = NULL; 1469 struct ipv6hdr _ip6h, *ip6h; 1470 struct icmp6hdr _icmph, *ic; 1471 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1472 struct ip_vs_conn *cp; 1473 struct ip_vs_protocol *pp; 1474 struct ip_vs_proto_data *pd; 1475 unsigned int offs_ciph, writable, verdict; 1476 1477 *related = 1; 1478 1479 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); 1480 if (ic == NULL) 1481 return NF_DROP; 1482 1483 /* 1484 * Work through seeing if this is for us. 1485 * These checks are supposed to be in an order that means easy 1486 * things are checked first to speed up processing.... however 1487 * this means that some packets will manage to get a long way 1488 * down this stack and then be rejected, but that's life. 1489 */ 1490 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1491 *related = 0; 1492 return NF_ACCEPT; 1493 } 1494 /* Fragment header that is before ICMP header tells us that: 1495 * it's not an error message since they can't be fragmented. 1496 */ 1497 if (iph->flags & IP6_FH_F_FRAG) 1498 return NF_DROP; 1499 1500 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1501 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1502 &iph->saddr, &iph->daddr); 1503 1504 /* Now find the contained IP header */ 1505 ciph.len = iph->len + sizeof(_icmph); 1506 offs_ciph = ciph.len; /* Save ip header offset */ 1507 ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); 1508 if (ip6h == NULL) 1509 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1510 ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ 1511 ciph.daddr.in6 = ip6h->daddr; 1512 /* skip possible IPv6 exthdrs of contained IPv6 packet */ 1513 ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); 1514 if (ciph.protocol < 0) 1515 return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ 1516 1517 net = skb_net(skb); 1518 pd = ip_vs_proto_data_get(net, ciph.protocol); 1519 if (!pd) 1520 return NF_ACCEPT; 1521 pp = pd->pp; 1522 1523 /* Cannot handle fragmented embedded protocol */ 1524 if (ciph.fragoffs) 1525 return NF_ACCEPT; 1526 1527 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, 1528 "Checking incoming ICMPv6 for"); 1529 1530 /* The embedded headers contain source and dest in reverse order 1531 * if not from localhost 1532 */ 1533 cp = pp->conn_in_get(AF_INET6, skb, &ciph, 1534 (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); 1535 1536 if (!cp) 1537 return NF_ACCEPT; 1538 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 1539 if ((hooknum == NF_INET_LOCAL_OUT) && 1540 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 1541 __ip_vs_conn_put(cp); 1542 return NF_ACCEPT; 1543 } 1544 1545 /* do the statistics and put it back */ 1546 ip_vs_in_stats(cp, skb); 1547 1548 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 1549 writable = ciph.len; 1550 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 1551 IPPROTO_SCTP == ciph.protocol) 1552 writable += 2 * sizeof(__u16); /* Also mangle ports */ 1553 1554 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); 1555 1556 __ip_vs_conn_put(cp); 1557 1558 return verdict; 1559} 1560#endif 1561 1562 1563/* 1564 * Check if it's for virtual services, look it up, 1565 * and send it on its way... 1566 */ 1567static unsigned int 1568ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) 1569{ 1570 struct net *net; 1571 struct ip_vs_iphdr iph; 1572 struct ip_vs_protocol *pp; 1573 struct ip_vs_proto_data *pd; 1574 struct ip_vs_conn *cp; 1575 int ret, pkts; 1576 struct netns_ipvs *ipvs; 1577 1578 /* Already marked as IPVS request or reply? */ 1579 if (skb->ipvs_property) 1580 return NF_ACCEPT; 1581 1582 /* 1583 * Big tappo: 1584 * - remote client: only PACKET_HOST 1585 * - route: used for struct net when skb->dev is unset 1586 */ 1587 if (unlikely((skb->pkt_type != PACKET_HOST && 1588 hooknum != NF_INET_LOCAL_OUT) || 1589 !skb_dst(skb))) { 1590 ip_vs_fill_iph_skb(af, skb, &iph); 1591 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 1592 " ignored in hook %u\n", 1593 skb->pkt_type, iph.protocol, 1594 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 1595 return NF_ACCEPT; 1596 } 1597 /* ipvs enabled in this netns ? */ 1598 net = skb_net(skb); 1599 ipvs = net_ipvs(net); 1600 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1601 return NF_ACCEPT; 1602 1603 ip_vs_fill_iph_skb(af, skb, &iph); 1604 1605 /* Bad... Do not break raw sockets */ 1606 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && 1607 af == AF_INET)) { 1608 struct sock *sk = skb->sk; 1609 struct inet_sock *inet = inet_sk(skb->sk); 1610 1611 if (inet && sk->sk_family == PF_INET && inet->nodefrag) 1612 return NF_ACCEPT; 1613 } 1614 1615#ifdef CONFIG_IP_VS_IPV6 1616 if (af == AF_INET6) { 1617 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1618 int related; 1619 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, 1620 &iph); 1621 1622 if (related) 1623 return verdict; 1624 } 1625 } else 1626#endif 1627 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1628 int related; 1629 int verdict = ip_vs_in_icmp(skb, &related, hooknum); 1630 1631 if (related) 1632 return verdict; 1633 } 1634 1635 /* Protocol supported? */ 1636 pd = ip_vs_proto_data_get(net, iph.protocol); 1637 if (unlikely(!pd)) 1638 return NF_ACCEPT; 1639 pp = pd->pp; 1640 /* 1641 * Check if the packet belongs to an existing connection entry 1642 */ 1643 cp = pp->conn_in_get(af, skb, &iph, 0); 1644 1645 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && 1646 unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && 1647 is_new_conn(skb, &iph)) { 1648 ip_vs_conn_expire_now(cp); 1649 __ip_vs_conn_put(cp); 1650 cp = NULL; 1651 } 1652 1653 if (unlikely(!cp) && !iph.fragoffs) { 1654 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1655 * replayed fragment zero will already have created the cp 1656 */ 1657 int v; 1658 1659 /* Schedule and create new connection entry into &cp */ 1660 if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) 1661 return v; 1662 } 1663 1664 if (unlikely(!cp)) { 1665 /* sorry, all this trouble for a no-hit :) */ 1666 IP_VS_DBG_PKT(12, af, pp, skb, 0, 1667 "ip_vs_in: packet continues traversal as normal"); 1668 if (iph.fragoffs) { 1669 /* Fragment that couldn't be mapped to a conn entry 1670 * is missing module nf_defrag_ipv6 1671 */ 1672 IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); 1673 IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); 1674 } 1675 return NF_ACCEPT; 1676 } 1677 1678 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); 1679 /* Check the server status */ 1680 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1681 /* the destination server is not available */ 1682 1683 if (sysctl_expire_nodest_conn(ipvs)) { 1684 /* try to expire the connection immediately */ 1685 ip_vs_conn_expire_now(cp); 1686 } 1687 /* don't restart its timer, and silently 1688 drop the packet. */ 1689 __ip_vs_conn_put(cp); 1690 return NF_DROP; 1691 } 1692 1693 ip_vs_in_stats(cp, skb); 1694 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 1695 if (cp->packet_xmit) 1696 ret = cp->packet_xmit(skb, cp, pp, &iph); 1697 /* do not touch skb anymore */ 1698 else { 1699 IP_VS_DBG_RL("warning: packet_xmit is null"); 1700 ret = NF_ACCEPT; 1701 } 1702 1703 /* Increase its packet counter and check if it is needed 1704 * to be synchronized 1705 * 1706 * Sync connection if it is about to close to 1707 * encorage the standby servers to update the connections timeout 1708 * 1709 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 1710 */ 1711 1712 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 1713 pkts = sysctl_sync_threshold(ipvs); 1714 else 1715 pkts = atomic_add_return(1, &cp->in_pkts); 1716 1717 if (ipvs->sync_state & IP_VS_STATE_MASTER) 1718 ip_vs_sync_conn(net, cp, pkts); 1719 1720 ip_vs_conn_put(cp); 1721 return ret; 1722} 1723 1724/* 1725 * AF_INET handler in NF_INET_LOCAL_IN chain 1726 * Schedule and forward packets from remote clients 1727 */ 1728static unsigned int 1729ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1730 const struct net_device *in, 1731 const struct net_device *out, 1732 int (*okfn)(struct sk_buff *)) 1733{ 1734 return ip_vs_in(ops->hooknum, skb, AF_INET); 1735} 1736 1737/* 1738 * AF_INET handler in NF_INET_LOCAL_OUT chain 1739 * Schedule and forward packets from local clients 1740 */ 1741static unsigned int 1742ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1743 const struct net_device *in, const struct net_device *out, 1744 int (*okfn)(struct sk_buff *)) 1745{ 1746 return ip_vs_in(ops->hooknum, skb, AF_INET); 1747} 1748 1749#ifdef CONFIG_IP_VS_IPV6 1750 1751/* 1752 * AF_INET6 handler in NF_INET_LOCAL_IN chain 1753 * Schedule and forward packets from remote clients 1754 */ 1755static unsigned int 1756ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1757 const struct net_device *in, 1758 const struct net_device *out, 1759 int (*okfn)(struct sk_buff *)) 1760{ 1761 return ip_vs_in(ops->hooknum, skb, AF_INET6); 1762} 1763 1764/* 1765 * AF_INET6 handler in NF_INET_LOCAL_OUT chain 1766 * Schedule and forward packets from local clients 1767 */ 1768static unsigned int 1769ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1770 const struct net_device *in, const struct net_device *out, 1771 int (*okfn)(struct sk_buff *)) 1772{ 1773 return ip_vs_in(ops->hooknum, skb, AF_INET6); 1774} 1775 1776#endif 1777 1778 1779/* 1780 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 1781 * related packets destined for 0.0.0.0/0. 1782 * When fwmark-based virtual service is used, such as transparent 1783 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 1784 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 1785 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 1786 * and send them to ip_vs_in_icmp. 1787 */ 1788static unsigned int 1789ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, 1790 const struct net_device *in, const struct net_device *out, 1791 int (*okfn)(struct sk_buff *)) 1792{ 1793 int r; 1794 struct net *net; 1795 struct netns_ipvs *ipvs; 1796 1797 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 1798 return NF_ACCEPT; 1799 1800 /* ipvs enabled in this netns ? */ 1801 net = skb_net(skb); 1802 ipvs = net_ipvs(net); 1803 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1804 return NF_ACCEPT; 1805 1806 return ip_vs_in_icmp(skb, &r, ops->hooknum); 1807} 1808 1809#ifdef CONFIG_IP_VS_IPV6 1810static unsigned int 1811ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1812 const struct net_device *in, const struct net_device *out, 1813 int (*okfn)(struct sk_buff *)) 1814{ 1815 int r; 1816 struct net *net; 1817 struct netns_ipvs *ipvs; 1818 struct ip_vs_iphdr iphdr; 1819 1820 ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); 1821 if (iphdr.protocol != IPPROTO_ICMPV6) 1822 return NF_ACCEPT; 1823 1824 /* ipvs enabled in this netns ? */ 1825 net = skb_net(skb); 1826 ipvs = net_ipvs(net); 1827 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1828 return NF_ACCEPT; 1829 1830 return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); 1831} 1832#endif 1833 1834 1835static struct nf_hook_ops ip_vs_ops[] __read_mostly = { 1836 /* After packet filtering, change source only for VS/NAT */ 1837 { 1838 .hook = ip_vs_reply4, 1839 .owner = THIS_MODULE, 1840 .pf = NFPROTO_IPV4, 1841 .hooknum = NF_INET_LOCAL_IN, 1842 .priority = NF_IP_PRI_NAT_SRC - 2, 1843 }, 1844 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1845 * or VS/NAT(change destination), so that filtering rules can be 1846 * applied to IPVS. */ 1847 { 1848 .hook = ip_vs_remote_request4, 1849 .owner = THIS_MODULE, 1850 .pf = NFPROTO_IPV4, 1851 .hooknum = NF_INET_LOCAL_IN, 1852 .priority = NF_IP_PRI_NAT_SRC - 1, 1853 }, 1854 /* Before ip_vs_in, change source only for VS/NAT */ 1855 { 1856 .hook = ip_vs_local_reply4, 1857 .owner = THIS_MODULE, 1858 .pf = NFPROTO_IPV4, 1859 .hooknum = NF_INET_LOCAL_OUT, 1860 .priority = NF_IP_PRI_NAT_DST + 1, 1861 }, 1862 /* After mangle, schedule and forward local requests */ 1863 { 1864 .hook = ip_vs_local_request4, 1865 .owner = THIS_MODULE, 1866 .pf = NFPROTO_IPV4, 1867 .hooknum = NF_INET_LOCAL_OUT, 1868 .priority = NF_IP_PRI_NAT_DST + 2, 1869 }, 1870 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1871 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1872 { 1873 .hook = ip_vs_forward_icmp, 1874 .owner = THIS_MODULE, 1875 .pf = NFPROTO_IPV4, 1876 .hooknum = NF_INET_FORWARD, 1877 .priority = 99, 1878 }, 1879 /* After packet filtering, change source only for VS/NAT */ 1880 { 1881 .hook = ip_vs_reply4, 1882 .owner = THIS_MODULE, 1883 .pf = NFPROTO_IPV4, 1884 .hooknum = NF_INET_FORWARD, 1885 .priority = 100, 1886 }, 1887#ifdef CONFIG_IP_VS_IPV6 1888 /* After packet filtering, change source only for VS/NAT */ 1889 { 1890 .hook = ip_vs_reply6, 1891 .owner = THIS_MODULE, 1892 .pf = NFPROTO_IPV6, 1893 .hooknum = NF_INET_LOCAL_IN, 1894 .priority = NF_IP6_PRI_NAT_SRC - 2, 1895 }, 1896 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1897 * or VS/NAT(change destination), so that filtering rules can be 1898 * applied to IPVS. */ 1899 { 1900 .hook = ip_vs_remote_request6, 1901 .owner = THIS_MODULE, 1902 .pf = NFPROTO_IPV6, 1903 .hooknum = NF_INET_LOCAL_IN, 1904 .priority = NF_IP6_PRI_NAT_SRC - 1, 1905 }, 1906 /* Before ip_vs_in, change source only for VS/NAT */ 1907 { 1908 .hook = ip_vs_local_reply6, 1909 .owner = THIS_MODULE, 1910 .pf = NFPROTO_IPV6, 1911 .hooknum = NF_INET_LOCAL_OUT, 1912 .priority = NF_IP6_PRI_NAT_DST + 1, 1913 }, 1914 /* After mangle, schedule and forward local requests */ 1915 { 1916 .hook = ip_vs_local_request6, 1917 .owner = THIS_MODULE, 1918 .pf = NFPROTO_IPV6, 1919 .hooknum = NF_INET_LOCAL_OUT, 1920 .priority = NF_IP6_PRI_NAT_DST + 2, 1921 }, 1922 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1923 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1924 { 1925 .hook = ip_vs_forward_icmp_v6, 1926 .owner = THIS_MODULE, 1927 .pf = NFPROTO_IPV6, 1928 .hooknum = NF_INET_FORWARD, 1929 .priority = 99, 1930 }, 1931 /* After packet filtering, change source only for VS/NAT */ 1932 { 1933 .hook = ip_vs_reply6, 1934 .owner = THIS_MODULE, 1935 .pf = NFPROTO_IPV6, 1936 .hooknum = NF_INET_FORWARD, 1937 .priority = 100, 1938 }, 1939#endif 1940}; 1941/* 1942 * Initialize IP Virtual Server netns mem. 1943 */ 1944static int __net_init __ip_vs_init(struct net *net) 1945{ 1946 struct netns_ipvs *ipvs; 1947 1948 ipvs = net_generic(net, ip_vs_net_id); 1949 if (ipvs == NULL) 1950 return -ENOMEM; 1951 1952 /* Hold the beast until a service is registerd */ 1953 ipvs->enable = 0; 1954 ipvs->net = net; 1955 /* Counters used for creating unique names */ 1956 ipvs->gen = atomic_read(&ipvs_netns_cnt); 1957 atomic_inc(&ipvs_netns_cnt); 1958 net->ipvs = ipvs; 1959 1960 if (ip_vs_estimator_net_init(net) < 0) 1961 goto estimator_fail; 1962 1963 if (ip_vs_control_net_init(net) < 0) 1964 goto control_fail; 1965 1966 if (ip_vs_protocol_net_init(net) < 0) 1967 goto protocol_fail; 1968 1969 if (ip_vs_app_net_init(net) < 0) 1970 goto app_fail; 1971 1972 if (ip_vs_conn_net_init(net) < 0) 1973 goto conn_fail; 1974 1975 if (ip_vs_sync_net_init(net) < 0) 1976 goto sync_fail; 1977 1978 printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", 1979 sizeof(struct netns_ipvs), ipvs->gen); 1980 return 0; 1981/* 1982 * Error handling 1983 */ 1984 1985sync_fail: 1986 ip_vs_conn_net_cleanup(net); 1987conn_fail: 1988 ip_vs_app_net_cleanup(net); 1989app_fail: 1990 ip_vs_protocol_net_cleanup(net); 1991protocol_fail: 1992 ip_vs_control_net_cleanup(net); 1993control_fail: 1994 ip_vs_estimator_net_cleanup(net); 1995estimator_fail: 1996 net->ipvs = NULL; 1997 return -ENOMEM; 1998} 1999 2000static void __net_exit __ip_vs_cleanup(struct net *net) 2001{ 2002 ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ 2003 ip_vs_conn_net_cleanup(net); 2004 ip_vs_app_net_cleanup(net); 2005 ip_vs_protocol_net_cleanup(net); 2006 ip_vs_control_net_cleanup(net); 2007 ip_vs_estimator_net_cleanup(net); 2008 IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); 2009 net->ipvs = NULL; 2010} 2011 2012static void __net_exit __ip_vs_dev_cleanup(struct net *net) 2013{ 2014 EnterFunction(2); 2015 net_ipvs(net)->enable = 0; /* Disable packet reception */ 2016 smp_wmb(); 2017 ip_vs_sync_net_cleanup(net); 2018 LeaveFunction(2); 2019} 2020 2021static struct pernet_operations ipvs_core_ops = { 2022 .init = __ip_vs_init, 2023 .exit = __ip_vs_cleanup, 2024 .id = &ip_vs_net_id, 2025 .size = sizeof(struct netns_ipvs), 2026}; 2027 2028static struct pernet_operations ipvs_core_dev_ops = { 2029 .exit = __ip_vs_dev_cleanup, 2030}; 2031 2032/* 2033 * Initialize IP Virtual Server 2034 */ 2035static int __init ip_vs_init(void) 2036{ 2037 int ret; 2038 2039 ret = ip_vs_control_init(); 2040 if (ret < 0) { 2041 pr_err("can't setup control.\n"); 2042 goto exit; 2043 } 2044 2045 ip_vs_protocol_init(); 2046 2047 ret = ip_vs_conn_init(); 2048 if (ret < 0) { 2049 pr_err("can't setup connection table.\n"); 2050 goto cleanup_protocol; 2051 } 2052 2053 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2054 if (ret < 0) 2055 goto cleanup_conn; 2056 2057 ret = register_pernet_device(&ipvs_core_dev_ops); 2058 if (ret < 0) 2059 goto cleanup_sub; 2060 2061 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2062 if (ret < 0) { 2063 pr_err("can't register hooks.\n"); 2064 goto cleanup_dev; 2065 } 2066 2067 ret = ip_vs_register_nl_ioctl(); 2068 if (ret < 0) { 2069 pr_err("can't register netlink/ioctl.\n"); 2070 goto cleanup_hooks; 2071 } 2072 2073 pr_info("ipvs loaded.\n"); 2074 2075 return ret; 2076 2077cleanup_hooks: 2078 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2079cleanup_dev: 2080 unregister_pernet_device(&ipvs_core_dev_ops); 2081cleanup_sub: 2082 unregister_pernet_subsys(&ipvs_core_ops); 2083cleanup_conn: 2084 ip_vs_conn_cleanup(); 2085cleanup_protocol: 2086 ip_vs_protocol_cleanup(); 2087 ip_vs_control_cleanup(); 2088exit: 2089 return ret; 2090} 2091 2092static void __exit ip_vs_cleanup(void) 2093{ 2094 ip_vs_unregister_nl_ioctl(); 2095 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2096 unregister_pernet_device(&ipvs_core_dev_ops); 2097 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2098 ip_vs_conn_cleanup(); 2099 ip_vs_protocol_cleanup(); 2100 ip_vs_control_cleanup(); 2101 pr_info("ipvs unloaded.\n"); 2102} 2103 2104module_init(ip_vs_init); 2105module_exit(ip_vs_cleanup); 2106MODULE_LICENSE("GPL"); 2107