1/*
2 * IPVS         An implementation of the IP virtual server support for the
3 *              LINUX operating system.  IPVS is now implemented as a module
4 *              over the Netfilter framework. IPVS can be used to build a
5 *              high-performance and highly available server based on a
6 *              cluster of servers.
7 *
8 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9 *              Peter Kese <peter.kese@ijs.si>
10 *              Julian Anastasov <ja@ssi.bg>
11 *
12 *              This program is free software; you can redistribute it and/or
13 *              modify it under the terms of the GNU General Public License
14 *              as published by the Free Software Foundation; either version
15 *              2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 *	Paul `Rusty' Russell		properly handle non-linear skbs
23 *	Harald Welte			don't use nfcache
24 *
25 */
26
27#define KMSG_COMPONENT "IPVS"
28#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30#include <linux/module.h>
31#include <linux/kernel.h>
32#include <linux/ip.h>
33#include <linux/tcp.h>
34#include <linux/sctp.h>
35#include <linux/icmp.h>
36#include <linux/slab.h>
37
38#include <net/ip.h>
39#include <net/tcp.h>
40#include <net/udp.h>
41#include <net/icmp.h>                   /* for icmp_send */
42#include <net/route.h>
43#include <net/ip6_checksum.h>
44#include <net/netns/generic.h>		/* net_generic() */
45
46#include <linux/netfilter.h>
47#include <linux/netfilter_ipv4.h>
48
49#ifdef CONFIG_IP_VS_IPV6
50#include <net/ipv6.h>
51#include <linux/netfilter_ipv6.h>
52#include <net/ip6_route.h>
53#endif
54
55#include <net/ip_vs.h>
56
57
58EXPORT_SYMBOL(register_ip_vs_scheduler);
59EXPORT_SYMBOL(unregister_ip_vs_scheduler);
60EXPORT_SYMBOL(ip_vs_proto_name);
61EXPORT_SYMBOL(ip_vs_conn_new);
62EXPORT_SYMBOL(ip_vs_conn_in_get);
63EXPORT_SYMBOL(ip_vs_conn_out_get);
64#ifdef CONFIG_IP_VS_PROTO_TCP
65EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
66#endif
67EXPORT_SYMBOL(ip_vs_conn_put);
68#ifdef CONFIG_IP_VS_DEBUG
69EXPORT_SYMBOL(ip_vs_get_debug_level);
70#endif
71
72static int ip_vs_net_id __read_mostly;
73/* netns cnt used for uniqueness */
74static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
75
76/* ID used in ICMP lookups */
77#define icmp_id(icmph)          (((icmph)->un).echo.id)
78#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
79
80const char *ip_vs_proto_name(unsigned int proto)
81{
82	static char buf[20];
83
84	switch (proto) {
85	case IPPROTO_IP:
86		return "IP";
87	case IPPROTO_UDP:
88		return "UDP";
89	case IPPROTO_TCP:
90		return "TCP";
91	case IPPROTO_SCTP:
92		return "SCTP";
93	case IPPROTO_ICMP:
94		return "ICMP";
95#ifdef CONFIG_IP_VS_IPV6
96	case IPPROTO_ICMPV6:
97		return "ICMPv6";
98#endif
99	default:
100		sprintf(buf, "IP_%u", proto);
101		return buf;
102	}
103}
104
105void ip_vs_init_hash_table(struct list_head *table, int rows)
106{
107	while (--rows >= 0)
108		INIT_LIST_HEAD(&table[rows]);
109}
110
111static inline void
112ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
113{
114	struct ip_vs_dest *dest = cp->dest;
115	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
116
117	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
118		struct ip_vs_cpu_stats *s;
119		struct ip_vs_service *svc;
120
121		s = this_cpu_ptr(dest->stats.cpustats);
122		s->ustats.inpkts++;
123		u64_stats_update_begin(&s->syncp);
124		s->ustats.inbytes += skb->len;
125		u64_stats_update_end(&s->syncp);
126
127		rcu_read_lock();
128		svc = rcu_dereference(dest->svc);
129		s = this_cpu_ptr(svc->stats.cpustats);
130		s->ustats.inpkts++;
131		u64_stats_update_begin(&s->syncp);
132		s->ustats.inbytes += skb->len;
133		u64_stats_update_end(&s->syncp);
134		rcu_read_unlock();
135
136		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
137		s->ustats.inpkts++;
138		u64_stats_update_begin(&s->syncp);
139		s->ustats.inbytes += skb->len;
140		u64_stats_update_end(&s->syncp);
141	}
142}
143
144
145static inline void
146ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
147{
148	struct ip_vs_dest *dest = cp->dest;
149	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
150
151	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
152		struct ip_vs_cpu_stats *s;
153		struct ip_vs_service *svc;
154
155		s = this_cpu_ptr(dest->stats.cpustats);
156		s->ustats.outpkts++;
157		u64_stats_update_begin(&s->syncp);
158		s->ustats.outbytes += skb->len;
159		u64_stats_update_end(&s->syncp);
160
161		rcu_read_lock();
162		svc = rcu_dereference(dest->svc);
163		s = this_cpu_ptr(svc->stats.cpustats);
164		s->ustats.outpkts++;
165		u64_stats_update_begin(&s->syncp);
166		s->ustats.outbytes += skb->len;
167		u64_stats_update_end(&s->syncp);
168		rcu_read_unlock();
169
170		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
171		s->ustats.outpkts++;
172		u64_stats_update_begin(&s->syncp);
173		s->ustats.outbytes += skb->len;
174		u64_stats_update_end(&s->syncp);
175	}
176}
177
178
179static inline void
180ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
181{
182	struct netns_ipvs *ipvs = net_ipvs(svc->net);
183	struct ip_vs_cpu_stats *s;
184
185	s = this_cpu_ptr(cp->dest->stats.cpustats);
186	s->ustats.conns++;
187
188	s = this_cpu_ptr(svc->stats.cpustats);
189	s->ustats.conns++;
190
191	s = this_cpu_ptr(ipvs->tot_stats.cpustats);
192	s->ustats.conns++;
193}
194
195
196static inline void
197ip_vs_set_state(struct ip_vs_conn *cp, int direction,
198		const struct sk_buff *skb,
199		struct ip_vs_proto_data *pd)
200{
201	if (likely(pd->pp->state_transition))
202		pd->pp->state_transition(cp, direction, skb, pd);
203}
204
205static inline int
206ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
207			      struct sk_buff *skb, int protocol,
208			      const union nf_inet_addr *caddr, __be16 cport,
209			      const union nf_inet_addr *vaddr, __be16 vport,
210			      struct ip_vs_conn_param *p)
211{
212	ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
213			      vport, p);
214	p->pe = rcu_dereference(svc->pe);
215	if (p->pe && p->pe->fill_param)
216		return p->pe->fill_param(p, skb);
217
218	return 0;
219}
220
221/*
222 *  IPVS persistent scheduling function
223 *  It creates a connection entry according to its template if exists,
224 *  or selects a server and creates a connection entry plus a template.
225 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
226 *  Protocols supported: TCP, UDP
227 */
228static struct ip_vs_conn *
229ip_vs_sched_persist(struct ip_vs_service *svc,
230		    struct sk_buff *skb, __be16 src_port, __be16 dst_port,
231		    int *ignored, struct ip_vs_iphdr *iph)
232{
233	struct ip_vs_conn *cp = NULL;
234	struct ip_vs_dest *dest;
235	struct ip_vs_conn *ct;
236	__be16 dport = 0;		/* destination port to forward */
237	unsigned int flags;
238	struct ip_vs_conn_param param;
239	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
240	union nf_inet_addr snet;	/* source network of the client,
241					   after masking */
242
243	/* Mask saddr with the netmask to adjust template granularity */
244#ifdef CONFIG_IP_VS_IPV6
245	if (svc->af == AF_INET6)
246		ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
247				 (__force __u32) svc->netmask);
248	else
249#endif
250		snet.ip = iph->saddr.ip & svc->netmask;
251
252	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
253		      "mnet %s\n",
254		      IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
255		      IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
256		      IP_VS_DBG_ADDR(svc->af, &snet));
257
258	/*
259	 * As far as we know, FTP is a very complicated network protocol, and
260	 * it uses control connection and data connections. For active FTP,
261	 * FTP server initialize data connection to the client, its source port
262	 * is often 20. For passive FTP, FTP server tells the clients the port
263	 * that it passively listens to,  and the client issues the data
264	 * connection. In the tunneling or direct routing mode, the load
265	 * balancer is on the client-to-server half of connection, the port
266	 * number is unknown to the load balancer. So, a conn template like
267	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
268	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
269	 * is created for other persistent services.
270	 */
271	{
272		int protocol = iph->protocol;
273		const union nf_inet_addr *vaddr = &iph->daddr;
274		__be16 vport = 0;
275
276		if (dst_port == svc->port) {
277			/* non-FTP template:
278			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
279			 * FTP template:
280			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
281			 */
282			if (svc->port != FTPPORT)
283				vport = dst_port;
284		} else {
285			/* Note: persistent fwmark-based services and
286			 * persistent port zero service are handled here.
287			 * fwmark template:
288			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
289			 * port zero template:
290			 * <protocol,caddr,0,vaddr,0,daddr,0>
291			 */
292			if (svc->fwmark) {
293				protocol = IPPROTO_IP;
294				vaddr = &fwmark;
295			}
296		}
297		/* return *ignored = -1 so NF_DROP can be used */
298		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
299						  vaddr, vport, &param) < 0) {
300			*ignored = -1;
301			return NULL;
302		}
303	}
304
305	/* Check if a template already exists */
306	ct = ip_vs_ct_in_get(&param);
307	if (!ct || !ip_vs_check_template(ct)) {
308		struct ip_vs_scheduler *sched;
309
310		/*
311		 * No template found or the dest of the connection
312		 * template is not available.
313		 * return *ignored=0 i.e. ICMP and NF_DROP
314		 */
315		sched = rcu_dereference(svc->scheduler);
316		dest = sched->schedule(svc, skb, iph);
317		if (!dest) {
318			IP_VS_DBG(1, "p-schedule: no dest found.\n");
319			kfree(param.pe_data);
320			*ignored = 0;
321			return NULL;
322		}
323
324		if (dst_port == svc->port && svc->port != FTPPORT)
325			dport = dest->port;
326
327		/* Create a template
328		 * This adds param.pe_data to the template,
329		 * and thus param.pe_data will be destroyed
330		 * when the template expires */
331		ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
332				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
333		if (ct == NULL) {
334			kfree(param.pe_data);
335			*ignored = -1;
336			return NULL;
337		}
338
339		ct->timeout = svc->timeout;
340	} else {
341		/* set destination with the found template */
342		dest = ct->dest;
343		kfree(param.pe_data);
344	}
345
346	dport = dst_port;
347	if (dport == svc->port && dest->port)
348		dport = dest->port;
349
350	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
351		 && iph->protocol == IPPROTO_UDP) ?
352		IP_VS_CONN_F_ONE_PACKET : 0;
353
354	/*
355	 *    Create a new connection according to the template
356	 */
357	ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
358			      src_port, &iph->daddr, dst_port, &param);
359
360	cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
361			    skb->mark);
362	if (cp == NULL) {
363		ip_vs_conn_put(ct);
364		*ignored = -1;
365		return NULL;
366	}
367
368	/*
369	 *    Add its control
370	 */
371	ip_vs_control_add(cp, ct);
372	ip_vs_conn_put(ct);
373
374	ip_vs_conn_stats(cp, svc);
375	return cp;
376}
377
378
379/*
380 *  IPVS main scheduling function
381 *  It selects a server according to the virtual service, and
382 *  creates a connection entry.
383 *  Protocols supported: TCP, UDP
384 *
385 *  Usage of *ignored
386 *
387 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
388 *       svc/scheduler decides that this packet should be accepted with
389 *       NF_ACCEPT because it must not be scheduled.
390 *
391 * 0 :   scheduler can not find destination, so try bypass or
392 *       return ICMP and then NF_DROP (ip_vs_leave).
393 *
394 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
395 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
396 *       failure such as missing Call-ID, ENOMEM on skb_linearize
397 *       or pe_data. In this case we should return NF_DROP without
398 *       any attempts to send ICMP with ip_vs_leave.
399 */
400struct ip_vs_conn *
401ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
402	       struct ip_vs_proto_data *pd, int *ignored,
403	       struct ip_vs_iphdr *iph)
404{
405	struct ip_vs_protocol *pp = pd->pp;
406	struct ip_vs_conn *cp = NULL;
407	struct ip_vs_scheduler *sched;
408	struct ip_vs_dest *dest;
409	__be16 _ports[2], *pptr;
410	unsigned int flags;
411
412	*ignored = 1;
413	/*
414	 * IPv6 frags, only the first hit here.
415	 */
416	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
417	if (pptr == NULL)
418		return NULL;
419
420	/*
421	 * FTPDATA needs this check when using local real server.
422	 * Never schedule Active FTPDATA connections from real server.
423	 * For LVS-NAT they must be already created. For other methods
424	 * with persistence the connection is created on SYN+ACK.
425	 */
426	if (pptr[0] == FTPDATA) {
427		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
428			      "Not scheduling FTPDATA");
429		return NULL;
430	}
431
432	/*
433	 *    Do not schedule replies from local real server.
434	 */
435	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
436	    (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
437		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
438			      "Not scheduling reply for existing connection");
439		__ip_vs_conn_put(cp);
440		return NULL;
441	}
442
443	/*
444	 *    Persistent service
445	 */
446	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
447		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
448					   iph);
449
450	*ignored = 0;
451
452	/*
453	 *    Non-persistent service
454	 */
455	if (!svc->fwmark && pptr[1] != svc->port) {
456		if (!svc->port)
457			pr_err("Schedule: port zero only supported "
458			       "in persistent services, "
459			       "check your ipvs configuration\n");
460		return NULL;
461	}
462
463	sched = rcu_dereference(svc->scheduler);
464	dest = sched->schedule(svc, skb, iph);
465	if (dest == NULL) {
466		IP_VS_DBG(1, "Schedule: no dest found.\n");
467		return NULL;
468	}
469
470	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
471		 && iph->protocol == IPPROTO_UDP) ?
472		IP_VS_CONN_F_ONE_PACKET : 0;
473
474	/*
475	 *    Create a connection entry.
476	 */
477	{
478		struct ip_vs_conn_param p;
479
480		ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
481				      &iph->saddr, pptr[0], &iph->daddr,
482				      pptr[1], &p);
483		cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
484				    dest->port ? dest->port : pptr[1],
485				    flags, dest, skb->mark);
486		if (!cp) {
487			*ignored = -1;
488			return NULL;
489		}
490	}
491
492	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
493		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
494		      ip_vs_fwd_tag(cp),
495		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
496		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
497		      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
498		      cp->flags, atomic_read(&cp->refcnt));
499
500	ip_vs_conn_stats(cp, svc);
501	return cp;
502}
503
504
505/*
506 *  Pass or drop the packet.
507 *  Called by ip_vs_in, when the virtual service is available but
508 *  no destination is available for a new connection.
509 */
510int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
511		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
512{
513	__be16 _ports[2], *pptr;
514#ifdef CONFIG_SYSCTL
515	struct net *net;
516	struct netns_ipvs *ipvs;
517	int unicast;
518#endif
519
520	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
521	if (pptr == NULL) {
522		return NF_DROP;
523	}
524
525#ifdef CONFIG_SYSCTL
526	net = skb_net(skb);
527
528#ifdef CONFIG_IP_VS_IPV6
529	if (svc->af == AF_INET6)
530		unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
531	else
532#endif
533		unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
534
535	/* if it is fwmark-based service, the cache_bypass sysctl is up
536	   and the destination is a non-local unicast, then create
537	   a cache_bypass connection entry */
538	ipvs = net_ipvs(net);
539	if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
540		int ret;
541		struct ip_vs_conn *cp;
542		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
543				      iph->protocol == IPPROTO_UDP) ?
544				      IP_VS_CONN_F_ONE_PACKET : 0;
545		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
546
547		/* create a new connection entry */
548		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
549		{
550			struct ip_vs_conn_param p;
551			ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
552					      &iph->saddr, pptr[0],
553					      &iph->daddr, pptr[1], &p);
554			cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
555					    IP_VS_CONN_F_BYPASS | flags,
556					    NULL, skb->mark);
557			if (!cp)
558				return NF_DROP;
559		}
560
561		/* statistics */
562		ip_vs_in_stats(cp, skb);
563
564		/* set state */
565		ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
566
567		/* transmit the first SYN packet */
568		ret = cp->packet_xmit(skb, cp, pd->pp, iph);
569		/* do not touch skb anymore */
570
571		atomic_inc(&cp->in_pkts);
572		ip_vs_conn_put(cp);
573		return ret;
574	}
575#endif
576
577	/*
578	 * When the virtual ftp service is presented, packets destined
579	 * for other services on the VIP may get here (except services
580	 * listed in the ipvs table), pass the packets, because it is
581	 * not ipvs job to decide to drop the packets.
582	 */
583	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
584		return NF_ACCEPT;
585
586	/*
587	 * Notify the client that the destination is unreachable, and
588	 * release the socket buffer.
589	 * Since it is in IP layer, the TCP socket is not actually
590	 * created, the TCP RST packet cannot be sent, instead that
591	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
592	 */
593#ifdef CONFIG_IP_VS_IPV6
594	if (svc->af == AF_INET6) {
595		if (!skb->dev) {
596			struct net *net_ = dev_net(skb_dst(skb)->dev);
597
598			skb->dev = net_->loopback_dev;
599		}
600		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
601	} else
602#endif
603		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
604
605	return NF_DROP;
606}
607
608#ifdef CONFIG_SYSCTL
609
610static int sysctl_snat_reroute(struct sk_buff *skb)
611{
612	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
613	return ipvs->sysctl_snat_reroute;
614}
615
616static int sysctl_nat_icmp_send(struct net *net)
617{
618	struct netns_ipvs *ipvs = net_ipvs(net);
619	return ipvs->sysctl_nat_icmp_send;
620}
621
622static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
623{
624	return ipvs->sysctl_expire_nodest_conn;
625}
626
627#else
628
629static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
630static int sysctl_nat_icmp_send(struct net *net) { return 0; }
631static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
632
633#endif
634
635__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
636{
637	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
638}
639
640static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
641{
642	if (NF_INET_LOCAL_IN == hooknum)
643		return IP_DEFRAG_VS_IN;
644	if (NF_INET_FORWARD == hooknum)
645		return IP_DEFRAG_VS_FWD;
646	return IP_DEFRAG_VS_OUT;
647}
648
649static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
650{
651	int err;
652
653	local_bh_disable();
654	err = ip_defrag(skb, user);
655	local_bh_enable();
656	if (!err)
657		ip_send_check(ip_hdr(skb));
658
659	return err;
660}
661
662static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
663{
664#ifdef CONFIG_IP_VS_IPV6
665	if (af == AF_INET6) {
666		if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
667			return 1;
668	} else
669#endif
670		if ((sysctl_snat_reroute(skb) ||
671		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
672		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
673			return 1;
674
675	return 0;
676}
677
678/*
679 * Packet has been made sufficiently writable in caller
680 * - inout: 1=in->out, 0=out->in
681 */
682void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
683		    struct ip_vs_conn *cp, int inout)
684{
685	struct iphdr *iph	 = ip_hdr(skb);
686	unsigned int icmp_offset = iph->ihl*4;
687	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
688						      icmp_offset);
689	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
690
691	if (inout) {
692		iph->saddr = cp->vaddr.ip;
693		ip_send_check(iph);
694		ciph->daddr = cp->vaddr.ip;
695		ip_send_check(ciph);
696	} else {
697		iph->daddr = cp->daddr.ip;
698		ip_send_check(iph);
699		ciph->saddr = cp->daddr.ip;
700		ip_send_check(ciph);
701	}
702
703	/* the TCP/UDP/SCTP port */
704	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
705	    IPPROTO_SCTP == ciph->protocol) {
706		__be16 *ports = (void *)ciph + ciph->ihl*4;
707
708		if (inout)
709			ports[1] = cp->vport;
710		else
711			ports[0] = cp->dport;
712	}
713
714	/* And finally the ICMP checksum */
715	icmph->checksum = 0;
716	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
717	skb->ip_summed = CHECKSUM_UNNECESSARY;
718
719	if (inout)
720		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
721			"Forwarding altered outgoing ICMP");
722	else
723		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
724			"Forwarding altered incoming ICMP");
725}
726
727#ifdef CONFIG_IP_VS_IPV6
728void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
729		    struct ip_vs_conn *cp, int inout)
730{
731	struct ipv6hdr *iph	 = ipv6_hdr(skb);
732	unsigned int icmp_offset = 0;
733	unsigned int offs	 = 0; /* header offset*/
734	int protocol;
735	struct icmp6hdr *icmph;
736	struct ipv6hdr *ciph;
737	unsigned short fragoffs;
738
739	ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
740	icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
741	offs = icmp_offset + sizeof(struct icmp6hdr);
742	ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
743
744	protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
745
746	if (inout) {
747		iph->saddr = cp->vaddr.in6;
748		ciph->daddr = cp->vaddr.in6;
749	} else {
750		iph->daddr = cp->daddr.in6;
751		ciph->saddr = cp->daddr.in6;
752	}
753
754	/* the TCP/UDP/SCTP port */
755	if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
756			  IPPROTO_SCTP == protocol)) {
757		__be16 *ports = (void *)(skb_network_header(skb) + offs);
758
759		IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
760			      ntohs(inout ? ports[1] : ports[0]),
761			      ntohs(inout ? cp->vport : cp->dport));
762		if (inout)
763			ports[1] = cp->vport;
764		else
765			ports[0] = cp->dport;
766	}
767
768	/* And finally the ICMP checksum */
769	icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
770					      skb->len - icmp_offset,
771					      IPPROTO_ICMPV6, 0);
772	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
773	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
774	skb->ip_summed = CHECKSUM_PARTIAL;
775
776	if (inout)
777		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
778			      (void *)ciph - (void *)iph,
779			      "Forwarding altered outgoing ICMPv6");
780	else
781		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
782			      (void *)ciph - (void *)iph,
783			      "Forwarding altered incoming ICMPv6");
784}
785#endif
786
787/* Handle relevant response ICMP messages - forward to the right
788 * destination host.
789 */
790static int handle_response_icmp(int af, struct sk_buff *skb,
791				union nf_inet_addr *snet,
792				__u8 protocol, struct ip_vs_conn *cp,
793				struct ip_vs_protocol *pp,
794				unsigned int offset, unsigned int ihl)
795{
796	unsigned int verdict = NF_DROP;
797
798	if (IP_VS_FWD_METHOD(cp) != 0) {
799		pr_err("shouldn't reach here, because the box is on the "
800		       "half connection in the tun/dr module.\n");
801	}
802
803	/* Ensure the checksum is correct */
804	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
805		/* Failed checksum! */
806		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
807			      IP_VS_DBG_ADDR(af, snet));
808		goto out;
809	}
810
811	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
812	    IPPROTO_SCTP == protocol)
813		offset += 2 * sizeof(__u16);
814	if (!skb_make_writable(skb, offset))
815		goto out;
816
817#ifdef CONFIG_IP_VS_IPV6
818	if (af == AF_INET6)
819		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
820	else
821#endif
822		ip_vs_nat_icmp(skb, pp, cp, 1);
823
824	if (ip_vs_route_me_harder(af, skb))
825		goto out;
826
827	/* do the statistics and put it back */
828	ip_vs_out_stats(cp, skb);
829
830	skb->ipvs_property = 1;
831	if (!(cp->flags & IP_VS_CONN_F_NFCT))
832		ip_vs_notrack(skb);
833	else
834		ip_vs_update_conntrack(skb, cp, 0);
835	verdict = NF_ACCEPT;
836
837out:
838	__ip_vs_conn_put(cp);
839
840	return verdict;
841}
842
843/*
844 *	Handle ICMP messages in the inside-to-outside direction (outgoing).
845 *	Find any that might be relevant, check against existing connections.
846 *	Currently handles error types - unreachable, quench, ttl exceeded.
847 */
848static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
849			  unsigned int hooknum)
850{
851	struct iphdr *iph;
852	struct icmphdr	_icmph, *ic;
853	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
854	struct ip_vs_iphdr ciph;
855	struct ip_vs_conn *cp;
856	struct ip_vs_protocol *pp;
857	unsigned int offset, ihl;
858	union nf_inet_addr snet;
859
860	*related = 1;
861
862	/* reassemble IP fragments */
863	if (ip_is_fragment(ip_hdr(skb))) {
864		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
865			return NF_STOLEN;
866	}
867
868	iph = ip_hdr(skb);
869	offset = ihl = iph->ihl * 4;
870	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
871	if (ic == NULL)
872		return NF_DROP;
873
874	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
875		  ic->type, ntohs(icmp_id(ic)),
876		  &iph->saddr, &iph->daddr);
877
878	/*
879	 * Work through seeing if this is for us.
880	 * These checks are supposed to be in an order that means easy
881	 * things are checked first to speed up processing.... however
882	 * this means that some packets will manage to get a long way
883	 * down this stack and then be rejected, but that's life.
884	 */
885	if ((ic->type != ICMP_DEST_UNREACH) &&
886	    (ic->type != ICMP_SOURCE_QUENCH) &&
887	    (ic->type != ICMP_TIME_EXCEEDED)) {
888		*related = 0;
889		return NF_ACCEPT;
890	}
891
892	/* Now find the contained IP header */
893	offset += sizeof(_icmph);
894	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
895	if (cih == NULL)
896		return NF_ACCEPT; /* The packet looks wrong, ignore */
897
898	pp = ip_vs_proto_get(cih->protocol);
899	if (!pp)
900		return NF_ACCEPT;
901
902	/* Is the embedded protocol header present? */
903	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
904		     pp->dont_defrag))
905		return NF_ACCEPT;
906
907	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
908		      "Checking outgoing ICMP for");
909
910	ip_vs_fill_ip4hdr(cih, &ciph);
911	ciph.len += offset;
912	/* The embedded headers contain source and dest in reverse order */
913	cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
914	if (!cp)
915		return NF_ACCEPT;
916
917	snet.ip = iph->saddr;
918	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
919				    pp, ciph.len, ihl);
920}
921
922#ifdef CONFIG_IP_VS_IPV6
923static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
924			     unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
925{
926	struct icmp6hdr	_icmph, *ic;
927	struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
928	struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
929	struct ip_vs_conn *cp;
930	struct ip_vs_protocol *pp;
931	union nf_inet_addr snet;
932	unsigned int writable;
933
934	*related = 1;
935	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
936	if (ic == NULL)
937		return NF_DROP;
938
939	/*
940	 * Work through seeing if this is for us.
941	 * These checks are supposed to be in an order that means easy
942	 * things are checked first to speed up processing.... however
943	 * this means that some packets will manage to get a long way
944	 * down this stack and then be rejected, but that's life.
945	 */
946	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
947		*related = 0;
948		return NF_ACCEPT;
949	}
950	/* Fragment header that is before ICMP header tells us that:
951	 * it's not an error message since they can't be fragmented.
952	 */
953	if (ipvsh->flags & IP6_FH_F_FRAG)
954		return NF_DROP;
955
956	IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
957		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
958		  &ipvsh->saddr, &ipvsh->daddr);
959
960	/* Now find the contained IP header */
961	ciph.len = ipvsh->len + sizeof(_icmph);
962	ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
963	if (ip6h == NULL)
964		return NF_ACCEPT; /* The packet looks wrong, ignore */
965	ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
966	ciph.daddr.in6 = ip6h->daddr;
967	/* skip possible IPv6 exthdrs of contained IPv6 packet */
968	ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
969	if (ciph.protocol < 0)
970		return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
971
972	pp = ip_vs_proto_get(ciph.protocol);
973	if (!pp)
974		return NF_ACCEPT;
975
976	/* The embedded headers contain source and dest in reverse order */
977	cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
978	if (!cp)
979		return NF_ACCEPT;
980
981	snet.in6 = ciph.saddr.in6;
982	writable = ciph.len;
983	return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
984				    pp, writable, sizeof(struct ipv6hdr));
985}
986#endif
987
988/*
989 * Check if sctp chunc is ABORT chunk
990 */
991static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
992{
993	sctp_chunkhdr_t *sch, schunk;
994	sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
995			sizeof(schunk), &schunk);
996	if (sch == NULL)
997		return 0;
998	if (sch->type == SCTP_CID_ABORT)
999		return 1;
1000	return 0;
1001}
1002
1003static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1004{
1005	struct tcphdr _tcph, *th;
1006
1007	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1008	if (th == NULL)
1009		return 0;
1010	return th->rst;
1011}
1012
1013static inline bool is_new_conn(const struct sk_buff *skb,
1014			       struct ip_vs_iphdr *iph)
1015{
1016	switch (iph->protocol) {
1017	case IPPROTO_TCP: {
1018		struct tcphdr _tcph, *th;
1019
1020		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
1021		if (th == NULL)
1022			return false;
1023		return th->syn;
1024	}
1025	case IPPROTO_SCTP: {
1026		sctp_chunkhdr_t *sch, schunk;
1027
1028		sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
1029					 sizeof(schunk), &schunk);
1030		if (sch == NULL)
1031			return false;
1032		return sch->type == SCTP_CID_INIT;
1033	}
1034	default:
1035		return false;
1036	}
1037}
1038
1039/* Handle response packets: rewrite addresses and send away...
1040 */
1041static unsigned int
1042handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1043		struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
1044{
1045	struct ip_vs_protocol *pp = pd->pp;
1046
1047	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
1048
1049	if (!skb_make_writable(skb, iph->len))
1050		goto drop;
1051
1052	/* mangle the packet */
1053	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
1054		goto drop;
1055
1056#ifdef CONFIG_IP_VS_IPV6
1057	if (af == AF_INET6)
1058		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1059	else
1060#endif
1061	{
1062		ip_hdr(skb)->saddr = cp->vaddr.ip;
1063		ip_send_check(ip_hdr(skb));
1064	}
1065
1066	/*
1067	 * nf_iterate does not expect change in the skb->dst->dev.
1068	 * It looks like it is not fatal to enable this code for hooks
1069	 * where our handlers are at the end of the chain list and
1070	 * when all next handlers use skb->dst->dev and not outdev.
1071	 * It will definitely route properly the inout NAT traffic
1072	 * when multiple paths are used.
1073	 */
1074
1075	/* For policy routing, packets originating from this
1076	 * machine itself may be routed differently to packets
1077	 * passing through.  We want this packet to be routed as
1078	 * if it came from this machine itself.  So re-compute
1079	 * the routing information.
1080	 */
1081	if (ip_vs_route_me_harder(af, skb))
1082		goto drop;
1083
1084	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
1085
1086	ip_vs_out_stats(cp, skb);
1087	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1088	skb->ipvs_property = 1;
1089	if (!(cp->flags & IP_VS_CONN_F_NFCT))
1090		ip_vs_notrack(skb);
1091	else
1092		ip_vs_update_conntrack(skb, cp, 0);
1093	ip_vs_conn_put(cp);
1094
1095	LeaveFunction(11);
1096	return NF_ACCEPT;
1097
1098drop:
1099	ip_vs_conn_put(cp);
1100	kfree_skb(skb);
1101	LeaveFunction(11);
1102	return NF_STOLEN;
1103}
1104
1105/*
1106 *	Check if outgoing packet belongs to the established ip_vs_conn.
1107 */
1108static unsigned int
1109ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1110{
1111	struct net *net = NULL;
1112	struct ip_vs_iphdr iph;
1113	struct ip_vs_protocol *pp;
1114	struct ip_vs_proto_data *pd;
1115	struct ip_vs_conn *cp;
1116
1117	EnterFunction(11);
1118
1119	/* Already marked as IPVS request or reply? */
1120	if (skb->ipvs_property)
1121		return NF_ACCEPT;
1122
1123	/* Bad... Do not break raw sockets */
1124	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1125		     af == AF_INET)) {
1126		struct sock *sk = skb->sk;
1127		struct inet_sock *inet = inet_sk(skb->sk);
1128
1129		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1130			return NF_ACCEPT;
1131	}
1132
1133	if (unlikely(!skb_dst(skb)))
1134		return NF_ACCEPT;
1135
1136	net = skb_net(skb);
1137	if (!net_ipvs(net)->enable)
1138		return NF_ACCEPT;
1139
1140	ip_vs_fill_iph_skb(af, skb, &iph);
1141#ifdef CONFIG_IP_VS_IPV6
1142	if (af == AF_INET6) {
1143		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1144			int related;
1145			int verdict = ip_vs_out_icmp_v6(skb, &related,
1146							hooknum, &iph);
1147
1148			if (related)
1149				return verdict;
1150		}
1151	} else
1152#endif
1153		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1154			int related;
1155			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
1156
1157			if (related)
1158				return verdict;
1159		}
1160
1161	pd = ip_vs_proto_data_get(net, iph.protocol);
1162	if (unlikely(!pd))
1163		return NF_ACCEPT;
1164	pp = pd->pp;
1165
1166	/* reassemble IP fragments */
1167#ifdef CONFIG_IP_VS_IPV6
1168	if (af == AF_INET)
1169#endif
1170		if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1171			if (ip_vs_gather_frags(skb,
1172					       ip_vs_defrag_user(hooknum)))
1173				return NF_STOLEN;
1174
1175			ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
1176		}
1177
1178	/*
1179	 * Check if the packet belongs to an existing entry
1180	 */
1181	cp = pp->conn_out_get(af, skb, &iph, 0);
1182
1183	if (likely(cp))
1184		return handle_response(af, skb, pd, cp, &iph);
1185	if (sysctl_nat_icmp_send(net) &&
1186	    (pp->protocol == IPPROTO_TCP ||
1187	     pp->protocol == IPPROTO_UDP ||
1188	     pp->protocol == IPPROTO_SCTP)) {
1189		__be16 _ports[2], *pptr;
1190
1191		pptr = frag_safe_skb_hp(skb, iph.len,
1192					 sizeof(_ports), _ports, &iph);
1193		if (pptr == NULL)
1194			return NF_ACCEPT;	/* Not for me */
1195		if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
1196					   pptr[0])) {
1197			/*
1198			 * Notify the real server: there is no
1199			 * existing entry if it is not RST
1200			 * packet or not TCP packet.
1201			 */
1202			if ((iph.protocol != IPPROTO_TCP &&
1203			     iph.protocol != IPPROTO_SCTP)
1204			     || ((iph.protocol == IPPROTO_TCP
1205				  && !is_tcp_reset(skb, iph.len))
1206				 || (iph.protocol == IPPROTO_SCTP
1207					&& !is_sctp_abort(skb,
1208						iph.len)))) {
1209#ifdef CONFIG_IP_VS_IPV6
1210				if (af == AF_INET6) {
1211					if (!skb->dev)
1212						skb->dev = net->loopback_dev;
1213					icmpv6_send(skb,
1214						    ICMPV6_DEST_UNREACH,
1215						    ICMPV6_PORT_UNREACH,
1216						    0);
1217				} else
1218#endif
1219					icmp_send(skb,
1220						  ICMP_DEST_UNREACH,
1221						  ICMP_PORT_UNREACH, 0);
1222				return NF_DROP;
1223			}
1224		}
1225	}
1226	IP_VS_DBG_PKT(12, af, pp, skb, 0,
1227		      "ip_vs_out: packet continues traversal as normal");
1228	return NF_ACCEPT;
1229}
1230
1231/*
1232 *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1233 *	used only for VS/NAT.
1234 *	Check if packet is reply for established ip_vs_conn.
1235 */
1236static unsigned int
1237ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
1238	     const struct net_device *in, const struct net_device *out,
1239	     int (*okfn)(struct sk_buff *))
1240{
1241	return ip_vs_out(ops->hooknum, skb, AF_INET);
1242}
1243
1244/*
1245 *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1246 *	Check if packet is reply for established ip_vs_conn.
1247 */
1248static unsigned int
1249ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
1250		   const struct net_device *in, const struct net_device *out,
1251		   int (*okfn)(struct sk_buff *))
1252{
1253	return ip_vs_out(ops->hooknum, skb, AF_INET);
1254}
1255
1256#ifdef CONFIG_IP_VS_IPV6
1257
1258/*
1259 *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1260 *	used only for VS/NAT.
1261 *	Check if packet is reply for established ip_vs_conn.
1262 */
1263static unsigned int
1264ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
1265	     const struct net_device *in, const struct net_device *out,
1266	     int (*okfn)(struct sk_buff *))
1267{
1268	return ip_vs_out(ops->hooknum, skb, AF_INET6);
1269}
1270
1271/*
1272 *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1273 *	Check if packet is reply for established ip_vs_conn.
1274 */
1275static unsigned int
1276ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
1277		   const struct net_device *in, const struct net_device *out,
1278		   int (*okfn)(struct sk_buff *))
1279{
1280	return ip_vs_out(ops->hooknum, skb, AF_INET6);
1281}
1282
1283#endif
1284
1285/*
1286 *	Handle ICMP messages in the outside-to-inside direction (incoming).
1287 *	Find any that might be relevant, check against existing connections,
1288 *	forward to the right destination host if relevant.
1289 *	Currently handles error types - unreachable, quench, ttl exceeded.
1290 */
1291static int
1292ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1293{
1294	struct net *net = NULL;
1295	struct iphdr *iph;
1296	struct icmphdr	_icmph, *ic;
1297	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
1298	struct ip_vs_iphdr ciph;
1299	struct ip_vs_conn *cp;
1300	struct ip_vs_protocol *pp;
1301	struct ip_vs_proto_data *pd;
1302	unsigned int offset, offset2, ihl, verdict;
1303	bool ipip;
1304
1305	*related = 1;
1306
1307	/* reassemble IP fragments */
1308	if (ip_is_fragment(ip_hdr(skb))) {
1309		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1310			return NF_STOLEN;
1311	}
1312
1313	iph = ip_hdr(skb);
1314	offset = ihl = iph->ihl * 4;
1315	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1316	if (ic == NULL)
1317		return NF_DROP;
1318
1319	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1320		  ic->type, ntohs(icmp_id(ic)),
1321		  &iph->saddr, &iph->daddr);
1322
1323	/*
1324	 * Work through seeing if this is for us.
1325	 * These checks are supposed to be in an order that means easy
1326	 * things are checked first to speed up processing.... however
1327	 * this means that some packets will manage to get a long way
1328	 * down this stack and then be rejected, but that's life.
1329	 */
1330	if ((ic->type != ICMP_DEST_UNREACH) &&
1331	    (ic->type != ICMP_SOURCE_QUENCH) &&
1332	    (ic->type != ICMP_TIME_EXCEEDED)) {
1333		*related = 0;
1334		return NF_ACCEPT;
1335	}
1336
1337	/* Now find the contained IP header */
1338	offset += sizeof(_icmph);
1339	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1340	if (cih == NULL)
1341		return NF_ACCEPT; /* The packet looks wrong, ignore */
1342
1343	net = skb_net(skb);
1344
1345	/* Special case for errors for IPIP packets */
1346	ipip = false;
1347	if (cih->protocol == IPPROTO_IPIP) {
1348		if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1349			return NF_ACCEPT;
1350		/* Error for our IPIP must arrive at LOCAL_IN */
1351		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1352			return NF_ACCEPT;
1353		offset += cih->ihl * 4;
1354		cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1355		if (cih == NULL)
1356			return NF_ACCEPT; /* The packet looks wrong, ignore */
1357		ipip = true;
1358	}
1359
1360	pd = ip_vs_proto_data_get(net, cih->protocol);
1361	if (!pd)
1362		return NF_ACCEPT;
1363	pp = pd->pp;
1364
1365	/* Is the embedded protocol header present? */
1366	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1367		     pp->dont_defrag))
1368		return NF_ACCEPT;
1369
1370	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1371		      "Checking incoming ICMP for");
1372
1373	offset2 = offset;
1374	ip_vs_fill_ip4hdr(cih, &ciph);
1375	ciph.len += offset;
1376	offset = ciph.len;
1377	/* The embedded headers contain source and dest in reverse order.
1378	 * For IPIP this is error for request, not for reply.
1379	 */
1380	cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
1381	if (!cp)
1382		return NF_ACCEPT;
1383
1384	verdict = NF_DROP;
1385
1386	/* Ensure the checksum is correct */
1387	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1388		/* Failed checksum! */
1389		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1390			  &iph->saddr);
1391		goto out;
1392	}
1393
1394	if (ipip) {
1395		__be32 info = ic->un.gateway;
1396		__u8 type = ic->type;
1397		__u8 code = ic->code;
1398
1399		/* Update the MTU */
1400		if (ic->type == ICMP_DEST_UNREACH &&
1401		    ic->code == ICMP_FRAG_NEEDED) {
1402			struct ip_vs_dest *dest = cp->dest;
1403			u32 mtu = ntohs(ic->un.frag.mtu);
1404			__be16 frag_off = cih->frag_off;
1405
1406			/* Strip outer IP and ICMP, go to IPIP header */
1407			if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
1408				goto ignore_ipip;
1409			offset2 -= ihl + sizeof(_icmph);
1410			skb_reset_network_header(skb);
1411			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1412				&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1413			ipv4_update_pmtu(skb, dev_net(skb->dev),
1414					 mtu, 0, 0, 0, 0);
1415			/* Client uses PMTUD? */
1416			if (!(frag_off & htons(IP_DF)))
1417				goto ignore_ipip;
1418			/* Prefer the resulting PMTU */
1419			if (dest) {
1420				struct ip_vs_dest_dst *dest_dst;
1421
1422				rcu_read_lock();
1423				dest_dst = rcu_dereference(dest->dest_dst);
1424				if (dest_dst)
1425					mtu = dst_mtu(dest_dst->dst_cache);
1426				rcu_read_unlock();
1427			}
1428			if (mtu > 68 + sizeof(struct iphdr))
1429				mtu -= sizeof(struct iphdr);
1430			info = htonl(mtu);
1431		}
1432		/* Strip outer IP, ICMP and IPIP, go to IP header of
1433		 * original request.
1434		 */
1435		if (pskb_pull(skb, offset2) == NULL)
1436			goto ignore_ipip;
1437		skb_reset_network_header(skb);
1438		IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1439			&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1440			type, code, ntohl(info));
1441		icmp_send(skb, type, code, info);
1442		/* ICMP can be shorter but anyways, account it */
1443		ip_vs_out_stats(cp, skb);
1444
1445ignore_ipip:
1446		consume_skb(skb);
1447		verdict = NF_STOLEN;
1448		goto out;
1449	}
1450
1451	/* do the statistics and put it back */
1452	ip_vs_in_stats(cp, skb);
1453	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
1454	    IPPROTO_SCTP == cih->protocol)
1455		offset += 2 * sizeof(__u16);
1456	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
1457
1458out:
1459	__ip_vs_conn_put(cp);
1460
1461	return verdict;
1462}
1463
1464#ifdef CONFIG_IP_VS_IPV6
1465static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
1466			    unsigned int hooknum, struct ip_vs_iphdr *iph)
1467{
1468	struct net *net = NULL;
1469	struct ipv6hdr _ip6h, *ip6h;
1470	struct icmp6hdr	_icmph, *ic;
1471	struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1472	struct ip_vs_conn *cp;
1473	struct ip_vs_protocol *pp;
1474	struct ip_vs_proto_data *pd;
1475	unsigned int offs_ciph, writable, verdict;
1476
1477	*related = 1;
1478
1479	ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
1480	if (ic == NULL)
1481		return NF_DROP;
1482
1483	/*
1484	 * Work through seeing if this is for us.
1485	 * These checks are supposed to be in an order that means easy
1486	 * things are checked first to speed up processing.... however
1487	 * this means that some packets will manage to get a long way
1488	 * down this stack and then be rejected, but that's life.
1489	 */
1490	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1491		*related = 0;
1492		return NF_ACCEPT;
1493	}
1494	/* Fragment header that is before ICMP header tells us that:
1495	 * it's not an error message since they can't be fragmented.
1496	 */
1497	if (iph->flags & IP6_FH_F_FRAG)
1498		return NF_DROP;
1499
1500	IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1501		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1502		  &iph->saddr, &iph->daddr);
1503
1504	/* Now find the contained IP header */
1505	ciph.len = iph->len + sizeof(_icmph);
1506	offs_ciph = ciph.len; /* Save ip header offset */
1507	ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
1508	if (ip6h == NULL)
1509		return NF_ACCEPT; /* The packet looks wrong, ignore */
1510	ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
1511	ciph.daddr.in6 = ip6h->daddr;
1512	/* skip possible IPv6 exthdrs of contained IPv6 packet */
1513	ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
1514	if (ciph.protocol < 0)
1515		return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
1516
1517	net = skb_net(skb);
1518	pd = ip_vs_proto_data_get(net, ciph.protocol);
1519	if (!pd)
1520		return NF_ACCEPT;
1521	pp = pd->pp;
1522
1523	/* Cannot handle fragmented embedded protocol */
1524	if (ciph.fragoffs)
1525		return NF_ACCEPT;
1526
1527	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
1528		      "Checking incoming ICMPv6 for");
1529
1530	/* The embedded headers contain source and dest in reverse order
1531	 * if not from localhost
1532	 */
1533	cp = pp->conn_in_get(AF_INET6, skb, &ciph,
1534			     (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
1535
1536	if (!cp)
1537		return NF_ACCEPT;
1538	/* VS/TUN, VS/DR and LOCALNODE just let it go */
1539	if ((hooknum == NF_INET_LOCAL_OUT) &&
1540	    (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
1541		__ip_vs_conn_put(cp);
1542		return NF_ACCEPT;
1543	}
1544
1545	/* do the statistics and put it back */
1546	ip_vs_in_stats(cp, skb);
1547
1548	/* Need to mangle contained IPv6 header in ICMPv6 packet */
1549	writable = ciph.len;
1550	if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
1551	    IPPROTO_SCTP == ciph.protocol)
1552		writable += 2 * sizeof(__u16); /* Also mangle ports */
1553
1554	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
1555
1556	__ip_vs_conn_put(cp);
1557
1558	return verdict;
1559}
1560#endif
1561
1562
1563/*
1564 *	Check if it's for virtual services, look it up,
1565 *	and send it on its way...
1566 */
1567static unsigned int
1568ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1569{
1570	struct net *net;
1571	struct ip_vs_iphdr iph;
1572	struct ip_vs_protocol *pp;
1573	struct ip_vs_proto_data *pd;
1574	struct ip_vs_conn *cp;
1575	int ret, pkts;
1576	struct netns_ipvs *ipvs;
1577
1578	/* Already marked as IPVS request or reply? */
1579	if (skb->ipvs_property)
1580		return NF_ACCEPT;
1581
1582	/*
1583	 *	Big tappo:
1584	 *	- remote client: only PACKET_HOST
1585	 *	- route: used for struct net when skb->dev is unset
1586	 */
1587	if (unlikely((skb->pkt_type != PACKET_HOST &&
1588		      hooknum != NF_INET_LOCAL_OUT) ||
1589		     !skb_dst(skb))) {
1590		ip_vs_fill_iph_skb(af, skb, &iph);
1591		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1592			      " ignored in hook %u\n",
1593			      skb->pkt_type, iph.protocol,
1594			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1595		return NF_ACCEPT;
1596	}
1597	/* ipvs enabled in this netns ? */
1598	net = skb_net(skb);
1599	ipvs = net_ipvs(net);
1600	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
1601		return NF_ACCEPT;
1602
1603	ip_vs_fill_iph_skb(af, skb, &iph);
1604
1605	/* Bad... Do not break raw sockets */
1606	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1607		     af == AF_INET)) {
1608		struct sock *sk = skb->sk;
1609		struct inet_sock *inet = inet_sk(skb->sk);
1610
1611		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1612			return NF_ACCEPT;
1613	}
1614
1615#ifdef CONFIG_IP_VS_IPV6
1616	if (af == AF_INET6) {
1617		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1618			int related;
1619			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
1620						       &iph);
1621
1622			if (related)
1623				return verdict;
1624		}
1625	} else
1626#endif
1627		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1628			int related;
1629			int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1630
1631			if (related)
1632				return verdict;
1633		}
1634
1635	/* Protocol supported? */
1636	pd = ip_vs_proto_data_get(net, iph.protocol);
1637	if (unlikely(!pd))
1638		return NF_ACCEPT;
1639	pp = pd->pp;
1640	/*
1641	 * Check if the packet belongs to an existing connection entry
1642	 */
1643	cp = pp->conn_in_get(af, skb, &iph, 0);
1644
1645	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
1646	    unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
1647	    is_new_conn(skb, &iph)) {
1648		ip_vs_conn_expire_now(cp);
1649		__ip_vs_conn_put(cp);
1650		cp = NULL;
1651	}
1652
1653	if (unlikely(!cp) && !iph.fragoffs) {
1654		/* No (second) fragments need to enter here, as nf_defrag_ipv6
1655		 * replayed fragment zero will already have created the cp
1656		 */
1657		int v;
1658
1659		/* Schedule and create new connection entry into &cp */
1660		if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
1661			return v;
1662	}
1663
1664	if (unlikely(!cp)) {
1665		/* sorry, all this trouble for a no-hit :) */
1666		IP_VS_DBG_PKT(12, af, pp, skb, 0,
1667			      "ip_vs_in: packet continues traversal as normal");
1668		if (iph.fragoffs) {
1669			/* Fragment that couldn't be mapped to a conn entry
1670			 * is missing module nf_defrag_ipv6
1671			 */
1672			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
1673			IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
1674		}
1675		return NF_ACCEPT;
1676	}
1677
1678	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1679	/* Check the server status */
1680	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1681		/* the destination server is not available */
1682
1683		if (sysctl_expire_nodest_conn(ipvs)) {
1684			/* try to expire the connection immediately */
1685			ip_vs_conn_expire_now(cp);
1686		}
1687		/* don't restart its timer, and silently
1688		   drop the packet. */
1689		__ip_vs_conn_put(cp);
1690		return NF_DROP;
1691	}
1692
1693	ip_vs_in_stats(cp, skb);
1694	ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
1695	if (cp->packet_xmit)
1696		ret = cp->packet_xmit(skb, cp, pp, &iph);
1697		/* do not touch skb anymore */
1698	else {
1699		IP_VS_DBG_RL("warning: packet_xmit is null");
1700		ret = NF_ACCEPT;
1701	}
1702
1703	/* Increase its packet counter and check if it is needed
1704	 * to be synchronized
1705	 *
1706	 * Sync connection if it is about to close to
1707	 * encorage the standby servers to update the connections timeout
1708	 *
1709	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
1710	 */
1711
1712	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1713		pkts = sysctl_sync_threshold(ipvs);
1714	else
1715		pkts = atomic_add_return(1, &cp->in_pkts);
1716
1717	if (ipvs->sync_state & IP_VS_STATE_MASTER)
1718		ip_vs_sync_conn(net, cp, pkts);
1719
1720	ip_vs_conn_put(cp);
1721	return ret;
1722}
1723
1724/*
1725 *	AF_INET handler in NF_INET_LOCAL_IN chain
1726 *	Schedule and forward packets from remote clients
1727 */
1728static unsigned int
1729ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
1730		      const struct net_device *in,
1731		      const struct net_device *out,
1732		      int (*okfn)(struct sk_buff *))
1733{
1734	return ip_vs_in(ops->hooknum, skb, AF_INET);
1735}
1736
1737/*
1738 *	AF_INET handler in NF_INET_LOCAL_OUT chain
1739 *	Schedule and forward packets from local clients
1740 */
1741static unsigned int
1742ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
1743		     const struct net_device *in, const struct net_device *out,
1744		     int (*okfn)(struct sk_buff *))
1745{
1746	return ip_vs_in(ops->hooknum, skb, AF_INET);
1747}
1748
1749#ifdef CONFIG_IP_VS_IPV6
1750
1751/*
1752 *	AF_INET6 handler in NF_INET_LOCAL_IN chain
1753 *	Schedule and forward packets from remote clients
1754 */
1755static unsigned int
1756ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
1757		      const struct net_device *in,
1758		      const struct net_device *out,
1759		      int (*okfn)(struct sk_buff *))
1760{
1761	return ip_vs_in(ops->hooknum, skb, AF_INET6);
1762}
1763
1764/*
1765 *	AF_INET6 handler in NF_INET_LOCAL_OUT chain
1766 *	Schedule and forward packets from local clients
1767 */
1768static unsigned int
1769ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
1770		     const struct net_device *in, const struct net_device *out,
1771		     int (*okfn)(struct sk_buff *))
1772{
1773	return ip_vs_in(ops->hooknum, skb, AF_INET6);
1774}
1775
1776#endif
1777
1778
1779/*
1780 *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1781 *      related packets destined for 0.0.0.0/0.
1782 *      When fwmark-based virtual service is used, such as transparent
1783 *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1784 *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1785 *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1786 *      and send them to ip_vs_in_icmp.
1787 */
1788static unsigned int
1789ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
1790		   const struct net_device *in, const struct net_device *out,
1791		   int (*okfn)(struct sk_buff *))
1792{
1793	int r;
1794	struct net *net;
1795	struct netns_ipvs *ipvs;
1796
1797	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1798		return NF_ACCEPT;
1799
1800	/* ipvs enabled in this netns ? */
1801	net = skb_net(skb);
1802	ipvs = net_ipvs(net);
1803	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
1804		return NF_ACCEPT;
1805
1806	return ip_vs_in_icmp(skb, &r, ops->hooknum);
1807}
1808
1809#ifdef CONFIG_IP_VS_IPV6
1810static unsigned int
1811ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
1812		      const struct net_device *in, const struct net_device *out,
1813		      int (*okfn)(struct sk_buff *))
1814{
1815	int r;
1816	struct net *net;
1817	struct netns_ipvs *ipvs;
1818	struct ip_vs_iphdr iphdr;
1819
1820	ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
1821	if (iphdr.protocol != IPPROTO_ICMPV6)
1822		return NF_ACCEPT;
1823
1824	/* ipvs enabled in this netns ? */
1825	net = skb_net(skb);
1826	ipvs = net_ipvs(net);
1827	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
1828		return NF_ACCEPT;
1829
1830	return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
1831}
1832#endif
1833
1834
1835static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1836	/* After packet filtering, change source only for VS/NAT */
1837	{
1838		.hook		= ip_vs_reply4,
1839		.owner		= THIS_MODULE,
1840		.pf		= NFPROTO_IPV4,
1841		.hooknum	= NF_INET_LOCAL_IN,
1842		.priority	= NF_IP_PRI_NAT_SRC - 2,
1843	},
1844	/* After packet filtering, forward packet through VS/DR, VS/TUN,
1845	 * or VS/NAT(change destination), so that filtering rules can be
1846	 * applied to IPVS. */
1847	{
1848		.hook		= ip_vs_remote_request4,
1849		.owner		= THIS_MODULE,
1850		.pf		= NFPROTO_IPV4,
1851		.hooknum	= NF_INET_LOCAL_IN,
1852		.priority	= NF_IP_PRI_NAT_SRC - 1,
1853	},
1854	/* Before ip_vs_in, change source only for VS/NAT */
1855	{
1856		.hook		= ip_vs_local_reply4,
1857		.owner		= THIS_MODULE,
1858		.pf		= NFPROTO_IPV4,
1859		.hooknum	= NF_INET_LOCAL_OUT,
1860		.priority	= NF_IP_PRI_NAT_DST + 1,
1861	},
1862	/* After mangle, schedule and forward local requests */
1863	{
1864		.hook		= ip_vs_local_request4,
1865		.owner		= THIS_MODULE,
1866		.pf		= NFPROTO_IPV4,
1867		.hooknum	= NF_INET_LOCAL_OUT,
1868		.priority	= NF_IP_PRI_NAT_DST + 2,
1869	},
1870	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1871	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1872	{
1873		.hook		= ip_vs_forward_icmp,
1874		.owner		= THIS_MODULE,
1875		.pf		= NFPROTO_IPV4,
1876		.hooknum	= NF_INET_FORWARD,
1877		.priority	= 99,
1878	},
1879	/* After packet filtering, change source only for VS/NAT */
1880	{
1881		.hook		= ip_vs_reply4,
1882		.owner		= THIS_MODULE,
1883		.pf		= NFPROTO_IPV4,
1884		.hooknum	= NF_INET_FORWARD,
1885		.priority	= 100,
1886	},
1887#ifdef CONFIG_IP_VS_IPV6
1888	/* After packet filtering, change source only for VS/NAT */
1889	{
1890		.hook		= ip_vs_reply6,
1891		.owner		= THIS_MODULE,
1892		.pf		= NFPROTO_IPV6,
1893		.hooknum	= NF_INET_LOCAL_IN,
1894		.priority	= NF_IP6_PRI_NAT_SRC - 2,
1895	},
1896	/* After packet filtering, forward packet through VS/DR, VS/TUN,
1897	 * or VS/NAT(change destination), so that filtering rules can be
1898	 * applied to IPVS. */
1899	{
1900		.hook		= ip_vs_remote_request6,
1901		.owner		= THIS_MODULE,
1902		.pf		= NFPROTO_IPV6,
1903		.hooknum	= NF_INET_LOCAL_IN,
1904		.priority	= NF_IP6_PRI_NAT_SRC - 1,
1905	},
1906	/* Before ip_vs_in, change source only for VS/NAT */
1907	{
1908		.hook		= ip_vs_local_reply6,
1909		.owner		= THIS_MODULE,
1910		.pf		= NFPROTO_IPV6,
1911		.hooknum	= NF_INET_LOCAL_OUT,
1912		.priority	= NF_IP6_PRI_NAT_DST + 1,
1913	},
1914	/* After mangle, schedule and forward local requests */
1915	{
1916		.hook		= ip_vs_local_request6,
1917		.owner		= THIS_MODULE,
1918		.pf		= NFPROTO_IPV6,
1919		.hooknum	= NF_INET_LOCAL_OUT,
1920		.priority	= NF_IP6_PRI_NAT_DST + 2,
1921	},
1922	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1923	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1924	{
1925		.hook		= ip_vs_forward_icmp_v6,
1926		.owner		= THIS_MODULE,
1927		.pf		= NFPROTO_IPV6,
1928		.hooknum	= NF_INET_FORWARD,
1929		.priority	= 99,
1930	},
1931	/* After packet filtering, change source only for VS/NAT */
1932	{
1933		.hook		= ip_vs_reply6,
1934		.owner		= THIS_MODULE,
1935		.pf		= NFPROTO_IPV6,
1936		.hooknum	= NF_INET_FORWARD,
1937		.priority	= 100,
1938	},
1939#endif
1940};
1941/*
1942 *	Initialize IP Virtual Server netns mem.
1943 */
1944static int __net_init __ip_vs_init(struct net *net)
1945{
1946	struct netns_ipvs *ipvs;
1947
1948	ipvs = net_generic(net, ip_vs_net_id);
1949	if (ipvs == NULL)
1950		return -ENOMEM;
1951
1952	/* Hold the beast until a service is registerd */
1953	ipvs->enable = 0;
1954	ipvs->net = net;
1955	/* Counters used for creating unique names */
1956	ipvs->gen = atomic_read(&ipvs_netns_cnt);
1957	atomic_inc(&ipvs_netns_cnt);
1958	net->ipvs = ipvs;
1959
1960	if (ip_vs_estimator_net_init(net) < 0)
1961		goto estimator_fail;
1962
1963	if (ip_vs_control_net_init(net) < 0)
1964		goto control_fail;
1965
1966	if (ip_vs_protocol_net_init(net) < 0)
1967		goto protocol_fail;
1968
1969	if (ip_vs_app_net_init(net) < 0)
1970		goto app_fail;
1971
1972	if (ip_vs_conn_net_init(net) < 0)
1973		goto conn_fail;
1974
1975	if (ip_vs_sync_net_init(net) < 0)
1976		goto sync_fail;
1977
1978	printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
1979			 sizeof(struct netns_ipvs), ipvs->gen);
1980	return 0;
1981/*
1982 * Error handling
1983 */
1984
1985sync_fail:
1986	ip_vs_conn_net_cleanup(net);
1987conn_fail:
1988	ip_vs_app_net_cleanup(net);
1989app_fail:
1990	ip_vs_protocol_net_cleanup(net);
1991protocol_fail:
1992	ip_vs_control_net_cleanup(net);
1993control_fail:
1994	ip_vs_estimator_net_cleanup(net);
1995estimator_fail:
1996	net->ipvs = NULL;
1997	return -ENOMEM;
1998}
1999
2000static void __net_exit __ip_vs_cleanup(struct net *net)
2001{
2002	ip_vs_service_net_cleanup(net);	/* ip_vs_flush() with locks */
2003	ip_vs_conn_net_cleanup(net);
2004	ip_vs_app_net_cleanup(net);
2005	ip_vs_protocol_net_cleanup(net);
2006	ip_vs_control_net_cleanup(net);
2007	ip_vs_estimator_net_cleanup(net);
2008	IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
2009	net->ipvs = NULL;
2010}
2011
2012static void __net_exit __ip_vs_dev_cleanup(struct net *net)
2013{
2014	EnterFunction(2);
2015	net_ipvs(net)->enable = 0;	/* Disable packet reception */
2016	smp_wmb();
2017	ip_vs_sync_net_cleanup(net);
2018	LeaveFunction(2);
2019}
2020
2021static struct pernet_operations ipvs_core_ops = {
2022	.init = __ip_vs_init,
2023	.exit = __ip_vs_cleanup,
2024	.id   = &ip_vs_net_id,
2025	.size = sizeof(struct netns_ipvs),
2026};
2027
2028static struct pernet_operations ipvs_core_dev_ops = {
2029	.exit = __ip_vs_dev_cleanup,
2030};
2031
2032/*
2033 *	Initialize IP Virtual Server
2034 */
2035static int __init ip_vs_init(void)
2036{
2037	int ret;
2038
2039	ret = ip_vs_control_init();
2040	if (ret < 0) {
2041		pr_err("can't setup control.\n");
2042		goto exit;
2043	}
2044
2045	ip_vs_protocol_init();
2046
2047	ret = ip_vs_conn_init();
2048	if (ret < 0) {
2049		pr_err("can't setup connection table.\n");
2050		goto cleanup_protocol;
2051	}
2052
2053	ret = register_pernet_subsys(&ipvs_core_ops);	/* Alloc ip_vs struct */
2054	if (ret < 0)
2055		goto cleanup_conn;
2056
2057	ret = register_pernet_device(&ipvs_core_dev_ops);
2058	if (ret < 0)
2059		goto cleanup_sub;
2060
2061	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2062	if (ret < 0) {
2063		pr_err("can't register hooks.\n");
2064		goto cleanup_dev;
2065	}
2066
2067	ret = ip_vs_register_nl_ioctl();
2068	if (ret < 0) {
2069		pr_err("can't register netlink/ioctl.\n");
2070		goto cleanup_hooks;
2071	}
2072
2073	pr_info("ipvs loaded.\n");
2074
2075	return ret;
2076
2077cleanup_hooks:
2078	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2079cleanup_dev:
2080	unregister_pernet_device(&ipvs_core_dev_ops);
2081cleanup_sub:
2082	unregister_pernet_subsys(&ipvs_core_ops);
2083cleanup_conn:
2084	ip_vs_conn_cleanup();
2085cleanup_protocol:
2086	ip_vs_protocol_cleanup();
2087	ip_vs_control_cleanup();
2088exit:
2089	return ret;
2090}
2091
2092static void __exit ip_vs_cleanup(void)
2093{
2094	ip_vs_unregister_nl_ioctl();
2095	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2096	unregister_pernet_device(&ipvs_core_dev_ops);
2097	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
2098	ip_vs_conn_cleanup();
2099	ip_vs_protocol_cleanup();
2100	ip_vs_control_cleanup();
2101	pr_info("ipvs unloaded.\n");
2102}
2103
2104module_init(ip_vs_init);
2105module_exit(ip_vs_cleanup);
2106MODULE_LICENSE("GPL");
2107