1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
43#include <linux/err.h>
44
45#include <net/sock.h>
46#include <net/ip.h>
47#include <net/icmp.h>
48#include <net/protocol.h>
49#include <net/ip_tunnels.h>
50#include <net/arp.h>
51#include <net/checksum.h>
52#include <net/dsfield.h>
53#include <net/inet_ecn.h>
54#include <net/xfrm.h>
55#include <net/net_namespace.h>
56#include <net/netns/generic.h>
57#include <net/rtnetlink.h>
58#include <net/udp.h>
59#include <net/gue.h>
60
61#if IS_ENABLED(CONFIG_IPV6)
62#include <net/ipv6.h>
63#include <net/ip6_fib.h>
64#include <net/ip6_route.h>
65#endif
66
67static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
68{
69	return hash_32((__force u32)key ^ (__force u32)remote,
70			 IP_TNL_HASH_BITS);
71}
72
73static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
74			     struct dst_entry *dst, __be32 saddr)
75{
76	struct dst_entry *old_dst;
77
78	dst_clone(dst);
79	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
80	dst_release(old_dst);
81	idst->saddr = saddr;
82}
83
84static noinline void tunnel_dst_set(struct ip_tunnel *t,
85			   struct dst_entry *dst, __be32 saddr)
86{
87	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
88}
89
90static void tunnel_dst_reset(struct ip_tunnel *t)
91{
92	tunnel_dst_set(t, NULL, 0);
93}
94
95void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
96{
97	int i;
98
99	for_each_possible_cpu(i)
100		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
101}
102EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
103
104static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
105					u32 cookie, __be32 *saddr)
106{
107	struct ip_tunnel_dst *idst;
108	struct dst_entry *dst;
109
110	rcu_read_lock();
111	idst = raw_cpu_ptr(t->dst_cache);
112	dst = rcu_dereference(idst->dst);
113	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
114		dst = NULL;
115	if (dst) {
116		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
117			*saddr = idst->saddr;
118		} else {
119			tunnel_dst_reset(t);
120			dst_release(dst);
121			dst = NULL;
122		}
123	}
124	rcu_read_unlock();
125	return (struct rtable *)dst;
126}
127
128static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
129				__be16 flags, __be32 key)
130{
131	if (p->i_flags & TUNNEL_KEY) {
132		if (flags & TUNNEL_KEY)
133			return key == p->i_key;
134		else
135			/* key expected, none present */
136			return false;
137	} else
138		return !(flags & TUNNEL_KEY);
139}
140
141/* Fallback tunnel: no source, no destination, no key, no options
142
143   Tunnel hash table:
144   We require exact key match i.e. if a key is present in packet
145   it will match only tunnel with the same key; if it is not present,
146   it will match only keyless tunnel.
147
148   All keysless packets, if not matched configured keyless tunnels
149   will match fallback tunnel.
150   Given src, dst and key, find appropriate for input tunnel.
151*/
152struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
153				   int link, __be16 flags,
154				   __be32 remote, __be32 local,
155				   __be32 key)
156{
157	unsigned int hash;
158	struct ip_tunnel *t, *cand = NULL;
159	struct hlist_head *head;
160
161	hash = ip_tunnel_hash(key, remote);
162	head = &itn->tunnels[hash];
163
164	hlist_for_each_entry_rcu(t, head, hash_node) {
165		if (local != t->parms.iph.saddr ||
166		    remote != t->parms.iph.daddr ||
167		    !(t->dev->flags & IFF_UP))
168			continue;
169
170		if (!ip_tunnel_key_match(&t->parms, flags, key))
171			continue;
172
173		if (t->parms.link == link)
174			return t;
175		else
176			cand = t;
177	}
178
179	hlist_for_each_entry_rcu(t, head, hash_node) {
180		if (remote != t->parms.iph.daddr ||
181		    t->parms.iph.saddr != 0 ||
182		    !(t->dev->flags & IFF_UP))
183			continue;
184
185		if (!ip_tunnel_key_match(&t->parms, flags, key))
186			continue;
187
188		if (t->parms.link == link)
189			return t;
190		else if (!cand)
191			cand = t;
192	}
193
194	hash = ip_tunnel_hash(key, 0);
195	head = &itn->tunnels[hash];
196
197	hlist_for_each_entry_rcu(t, head, hash_node) {
198		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
199		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
200			continue;
201
202		if (!(t->dev->flags & IFF_UP))
203			continue;
204
205		if (!ip_tunnel_key_match(&t->parms, flags, key))
206			continue;
207
208		if (t->parms.link == link)
209			return t;
210		else if (!cand)
211			cand = t;
212	}
213
214	if (flags & TUNNEL_NO_KEY)
215		goto skip_key_lookup;
216
217	hlist_for_each_entry_rcu(t, head, hash_node) {
218		if (t->parms.i_key != key ||
219		    t->parms.iph.saddr != 0 ||
220		    t->parms.iph.daddr != 0 ||
221		    !(t->dev->flags & IFF_UP))
222			continue;
223
224		if (t->parms.link == link)
225			return t;
226		else if (!cand)
227			cand = t;
228	}
229
230skip_key_lookup:
231	if (cand)
232		return cand;
233
234	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
235		return netdev_priv(itn->fb_tunnel_dev);
236
237
238	return NULL;
239}
240EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
241
242static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
243				    struct ip_tunnel_parm *parms)
244{
245	unsigned int h;
246	__be32 remote;
247	__be32 i_key = parms->i_key;
248
249	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
250		remote = parms->iph.daddr;
251	else
252		remote = 0;
253
254	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
255		i_key = 0;
256
257	h = ip_tunnel_hash(i_key, remote);
258	return &itn->tunnels[h];
259}
260
261static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
262{
263	struct hlist_head *head = ip_bucket(itn, &t->parms);
264
265	hlist_add_head_rcu(&t->hash_node, head);
266}
267
268static void ip_tunnel_del(struct ip_tunnel *t)
269{
270	hlist_del_init_rcu(&t->hash_node);
271}
272
273static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
274					struct ip_tunnel_parm *parms,
275					int type)
276{
277	__be32 remote = parms->iph.daddr;
278	__be32 local = parms->iph.saddr;
279	__be32 key = parms->i_key;
280	__be16 flags = parms->i_flags;
281	int link = parms->link;
282	struct ip_tunnel *t = NULL;
283	struct hlist_head *head = ip_bucket(itn, parms);
284
285	hlist_for_each_entry_rcu(t, head, hash_node) {
286		if (local == t->parms.iph.saddr &&
287		    remote == t->parms.iph.daddr &&
288		    link == t->parms.link &&
289		    type == t->dev->type &&
290		    ip_tunnel_key_match(&t->parms, flags, key))
291			break;
292	}
293	return t;
294}
295
296static struct net_device *__ip_tunnel_create(struct net *net,
297					     const struct rtnl_link_ops *ops,
298					     struct ip_tunnel_parm *parms)
299{
300	int err;
301	struct ip_tunnel *tunnel;
302	struct net_device *dev;
303	char name[IFNAMSIZ];
304
305	if (parms->name[0])
306		strlcpy(name, parms->name, IFNAMSIZ);
307	else {
308		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
309			err = -E2BIG;
310			goto failed;
311		}
312		strlcpy(name, ops->kind, IFNAMSIZ);
313		strncat(name, "%d", 2);
314	}
315
316	ASSERT_RTNL();
317	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
318	if (!dev) {
319		err = -ENOMEM;
320		goto failed;
321	}
322	dev_net_set(dev, net);
323
324	dev->rtnl_link_ops = ops;
325
326	tunnel = netdev_priv(dev);
327	tunnel->parms = *parms;
328	tunnel->net = net;
329
330	err = register_netdevice(dev);
331	if (err)
332		goto failed_free;
333
334	return dev;
335
336failed_free:
337	free_netdev(dev);
338failed:
339	return ERR_PTR(err);
340}
341
342static inline void init_tunnel_flow(struct flowi4 *fl4,
343				    int proto,
344				    __be32 daddr, __be32 saddr,
345				    __be32 key, __u8 tos, int oif)
346{
347	memset(fl4, 0, sizeof(*fl4));
348	fl4->flowi4_oif = oif;
349	fl4->daddr = daddr;
350	fl4->saddr = saddr;
351	fl4->flowi4_tos = tos;
352	fl4->flowi4_proto = proto;
353	fl4->fl4_gre_key = key;
354}
355
356static int ip_tunnel_bind_dev(struct net_device *dev)
357{
358	struct net_device *tdev = NULL;
359	struct ip_tunnel *tunnel = netdev_priv(dev);
360	const struct iphdr *iph;
361	int hlen = LL_MAX_HEADER;
362	int mtu = ETH_DATA_LEN;
363	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
364
365	iph = &tunnel->parms.iph;
366
367	/* Guess output device to choose reasonable mtu and needed_headroom */
368	if (iph->daddr) {
369		struct flowi4 fl4;
370		struct rtable *rt;
371
372		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
373				 iph->saddr, tunnel->parms.o_key,
374				 RT_TOS(iph->tos), tunnel->parms.link);
375		rt = ip_route_output_key(tunnel->net, &fl4);
376
377		if (!IS_ERR(rt)) {
378			tdev = rt->dst.dev;
379			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
380			ip_rt_put(rt);
381		}
382		if (dev->type != ARPHRD_ETHER)
383			dev->flags |= IFF_POINTOPOINT;
384	}
385
386	if (!tdev && tunnel->parms.link)
387		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
388
389	if (tdev) {
390		hlen = tdev->hard_header_len + tdev->needed_headroom;
391		mtu = tdev->mtu;
392	}
393	dev->iflink = tunnel->parms.link;
394
395	dev->needed_headroom = t_hlen + hlen;
396	mtu -= (dev->hard_header_len + t_hlen);
397
398	if (mtu < 68)
399		mtu = 68;
400
401	return mtu;
402}
403
404static struct ip_tunnel *ip_tunnel_create(struct net *net,
405					  struct ip_tunnel_net *itn,
406					  struct ip_tunnel_parm *parms)
407{
408	struct ip_tunnel *nt;
409	struct net_device *dev;
410
411	BUG_ON(!itn->fb_tunnel_dev);
412	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
413	if (IS_ERR(dev))
414		return ERR_CAST(dev);
415
416	dev->mtu = ip_tunnel_bind_dev(dev);
417
418	nt = netdev_priv(dev);
419	ip_tunnel_add(itn, nt);
420	return nt;
421}
422
423int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
424		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
425{
426	struct pcpu_sw_netstats *tstats;
427	const struct iphdr *iph = ip_hdr(skb);
428	int err;
429
430#ifdef CONFIG_NET_IPGRE_BROADCAST
431	if (ipv4_is_multicast(iph->daddr)) {
432		tunnel->dev->stats.multicast++;
433		skb->pkt_type = PACKET_BROADCAST;
434	}
435#endif
436
437	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
438	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
439		tunnel->dev->stats.rx_crc_errors++;
440		tunnel->dev->stats.rx_errors++;
441		goto drop;
442	}
443
444	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
445		if (!(tpi->flags&TUNNEL_SEQ) ||
446		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
447			tunnel->dev->stats.rx_fifo_errors++;
448			tunnel->dev->stats.rx_errors++;
449			goto drop;
450		}
451		tunnel->i_seqno = ntohl(tpi->seq) + 1;
452	}
453
454	skb_reset_network_header(skb);
455
456	err = IP_ECN_decapsulate(iph, skb);
457	if (unlikely(err)) {
458		if (log_ecn_error)
459			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
460					&iph->saddr, iph->tos);
461		if (err > 1) {
462			++tunnel->dev->stats.rx_frame_errors;
463			++tunnel->dev->stats.rx_errors;
464			goto drop;
465		}
466	}
467
468	tstats = this_cpu_ptr(tunnel->dev->tstats);
469	u64_stats_update_begin(&tstats->syncp);
470	tstats->rx_packets++;
471	tstats->rx_bytes += skb->len;
472	u64_stats_update_end(&tstats->syncp);
473
474	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
475
476	if (tunnel->dev->type == ARPHRD_ETHER) {
477		skb->protocol = eth_type_trans(skb, tunnel->dev);
478		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
479	} else {
480		skb->dev = tunnel->dev;
481	}
482
483	gro_cells_receive(&tunnel->gro_cells, skb);
484	return 0;
485
486drop:
487	kfree_skb(skb);
488	return 0;
489}
490EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
491
492static int ip_encap_hlen(struct ip_tunnel_encap *e)
493{
494	switch (e->type) {
495	case TUNNEL_ENCAP_NONE:
496		return 0;
497	case TUNNEL_ENCAP_FOU:
498		return sizeof(struct udphdr);
499	case TUNNEL_ENCAP_GUE:
500		return sizeof(struct udphdr) + sizeof(struct guehdr);
501	default:
502		return -EINVAL;
503	}
504}
505
506int ip_tunnel_encap_setup(struct ip_tunnel *t,
507			  struct ip_tunnel_encap *ipencap)
508{
509	int hlen;
510
511	memset(&t->encap, 0, sizeof(t->encap));
512
513	hlen = ip_encap_hlen(ipencap);
514	if (hlen < 0)
515		return hlen;
516
517	t->encap.type = ipencap->type;
518	t->encap.sport = ipencap->sport;
519	t->encap.dport = ipencap->dport;
520	t->encap.flags = ipencap->flags;
521
522	t->encap_hlen = hlen;
523	t->hlen = t->encap_hlen + t->tun_hlen;
524
525	return 0;
526}
527EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
528
529static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
530			    size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
531{
532	struct udphdr *uh;
533	__be16 sport;
534	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
535	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
536
537	skb = iptunnel_handle_offloads(skb, csum, type);
538
539	if (IS_ERR(skb))
540		return PTR_ERR(skb);
541
542	/* Get length and hash before making space in skb */
543
544	sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
545					       skb, 0, 0, false);
546
547	skb_push(skb, hdr_len);
548
549	skb_reset_transport_header(skb);
550	uh = udp_hdr(skb);
551
552	if (e->type == TUNNEL_ENCAP_GUE) {
553		struct guehdr *guehdr = (struct guehdr *)&uh[1];
554
555		guehdr->version = 0;
556		guehdr->hlen = 0;
557		guehdr->flags = 0;
558		guehdr->next_hdr = *protocol;
559	}
560
561	uh->dest = e->dport;
562	uh->source = sport;
563	uh->len = htons(skb->len);
564	uh->check = 0;
565	udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
566		     fl4->saddr, fl4->daddr, skb->len);
567
568	*protocol = IPPROTO_UDP;
569
570	return 0;
571}
572
573int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
574		    u8 *protocol, struct flowi4 *fl4)
575{
576	switch (t->encap.type) {
577	case TUNNEL_ENCAP_NONE:
578		return 0;
579	case TUNNEL_ENCAP_FOU:
580	case TUNNEL_ENCAP_GUE:
581		return fou_build_header(skb, &t->encap, t->encap_hlen,
582					protocol, fl4);
583	default:
584		return -EINVAL;
585	}
586}
587EXPORT_SYMBOL(ip_tunnel_encap);
588
589static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
590			    struct rtable *rt, __be16 df)
591{
592	struct ip_tunnel *tunnel = netdev_priv(dev);
593	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594	int mtu;
595
596	if (df)
597		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598					- sizeof(struct iphdr) - tunnel->hlen;
599	else
600		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601
602	if (skb_dst(skb))
603		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604
605	if (skb->protocol == htons(ETH_P_IP)) {
606		if (!skb_is_gso(skb) &&
607		    (df & htons(IP_DF)) && mtu < pkt_size) {
608			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
609			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
610			return -E2BIG;
611		}
612	}
613#if IS_ENABLED(CONFIG_IPV6)
614	else if (skb->protocol == htons(ETH_P_IPV6)) {
615		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
616
617		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
618			   mtu >= IPV6_MIN_MTU) {
619			if ((tunnel->parms.iph.daddr &&
620			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
621			    rt6->rt6i_dst.plen == 128) {
622				rt6->rt6i_flags |= RTF_MODIFIED;
623				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
624			}
625		}
626
627		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
628					mtu < pkt_size) {
629			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630			return -E2BIG;
631		}
632	}
633#endif
634	return 0;
635}
636
637void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
638		    const struct iphdr *tnl_params, u8 protocol)
639{
640	struct ip_tunnel *tunnel = netdev_priv(dev);
641	const struct iphdr *inner_iph;
642	struct flowi4 fl4;
643	u8     tos, ttl;
644	__be16 df;
645	struct rtable *rt;		/* Route to the other host */
646	unsigned int max_headroom;	/* The extra header space needed */
647	__be32 dst;
648	int err;
649	bool connected;
650
651	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
652	connected = (tunnel->parms.iph.daddr != 0);
653
654	dst = tnl_params->daddr;
655	if (dst == 0) {
656		/* NBMA tunnel */
657
658		if (skb_dst(skb) == NULL) {
659			dev->stats.tx_fifo_errors++;
660			goto tx_error;
661		}
662
663		if (skb->protocol == htons(ETH_P_IP)) {
664			rt = skb_rtable(skb);
665			dst = rt_nexthop(rt, inner_iph->daddr);
666		}
667#if IS_ENABLED(CONFIG_IPV6)
668		else if (skb->protocol == htons(ETH_P_IPV6)) {
669			const struct in6_addr *addr6;
670			struct neighbour *neigh;
671			bool do_tx_error_icmp;
672			int addr_type;
673
674			neigh = dst_neigh_lookup(skb_dst(skb),
675						 &ipv6_hdr(skb)->daddr);
676			if (neigh == NULL)
677				goto tx_error;
678
679			addr6 = (const struct in6_addr *)&neigh->primary_key;
680			addr_type = ipv6_addr_type(addr6);
681
682			if (addr_type == IPV6_ADDR_ANY) {
683				addr6 = &ipv6_hdr(skb)->daddr;
684				addr_type = ipv6_addr_type(addr6);
685			}
686
687			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
688				do_tx_error_icmp = true;
689			else {
690				do_tx_error_icmp = false;
691				dst = addr6->s6_addr32[3];
692			}
693			neigh_release(neigh);
694			if (do_tx_error_icmp)
695				goto tx_error_icmp;
696		}
697#endif
698		else
699			goto tx_error;
700
701		connected = false;
702	}
703
704	tos = tnl_params->tos;
705	if (tos & 0x1) {
706		tos &= ~0x1;
707		if (skb->protocol == htons(ETH_P_IP)) {
708			tos = inner_iph->tos;
709			connected = false;
710		} else if (skb->protocol == htons(ETH_P_IPV6)) {
711			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
712			connected = false;
713		}
714	}
715
716	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
717			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
718
719	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720		goto tx_error;
721
722	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
723
724	if (!rt) {
725		rt = ip_route_output_key(tunnel->net, &fl4);
726
727		if (IS_ERR(rt)) {
728			dev->stats.tx_carrier_errors++;
729			goto tx_error;
730		}
731		if (connected)
732			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
733	}
734
735	if (rt->dst.dev == dev) {
736		ip_rt_put(rt);
737		dev->stats.collisions++;
738		goto tx_error;
739	}
740
741	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
742		ip_rt_put(rt);
743		goto tx_error;
744	}
745
746	if (tunnel->err_count > 0) {
747		if (time_before(jiffies,
748				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
749			tunnel->err_count--;
750
751			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
752			dst_link_failure(skb);
753		} else
754			tunnel->err_count = 0;
755	}
756
757	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
758	ttl = tnl_params->ttl;
759	if (ttl == 0) {
760		if (skb->protocol == htons(ETH_P_IP))
761			ttl = inner_iph->ttl;
762#if IS_ENABLED(CONFIG_IPV6)
763		else if (skb->protocol == htons(ETH_P_IPV6))
764			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
765#endif
766		else
767			ttl = ip4_dst_hoplimit(&rt->dst);
768	}
769
770	df = tnl_params->frag_off;
771	if (skb->protocol == htons(ETH_P_IP))
772		df |= (inner_iph->frag_off&htons(IP_DF));
773
774	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
775			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
776	if (max_headroom > dev->needed_headroom)
777		dev->needed_headroom = max_headroom;
778
779	if (skb_cow_head(skb, dev->needed_headroom)) {
780		ip_rt_put(rt);
781		dev->stats.tx_dropped++;
782		kfree_skb(skb);
783		return;
784	}
785
786	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
787			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
788	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
789
790	return;
791
792#if IS_ENABLED(CONFIG_IPV6)
793tx_error_icmp:
794	dst_link_failure(skb);
795#endif
796tx_error:
797	dev->stats.tx_errors++;
798	kfree_skb(skb);
799}
800EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
801
802static void ip_tunnel_update(struct ip_tunnel_net *itn,
803			     struct ip_tunnel *t,
804			     struct net_device *dev,
805			     struct ip_tunnel_parm *p,
806			     bool set_mtu)
807{
808	ip_tunnel_del(t);
809	t->parms.iph.saddr = p->iph.saddr;
810	t->parms.iph.daddr = p->iph.daddr;
811	t->parms.i_key = p->i_key;
812	t->parms.o_key = p->o_key;
813	if (dev->type != ARPHRD_ETHER) {
814		memcpy(dev->dev_addr, &p->iph.saddr, 4);
815		memcpy(dev->broadcast, &p->iph.daddr, 4);
816	}
817	ip_tunnel_add(itn, t);
818
819	t->parms.iph.ttl = p->iph.ttl;
820	t->parms.iph.tos = p->iph.tos;
821	t->parms.iph.frag_off = p->iph.frag_off;
822
823	if (t->parms.link != p->link) {
824		int mtu;
825
826		t->parms.link = p->link;
827		mtu = ip_tunnel_bind_dev(dev);
828		if (set_mtu)
829			dev->mtu = mtu;
830	}
831	ip_tunnel_dst_reset_all(t);
832	netdev_state_change(dev);
833}
834
835int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
836{
837	int err = 0;
838	struct ip_tunnel *t = netdev_priv(dev);
839	struct net *net = t->net;
840	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
841
842	BUG_ON(!itn->fb_tunnel_dev);
843	switch (cmd) {
844	case SIOCGETTUNNEL:
845		if (dev == itn->fb_tunnel_dev) {
846			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847			if (t == NULL)
848				t = netdev_priv(dev);
849		}
850		memcpy(p, &t->parms, sizeof(*p));
851		break;
852
853	case SIOCADDTUNNEL:
854	case SIOCCHGTUNNEL:
855		err = -EPERM;
856		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857			goto done;
858		if (p->iph.ttl)
859			p->iph.frag_off |= htons(IP_DF);
860		if (!(p->i_flags & VTI_ISVTI)) {
861			if (!(p->i_flags & TUNNEL_KEY))
862				p->i_key = 0;
863			if (!(p->o_flags & TUNNEL_KEY))
864				p->o_key = 0;
865		}
866
867		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
868
869		if (cmd == SIOCADDTUNNEL) {
870			if (!t) {
871				t = ip_tunnel_create(net, itn, p);
872				err = PTR_ERR_OR_ZERO(t);
873				break;
874			}
875
876			err = -EEXIST;
877			break;
878		}
879		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880			if (t != NULL) {
881				if (t->dev != dev) {
882					err = -EEXIST;
883					break;
884				}
885			} else {
886				unsigned int nflags = 0;
887
888				if (ipv4_is_multicast(p->iph.daddr))
889					nflags = IFF_BROADCAST;
890				else if (p->iph.daddr)
891					nflags = IFF_POINTOPOINT;
892
893				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894					err = -EINVAL;
895					break;
896				}
897
898				t = netdev_priv(dev);
899			}
900		}
901
902		if (t) {
903			err = 0;
904			ip_tunnel_update(itn, t, dev, p, true);
905		} else {
906			err = -ENOENT;
907		}
908		break;
909
910	case SIOCDELTUNNEL:
911		err = -EPERM;
912		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913			goto done;
914
915		if (dev == itn->fb_tunnel_dev) {
916			err = -ENOENT;
917			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918			if (t == NULL)
919				goto done;
920			err = -EPERM;
921			if (t == netdev_priv(itn->fb_tunnel_dev))
922				goto done;
923			dev = t->dev;
924		}
925		unregister_netdevice(dev);
926		err = 0;
927		break;
928
929	default:
930		err = -EINVAL;
931	}
932
933done:
934	return err;
935}
936EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937
938int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
939{
940	struct ip_tunnel *tunnel = netdev_priv(dev);
941	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942
943	if (new_mtu < 68 ||
944	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
945		return -EINVAL;
946	dev->mtu = new_mtu;
947	return 0;
948}
949EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
950
951static void ip_tunnel_dev_free(struct net_device *dev)
952{
953	struct ip_tunnel *tunnel = netdev_priv(dev);
954
955	gro_cells_destroy(&tunnel->gro_cells);
956	free_percpu(tunnel->dst_cache);
957	free_percpu(dev->tstats);
958	free_netdev(dev);
959}
960
961void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
962{
963	struct ip_tunnel *tunnel = netdev_priv(dev);
964	struct ip_tunnel_net *itn;
965
966	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
967
968	if (itn->fb_tunnel_dev != dev) {
969		ip_tunnel_del(netdev_priv(dev));
970		unregister_netdevice_queue(dev, head);
971	}
972}
973EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
974
975int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
976				  struct rtnl_link_ops *ops, char *devname)
977{
978	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
979	struct ip_tunnel_parm parms;
980	unsigned int i;
981
982	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
983		INIT_HLIST_HEAD(&itn->tunnels[i]);
984
985	if (!ops) {
986		itn->fb_tunnel_dev = NULL;
987		return 0;
988	}
989
990	memset(&parms, 0, sizeof(parms));
991	if (devname)
992		strlcpy(parms.name, devname, IFNAMSIZ);
993
994	rtnl_lock();
995	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
996	/* FB netdevice is special: we have one, and only one per netns.
997	 * Allowing to move it to another netns is clearly unsafe.
998	 */
999	if (!IS_ERR(itn->fb_tunnel_dev)) {
1000		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1001		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1002		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1003	}
1004	rtnl_unlock();
1005
1006	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1007}
1008EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1009
1010static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1011			      struct rtnl_link_ops *ops)
1012{
1013	struct net *net = dev_net(itn->fb_tunnel_dev);
1014	struct net_device *dev, *aux;
1015	int h;
1016
1017	for_each_netdev_safe(net, dev, aux)
1018		if (dev->rtnl_link_ops == ops)
1019			unregister_netdevice_queue(dev, head);
1020
1021	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1022		struct ip_tunnel *t;
1023		struct hlist_node *n;
1024		struct hlist_head *thead = &itn->tunnels[h];
1025
1026		hlist_for_each_entry_safe(t, n, thead, hash_node)
1027			/* If dev is in the same netns, it has already
1028			 * been added to the list by the previous loop.
1029			 */
1030			if (!net_eq(dev_net(t->dev), net))
1031				unregister_netdevice_queue(t->dev, head);
1032	}
1033}
1034
1035void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1036{
1037	LIST_HEAD(list);
1038
1039	rtnl_lock();
1040	ip_tunnel_destroy(itn, &list, ops);
1041	unregister_netdevice_many(&list);
1042	rtnl_unlock();
1043}
1044EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1045
1046int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1047		      struct ip_tunnel_parm *p)
1048{
1049	struct ip_tunnel *nt;
1050	struct net *net = dev_net(dev);
1051	struct ip_tunnel_net *itn;
1052	int mtu;
1053	int err;
1054
1055	nt = netdev_priv(dev);
1056	itn = net_generic(net, nt->ip_tnl_net_id);
1057
1058	if (ip_tunnel_find(itn, p, dev->type))
1059		return -EEXIST;
1060
1061	nt->net = net;
1062	nt->parms = *p;
1063	err = register_netdevice(dev);
1064	if (err)
1065		goto out;
1066
1067	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1068		eth_hw_addr_random(dev);
1069
1070	mtu = ip_tunnel_bind_dev(dev);
1071	if (!tb[IFLA_MTU])
1072		dev->mtu = mtu;
1073
1074	ip_tunnel_add(itn, nt);
1075
1076out:
1077	return err;
1078}
1079EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1080
1081int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1082			 struct ip_tunnel_parm *p)
1083{
1084	struct ip_tunnel *t;
1085	struct ip_tunnel *tunnel = netdev_priv(dev);
1086	struct net *net = tunnel->net;
1087	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1088
1089	if (dev == itn->fb_tunnel_dev)
1090		return -EINVAL;
1091
1092	t = ip_tunnel_find(itn, p, dev->type);
1093
1094	if (t) {
1095		if (t->dev != dev)
1096			return -EEXIST;
1097	} else {
1098		t = tunnel;
1099
1100		if (dev->type != ARPHRD_ETHER) {
1101			unsigned int nflags = 0;
1102
1103			if (ipv4_is_multicast(p->iph.daddr))
1104				nflags = IFF_BROADCAST;
1105			else if (p->iph.daddr)
1106				nflags = IFF_POINTOPOINT;
1107
1108			if ((dev->flags ^ nflags) &
1109			    (IFF_POINTOPOINT | IFF_BROADCAST))
1110				return -EINVAL;
1111		}
1112	}
1113
1114	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1115	return 0;
1116}
1117EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1118
1119int ip_tunnel_init(struct net_device *dev)
1120{
1121	struct ip_tunnel *tunnel = netdev_priv(dev);
1122	struct iphdr *iph = &tunnel->parms.iph;
1123	int err;
1124
1125	dev->destructor	= ip_tunnel_dev_free;
1126	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1127	if (!dev->tstats)
1128		return -ENOMEM;
1129
1130	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1131	if (!tunnel->dst_cache) {
1132		free_percpu(dev->tstats);
1133		return -ENOMEM;
1134	}
1135
1136	err = gro_cells_init(&tunnel->gro_cells, dev);
1137	if (err) {
1138		free_percpu(tunnel->dst_cache);
1139		free_percpu(dev->tstats);
1140		return err;
1141	}
1142
1143	tunnel->dev = dev;
1144	tunnel->net = dev_net(dev);
1145	strcpy(tunnel->parms.name, dev->name);
1146	iph->version		= 4;
1147	iph->ihl		= 5;
1148
1149	return 0;
1150}
1151EXPORT_SYMBOL_GPL(ip_tunnel_init);
1152
1153void ip_tunnel_uninit(struct net_device *dev)
1154{
1155	struct ip_tunnel *tunnel = netdev_priv(dev);
1156	struct net *net = tunnel->net;
1157	struct ip_tunnel_net *itn;
1158
1159	itn = net_generic(net, tunnel->ip_tnl_net_id);
1160	/* fb_tunnel_dev will be unregisted in net-exit call. */
1161	if (itn->fb_tunnel_dev != dev)
1162		ip_tunnel_del(netdev_priv(dev));
1163
1164	ip_tunnel_dst_reset_all(tunnel);
1165}
1166EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1167
1168/* Do least required initialization, rest of init is done in tunnel_init call */
1169void ip_tunnel_setup(struct net_device *dev, int net_id)
1170{
1171	struct ip_tunnel *tunnel = netdev_priv(dev);
1172	tunnel->ip_tnl_net_id = net_id;
1173}
1174EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1175
1176MODULE_LICENSE("GPL");
1177