ip_tunnel.c revision 8b7ed2d91d6afb0b55ba75f94b66e51f70783a46
1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
43
44#include <net/sock.h>
45#include <net/ip.h>
46#include <net/icmp.h>
47#include <net/protocol.h>
48#include <net/ip_tunnels.h>
49#include <net/arp.h>
50#include <net/checksum.h>
51#include <net/dsfield.h>
52#include <net/inet_ecn.h>
53#include <net/xfrm.h>
54#include <net/net_namespace.h>
55#include <net/netns/generic.h>
56#include <net/rtnetlink.h>
57
58#if IS_ENABLED(CONFIG_IPV6)
59#include <net/ipv6.h>
60#include <net/ip6_fib.h>
61#include <net/ip6_route.h>
62#endif
63
64static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65				   __be32 key, __be32 remote)
66{
67	return hash_32((__force u32)key ^ (__force u32)remote,
68			 IP_TNL_HASH_BITS);
69}
70
71/* Often modified stats are per cpu, other are shared (netdev->stats) */
72struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
73						struct rtnl_link_stats64 *tot)
74{
75	int i;
76
77	for_each_possible_cpu(i) {
78		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
79		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
80		unsigned int start;
81
82		do {
83			start = u64_stats_fetch_begin_bh(&tstats->syncp);
84			rx_packets = tstats->rx_packets;
85			tx_packets = tstats->tx_packets;
86			rx_bytes = tstats->rx_bytes;
87			tx_bytes = tstats->tx_bytes;
88		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
89
90		tot->rx_packets += rx_packets;
91		tot->tx_packets += tx_packets;
92		tot->rx_bytes   += rx_bytes;
93		tot->tx_bytes   += tx_bytes;
94	}
95
96	tot->multicast = dev->stats.multicast;
97
98	tot->rx_crc_errors = dev->stats.rx_crc_errors;
99	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
100	tot->rx_length_errors = dev->stats.rx_length_errors;
101	tot->rx_frame_errors = dev->stats.rx_frame_errors;
102	tot->rx_errors = dev->stats.rx_errors;
103
104	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
105	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
106	tot->tx_dropped = dev->stats.tx_dropped;
107	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
108	tot->tx_errors = dev->stats.tx_errors;
109
110	tot->collisions  = dev->stats.collisions;
111
112	return tot;
113}
114EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
115
116static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
117				__be16 flags, __be32 key)
118{
119	if (p->i_flags & TUNNEL_KEY) {
120		if (flags & TUNNEL_KEY)
121			return key == p->i_key;
122		else
123			/* key expected, none present */
124			return false;
125	} else
126		return !(flags & TUNNEL_KEY);
127}
128
129/* Fallback tunnel: no source, no destination, no key, no options
130
131   Tunnel hash table:
132   We require exact key match i.e. if a key is present in packet
133   it will match only tunnel with the same key; if it is not present,
134   it will match only keyless tunnel.
135
136   All keysless packets, if not matched configured keyless tunnels
137   will match fallback tunnel.
138   Given src, dst and key, find appropriate for input tunnel.
139*/
140struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
141				   int link, __be16 flags,
142				   __be32 remote, __be32 local,
143				   __be32 key)
144{
145	unsigned int hash;
146	struct ip_tunnel *t, *cand = NULL;
147	struct hlist_head *head;
148
149	hash = ip_tunnel_hash(itn, key, remote);
150	head = &itn->tunnels[hash];
151
152	hlist_for_each_entry_rcu(t, head, hash_node) {
153		if (local != t->parms.iph.saddr ||
154		    remote != t->parms.iph.daddr ||
155		    !(t->dev->flags & IFF_UP))
156			continue;
157
158		if (!ip_tunnel_key_match(&t->parms, flags, key))
159			continue;
160
161		if (t->parms.link == link)
162			return t;
163		else
164			cand = t;
165	}
166
167	hlist_for_each_entry_rcu(t, head, hash_node) {
168		if (remote != t->parms.iph.daddr ||
169		    !(t->dev->flags & IFF_UP))
170			continue;
171
172		if (!ip_tunnel_key_match(&t->parms, flags, key))
173			continue;
174
175		if (t->parms.link == link)
176			return t;
177		else if (!cand)
178			cand = t;
179	}
180
181	hash = ip_tunnel_hash(itn, key, 0);
182	head = &itn->tunnels[hash];
183
184	hlist_for_each_entry_rcu(t, head, hash_node) {
185		if ((local != t->parms.iph.saddr &&
186		     (local != t->parms.iph.daddr ||
187		      !ipv4_is_multicast(local))) ||
188		    !(t->dev->flags & IFF_UP))
189			continue;
190
191		if (!ip_tunnel_key_match(&t->parms, flags, key))
192			continue;
193
194		if (t->parms.link == link)
195			return t;
196		else if (!cand)
197			cand = t;
198	}
199
200	if (flags & TUNNEL_NO_KEY)
201		goto skip_key_lookup;
202
203	hlist_for_each_entry_rcu(t, head, hash_node) {
204		if (t->parms.i_key != key ||
205		    !(t->dev->flags & IFF_UP))
206			continue;
207
208		if (t->parms.link == link)
209			return t;
210		else if (!cand)
211			cand = t;
212	}
213
214skip_key_lookup:
215	if (cand)
216		return cand;
217
218	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
219		return netdev_priv(itn->fb_tunnel_dev);
220
221
222	return NULL;
223}
224EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
225
226static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
227				    struct ip_tunnel_parm *parms)
228{
229	unsigned int h;
230	__be32 remote;
231
232	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
233		remote = parms->iph.daddr;
234	else
235		remote = 0;
236
237	h = ip_tunnel_hash(itn, parms->i_key, remote);
238	return &itn->tunnels[h];
239}
240
241static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
242{
243	struct hlist_head *head = ip_bucket(itn, &t->parms);
244
245	hlist_add_head_rcu(&t->hash_node, head);
246}
247
248static void ip_tunnel_del(struct ip_tunnel *t)
249{
250	hlist_del_init_rcu(&t->hash_node);
251}
252
253static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
254					struct ip_tunnel_parm *parms,
255					int type)
256{
257	__be32 remote = parms->iph.daddr;
258	__be32 local = parms->iph.saddr;
259	__be32 key = parms->i_key;
260	int link = parms->link;
261	struct ip_tunnel *t = NULL;
262	struct hlist_head *head = ip_bucket(itn, parms);
263
264	hlist_for_each_entry_rcu(t, head, hash_node) {
265		if (local == t->parms.iph.saddr &&
266		    remote == t->parms.iph.daddr &&
267		    key == t->parms.i_key &&
268		    link == t->parms.link &&
269		    type == t->dev->type)
270			break;
271	}
272	return t;
273}
274
275static struct net_device *__ip_tunnel_create(struct net *net,
276					     const struct rtnl_link_ops *ops,
277					     struct ip_tunnel_parm *parms)
278{
279	int err;
280	struct ip_tunnel *tunnel;
281	struct net_device *dev;
282	char name[IFNAMSIZ];
283
284	if (parms->name[0])
285		strlcpy(name, parms->name, IFNAMSIZ);
286	else {
287		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
288			err = -E2BIG;
289			goto failed;
290		}
291		strlcpy(name, ops->kind, IFNAMSIZ);
292		strncat(name, "%d", 2);
293	}
294
295	ASSERT_RTNL();
296	dev = alloc_netdev(ops->priv_size, name, ops->setup);
297	if (!dev) {
298		err = -ENOMEM;
299		goto failed;
300	}
301	dev_net_set(dev, net);
302
303	dev->rtnl_link_ops = ops;
304
305	tunnel = netdev_priv(dev);
306	tunnel->parms = *parms;
307	tunnel->net = net;
308
309	err = register_netdevice(dev);
310	if (err)
311		goto failed_free;
312
313	return dev;
314
315failed_free:
316	free_netdev(dev);
317failed:
318	return ERR_PTR(err);
319}
320
321static inline struct rtable *ip_route_output_tunnel(struct net *net,
322						    struct flowi4 *fl4,
323						    int proto,
324						    __be32 daddr, __be32 saddr,
325						    __be32 key, __u8 tos, int oif)
326{
327	memset(fl4, 0, sizeof(*fl4));
328	fl4->flowi4_oif = oif;
329	fl4->daddr = daddr;
330	fl4->saddr = saddr;
331	fl4->flowi4_tos = tos;
332	fl4->flowi4_proto = proto;
333	fl4->fl4_gre_key = key;
334	return ip_route_output_key(net, fl4);
335}
336
337static int ip_tunnel_bind_dev(struct net_device *dev)
338{
339	struct net_device *tdev = NULL;
340	struct ip_tunnel *tunnel = netdev_priv(dev);
341	const struct iphdr *iph;
342	int hlen = LL_MAX_HEADER;
343	int mtu = ETH_DATA_LEN;
344	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
345
346	iph = &tunnel->parms.iph;
347
348	/* Guess output device to choose reasonable mtu and needed_headroom */
349	if (iph->daddr) {
350		struct flowi4 fl4;
351		struct rtable *rt;
352
353		rt = ip_route_output_tunnel(tunnel->net, &fl4,
354					    tunnel->parms.iph.protocol,
355					    iph->daddr, iph->saddr,
356					    tunnel->parms.o_key,
357					    RT_TOS(iph->tos),
358					    tunnel->parms.link);
359		if (!IS_ERR(rt)) {
360			tdev = rt->dst.dev;
361			ip_rt_put(rt);
362		}
363		if (dev->type != ARPHRD_ETHER)
364			dev->flags |= IFF_POINTOPOINT;
365	}
366
367	if (!tdev && tunnel->parms.link)
368		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
369
370	if (tdev) {
371		hlen = tdev->hard_header_len + tdev->needed_headroom;
372		mtu = tdev->mtu;
373	}
374	dev->iflink = tunnel->parms.link;
375
376	dev->needed_headroom = t_hlen + hlen;
377	mtu -= (dev->hard_header_len + t_hlen);
378
379	if (mtu < 68)
380		mtu = 68;
381
382	return mtu;
383}
384
385static struct ip_tunnel *ip_tunnel_create(struct net *net,
386					  struct ip_tunnel_net *itn,
387					  struct ip_tunnel_parm *parms)
388{
389	struct ip_tunnel *nt, *fbt;
390	struct net_device *dev;
391
392	BUG_ON(!itn->fb_tunnel_dev);
393	fbt = netdev_priv(itn->fb_tunnel_dev);
394	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
395	if (IS_ERR(dev))
396		return NULL;
397
398	dev->mtu = ip_tunnel_bind_dev(dev);
399
400	nt = netdev_priv(dev);
401	ip_tunnel_add(itn, nt);
402	return nt;
403}
404
405int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
406		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
407{
408	struct pcpu_tstats *tstats;
409	const struct iphdr *iph = ip_hdr(skb);
410	int err;
411
412#ifdef CONFIG_NET_IPGRE_BROADCAST
413	if (ipv4_is_multicast(iph->daddr)) {
414		/* Looped back packet, drop it! */
415		if (rt_is_output_route(skb_rtable(skb)))
416			goto drop;
417		tunnel->dev->stats.multicast++;
418		skb->pkt_type = PACKET_BROADCAST;
419	}
420#endif
421
422	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
423	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
424		tunnel->dev->stats.rx_crc_errors++;
425		tunnel->dev->stats.rx_errors++;
426		goto drop;
427	}
428
429	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
430		if (!(tpi->flags&TUNNEL_SEQ) ||
431		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
432			tunnel->dev->stats.rx_fifo_errors++;
433			tunnel->dev->stats.rx_errors++;
434			goto drop;
435		}
436		tunnel->i_seqno = ntohl(tpi->seq) + 1;
437	}
438
439	err = IP_ECN_decapsulate(iph, skb);
440	if (unlikely(err)) {
441		if (log_ecn_error)
442			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
443					&iph->saddr, iph->tos);
444		if (err > 1) {
445			++tunnel->dev->stats.rx_frame_errors;
446			++tunnel->dev->stats.rx_errors;
447			goto drop;
448		}
449	}
450
451	tstats = this_cpu_ptr(tunnel->dev->tstats);
452	u64_stats_update_begin(&tstats->syncp);
453	tstats->rx_packets++;
454	tstats->rx_bytes += skb->len;
455	u64_stats_update_end(&tstats->syncp);
456
457	if (tunnel->dev->type == ARPHRD_ETHER) {
458		skb->protocol = eth_type_trans(skb, tunnel->dev);
459		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
460	} else {
461		skb->dev = tunnel->dev;
462	}
463
464	if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
465		skb_scrub_packet(skb);
466
467	gro_cells_receive(&tunnel->gro_cells, skb);
468	return 0;
469
470drop:
471	kfree_skb(skb);
472	return 0;
473}
474EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
475
476static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
477			    struct rtable *rt, __be16 df)
478{
479	struct ip_tunnel *tunnel = netdev_priv(dev);
480	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
481	int mtu;
482
483	if (df)
484		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
485					- sizeof(struct iphdr) - tunnel->hlen;
486	else
487		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
488
489	if (skb_dst(skb))
490		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
491
492	if (skb->protocol == htons(ETH_P_IP)) {
493		if (!skb_is_gso(skb) &&
494		    (df & htons(IP_DF)) && mtu < pkt_size) {
495			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
496			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
497			return -E2BIG;
498		}
499	}
500#if IS_ENABLED(CONFIG_IPV6)
501	else if (skb->protocol == htons(ETH_P_IPV6)) {
502		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
503
504		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
505			   mtu >= IPV6_MIN_MTU) {
506			if ((tunnel->parms.iph.daddr &&
507			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
508			    rt6->rt6i_dst.plen == 128) {
509				rt6->rt6i_flags |= RTF_MODIFIED;
510				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
511			}
512		}
513
514		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
515					mtu < pkt_size) {
516			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
517			return -E2BIG;
518		}
519	}
520#endif
521	return 0;
522}
523
524void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
525		    const struct iphdr *tnl_params, const u8 protocol)
526{
527	struct ip_tunnel *tunnel = netdev_priv(dev);
528	const struct iphdr *inner_iph;
529	struct flowi4 fl4;
530	u8     tos, ttl;
531	__be16 df;
532	struct rtable *rt;		/* Route to the other host */
533	unsigned int max_headroom;	/* The extra header space needed */
534	__be32 dst;
535	int err;
536
537	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
538
539	dst = tnl_params->daddr;
540	if (dst == 0) {
541		/* NBMA tunnel */
542
543		if (skb_dst(skb) == NULL) {
544			dev->stats.tx_fifo_errors++;
545			goto tx_error;
546		}
547
548		if (skb->protocol == htons(ETH_P_IP)) {
549			rt = skb_rtable(skb);
550			dst = rt_nexthop(rt, inner_iph->daddr);
551		}
552#if IS_ENABLED(CONFIG_IPV6)
553		else if (skb->protocol == htons(ETH_P_IPV6)) {
554			const struct in6_addr *addr6;
555			struct neighbour *neigh;
556			bool do_tx_error_icmp;
557			int addr_type;
558
559			neigh = dst_neigh_lookup(skb_dst(skb),
560						 &ipv6_hdr(skb)->daddr);
561			if (neigh == NULL)
562				goto tx_error;
563
564			addr6 = (const struct in6_addr *)&neigh->primary_key;
565			addr_type = ipv6_addr_type(addr6);
566
567			if (addr_type == IPV6_ADDR_ANY) {
568				addr6 = &ipv6_hdr(skb)->daddr;
569				addr_type = ipv6_addr_type(addr6);
570			}
571
572			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
573				do_tx_error_icmp = true;
574			else {
575				do_tx_error_icmp = false;
576				dst = addr6->s6_addr32[3];
577			}
578			neigh_release(neigh);
579			if (do_tx_error_icmp)
580				goto tx_error_icmp;
581		}
582#endif
583		else
584			goto tx_error;
585	}
586
587	tos = tnl_params->tos;
588	if (tos & 0x1) {
589		tos &= ~0x1;
590		if (skb->protocol == htons(ETH_P_IP))
591			tos = inner_iph->tos;
592		else if (skb->protocol == htons(ETH_P_IPV6))
593			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
594	}
595
596	rt = ip_route_output_tunnel(tunnel->net, &fl4,
597				    protocol,
598				    dst, tnl_params->saddr,
599				    tunnel->parms.o_key,
600				    RT_TOS(tos),
601				    tunnel->parms.link);
602	if (IS_ERR(rt)) {
603		dev->stats.tx_carrier_errors++;
604		goto tx_error;
605	}
606	if (rt->dst.dev == dev) {
607		ip_rt_put(rt);
608		dev->stats.collisions++;
609		goto tx_error;
610	}
611
612	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
613		ip_rt_put(rt);
614		goto tx_error;
615	}
616
617	if (!net_eq(tunnel->net, dev_net(dev)))
618		skb_scrub_packet(skb);
619
620	if (tunnel->err_count > 0) {
621		if (time_before(jiffies,
622				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
623			tunnel->err_count--;
624
625			dst_link_failure(skb);
626		} else
627			tunnel->err_count = 0;
628	}
629
630	ttl = tnl_params->ttl;
631	if (ttl == 0) {
632		if (skb->protocol == htons(ETH_P_IP))
633			ttl = inner_iph->ttl;
634#if IS_ENABLED(CONFIG_IPV6)
635		else if (skb->protocol == htons(ETH_P_IPV6))
636			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
637#endif
638		else
639			ttl = ip4_dst_hoplimit(&rt->dst);
640	}
641
642	df = tnl_params->frag_off;
643	if (skb->protocol == htons(ETH_P_IP))
644		df |= (inner_iph->frag_off&htons(IP_DF));
645
646	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
647			+ rt->dst.header_len;
648	if (max_headroom > dev->needed_headroom) {
649		dev->needed_headroom = max_headroom;
650		if (skb_cow_head(skb, dev->needed_headroom)) {
651			dev->stats.tx_dropped++;
652			dev_kfree_skb(skb);
653			return;
654		}
655	}
656
657	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
658			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
659	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
660
661	return;
662
663#if IS_ENABLED(CONFIG_IPV6)
664tx_error_icmp:
665	dst_link_failure(skb);
666#endif
667tx_error:
668	dev->stats.tx_errors++;
669	dev_kfree_skb(skb);
670}
671EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
672
673static void ip_tunnel_update(struct ip_tunnel_net *itn,
674			     struct ip_tunnel *t,
675			     struct net_device *dev,
676			     struct ip_tunnel_parm *p,
677			     bool set_mtu)
678{
679	ip_tunnel_del(t);
680	t->parms.iph.saddr = p->iph.saddr;
681	t->parms.iph.daddr = p->iph.daddr;
682	t->parms.i_key = p->i_key;
683	t->parms.o_key = p->o_key;
684	if (dev->type != ARPHRD_ETHER) {
685		memcpy(dev->dev_addr, &p->iph.saddr, 4);
686		memcpy(dev->broadcast, &p->iph.daddr, 4);
687	}
688	ip_tunnel_add(itn, t);
689
690	t->parms.iph.ttl = p->iph.ttl;
691	t->parms.iph.tos = p->iph.tos;
692	t->parms.iph.frag_off = p->iph.frag_off;
693
694	if (t->parms.link != p->link) {
695		int mtu;
696
697		t->parms.link = p->link;
698		mtu = ip_tunnel_bind_dev(dev);
699		if (set_mtu)
700			dev->mtu = mtu;
701	}
702	netdev_state_change(dev);
703}
704
705int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
706{
707	int err = 0;
708	struct ip_tunnel *t;
709	struct net *net = dev_net(dev);
710	struct ip_tunnel *tunnel = netdev_priv(dev);
711	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
712
713	BUG_ON(!itn->fb_tunnel_dev);
714	switch (cmd) {
715	case SIOCGETTUNNEL:
716		t = NULL;
717		if (dev == itn->fb_tunnel_dev)
718			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
719		if (t == NULL)
720			t = netdev_priv(dev);
721		memcpy(p, &t->parms, sizeof(*p));
722		break;
723
724	case SIOCADDTUNNEL:
725	case SIOCCHGTUNNEL:
726		err = -EPERM;
727		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
728			goto done;
729		if (p->iph.ttl)
730			p->iph.frag_off |= htons(IP_DF);
731		if (!(p->i_flags&TUNNEL_KEY))
732			p->i_key = 0;
733		if (!(p->o_flags&TUNNEL_KEY))
734			p->o_key = 0;
735
736		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
737
738		if (!t && (cmd == SIOCADDTUNNEL))
739			t = ip_tunnel_create(net, itn, p);
740
741		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
742			if (t != NULL) {
743				if (t->dev != dev) {
744					err = -EEXIST;
745					break;
746				}
747			} else {
748				unsigned int nflags = 0;
749
750				if (ipv4_is_multicast(p->iph.daddr))
751					nflags = IFF_BROADCAST;
752				else if (p->iph.daddr)
753					nflags = IFF_POINTOPOINT;
754
755				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
756					err = -EINVAL;
757					break;
758				}
759
760				t = netdev_priv(dev);
761			}
762		}
763
764		if (t) {
765			err = 0;
766			ip_tunnel_update(itn, t, dev, p, true);
767		} else
768			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
769		break;
770
771	case SIOCDELTUNNEL:
772		err = -EPERM;
773		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
774			goto done;
775
776		if (dev == itn->fb_tunnel_dev) {
777			err = -ENOENT;
778			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
779			if (t == NULL)
780				goto done;
781			err = -EPERM;
782			if (t == netdev_priv(itn->fb_tunnel_dev))
783				goto done;
784			dev = t->dev;
785		}
786		unregister_netdevice(dev);
787		err = 0;
788		break;
789
790	default:
791		err = -EINVAL;
792	}
793
794done:
795	return err;
796}
797EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
798
799int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
800{
801	struct ip_tunnel *tunnel = netdev_priv(dev);
802	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
803
804	if (new_mtu < 68 ||
805	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
806		return -EINVAL;
807	dev->mtu = new_mtu;
808	return 0;
809}
810EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
811
812static void ip_tunnel_dev_free(struct net_device *dev)
813{
814	struct ip_tunnel *tunnel = netdev_priv(dev);
815
816	gro_cells_destroy(&tunnel->gro_cells);
817	free_percpu(dev->tstats);
818	free_netdev(dev);
819}
820
821void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
822{
823	struct ip_tunnel *tunnel = netdev_priv(dev);
824	struct ip_tunnel_net *itn;
825
826	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
827
828	if (itn->fb_tunnel_dev != dev) {
829		ip_tunnel_del(netdev_priv(dev));
830		unregister_netdevice_queue(dev, head);
831	}
832}
833EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
834
835int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
836				  struct rtnl_link_ops *ops, char *devname)
837{
838	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
839	struct ip_tunnel_parm parms;
840	unsigned int i;
841
842	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
843		INIT_HLIST_HEAD(&itn->tunnels[i]);
844
845	if (!ops) {
846		itn->fb_tunnel_dev = NULL;
847		return 0;
848	}
849
850	memset(&parms, 0, sizeof(parms));
851	if (devname)
852		strlcpy(parms.name, devname, IFNAMSIZ);
853
854	rtnl_lock();
855	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
856	/* FB netdevice is special: we have one, and only one per netns.
857	 * Allowing to move it to another netns is clearly unsafe.
858	 */
859	if (!IS_ERR(itn->fb_tunnel_dev))
860		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
861	rtnl_unlock();
862
863	return PTR_RET(itn->fb_tunnel_dev);
864}
865EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
866
867static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
868			      struct rtnl_link_ops *ops)
869{
870	struct net *net = dev_net(itn->fb_tunnel_dev);
871	struct net_device *dev, *aux;
872	int h;
873
874	for_each_netdev_safe(net, dev, aux)
875		if (dev->rtnl_link_ops == ops)
876			unregister_netdevice_queue(dev, head);
877
878	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
879		struct ip_tunnel *t;
880		struct hlist_node *n;
881		struct hlist_head *thead = &itn->tunnels[h];
882
883		hlist_for_each_entry_safe(t, n, thead, hash_node)
884			/* If dev is in the same netns, it has already
885			 * been added to the list by the previous loop.
886			 */
887			if (!net_eq(dev_net(t->dev), net))
888				unregister_netdevice_queue(t->dev, head);
889	}
890	if (itn->fb_tunnel_dev)
891		unregister_netdevice_queue(itn->fb_tunnel_dev, head);
892}
893
894void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
895{
896	LIST_HEAD(list);
897
898	rtnl_lock();
899	ip_tunnel_destroy(itn, &list, ops);
900	unregister_netdevice_many(&list);
901	rtnl_unlock();
902}
903EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
904
905int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
906		      struct ip_tunnel_parm *p)
907{
908	struct ip_tunnel *nt;
909	struct net *net = dev_net(dev);
910	struct ip_tunnel_net *itn;
911	int mtu;
912	int err;
913
914	nt = netdev_priv(dev);
915	itn = net_generic(net, nt->ip_tnl_net_id);
916
917	if (ip_tunnel_find(itn, p, dev->type))
918		return -EEXIST;
919
920	nt->net = net;
921	nt->parms = *p;
922	err = register_netdevice(dev);
923	if (err)
924		goto out;
925
926	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
927		eth_hw_addr_random(dev);
928
929	mtu = ip_tunnel_bind_dev(dev);
930	if (!tb[IFLA_MTU])
931		dev->mtu = mtu;
932
933	ip_tunnel_add(itn, nt);
934
935out:
936	return err;
937}
938EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
939
940int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
941			 struct ip_tunnel_parm *p)
942{
943	struct ip_tunnel *t;
944	struct ip_tunnel *tunnel = netdev_priv(dev);
945	struct net *net = tunnel->net;
946	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
947
948	if (dev == itn->fb_tunnel_dev)
949		return -EINVAL;
950
951	t = ip_tunnel_find(itn, p, dev->type);
952
953	if (t) {
954		if (t->dev != dev)
955			return -EEXIST;
956	} else {
957		t = tunnel;
958
959		if (dev->type != ARPHRD_ETHER) {
960			unsigned int nflags = 0;
961
962			if (ipv4_is_multicast(p->iph.daddr))
963				nflags = IFF_BROADCAST;
964			else if (p->iph.daddr)
965				nflags = IFF_POINTOPOINT;
966
967			if ((dev->flags ^ nflags) &
968			    (IFF_POINTOPOINT | IFF_BROADCAST))
969				return -EINVAL;
970		}
971	}
972
973	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
974	return 0;
975}
976EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
977
978int ip_tunnel_init(struct net_device *dev)
979{
980	struct ip_tunnel *tunnel = netdev_priv(dev);
981	struct iphdr *iph = &tunnel->parms.iph;
982	int err;
983
984	dev->destructor	= ip_tunnel_dev_free;
985	dev->tstats = alloc_percpu(struct pcpu_tstats);
986	if (!dev->tstats)
987		return -ENOMEM;
988
989	err = gro_cells_init(&tunnel->gro_cells, dev);
990	if (err) {
991		free_percpu(dev->tstats);
992		return err;
993	}
994
995	tunnel->dev = dev;
996	tunnel->net = dev_net(dev);
997	strcpy(tunnel->parms.name, dev->name);
998	iph->version		= 4;
999	iph->ihl		= 5;
1000
1001	return 0;
1002}
1003EXPORT_SYMBOL_GPL(ip_tunnel_init);
1004
1005void ip_tunnel_uninit(struct net_device *dev)
1006{
1007	struct ip_tunnel *tunnel = netdev_priv(dev);
1008	struct net *net = tunnel->net;
1009	struct ip_tunnel_net *itn;
1010
1011	itn = net_generic(net, tunnel->ip_tnl_net_id);
1012	/* fb_tunnel_dev will be unregisted in net-exit call. */
1013	if (itn->fb_tunnel_dev != dev)
1014		ip_tunnel_del(netdev_priv(dev));
1015}
1016EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1017
1018/* Do least required initialization, rest of init is done in tunnel_init call */
1019void ip_tunnel_setup(struct net_device *dev, int net_id)
1020{
1021	struct ip_tunnel *tunnel = netdev_priv(dev);
1022	tunnel->ip_tnl_net_id = net_id;
1023}
1024EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1025
1026MODULE_LICENSE("GPL");
1027