ip_tunnel.c revision 22fb22eaebf4d16987f3fd9c3484c436ee0badf2
1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
43#include <linux/err.h>
44
45#include <net/sock.h>
46#include <net/ip.h>
47#include <net/icmp.h>
48#include <net/protocol.h>
49#include <net/ip_tunnels.h>
50#include <net/arp.h>
51#include <net/checksum.h>
52#include <net/dsfield.h>
53#include <net/inet_ecn.h>
54#include <net/xfrm.h>
55#include <net/net_namespace.h>
56#include <net/netns/generic.h>
57#include <net/rtnetlink.h>
58
59#if IS_ENABLED(CONFIG_IPV6)
60#include <net/ipv6.h>
61#include <net/ip6_fib.h>
62#include <net/ip6_route.h>
63#endif
64
65static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66{
67	return hash_32((__force u32)key ^ (__force u32)remote,
68			 IP_TNL_HASH_BITS);
69}
70
71static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72			     struct dst_entry *dst)
73{
74	struct dst_entry *old_dst;
75
76	if (dst) {
77		if (dst->flags & DST_NOCACHE)
78			dst = NULL;
79		else
80			dst_clone(dst);
81	}
82	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83	dst_release(old_dst);
84}
85
86static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87{
88	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89}
90
91static void tunnel_dst_reset(struct ip_tunnel *t)
92{
93	tunnel_dst_set(t, NULL);
94}
95
96void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97{
98	int i;
99
100	for_each_possible_cpu(i)
101		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102}
103EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104
105static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106{
107	struct dst_entry *dst;
108
109	rcu_read_lock();
110	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111	if (dst) {
112		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113			rcu_read_unlock();
114			tunnel_dst_reset(t);
115			return NULL;
116		}
117		dst_hold(dst);
118	}
119	rcu_read_unlock();
120	return (struct rtable *)dst;
121}
122
123static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124				__be16 flags, __be32 key)
125{
126	if (p->i_flags & TUNNEL_KEY) {
127		if (flags & TUNNEL_KEY)
128			return key == p->i_key;
129		else
130			/* key expected, none present */
131			return false;
132	} else
133		return !(flags & TUNNEL_KEY);
134}
135
136/* Fallback tunnel: no source, no destination, no key, no options
137
138   Tunnel hash table:
139   We require exact key match i.e. if a key is present in packet
140   it will match only tunnel with the same key; if it is not present,
141   it will match only keyless tunnel.
142
143   All keysless packets, if not matched configured keyless tunnels
144   will match fallback tunnel.
145   Given src, dst and key, find appropriate for input tunnel.
146*/
147struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148				   int link, __be16 flags,
149				   __be32 remote, __be32 local,
150				   __be32 key)
151{
152	unsigned int hash;
153	struct ip_tunnel *t, *cand = NULL;
154	struct hlist_head *head;
155
156	hash = ip_tunnel_hash(key, remote);
157	head = &itn->tunnels[hash];
158
159	hlist_for_each_entry_rcu(t, head, hash_node) {
160		if (local != t->parms.iph.saddr ||
161		    remote != t->parms.iph.daddr ||
162		    !(t->dev->flags & IFF_UP))
163			continue;
164
165		if (!ip_tunnel_key_match(&t->parms, flags, key))
166			continue;
167
168		if (t->parms.link == link)
169			return t;
170		else
171			cand = t;
172	}
173
174	hlist_for_each_entry_rcu(t, head, hash_node) {
175		if (remote != t->parms.iph.daddr ||
176		    !(t->dev->flags & IFF_UP))
177			continue;
178
179		if (!ip_tunnel_key_match(&t->parms, flags, key))
180			continue;
181
182		if (t->parms.link == link)
183			return t;
184		else if (!cand)
185			cand = t;
186	}
187
188	hash = ip_tunnel_hash(key, 0);
189	head = &itn->tunnels[hash];
190
191	hlist_for_each_entry_rcu(t, head, hash_node) {
192		if ((local != t->parms.iph.saddr &&
193		     (local != t->parms.iph.daddr ||
194		      !ipv4_is_multicast(local))) ||
195		    !(t->dev->flags & IFF_UP))
196			continue;
197
198		if (!ip_tunnel_key_match(&t->parms, flags, key))
199			continue;
200
201		if (t->parms.link == link)
202			return t;
203		else if (!cand)
204			cand = t;
205	}
206
207	if (flags & TUNNEL_NO_KEY)
208		goto skip_key_lookup;
209
210	hlist_for_each_entry_rcu(t, head, hash_node) {
211		if (t->parms.i_key != key ||
212		    !(t->dev->flags & IFF_UP))
213			continue;
214
215		if (t->parms.link == link)
216			return t;
217		else if (!cand)
218			cand = t;
219	}
220
221skip_key_lookup:
222	if (cand)
223		return cand;
224
225	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226		return netdev_priv(itn->fb_tunnel_dev);
227
228
229	return NULL;
230}
231EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232
233static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234				    struct ip_tunnel_parm *parms)
235{
236	unsigned int h;
237	__be32 remote;
238	__be32 i_key = parms->i_key;
239
240	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241		remote = parms->iph.daddr;
242	else
243		remote = 0;
244
245	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246		i_key = 0;
247
248	h = ip_tunnel_hash(i_key, remote);
249	return &itn->tunnels[h];
250}
251
252static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253{
254	struct hlist_head *head = ip_bucket(itn, &t->parms);
255
256	hlist_add_head_rcu(&t->hash_node, head);
257}
258
259static void ip_tunnel_del(struct ip_tunnel *t)
260{
261	hlist_del_init_rcu(&t->hash_node);
262}
263
264static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265					struct ip_tunnel_parm *parms,
266					int type)
267{
268	__be32 remote = parms->iph.daddr;
269	__be32 local = parms->iph.saddr;
270	__be32 key = parms->i_key;
271	int link = parms->link;
272	struct ip_tunnel *t = NULL;
273	struct hlist_head *head = ip_bucket(itn, parms);
274
275	hlist_for_each_entry_rcu(t, head, hash_node) {
276		if (local == t->parms.iph.saddr &&
277		    remote == t->parms.iph.daddr &&
278		    key == t->parms.i_key &&
279		    link == t->parms.link &&
280		    type == t->dev->type)
281			break;
282	}
283	return t;
284}
285
286static struct net_device *__ip_tunnel_create(struct net *net,
287					     const struct rtnl_link_ops *ops,
288					     struct ip_tunnel_parm *parms)
289{
290	int err;
291	struct ip_tunnel *tunnel;
292	struct net_device *dev;
293	char name[IFNAMSIZ];
294
295	if (parms->name[0])
296		strlcpy(name, parms->name, IFNAMSIZ);
297	else {
298		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
299			err = -E2BIG;
300			goto failed;
301		}
302		strlcpy(name, ops->kind, IFNAMSIZ);
303		strncat(name, "%d", 2);
304	}
305
306	ASSERT_RTNL();
307	dev = alloc_netdev(ops->priv_size, name, ops->setup);
308	if (!dev) {
309		err = -ENOMEM;
310		goto failed;
311	}
312	dev_net_set(dev, net);
313
314	dev->rtnl_link_ops = ops;
315
316	tunnel = netdev_priv(dev);
317	tunnel->parms = *parms;
318	tunnel->net = net;
319
320	err = register_netdevice(dev);
321	if (err)
322		goto failed_free;
323
324	return dev;
325
326failed_free:
327	free_netdev(dev);
328failed:
329	return ERR_PTR(err);
330}
331
332static inline void init_tunnel_flow(struct flowi4 *fl4,
333				    int proto,
334				    __be32 daddr, __be32 saddr,
335				    __be32 key, __u8 tos, int oif)
336{
337	memset(fl4, 0, sizeof(*fl4));
338	fl4->flowi4_oif = oif;
339	fl4->daddr = daddr;
340	fl4->saddr = saddr;
341	fl4->flowi4_tos = tos;
342	fl4->flowi4_proto = proto;
343	fl4->fl4_gre_key = key;
344}
345
346static int ip_tunnel_bind_dev(struct net_device *dev)
347{
348	struct net_device *tdev = NULL;
349	struct ip_tunnel *tunnel = netdev_priv(dev);
350	const struct iphdr *iph;
351	int hlen = LL_MAX_HEADER;
352	int mtu = ETH_DATA_LEN;
353	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
354
355	iph = &tunnel->parms.iph;
356
357	/* Guess output device to choose reasonable mtu and needed_headroom */
358	if (iph->daddr) {
359		struct flowi4 fl4;
360		struct rtable *rt;
361
362		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
363				 iph->saddr, tunnel->parms.o_key,
364				 RT_TOS(iph->tos), tunnel->parms.link);
365		rt = ip_route_output_key(tunnel->net, &fl4);
366
367		if (!IS_ERR(rt)) {
368			tdev = rt->dst.dev;
369			tunnel_dst_set(tunnel, &rt->dst);
370			ip_rt_put(rt);
371		}
372		if (dev->type != ARPHRD_ETHER)
373			dev->flags |= IFF_POINTOPOINT;
374	}
375
376	if (!tdev && tunnel->parms.link)
377		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
378
379	if (tdev) {
380		hlen = tdev->hard_header_len + tdev->needed_headroom;
381		mtu = tdev->mtu;
382	}
383	dev->iflink = tunnel->parms.link;
384
385	dev->needed_headroom = t_hlen + hlen;
386	mtu -= (dev->hard_header_len + t_hlen);
387
388	if (mtu < 68)
389		mtu = 68;
390
391	return mtu;
392}
393
394static struct ip_tunnel *ip_tunnel_create(struct net *net,
395					  struct ip_tunnel_net *itn,
396					  struct ip_tunnel_parm *parms)
397{
398	struct ip_tunnel *nt, *fbt;
399	struct net_device *dev;
400
401	BUG_ON(!itn->fb_tunnel_dev);
402	fbt = netdev_priv(itn->fb_tunnel_dev);
403	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404	if (IS_ERR(dev))
405		return ERR_CAST(dev);
406
407	dev->mtu = ip_tunnel_bind_dev(dev);
408
409	nt = netdev_priv(dev);
410	ip_tunnel_add(itn, nt);
411	return nt;
412}
413
414int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
416{
417	struct pcpu_sw_netstats *tstats;
418	const struct iphdr *iph = ip_hdr(skb);
419	int err;
420
421#ifdef CONFIG_NET_IPGRE_BROADCAST
422	if (ipv4_is_multicast(iph->daddr)) {
423		tunnel->dev->stats.multicast++;
424		skb->pkt_type = PACKET_BROADCAST;
425	}
426#endif
427
428	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430		tunnel->dev->stats.rx_crc_errors++;
431		tunnel->dev->stats.rx_errors++;
432		goto drop;
433	}
434
435	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436		if (!(tpi->flags&TUNNEL_SEQ) ||
437		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438			tunnel->dev->stats.rx_fifo_errors++;
439			tunnel->dev->stats.rx_errors++;
440			goto drop;
441		}
442		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443	}
444
445	skb_reset_network_header(skb);
446
447	err = IP_ECN_decapsulate(iph, skb);
448	if (unlikely(err)) {
449		if (log_ecn_error)
450			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
451					&iph->saddr, iph->tos);
452		if (err > 1) {
453			++tunnel->dev->stats.rx_frame_errors;
454			++tunnel->dev->stats.rx_errors;
455			goto drop;
456		}
457	}
458
459	tstats = this_cpu_ptr(tunnel->dev->tstats);
460	u64_stats_update_begin(&tstats->syncp);
461	tstats->rx_packets++;
462	tstats->rx_bytes += skb->len;
463	u64_stats_update_end(&tstats->syncp);
464
465	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
466
467	if (tunnel->dev->type == ARPHRD_ETHER) {
468		skb->protocol = eth_type_trans(skb, tunnel->dev);
469		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
470	} else {
471		skb->dev = tunnel->dev;
472	}
473
474	gro_cells_receive(&tunnel->gro_cells, skb);
475	return 0;
476
477drop:
478	kfree_skb(skb);
479	return 0;
480}
481EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
482
483static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
484			    struct rtable *rt, __be16 df)
485{
486	struct ip_tunnel *tunnel = netdev_priv(dev);
487	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
488	int mtu;
489
490	if (df)
491		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
492					- sizeof(struct iphdr) - tunnel->hlen;
493	else
494		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
495
496	if (skb_dst(skb))
497		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
498
499	if (skb->protocol == htons(ETH_P_IP)) {
500		if (!skb_is_gso(skb) &&
501		    (df & htons(IP_DF)) && mtu < pkt_size) {
502			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
503			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
504			return -E2BIG;
505		}
506	}
507#if IS_ENABLED(CONFIG_IPV6)
508	else if (skb->protocol == htons(ETH_P_IPV6)) {
509		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
510
511		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
512			   mtu >= IPV6_MIN_MTU) {
513			if ((tunnel->parms.iph.daddr &&
514			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
515			    rt6->rt6i_dst.plen == 128) {
516				rt6->rt6i_flags |= RTF_MODIFIED;
517				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
518			}
519		}
520
521		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
522					mtu < pkt_size) {
523			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524			return -E2BIG;
525		}
526	}
527#endif
528	return 0;
529}
530
531void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
532		    const struct iphdr *tnl_params, const u8 protocol)
533{
534	struct ip_tunnel *tunnel = netdev_priv(dev);
535	const struct iphdr *inner_iph;
536	struct flowi4 fl4;
537	u8     tos, ttl;
538	__be16 df;
539	struct rtable *rt;		/* Route to the other host */
540	unsigned int max_headroom;	/* The extra header space needed */
541	__be32 dst;
542	int err;
543	bool connected;
544
545	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
546	connected = (tunnel->parms.iph.daddr != 0);
547
548	dst = tnl_params->daddr;
549	if (dst == 0) {
550		/* NBMA tunnel */
551
552		if (skb_dst(skb) == NULL) {
553			dev->stats.tx_fifo_errors++;
554			goto tx_error;
555		}
556
557		if (skb->protocol == htons(ETH_P_IP)) {
558			rt = skb_rtable(skb);
559			dst = rt_nexthop(rt, inner_iph->daddr);
560		}
561#if IS_ENABLED(CONFIG_IPV6)
562		else if (skb->protocol == htons(ETH_P_IPV6)) {
563			const struct in6_addr *addr6;
564			struct neighbour *neigh;
565			bool do_tx_error_icmp;
566			int addr_type;
567
568			neigh = dst_neigh_lookup(skb_dst(skb),
569						 &ipv6_hdr(skb)->daddr);
570			if (neigh == NULL)
571				goto tx_error;
572
573			addr6 = (const struct in6_addr *)&neigh->primary_key;
574			addr_type = ipv6_addr_type(addr6);
575
576			if (addr_type == IPV6_ADDR_ANY) {
577				addr6 = &ipv6_hdr(skb)->daddr;
578				addr_type = ipv6_addr_type(addr6);
579			}
580
581			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
582				do_tx_error_icmp = true;
583			else {
584				do_tx_error_icmp = false;
585				dst = addr6->s6_addr32[3];
586			}
587			neigh_release(neigh);
588			if (do_tx_error_icmp)
589				goto tx_error_icmp;
590		}
591#endif
592		else
593			goto tx_error;
594
595		connected = false;
596	}
597
598	tos = tnl_params->tos;
599	if (tos & 0x1) {
600		tos &= ~0x1;
601		if (skb->protocol == htons(ETH_P_IP)) {
602			tos = inner_iph->tos;
603			connected = false;
604		} else if (skb->protocol == htons(ETH_P_IPV6)) {
605			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
606			connected = false;
607		}
608	}
609
610	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
611			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
612
613	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
614
615	if (!rt) {
616		rt = ip_route_output_key(tunnel->net, &fl4);
617
618		if (IS_ERR(rt)) {
619			dev->stats.tx_carrier_errors++;
620			goto tx_error;
621		}
622		if (connected)
623			tunnel_dst_set(tunnel, &rt->dst);
624	}
625
626	if (rt->dst.dev == dev) {
627		ip_rt_put(rt);
628		dev->stats.collisions++;
629		goto tx_error;
630	}
631
632	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
633		ip_rt_put(rt);
634		goto tx_error;
635	}
636
637	if (tunnel->err_count > 0) {
638		if (time_before(jiffies,
639				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
640			tunnel->err_count--;
641
642			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
643			dst_link_failure(skb);
644		} else
645			tunnel->err_count = 0;
646	}
647
648	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
649	ttl = tnl_params->ttl;
650	if (ttl == 0) {
651		if (skb->protocol == htons(ETH_P_IP))
652			ttl = inner_iph->ttl;
653#if IS_ENABLED(CONFIG_IPV6)
654		else if (skb->protocol == htons(ETH_P_IPV6))
655			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656#endif
657		else
658			ttl = ip4_dst_hoplimit(&rt->dst);
659	}
660
661	df = tnl_params->frag_off;
662	if (skb->protocol == htons(ETH_P_IP))
663		df |= (inner_iph->frag_off&htons(IP_DF));
664
665	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
666			+ rt->dst.header_len;
667	if (max_headroom > dev->needed_headroom)
668		dev->needed_headroom = max_headroom;
669
670	if (skb_cow_head(skb, dev->needed_headroom)) {
671		dev->stats.tx_dropped++;
672		kfree_skb(skb);
673		return;
674	}
675
676	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
677			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
678	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
679
680	return;
681
682#if IS_ENABLED(CONFIG_IPV6)
683tx_error_icmp:
684	dst_link_failure(skb);
685#endif
686tx_error:
687	dev->stats.tx_errors++;
688	kfree_skb(skb);
689}
690EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
691
692static void ip_tunnel_update(struct ip_tunnel_net *itn,
693			     struct ip_tunnel *t,
694			     struct net_device *dev,
695			     struct ip_tunnel_parm *p,
696			     bool set_mtu)
697{
698	ip_tunnel_del(t);
699	t->parms.iph.saddr = p->iph.saddr;
700	t->parms.iph.daddr = p->iph.daddr;
701	t->parms.i_key = p->i_key;
702	t->parms.o_key = p->o_key;
703	if (dev->type != ARPHRD_ETHER) {
704		memcpy(dev->dev_addr, &p->iph.saddr, 4);
705		memcpy(dev->broadcast, &p->iph.daddr, 4);
706	}
707	ip_tunnel_add(itn, t);
708
709	t->parms.iph.ttl = p->iph.ttl;
710	t->parms.iph.tos = p->iph.tos;
711	t->parms.iph.frag_off = p->iph.frag_off;
712
713	if (t->parms.link != p->link) {
714		int mtu;
715
716		t->parms.link = p->link;
717		mtu = ip_tunnel_bind_dev(dev);
718		if (set_mtu)
719			dev->mtu = mtu;
720	}
721	ip_tunnel_dst_reset_all(t);
722	netdev_state_change(dev);
723}
724
725int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
726{
727	int err = 0;
728	struct ip_tunnel *t = netdev_priv(dev);
729	struct net *net = t->net;
730	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
731
732	BUG_ON(!itn->fb_tunnel_dev);
733	switch (cmd) {
734	case SIOCGETTUNNEL:
735		if (dev == itn->fb_tunnel_dev) {
736			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
737			if (t == NULL)
738				t = netdev_priv(dev);
739		}
740		memcpy(p, &t->parms, sizeof(*p));
741		break;
742
743	case SIOCADDTUNNEL:
744	case SIOCCHGTUNNEL:
745		err = -EPERM;
746		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
747			goto done;
748		if (p->iph.ttl)
749			p->iph.frag_off |= htons(IP_DF);
750		if (!(p->i_flags&TUNNEL_KEY))
751			p->i_key = 0;
752		if (!(p->o_flags&TUNNEL_KEY))
753			p->o_key = 0;
754
755		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
756
757		if (!t && (cmd == SIOCADDTUNNEL)) {
758			t = ip_tunnel_create(net, itn, p);
759			if (IS_ERR(t)) {
760				err = PTR_ERR(t);
761				break;
762			}
763		}
764		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
765			if (t != NULL) {
766				if (t->dev != dev) {
767					err = -EEXIST;
768					break;
769				}
770			} else {
771				unsigned int nflags = 0;
772
773				if (ipv4_is_multicast(p->iph.daddr))
774					nflags = IFF_BROADCAST;
775				else if (p->iph.daddr)
776					nflags = IFF_POINTOPOINT;
777
778				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
779					err = -EINVAL;
780					break;
781				}
782
783				t = netdev_priv(dev);
784			}
785		}
786
787		if (t) {
788			err = 0;
789			ip_tunnel_update(itn, t, dev, p, true);
790		} else {
791			err = -ENOENT;
792		}
793		break;
794
795	case SIOCDELTUNNEL:
796		err = -EPERM;
797		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
798			goto done;
799
800		if (dev == itn->fb_tunnel_dev) {
801			err = -ENOENT;
802			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
803			if (t == NULL)
804				goto done;
805			err = -EPERM;
806			if (t == netdev_priv(itn->fb_tunnel_dev))
807				goto done;
808			dev = t->dev;
809		}
810		unregister_netdevice(dev);
811		err = 0;
812		break;
813
814	default:
815		err = -EINVAL;
816	}
817
818done:
819	return err;
820}
821EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
822
823int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
824{
825	struct ip_tunnel *tunnel = netdev_priv(dev);
826	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
827
828	if (new_mtu < 68 ||
829	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
830		return -EINVAL;
831	dev->mtu = new_mtu;
832	return 0;
833}
834EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
835
836static void ip_tunnel_dev_free(struct net_device *dev)
837{
838	struct ip_tunnel *tunnel = netdev_priv(dev);
839
840	gro_cells_destroy(&tunnel->gro_cells);
841	free_percpu(tunnel->dst_cache);
842	free_percpu(dev->tstats);
843	free_netdev(dev);
844}
845
846void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
847{
848	struct ip_tunnel *tunnel = netdev_priv(dev);
849	struct ip_tunnel_net *itn;
850
851	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
852
853	if (itn->fb_tunnel_dev != dev) {
854		ip_tunnel_del(netdev_priv(dev));
855		unregister_netdevice_queue(dev, head);
856	}
857}
858EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
859
860int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
861				  struct rtnl_link_ops *ops, char *devname)
862{
863	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
864	struct ip_tunnel_parm parms;
865	unsigned int i;
866
867	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
868		INIT_HLIST_HEAD(&itn->tunnels[i]);
869
870	if (!ops) {
871		itn->fb_tunnel_dev = NULL;
872		return 0;
873	}
874
875	memset(&parms, 0, sizeof(parms));
876	if (devname)
877		strlcpy(parms.name, devname, IFNAMSIZ);
878
879	rtnl_lock();
880	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
881	/* FB netdevice is special: we have one, and only one per netns.
882	 * Allowing to move it to another netns is clearly unsafe.
883	 */
884	if (!IS_ERR(itn->fb_tunnel_dev)) {
885		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
886		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
887	}
888	rtnl_unlock();
889
890	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
891}
892EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
893
894static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
895			      struct rtnl_link_ops *ops)
896{
897	struct net *net = dev_net(itn->fb_tunnel_dev);
898	struct net_device *dev, *aux;
899	int h;
900
901	for_each_netdev_safe(net, dev, aux)
902		if (dev->rtnl_link_ops == ops)
903			unregister_netdevice_queue(dev, head);
904
905	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
906		struct ip_tunnel *t;
907		struct hlist_node *n;
908		struct hlist_head *thead = &itn->tunnels[h];
909
910		hlist_for_each_entry_safe(t, n, thead, hash_node)
911			/* If dev is in the same netns, it has already
912			 * been added to the list by the previous loop.
913			 */
914			if (!net_eq(dev_net(t->dev), net))
915				unregister_netdevice_queue(t->dev, head);
916	}
917}
918
919void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
920{
921	LIST_HEAD(list);
922
923	rtnl_lock();
924	ip_tunnel_destroy(itn, &list, ops);
925	unregister_netdevice_many(&list);
926	rtnl_unlock();
927}
928EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
929
930int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
931		      struct ip_tunnel_parm *p)
932{
933	struct ip_tunnel *nt;
934	struct net *net = dev_net(dev);
935	struct ip_tunnel_net *itn;
936	int mtu;
937	int err;
938
939	nt = netdev_priv(dev);
940	itn = net_generic(net, nt->ip_tnl_net_id);
941
942	if (ip_tunnel_find(itn, p, dev->type))
943		return -EEXIST;
944
945	nt->net = net;
946	nt->parms = *p;
947	err = register_netdevice(dev);
948	if (err)
949		goto out;
950
951	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
952		eth_hw_addr_random(dev);
953
954	mtu = ip_tunnel_bind_dev(dev);
955	if (!tb[IFLA_MTU])
956		dev->mtu = mtu;
957
958	ip_tunnel_add(itn, nt);
959
960out:
961	return err;
962}
963EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
964
965int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
966			 struct ip_tunnel_parm *p)
967{
968	struct ip_tunnel *t;
969	struct ip_tunnel *tunnel = netdev_priv(dev);
970	struct net *net = tunnel->net;
971	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
972
973	if (dev == itn->fb_tunnel_dev)
974		return -EINVAL;
975
976	t = ip_tunnel_find(itn, p, dev->type);
977
978	if (t) {
979		if (t->dev != dev)
980			return -EEXIST;
981	} else {
982		t = tunnel;
983
984		if (dev->type != ARPHRD_ETHER) {
985			unsigned int nflags = 0;
986
987			if (ipv4_is_multicast(p->iph.daddr))
988				nflags = IFF_BROADCAST;
989			else if (p->iph.daddr)
990				nflags = IFF_POINTOPOINT;
991
992			if ((dev->flags ^ nflags) &
993			    (IFF_POINTOPOINT | IFF_BROADCAST))
994				return -EINVAL;
995		}
996	}
997
998	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
999	return 0;
1000}
1001EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1002
1003int ip_tunnel_init(struct net_device *dev)
1004{
1005	struct ip_tunnel *tunnel = netdev_priv(dev);
1006	struct iphdr *iph = &tunnel->parms.iph;
1007	int err;
1008
1009	dev->destructor	= ip_tunnel_dev_free;
1010	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1011	if (!dev->tstats)
1012		return -ENOMEM;
1013
1014	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1015	if (!tunnel->dst_cache) {
1016		free_percpu(dev->tstats);
1017		return -ENOMEM;
1018	}
1019
1020	err = gro_cells_init(&tunnel->gro_cells, dev);
1021	if (err) {
1022		free_percpu(tunnel->dst_cache);
1023		free_percpu(dev->tstats);
1024		return err;
1025	}
1026
1027	tunnel->dev = dev;
1028	tunnel->net = dev_net(dev);
1029	strcpy(tunnel->parms.name, dev->name);
1030	iph->version		= 4;
1031	iph->ihl		= 5;
1032
1033	return 0;
1034}
1035EXPORT_SYMBOL_GPL(ip_tunnel_init);
1036
1037void ip_tunnel_uninit(struct net_device *dev)
1038{
1039	struct ip_tunnel *tunnel = netdev_priv(dev);
1040	struct net *net = tunnel->net;
1041	struct ip_tunnel_net *itn;
1042
1043	itn = net_generic(net, tunnel->ip_tnl_net_id);
1044	/* fb_tunnel_dev will be unregisted in net-exit call. */
1045	if (itn->fb_tunnel_dev != dev)
1046		ip_tunnel_del(netdev_priv(dev));
1047
1048	ip_tunnel_dst_reset_all(tunnel);
1049}
1050EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1051
1052/* Do least required initialization, rest of init is done in tunnel_init call */
1053void ip_tunnel_setup(struct net_device *dev, int net_id)
1054{
1055	struct ip_tunnel *tunnel = netdev_priv(dev);
1056	tunnel->ip_tnl_net_id = net_id;
1057}
1058EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1059
1060MODULE_LICENSE("GPL");
1061