fib_frontend.c revision 35ebf65e851c6d9731abc6362b189858eb59f4d3
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IPv4 Forwarding Information Base: FIB frontend.
7 *
8 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 *		This program is free software; you can redistribute it and/or
11 *		modify it under the terms of the GNU General Public License
12 *		as published by the Free Software Foundation; either version
13 *		2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/module.h>
17#include <asm/uaccess.h>
18#include <linux/bitops.h>
19#include <linux/capability.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/mm.h>
23#include <linux/string.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/errno.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/inetdevice.h>
30#include <linux/netdevice.h>
31#include <linux/if_addr.h>
32#include <linux/if_arp.h>
33#include <linux/skbuff.h>
34#include <linux/init.h>
35#include <linux/list.h>
36#include <linux/slab.h>
37
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/arp.h>
44#include <net/ip_fib.h>
45#include <net/rtnetlink.h>
46#include <net/xfrm.h>
47
48#ifndef CONFIG_IP_MULTIPLE_TABLES
49
50static int __net_init fib4_rules_init(struct net *net)
51{
52	struct fib_table *local_table, *main_table;
53
54	local_table = fib_trie_table(RT_TABLE_LOCAL);
55	if (local_table == NULL)
56		return -ENOMEM;
57
58	main_table  = fib_trie_table(RT_TABLE_MAIN);
59	if (main_table == NULL)
60		goto fail;
61
62	hlist_add_head_rcu(&local_table->tb_hlist,
63				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64	hlist_add_head_rcu(&main_table->tb_hlist,
65				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66	return 0;
67
68fail:
69	kfree(local_table);
70	return -ENOMEM;
71}
72#else
73
74struct fib_table *fib_new_table(struct net *net, u32 id)
75{
76	struct fib_table *tb;
77	unsigned int h;
78
79	if (id == 0)
80		id = RT_TABLE_MAIN;
81	tb = fib_get_table(net, id);
82	if (tb)
83		return tb;
84
85	tb = fib_trie_table(id);
86	if (!tb)
87		return NULL;
88	h = id & (FIB_TABLE_HASHSZ - 1);
89	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
90	return tb;
91}
92
93struct fib_table *fib_get_table(struct net *net, u32 id)
94{
95	struct fib_table *tb;
96	struct hlist_node *node;
97	struct hlist_head *head;
98	unsigned int h;
99
100	if (id == 0)
101		id = RT_TABLE_MAIN;
102	h = id & (FIB_TABLE_HASHSZ - 1);
103
104	rcu_read_lock();
105	head = &net->ipv4.fib_table_hash[h];
106	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107		if (tb->tb_id == id) {
108			rcu_read_unlock();
109			return tb;
110		}
111	}
112	rcu_read_unlock();
113	return NULL;
114}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */
116
117static void fib_flush(struct net *net)
118{
119	int flushed = 0;
120	struct fib_table *tb;
121	struct hlist_node *node;
122	struct hlist_head *head;
123	unsigned int h;
124
125	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
126		head = &net->ipv4.fib_table_hash[h];
127		hlist_for_each_entry(tb, node, head, tb_hlist)
128			flushed += fib_table_flush(tb);
129	}
130
131	if (flushed)
132		rt_cache_flush(net, -1);
133}
134
135/*
136 * Find address type as if only "dev" was present in the system. If
137 * on_dev is NULL then all interfaces are taken into consideration.
138 */
139static inline unsigned int __inet_dev_addr_type(struct net *net,
140						const struct net_device *dev,
141						__be32 addr)
142{
143	struct flowi4		fl4 = { .daddr = addr };
144	struct fib_result	res;
145	unsigned int ret = RTN_BROADCAST;
146	struct fib_table *local_table;
147
148	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
149		return RTN_BROADCAST;
150	if (ipv4_is_multicast(addr))
151		return RTN_MULTICAST;
152
153#ifdef CONFIG_IP_MULTIPLE_TABLES
154	res.r = NULL;
155#endif
156
157	local_table = fib_get_table(net, RT_TABLE_LOCAL);
158	if (local_table) {
159		ret = RTN_UNICAST;
160		rcu_read_lock();
161		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
162			if (!dev || dev == res.fi->fib_dev)
163				ret = res.type;
164		}
165		rcu_read_unlock();
166	}
167	return ret;
168}
169
170unsigned int inet_addr_type(struct net *net, __be32 addr)
171{
172	return __inet_dev_addr_type(net, NULL, addr);
173}
174EXPORT_SYMBOL(inet_addr_type);
175
176unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
177				__be32 addr)
178{
179	return __inet_dev_addr_type(net, dev, addr);
180}
181EXPORT_SYMBOL(inet_dev_addr_type);
182
183__be32 fib_compute_spec_dst(struct sk_buff *skb)
184{
185	struct net_device *dev = skb->dev;
186	struct in_device *in_dev;
187	struct fib_result res;
188	struct flowi4 fl4;
189	struct net *net;
190
191	if (skb->pkt_type != PACKET_BROADCAST &&
192	    skb->pkt_type != PACKET_MULTICAST)
193		return ip_hdr(skb)->daddr;
194
195	in_dev = __in_dev_get_rcu(dev);
196	BUG_ON(!in_dev);
197	fl4.flowi4_oif = 0;
198	fl4.flowi4_iif = 0;
199	fl4.daddr = ip_hdr(skb)->saddr;
200	fl4.saddr = ip_hdr(skb)->daddr;
201	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
202	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
203	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
204
205	net = dev_net(dev);
206	if (!fib_lookup(net, &fl4, &res))
207		return FIB_RES_PREFSRC(net, res);
208	else
209		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
210}
211
212/* Given (packet source, input interface) and optional (dst, oif, tos):
213 * - (main) check, that source is valid i.e. not broadcast or our local
214 *   address.
215 * - figure out what "logical" interface this packet arrived
216 *   and calculate "specific destination" address.
217 * - check, that packet arrived from expected physical interface.
218 * called with rcu_read_lock()
219 */
220int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
221			int oif, struct net_device *dev, __be32 *spec_dst,
222			u32 *itag)
223{
224	struct in_device *in_dev;
225	struct flowi4 fl4;
226	struct fib_result res;
227	int no_addr, rpf, accept_local;
228	bool dev_match;
229	int ret;
230	struct net *net;
231
232	fl4.flowi4_oif = 0;
233	fl4.flowi4_iif = oif;
234	fl4.daddr = src;
235	fl4.saddr = dst;
236	fl4.flowi4_tos = tos;
237	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
238
239	no_addr = rpf = accept_local = 0;
240	in_dev = __in_dev_get_rcu(dev);
241	if (in_dev) {
242		no_addr = in_dev->ifa_list == NULL;
243
244		/* Ignore rp_filter for packets protected by IPsec. */
245		rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
246
247		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
248		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
249	}
250
251	if (in_dev == NULL)
252		goto e_inval;
253
254	net = dev_net(dev);
255	if (fib_lookup(net, &fl4, &res))
256		goto last_resort;
257	if (res.type != RTN_UNICAST) {
258		if (res.type != RTN_LOCAL || !accept_local)
259			goto e_inval;
260	}
261	*spec_dst = FIB_RES_PREFSRC(net, res);
262	fib_combine_itag(itag, &res);
263	dev_match = false;
264
265#ifdef CONFIG_IP_ROUTE_MULTIPATH
266	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
267		struct fib_nh *nh = &res.fi->fib_nh[ret];
268
269		if (nh->nh_dev == dev) {
270			dev_match = true;
271			break;
272		}
273	}
274#else
275	if (FIB_RES_DEV(res) == dev)
276		dev_match = true;
277#endif
278	if (dev_match) {
279		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
280		return ret;
281	}
282	if (no_addr)
283		goto last_resort;
284	if (rpf == 1)
285		goto e_rpf;
286	fl4.flowi4_oif = dev->ifindex;
287
288	ret = 0;
289	if (fib_lookup(net, &fl4, &res) == 0) {
290		if (res.type == RTN_UNICAST) {
291			*spec_dst = FIB_RES_PREFSRC(net, res);
292			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
293		}
294	}
295	return ret;
296
297last_resort:
298	if (rpf)
299		goto e_rpf;
300	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
301	*itag = 0;
302	return 0;
303
304e_inval:
305	return -EINVAL;
306e_rpf:
307	return -EXDEV;
308}
309
310static inline __be32 sk_extract_addr(struct sockaddr *addr)
311{
312	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
313}
314
315static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
316{
317	struct nlattr *nla;
318
319	nla = (struct nlattr *) ((char *) mx + len);
320	nla->nla_type = type;
321	nla->nla_len = nla_attr_size(4);
322	*(u32 *) nla_data(nla) = value;
323
324	return len + nla_total_size(4);
325}
326
327static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
328				 struct fib_config *cfg)
329{
330	__be32 addr;
331	int plen;
332
333	memset(cfg, 0, sizeof(*cfg));
334	cfg->fc_nlinfo.nl_net = net;
335
336	if (rt->rt_dst.sa_family != AF_INET)
337		return -EAFNOSUPPORT;
338
339	/*
340	 * Check mask for validity:
341	 * a) it must be contiguous.
342	 * b) destination must have all host bits clear.
343	 * c) if application forgot to set correct family (AF_INET),
344	 *    reject request unless it is absolutely clear i.e.
345	 *    both family and mask are zero.
346	 */
347	plen = 32;
348	addr = sk_extract_addr(&rt->rt_dst);
349	if (!(rt->rt_flags & RTF_HOST)) {
350		__be32 mask = sk_extract_addr(&rt->rt_genmask);
351
352		if (rt->rt_genmask.sa_family != AF_INET) {
353			if (mask || rt->rt_genmask.sa_family)
354				return -EAFNOSUPPORT;
355		}
356
357		if (bad_mask(mask, addr))
358			return -EINVAL;
359
360		plen = inet_mask_len(mask);
361	}
362
363	cfg->fc_dst_len = plen;
364	cfg->fc_dst = addr;
365
366	if (cmd != SIOCDELRT) {
367		cfg->fc_nlflags = NLM_F_CREATE;
368		cfg->fc_protocol = RTPROT_BOOT;
369	}
370
371	if (rt->rt_metric)
372		cfg->fc_priority = rt->rt_metric - 1;
373
374	if (rt->rt_flags & RTF_REJECT) {
375		cfg->fc_scope = RT_SCOPE_HOST;
376		cfg->fc_type = RTN_UNREACHABLE;
377		return 0;
378	}
379
380	cfg->fc_scope = RT_SCOPE_NOWHERE;
381	cfg->fc_type = RTN_UNICAST;
382
383	if (rt->rt_dev) {
384		char *colon;
385		struct net_device *dev;
386		char devname[IFNAMSIZ];
387
388		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
389			return -EFAULT;
390
391		devname[IFNAMSIZ-1] = 0;
392		colon = strchr(devname, ':');
393		if (colon)
394			*colon = 0;
395		dev = __dev_get_by_name(net, devname);
396		if (!dev)
397			return -ENODEV;
398		cfg->fc_oif = dev->ifindex;
399		if (colon) {
400			struct in_ifaddr *ifa;
401			struct in_device *in_dev = __in_dev_get_rtnl(dev);
402			if (!in_dev)
403				return -ENODEV;
404			*colon = ':';
405			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
406				if (strcmp(ifa->ifa_label, devname) == 0)
407					break;
408			if (ifa == NULL)
409				return -ENODEV;
410			cfg->fc_prefsrc = ifa->ifa_local;
411		}
412	}
413
414	addr = sk_extract_addr(&rt->rt_gateway);
415	if (rt->rt_gateway.sa_family == AF_INET && addr) {
416		cfg->fc_gw = addr;
417		if (rt->rt_flags & RTF_GATEWAY &&
418		    inet_addr_type(net, addr) == RTN_UNICAST)
419			cfg->fc_scope = RT_SCOPE_UNIVERSE;
420	}
421
422	if (cmd == SIOCDELRT)
423		return 0;
424
425	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
426		return -EINVAL;
427
428	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
429		cfg->fc_scope = RT_SCOPE_LINK;
430
431	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
432		struct nlattr *mx;
433		int len = 0;
434
435		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
436		if (mx == NULL)
437			return -ENOMEM;
438
439		if (rt->rt_flags & RTF_MTU)
440			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
441
442		if (rt->rt_flags & RTF_WINDOW)
443			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
444
445		if (rt->rt_flags & RTF_IRTT)
446			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
447
448		cfg->fc_mx = mx;
449		cfg->fc_mx_len = len;
450	}
451
452	return 0;
453}
454
455/*
456 * Handle IP routing ioctl calls.
457 * These are used to manipulate the routing tables
458 */
459int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
460{
461	struct fib_config cfg;
462	struct rtentry rt;
463	int err;
464
465	switch (cmd) {
466	case SIOCADDRT:		/* Add a route */
467	case SIOCDELRT:		/* Delete a route */
468		if (!capable(CAP_NET_ADMIN))
469			return -EPERM;
470
471		if (copy_from_user(&rt, arg, sizeof(rt)))
472			return -EFAULT;
473
474		rtnl_lock();
475		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
476		if (err == 0) {
477			struct fib_table *tb;
478
479			if (cmd == SIOCDELRT) {
480				tb = fib_get_table(net, cfg.fc_table);
481				if (tb)
482					err = fib_table_delete(tb, &cfg);
483				else
484					err = -ESRCH;
485			} else {
486				tb = fib_new_table(net, cfg.fc_table);
487				if (tb)
488					err = fib_table_insert(tb, &cfg);
489				else
490					err = -ENOBUFS;
491			}
492
493			/* allocated by rtentry_to_fib_config() */
494			kfree(cfg.fc_mx);
495		}
496		rtnl_unlock();
497		return err;
498	}
499	return -EINVAL;
500}
501
502const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
503	[RTA_DST]		= { .type = NLA_U32 },
504	[RTA_SRC]		= { .type = NLA_U32 },
505	[RTA_IIF]		= { .type = NLA_U32 },
506	[RTA_OIF]		= { .type = NLA_U32 },
507	[RTA_GATEWAY]		= { .type = NLA_U32 },
508	[RTA_PRIORITY]		= { .type = NLA_U32 },
509	[RTA_PREFSRC]		= { .type = NLA_U32 },
510	[RTA_METRICS]		= { .type = NLA_NESTED },
511	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
512	[RTA_FLOW]		= { .type = NLA_U32 },
513};
514
515static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
516			     struct nlmsghdr *nlh, struct fib_config *cfg)
517{
518	struct nlattr *attr;
519	int err, remaining;
520	struct rtmsg *rtm;
521
522	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
523	if (err < 0)
524		goto errout;
525
526	memset(cfg, 0, sizeof(*cfg));
527
528	rtm = nlmsg_data(nlh);
529	cfg->fc_dst_len = rtm->rtm_dst_len;
530	cfg->fc_tos = rtm->rtm_tos;
531	cfg->fc_table = rtm->rtm_table;
532	cfg->fc_protocol = rtm->rtm_protocol;
533	cfg->fc_scope = rtm->rtm_scope;
534	cfg->fc_type = rtm->rtm_type;
535	cfg->fc_flags = rtm->rtm_flags;
536	cfg->fc_nlflags = nlh->nlmsg_flags;
537
538	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
539	cfg->fc_nlinfo.nlh = nlh;
540	cfg->fc_nlinfo.nl_net = net;
541
542	if (cfg->fc_type > RTN_MAX) {
543		err = -EINVAL;
544		goto errout;
545	}
546
547	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
548		switch (nla_type(attr)) {
549		case RTA_DST:
550			cfg->fc_dst = nla_get_be32(attr);
551			break;
552		case RTA_OIF:
553			cfg->fc_oif = nla_get_u32(attr);
554			break;
555		case RTA_GATEWAY:
556			cfg->fc_gw = nla_get_be32(attr);
557			break;
558		case RTA_PRIORITY:
559			cfg->fc_priority = nla_get_u32(attr);
560			break;
561		case RTA_PREFSRC:
562			cfg->fc_prefsrc = nla_get_be32(attr);
563			break;
564		case RTA_METRICS:
565			cfg->fc_mx = nla_data(attr);
566			cfg->fc_mx_len = nla_len(attr);
567			break;
568		case RTA_MULTIPATH:
569			cfg->fc_mp = nla_data(attr);
570			cfg->fc_mp_len = nla_len(attr);
571			break;
572		case RTA_FLOW:
573			cfg->fc_flow = nla_get_u32(attr);
574			break;
575		case RTA_TABLE:
576			cfg->fc_table = nla_get_u32(attr);
577			break;
578		}
579	}
580
581	return 0;
582errout:
583	return err;
584}
585
586static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
587{
588	struct net *net = sock_net(skb->sk);
589	struct fib_config cfg;
590	struct fib_table *tb;
591	int err;
592
593	err = rtm_to_fib_config(net, skb, nlh, &cfg);
594	if (err < 0)
595		goto errout;
596
597	tb = fib_get_table(net, cfg.fc_table);
598	if (tb == NULL) {
599		err = -ESRCH;
600		goto errout;
601	}
602
603	err = fib_table_delete(tb, &cfg);
604errout:
605	return err;
606}
607
608static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
609{
610	struct net *net = sock_net(skb->sk);
611	struct fib_config cfg;
612	struct fib_table *tb;
613	int err;
614
615	err = rtm_to_fib_config(net, skb, nlh, &cfg);
616	if (err < 0)
617		goto errout;
618
619	tb = fib_new_table(net, cfg.fc_table);
620	if (tb == NULL) {
621		err = -ENOBUFS;
622		goto errout;
623	}
624
625	err = fib_table_insert(tb, &cfg);
626errout:
627	return err;
628}
629
630static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
631{
632	struct net *net = sock_net(skb->sk);
633	unsigned int h, s_h;
634	unsigned int e = 0, s_e;
635	struct fib_table *tb;
636	struct hlist_node *node;
637	struct hlist_head *head;
638	int dumped = 0;
639
640	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
641	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
642		return ip_rt_dump(skb, cb);
643
644	s_h = cb->args[0];
645	s_e = cb->args[1];
646
647	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
648		e = 0;
649		head = &net->ipv4.fib_table_hash[h];
650		hlist_for_each_entry(tb, node, head, tb_hlist) {
651			if (e < s_e)
652				goto next;
653			if (dumped)
654				memset(&cb->args[2], 0, sizeof(cb->args) -
655						 2 * sizeof(cb->args[0]));
656			if (fib_table_dump(tb, skb, cb) < 0)
657				goto out;
658			dumped = 1;
659next:
660			e++;
661		}
662	}
663out:
664	cb->args[1] = e;
665	cb->args[0] = h;
666
667	return skb->len;
668}
669
670/* Prepare and feed intra-kernel routing request.
671 * Really, it should be netlink message, but :-( netlink
672 * can be not configured, so that we feed it directly
673 * to fib engine. It is legal, because all events occur
674 * only when netlink is already locked.
675 */
676static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
677{
678	struct net *net = dev_net(ifa->ifa_dev->dev);
679	struct fib_table *tb;
680	struct fib_config cfg = {
681		.fc_protocol = RTPROT_KERNEL,
682		.fc_type = type,
683		.fc_dst = dst,
684		.fc_dst_len = dst_len,
685		.fc_prefsrc = ifa->ifa_local,
686		.fc_oif = ifa->ifa_dev->dev->ifindex,
687		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
688		.fc_nlinfo = {
689			.nl_net = net,
690		},
691	};
692
693	if (type == RTN_UNICAST)
694		tb = fib_new_table(net, RT_TABLE_MAIN);
695	else
696		tb = fib_new_table(net, RT_TABLE_LOCAL);
697
698	if (tb == NULL)
699		return;
700
701	cfg.fc_table = tb->tb_id;
702
703	if (type != RTN_LOCAL)
704		cfg.fc_scope = RT_SCOPE_LINK;
705	else
706		cfg.fc_scope = RT_SCOPE_HOST;
707
708	if (cmd == RTM_NEWROUTE)
709		fib_table_insert(tb, &cfg);
710	else
711		fib_table_delete(tb, &cfg);
712}
713
714void fib_add_ifaddr(struct in_ifaddr *ifa)
715{
716	struct in_device *in_dev = ifa->ifa_dev;
717	struct net_device *dev = in_dev->dev;
718	struct in_ifaddr *prim = ifa;
719	__be32 mask = ifa->ifa_mask;
720	__be32 addr = ifa->ifa_local;
721	__be32 prefix = ifa->ifa_address & mask;
722
723	if (ifa->ifa_flags & IFA_F_SECONDARY) {
724		prim = inet_ifa_byprefix(in_dev, prefix, mask);
725		if (prim == NULL) {
726			pr_warn("%s: bug: prim == NULL\n", __func__);
727			return;
728		}
729	}
730
731	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
732
733	if (!(dev->flags & IFF_UP))
734		return;
735
736	/* Add broadcast address, if it is explicitly assigned. */
737	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
738		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
739
740	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
741	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
742		fib_magic(RTM_NEWROUTE,
743			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
744			  prefix, ifa->ifa_prefixlen, prim);
745
746		/* Add network specific broadcasts, when it takes a sense */
747		if (ifa->ifa_prefixlen < 31) {
748			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
749			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
750				  32, prim);
751		}
752	}
753}
754
755/* Delete primary or secondary address.
756 * Optionally, on secondary address promotion consider the addresses
757 * from subnet iprim as deleted, even if they are in device list.
758 * In this case the secondary ifa can be in device list.
759 */
760void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
761{
762	struct in_device *in_dev = ifa->ifa_dev;
763	struct net_device *dev = in_dev->dev;
764	struct in_ifaddr *ifa1;
765	struct in_ifaddr *prim = ifa, *prim1 = NULL;
766	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
767	__be32 any = ifa->ifa_address & ifa->ifa_mask;
768#define LOCAL_OK	1
769#define BRD_OK		2
770#define BRD0_OK		4
771#define BRD1_OK		8
772	unsigned int ok = 0;
773	int subnet = 0;		/* Primary network */
774	int gone = 1;		/* Address is missing */
775	int same_prefsrc = 0;	/* Another primary with same IP */
776
777	if (ifa->ifa_flags & IFA_F_SECONDARY) {
778		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
779		if (prim == NULL) {
780			pr_warn("%s: bug: prim == NULL\n", __func__);
781			return;
782		}
783		if (iprim && iprim != prim) {
784			pr_warn("%s: bug: iprim != prim\n", __func__);
785			return;
786		}
787	} else if (!ipv4_is_zeronet(any) &&
788		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
789		fib_magic(RTM_DELROUTE,
790			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
791			  any, ifa->ifa_prefixlen, prim);
792		subnet = 1;
793	}
794
795	/* Deletion is more complicated than add.
796	 * We should take care of not to delete too much :-)
797	 *
798	 * Scan address list to be sure that addresses are really gone.
799	 */
800
801	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
802		if (ifa1 == ifa) {
803			/* promotion, keep the IP */
804			gone = 0;
805			continue;
806		}
807		/* Ignore IFAs from our subnet */
808		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
809		    inet_ifa_match(ifa1->ifa_address, iprim))
810			continue;
811
812		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
813		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
814			/* Another address from our subnet? */
815			if (ifa1->ifa_mask == prim->ifa_mask &&
816			    inet_ifa_match(ifa1->ifa_address, prim))
817				prim1 = prim;
818			else {
819				/* We reached the secondaries, so
820				 * same_prefsrc should be determined.
821				 */
822				if (!same_prefsrc)
823					continue;
824				/* Search new prim1 if ifa1 is not
825				 * using the current prim1
826				 */
827				if (!prim1 ||
828				    ifa1->ifa_mask != prim1->ifa_mask ||
829				    !inet_ifa_match(ifa1->ifa_address, prim1))
830					prim1 = inet_ifa_byprefix(in_dev,
831							ifa1->ifa_address,
832							ifa1->ifa_mask);
833				if (!prim1)
834					continue;
835				if (prim1->ifa_local != prim->ifa_local)
836					continue;
837			}
838		} else {
839			if (prim->ifa_local != ifa1->ifa_local)
840				continue;
841			prim1 = ifa1;
842			if (prim != prim1)
843				same_prefsrc = 1;
844		}
845		if (ifa->ifa_local == ifa1->ifa_local)
846			ok |= LOCAL_OK;
847		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
848			ok |= BRD_OK;
849		if (brd == ifa1->ifa_broadcast)
850			ok |= BRD1_OK;
851		if (any == ifa1->ifa_broadcast)
852			ok |= BRD0_OK;
853		/* primary has network specific broadcasts */
854		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
855			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
856			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
857
858			if (!ipv4_is_zeronet(any1)) {
859				if (ifa->ifa_broadcast == brd1 ||
860				    ifa->ifa_broadcast == any1)
861					ok |= BRD_OK;
862				if (brd == brd1 || brd == any1)
863					ok |= BRD1_OK;
864				if (any == brd1 || any == any1)
865					ok |= BRD0_OK;
866			}
867		}
868	}
869
870	if (!(ok & BRD_OK))
871		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
872	if (subnet && ifa->ifa_prefixlen < 31) {
873		if (!(ok & BRD1_OK))
874			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
875		if (!(ok & BRD0_OK))
876			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
877	}
878	if (!(ok & LOCAL_OK)) {
879		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
880
881		/* Check, that this local address finally disappeared. */
882		if (gone &&
883		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
884			/* And the last, but not the least thing.
885			 * We must flush stray FIB entries.
886			 *
887			 * First of all, we scan fib_info list searching
888			 * for stray nexthop entries, then ignite fib_flush.
889			 */
890			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
891				fib_flush(dev_net(dev));
892		}
893	}
894#undef LOCAL_OK
895#undef BRD_OK
896#undef BRD0_OK
897#undef BRD1_OK
898}
899
900static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
901{
902
903	struct fib_result       res;
904	struct flowi4           fl4 = {
905		.flowi4_mark = frn->fl_mark,
906		.daddr = frn->fl_addr,
907		.flowi4_tos = frn->fl_tos,
908		.flowi4_scope = frn->fl_scope,
909	};
910
911#ifdef CONFIG_IP_MULTIPLE_TABLES
912	res.r = NULL;
913#endif
914
915	frn->err = -ENOENT;
916	if (tb) {
917		local_bh_disable();
918
919		frn->tb_id = tb->tb_id;
920		rcu_read_lock();
921		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
922
923		if (!frn->err) {
924			frn->prefixlen = res.prefixlen;
925			frn->nh_sel = res.nh_sel;
926			frn->type = res.type;
927			frn->scope = res.scope;
928		}
929		rcu_read_unlock();
930		local_bh_enable();
931	}
932}
933
934static void nl_fib_input(struct sk_buff *skb)
935{
936	struct net *net;
937	struct fib_result_nl *frn;
938	struct nlmsghdr *nlh;
939	struct fib_table *tb;
940	u32 pid;
941
942	net = sock_net(skb->sk);
943	nlh = nlmsg_hdr(skb);
944	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
945	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
946		return;
947
948	skb = skb_clone(skb, GFP_KERNEL);
949	if (skb == NULL)
950		return;
951	nlh = nlmsg_hdr(skb);
952
953	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
954	tb = fib_get_table(net, frn->tb_id_in);
955
956	nl_fib_lookup(frn, tb);
957
958	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
959	NETLINK_CB(skb).pid = 0;        /* from kernel */
960	NETLINK_CB(skb).dst_group = 0;  /* unicast */
961	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
962}
963
964static int __net_init nl_fib_lookup_init(struct net *net)
965{
966	struct sock *sk;
967	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
968				   nl_fib_input, NULL, THIS_MODULE);
969	if (sk == NULL)
970		return -EAFNOSUPPORT;
971	net->ipv4.fibnl = sk;
972	return 0;
973}
974
975static void nl_fib_lookup_exit(struct net *net)
976{
977	netlink_kernel_release(net->ipv4.fibnl);
978	net->ipv4.fibnl = NULL;
979}
980
981static void fib_disable_ip(struct net_device *dev, int force, int delay)
982{
983	if (fib_sync_down_dev(dev, force))
984		fib_flush(dev_net(dev));
985	rt_cache_flush(dev_net(dev), delay);
986	arp_ifdown(dev);
987}
988
989static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
990{
991	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
992	struct net_device *dev = ifa->ifa_dev->dev;
993	struct net *net = dev_net(dev);
994
995	switch (event) {
996	case NETDEV_UP:
997		fib_add_ifaddr(ifa);
998#ifdef CONFIG_IP_ROUTE_MULTIPATH
999		fib_sync_up(dev);
1000#endif
1001		atomic_inc(&net->ipv4.dev_addr_genid);
1002		rt_cache_flush(dev_net(dev), -1);
1003		break;
1004	case NETDEV_DOWN:
1005		fib_del_ifaddr(ifa, NULL);
1006		atomic_inc(&net->ipv4.dev_addr_genid);
1007		if (ifa->ifa_dev->ifa_list == NULL) {
1008			/* Last address was deleted from this interface.
1009			 * Disable IP.
1010			 */
1011			fib_disable_ip(dev, 1, 0);
1012		} else {
1013			rt_cache_flush(dev_net(dev), -1);
1014		}
1015		break;
1016	}
1017	return NOTIFY_DONE;
1018}
1019
1020static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1021{
1022	struct net_device *dev = ptr;
1023	struct in_device *in_dev = __in_dev_get_rtnl(dev);
1024	struct net *net = dev_net(dev);
1025
1026	if (event == NETDEV_UNREGISTER) {
1027		fib_disable_ip(dev, 2, -1);
1028		return NOTIFY_DONE;
1029	}
1030
1031	if (!in_dev)
1032		return NOTIFY_DONE;
1033
1034	switch (event) {
1035	case NETDEV_UP:
1036		for_ifa(in_dev) {
1037			fib_add_ifaddr(ifa);
1038		} endfor_ifa(in_dev);
1039#ifdef CONFIG_IP_ROUTE_MULTIPATH
1040		fib_sync_up(dev);
1041#endif
1042		atomic_inc(&net->ipv4.dev_addr_genid);
1043		rt_cache_flush(dev_net(dev), -1);
1044		break;
1045	case NETDEV_DOWN:
1046		fib_disable_ip(dev, 0, 0);
1047		break;
1048	case NETDEV_CHANGEMTU:
1049	case NETDEV_CHANGE:
1050		rt_cache_flush(dev_net(dev), 0);
1051		break;
1052	case NETDEV_UNREGISTER_BATCH:
1053		/* The batch unregister is only called on the first
1054		 * device in the list of devices being unregistered.
1055		 * Therefore we should not pass dev_net(dev) in here.
1056		 */
1057		rt_cache_flush_batch(NULL);
1058		break;
1059	}
1060	return NOTIFY_DONE;
1061}
1062
1063static struct notifier_block fib_inetaddr_notifier = {
1064	.notifier_call = fib_inetaddr_event,
1065};
1066
1067static struct notifier_block fib_netdev_notifier = {
1068	.notifier_call = fib_netdev_event,
1069};
1070
1071static int __net_init ip_fib_net_init(struct net *net)
1072{
1073	int err;
1074	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1075
1076	/* Avoid false sharing : Use at least a full cache line */
1077	size = max_t(size_t, size, L1_CACHE_BYTES);
1078
1079	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1080	if (net->ipv4.fib_table_hash == NULL)
1081		return -ENOMEM;
1082
1083	err = fib4_rules_init(net);
1084	if (err < 0)
1085		goto fail;
1086	return 0;
1087
1088fail:
1089	kfree(net->ipv4.fib_table_hash);
1090	return err;
1091}
1092
1093static void ip_fib_net_exit(struct net *net)
1094{
1095	unsigned int i;
1096
1097#ifdef CONFIG_IP_MULTIPLE_TABLES
1098	fib4_rules_exit(net);
1099#endif
1100
1101	rtnl_lock();
1102	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1103		struct fib_table *tb;
1104		struct hlist_head *head;
1105		struct hlist_node *node, *tmp;
1106
1107		head = &net->ipv4.fib_table_hash[i];
1108		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1109			hlist_del(node);
1110			fib_table_flush(tb);
1111			fib_free_table(tb);
1112		}
1113	}
1114	rtnl_unlock();
1115	kfree(net->ipv4.fib_table_hash);
1116}
1117
1118static int __net_init fib_net_init(struct net *net)
1119{
1120	int error;
1121
1122	error = ip_fib_net_init(net);
1123	if (error < 0)
1124		goto out;
1125	error = nl_fib_lookup_init(net);
1126	if (error < 0)
1127		goto out_nlfl;
1128	error = fib_proc_init(net);
1129	if (error < 0)
1130		goto out_proc;
1131out:
1132	return error;
1133
1134out_proc:
1135	nl_fib_lookup_exit(net);
1136out_nlfl:
1137	ip_fib_net_exit(net);
1138	goto out;
1139}
1140
1141static void __net_exit fib_net_exit(struct net *net)
1142{
1143	fib_proc_exit(net);
1144	nl_fib_lookup_exit(net);
1145	ip_fib_net_exit(net);
1146}
1147
1148static struct pernet_operations fib_net_ops = {
1149	.init = fib_net_init,
1150	.exit = fib_net_exit,
1151};
1152
1153void __init ip_fib_init(void)
1154{
1155	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
1156	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
1157	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
1158
1159	register_pernet_subsys(&fib_net_ops);
1160	register_netdevice_notifier(&fib_netdev_notifier);
1161	register_inetaddr_notifier(&fib_inetaddr_notifier);
1162
1163	fib_trie_init();
1164}
1165