fib_frontend.c revision 058bd4d2a4ff0aaa4a5381c67e776729d840c785
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IPv4 Forwarding Information Base: FIB frontend.
7 *
8 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 *		This program is free software; you can redistribute it and/or
11 *		modify it under the terms of the GNU General Public License
12 *		as published by the Free Software Foundation; either version
13 *		2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/module.h>
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <linux/bitops.h>
20#include <linux/capability.h>
21#include <linux/types.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/string.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/errno.h>
28#include <linux/in.h>
29#include <linux/inet.h>
30#include <linux/inetdevice.h>
31#include <linux/netdevice.h>
32#include <linux/if_addr.h>
33#include <linux/if_arp.h>
34#include <linux/skbuff.h>
35#include <linux/init.h>
36#include <linux/list.h>
37#include <linux/slab.h>
38
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/arp.h>
45#include <net/ip_fib.h>
46#include <net/rtnetlink.h>
47#include <net/xfrm.h>
48
49#ifndef CONFIG_IP_MULTIPLE_TABLES
50
51static int __net_init fib4_rules_init(struct net *net)
52{
53	struct fib_table *local_table, *main_table;
54
55	local_table = fib_trie_table(RT_TABLE_LOCAL);
56	if (local_table == NULL)
57		return -ENOMEM;
58
59	main_table  = fib_trie_table(RT_TABLE_MAIN);
60	if (main_table == NULL)
61		goto fail;
62
63	hlist_add_head_rcu(&local_table->tb_hlist,
64				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
65	hlist_add_head_rcu(&main_table->tb_hlist,
66				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
67	return 0;
68
69fail:
70	kfree(local_table);
71	return -ENOMEM;
72}
73#else
74
75struct fib_table *fib_new_table(struct net *net, u32 id)
76{
77	struct fib_table *tb;
78	unsigned int h;
79
80	if (id == 0)
81		id = RT_TABLE_MAIN;
82	tb = fib_get_table(net, id);
83	if (tb)
84		return tb;
85
86	tb = fib_trie_table(id);
87	if (!tb)
88		return NULL;
89	h = id & (FIB_TABLE_HASHSZ - 1);
90	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
91	return tb;
92}
93
94struct fib_table *fib_get_table(struct net *net, u32 id)
95{
96	struct fib_table *tb;
97	struct hlist_node *node;
98	struct hlist_head *head;
99	unsigned int h;
100
101	if (id == 0)
102		id = RT_TABLE_MAIN;
103	h = id & (FIB_TABLE_HASHSZ - 1);
104
105	rcu_read_lock();
106	head = &net->ipv4.fib_table_hash[h];
107	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
108		if (tb->tb_id == id) {
109			rcu_read_unlock();
110			return tb;
111		}
112	}
113	rcu_read_unlock();
114	return NULL;
115}
116#endif /* CONFIG_IP_MULTIPLE_TABLES */
117
118static void fib_flush(struct net *net)
119{
120	int flushed = 0;
121	struct fib_table *tb;
122	struct hlist_node *node;
123	struct hlist_head *head;
124	unsigned int h;
125
126	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
127		head = &net->ipv4.fib_table_hash[h];
128		hlist_for_each_entry(tb, node, head, tb_hlist)
129			flushed += fib_table_flush(tb);
130	}
131
132	if (flushed)
133		rt_cache_flush(net, -1);
134}
135
136/*
137 * Find address type as if only "dev" was present in the system. If
138 * on_dev is NULL then all interfaces are taken into consideration.
139 */
140static inline unsigned __inet_dev_addr_type(struct net *net,
141					    const struct net_device *dev,
142					    __be32 addr)
143{
144	struct flowi4		fl4 = { .daddr = addr };
145	struct fib_result	res;
146	unsigned ret = RTN_BROADCAST;
147	struct fib_table *local_table;
148
149	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
150		return RTN_BROADCAST;
151	if (ipv4_is_multicast(addr))
152		return RTN_MULTICAST;
153
154#ifdef CONFIG_IP_MULTIPLE_TABLES
155	res.r = NULL;
156#endif
157
158	local_table = fib_get_table(net, RT_TABLE_LOCAL);
159	if (local_table) {
160		ret = RTN_UNICAST;
161		rcu_read_lock();
162		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
163			if (!dev || dev == res.fi->fib_dev)
164				ret = res.type;
165		}
166		rcu_read_unlock();
167	}
168	return ret;
169}
170
171unsigned int inet_addr_type(struct net *net, __be32 addr)
172{
173	return __inet_dev_addr_type(net, NULL, addr);
174}
175EXPORT_SYMBOL(inet_addr_type);
176
177unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
178				__be32 addr)
179{
180	return __inet_dev_addr_type(net, dev, addr);
181}
182EXPORT_SYMBOL(inet_dev_addr_type);
183
184/* Given (packet source, input interface) and optional (dst, oif, tos):
185 * - (main) check, that source is valid i.e. not broadcast or our local
186 *   address.
187 * - figure out what "logical" interface this packet arrived
188 *   and calculate "specific destination" address.
189 * - check, that packet arrived from expected physical interface.
190 * called with rcu_read_lock()
191 */
192int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
193			int oif, struct net_device *dev, __be32 *spec_dst,
194			u32 *itag)
195{
196	struct in_device *in_dev;
197	struct flowi4 fl4;
198	struct fib_result res;
199	int no_addr, rpf, accept_local;
200	bool dev_match;
201	int ret;
202	struct net *net;
203
204	fl4.flowi4_oif = 0;
205	fl4.flowi4_iif = oif;
206	fl4.daddr = src;
207	fl4.saddr = dst;
208	fl4.flowi4_tos = tos;
209	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210
211	no_addr = rpf = accept_local = 0;
212	in_dev = __in_dev_get_rcu(dev);
213	if (in_dev) {
214		no_addr = in_dev->ifa_list == NULL;
215
216		/* Ignore rp_filter for packets protected by IPsec. */
217		rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
218
219		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
220		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
221	}
222
223	if (in_dev == NULL)
224		goto e_inval;
225
226	net = dev_net(dev);
227	if (fib_lookup(net, &fl4, &res))
228		goto last_resort;
229	if (res.type != RTN_UNICAST) {
230		if (res.type != RTN_LOCAL || !accept_local)
231			goto e_inval;
232	}
233	*spec_dst = FIB_RES_PREFSRC(net, res);
234	fib_combine_itag(itag, &res);
235	dev_match = false;
236
237#ifdef CONFIG_IP_ROUTE_MULTIPATH
238	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
239		struct fib_nh *nh = &res.fi->fib_nh[ret];
240
241		if (nh->nh_dev == dev) {
242			dev_match = true;
243			break;
244		}
245	}
246#else
247	if (FIB_RES_DEV(res) == dev)
248		dev_match = true;
249#endif
250	if (dev_match) {
251		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
252		return ret;
253	}
254	if (no_addr)
255		goto last_resort;
256	if (rpf == 1)
257		goto e_rpf;
258	fl4.flowi4_oif = dev->ifindex;
259
260	ret = 0;
261	if (fib_lookup(net, &fl4, &res) == 0) {
262		if (res.type == RTN_UNICAST) {
263			*spec_dst = FIB_RES_PREFSRC(net, res);
264			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
265		}
266	}
267	return ret;
268
269last_resort:
270	if (rpf)
271		goto e_rpf;
272	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
273	*itag = 0;
274	return 0;
275
276e_inval:
277	return -EINVAL;
278e_rpf:
279	return -EXDEV;
280}
281
282static inline __be32 sk_extract_addr(struct sockaddr *addr)
283{
284	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
285}
286
287static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
288{
289	struct nlattr *nla;
290
291	nla = (struct nlattr *) ((char *) mx + len);
292	nla->nla_type = type;
293	nla->nla_len = nla_attr_size(4);
294	*(u32 *) nla_data(nla) = value;
295
296	return len + nla_total_size(4);
297}
298
299static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
300				 struct fib_config *cfg)
301{
302	__be32 addr;
303	int plen;
304
305	memset(cfg, 0, sizeof(*cfg));
306	cfg->fc_nlinfo.nl_net = net;
307
308	if (rt->rt_dst.sa_family != AF_INET)
309		return -EAFNOSUPPORT;
310
311	/*
312	 * Check mask for validity:
313	 * a) it must be contiguous.
314	 * b) destination must have all host bits clear.
315	 * c) if application forgot to set correct family (AF_INET),
316	 *    reject request unless it is absolutely clear i.e.
317	 *    both family and mask are zero.
318	 */
319	plen = 32;
320	addr = sk_extract_addr(&rt->rt_dst);
321	if (!(rt->rt_flags & RTF_HOST)) {
322		__be32 mask = sk_extract_addr(&rt->rt_genmask);
323
324		if (rt->rt_genmask.sa_family != AF_INET) {
325			if (mask || rt->rt_genmask.sa_family)
326				return -EAFNOSUPPORT;
327		}
328
329		if (bad_mask(mask, addr))
330			return -EINVAL;
331
332		plen = inet_mask_len(mask);
333	}
334
335	cfg->fc_dst_len = plen;
336	cfg->fc_dst = addr;
337
338	if (cmd != SIOCDELRT) {
339		cfg->fc_nlflags = NLM_F_CREATE;
340		cfg->fc_protocol = RTPROT_BOOT;
341	}
342
343	if (rt->rt_metric)
344		cfg->fc_priority = rt->rt_metric - 1;
345
346	if (rt->rt_flags & RTF_REJECT) {
347		cfg->fc_scope = RT_SCOPE_HOST;
348		cfg->fc_type = RTN_UNREACHABLE;
349		return 0;
350	}
351
352	cfg->fc_scope = RT_SCOPE_NOWHERE;
353	cfg->fc_type = RTN_UNICAST;
354
355	if (rt->rt_dev) {
356		char *colon;
357		struct net_device *dev;
358		char devname[IFNAMSIZ];
359
360		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
361			return -EFAULT;
362
363		devname[IFNAMSIZ-1] = 0;
364		colon = strchr(devname, ':');
365		if (colon)
366			*colon = 0;
367		dev = __dev_get_by_name(net, devname);
368		if (!dev)
369			return -ENODEV;
370		cfg->fc_oif = dev->ifindex;
371		if (colon) {
372			struct in_ifaddr *ifa;
373			struct in_device *in_dev = __in_dev_get_rtnl(dev);
374			if (!in_dev)
375				return -ENODEV;
376			*colon = ':';
377			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
378				if (strcmp(ifa->ifa_label, devname) == 0)
379					break;
380			if (ifa == NULL)
381				return -ENODEV;
382			cfg->fc_prefsrc = ifa->ifa_local;
383		}
384	}
385
386	addr = sk_extract_addr(&rt->rt_gateway);
387	if (rt->rt_gateway.sa_family == AF_INET && addr) {
388		cfg->fc_gw = addr;
389		if (rt->rt_flags & RTF_GATEWAY &&
390		    inet_addr_type(net, addr) == RTN_UNICAST)
391			cfg->fc_scope = RT_SCOPE_UNIVERSE;
392	}
393
394	if (cmd == SIOCDELRT)
395		return 0;
396
397	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
398		return -EINVAL;
399
400	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
401		cfg->fc_scope = RT_SCOPE_LINK;
402
403	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
404		struct nlattr *mx;
405		int len = 0;
406
407		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
408		if (mx == NULL)
409			return -ENOMEM;
410
411		if (rt->rt_flags & RTF_MTU)
412			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
413
414		if (rt->rt_flags & RTF_WINDOW)
415			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
416
417		if (rt->rt_flags & RTF_IRTT)
418			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
419
420		cfg->fc_mx = mx;
421		cfg->fc_mx_len = len;
422	}
423
424	return 0;
425}
426
427/*
428 * Handle IP routing ioctl calls.
429 * These are used to manipulate the routing tables
430 */
431int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
432{
433	struct fib_config cfg;
434	struct rtentry rt;
435	int err;
436
437	switch (cmd) {
438	case SIOCADDRT:		/* Add a route */
439	case SIOCDELRT:		/* Delete a route */
440		if (!capable(CAP_NET_ADMIN))
441			return -EPERM;
442
443		if (copy_from_user(&rt, arg, sizeof(rt)))
444			return -EFAULT;
445
446		rtnl_lock();
447		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
448		if (err == 0) {
449			struct fib_table *tb;
450
451			if (cmd == SIOCDELRT) {
452				tb = fib_get_table(net, cfg.fc_table);
453				if (tb)
454					err = fib_table_delete(tb, &cfg);
455				else
456					err = -ESRCH;
457			} else {
458				tb = fib_new_table(net, cfg.fc_table);
459				if (tb)
460					err = fib_table_insert(tb, &cfg);
461				else
462					err = -ENOBUFS;
463			}
464
465			/* allocated by rtentry_to_fib_config() */
466			kfree(cfg.fc_mx);
467		}
468		rtnl_unlock();
469		return err;
470	}
471	return -EINVAL;
472}
473
474const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
475	[RTA_DST]		= { .type = NLA_U32 },
476	[RTA_SRC]		= { .type = NLA_U32 },
477	[RTA_IIF]		= { .type = NLA_U32 },
478	[RTA_OIF]		= { .type = NLA_U32 },
479	[RTA_GATEWAY]		= { .type = NLA_U32 },
480	[RTA_PRIORITY]		= { .type = NLA_U32 },
481	[RTA_PREFSRC]		= { .type = NLA_U32 },
482	[RTA_METRICS]		= { .type = NLA_NESTED },
483	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
484	[RTA_FLOW]		= { .type = NLA_U32 },
485};
486
487static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
488			     struct nlmsghdr *nlh, struct fib_config *cfg)
489{
490	struct nlattr *attr;
491	int err, remaining;
492	struct rtmsg *rtm;
493
494	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
495	if (err < 0)
496		goto errout;
497
498	memset(cfg, 0, sizeof(*cfg));
499
500	rtm = nlmsg_data(nlh);
501	cfg->fc_dst_len = rtm->rtm_dst_len;
502	cfg->fc_tos = rtm->rtm_tos;
503	cfg->fc_table = rtm->rtm_table;
504	cfg->fc_protocol = rtm->rtm_protocol;
505	cfg->fc_scope = rtm->rtm_scope;
506	cfg->fc_type = rtm->rtm_type;
507	cfg->fc_flags = rtm->rtm_flags;
508	cfg->fc_nlflags = nlh->nlmsg_flags;
509
510	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
511	cfg->fc_nlinfo.nlh = nlh;
512	cfg->fc_nlinfo.nl_net = net;
513
514	if (cfg->fc_type > RTN_MAX) {
515		err = -EINVAL;
516		goto errout;
517	}
518
519	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
520		switch (nla_type(attr)) {
521		case RTA_DST:
522			cfg->fc_dst = nla_get_be32(attr);
523			break;
524		case RTA_OIF:
525			cfg->fc_oif = nla_get_u32(attr);
526			break;
527		case RTA_GATEWAY:
528			cfg->fc_gw = nla_get_be32(attr);
529			break;
530		case RTA_PRIORITY:
531			cfg->fc_priority = nla_get_u32(attr);
532			break;
533		case RTA_PREFSRC:
534			cfg->fc_prefsrc = nla_get_be32(attr);
535			break;
536		case RTA_METRICS:
537			cfg->fc_mx = nla_data(attr);
538			cfg->fc_mx_len = nla_len(attr);
539			break;
540		case RTA_MULTIPATH:
541			cfg->fc_mp = nla_data(attr);
542			cfg->fc_mp_len = nla_len(attr);
543			break;
544		case RTA_FLOW:
545			cfg->fc_flow = nla_get_u32(attr);
546			break;
547		case RTA_TABLE:
548			cfg->fc_table = nla_get_u32(attr);
549			break;
550		}
551	}
552
553	return 0;
554errout:
555	return err;
556}
557
558static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
559{
560	struct net *net = sock_net(skb->sk);
561	struct fib_config cfg;
562	struct fib_table *tb;
563	int err;
564
565	err = rtm_to_fib_config(net, skb, nlh, &cfg);
566	if (err < 0)
567		goto errout;
568
569	tb = fib_get_table(net, cfg.fc_table);
570	if (tb == NULL) {
571		err = -ESRCH;
572		goto errout;
573	}
574
575	err = fib_table_delete(tb, &cfg);
576errout:
577	return err;
578}
579
580static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
581{
582	struct net *net = sock_net(skb->sk);
583	struct fib_config cfg;
584	struct fib_table *tb;
585	int err;
586
587	err = rtm_to_fib_config(net, skb, nlh, &cfg);
588	if (err < 0)
589		goto errout;
590
591	tb = fib_new_table(net, cfg.fc_table);
592	if (tb == NULL) {
593		err = -ENOBUFS;
594		goto errout;
595	}
596
597	err = fib_table_insert(tb, &cfg);
598errout:
599	return err;
600}
601
602static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
603{
604	struct net *net = sock_net(skb->sk);
605	unsigned int h, s_h;
606	unsigned int e = 0, s_e;
607	struct fib_table *tb;
608	struct hlist_node *node;
609	struct hlist_head *head;
610	int dumped = 0;
611
612	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
613	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
614		return ip_rt_dump(skb, cb);
615
616	s_h = cb->args[0];
617	s_e = cb->args[1];
618
619	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
620		e = 0;
621		head = &net->ipv4.fib_table_hash[h];
622		hlist_for_each_entry(tb, node, head, tb_hlist) {
623			if (e < s_e)
624				goto next;
625			if (dumped)
626				memset(&cb->args[2], 0, sizeof(cb->args) -
627						 2 * sizeof(cb->args[0]));
628			if (fib_table_dump(tb, skb, cb) < 0)
629				goto out;
630			dumped = 1;
631next:
632			e++;
633		}
634	}
635out:
636	cb->args[1] = e;
637	cb->args[0] = h;
638
639	return skb->len;
640}
641
642/* Prepare and feed intra-kernel routing request.
643 * Really, it should be netlink message, but :-( netlink
644 * can be not configured, so that we feed it directly
645 * to fib engine. It is legal, because all events occur
646 * only when netlink is already locked.
647 */
648static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
649{
650	struct net *net = dev_net(ifa->ifa_dev->dev);
651	struct fib_table *tb;
652	struct fib_config cfg = {
653		.fc_protocol = RTPROT_KERNEL,
654		.fc_type = type,
655		.fc_dst = dst,
656		.fc_dst_len = dst_len,
657		.fc_prefsrc = ifa->ifa_local,
658		.fc_oif = ifa->ifa_dev->dev->ifindex,
659		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
660		.fc_nlinfo = {
661			.nl_net = net,
662		},
663	};
664
665	if (type == RTN_UNICAST)
666		tb = fib_new_table(net, RT_TABLE_MAIN);
667	else
668		tb = fib_new_table(net, RT_TABLE_LOCAL);
669
670	if (tb == NULL)
671		return;
672
673	cfg.fc_table = tb->tb_id;
674
675	if (type != RTN_LOCAL)
676		cfg.fc_scope = RT_SCOPE_LINK;
677	else
678		cfg.fc_scope = RT_SCOPE_HOST;
679
680	if (cmd == RTM_NEWROUTE)
681		fib_table_insert(tb, &cfg);
682	else
683		fib_table_delete(tb, &cfg);
684}
685
686void fib_add_ifaddr(struct in_ifaddr *ifa)
687{
688	struct in_device *in_dev = ifa->ifa_dev;
689	struct net_device *dev = in_dev->dev;
690	struct in_ifaddr *prim = ifa;
691	__be32 mask = ifa->ifa_mask;
692	__be32 addr = ifa->ifa_local;
693	__be32 prefix = ifa->ifa_address & mask;
694
695	if (ifa->ifa_flags & IFA_F_SECONDARY) {
696		prim = inet_ifa_byprefix(in_dev, prefix, mask);
697		if (prim == NULL) {
698			pr_warn("%s: bug: prim == NULL\n", __func__);
699			return;
700		}
701	}
702
703	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
704
705	if (!(dev->flags & IFF_UP))
706		return;
707
708	/* Add broadcast address, if it is explicitly assigned. */
709	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
710		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
711
712	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
713	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
714		fib_magic(RTM_NEWROUTE,
715			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
716			  prefix, ifa->ifa_prefixlen, prim);
717
718		/* Add network specific broadcasts, when it takes a sense */
719		if (ifa->ifa_prefixlen < 31) {
720			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
721			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
722				  32, prim);
723		}
724	}
725}
726
727/* Delete primary or secondary address.
728 * Optionally, on secondary address promotion consider the addresses
729 * from subnet iprim as deleted, even if they are in device list.
730 * In this case the secondary ifa can be in device list.
731 */
732void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
733{
734	struct in_device *in_dev = ifa->ifa_dev;
735	struct net_device *dev = in_dev->dev;
736	struct in_ifaddr *ifa1;
737	struct in_ifaddr *prim = ifa, *prim1 = NULL;
738	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
739	__be32 any = ifa->ifa_address & ifa->ifa_mask;
740#define LOCAL_OK	1
741#define BRD_OK		2
742#define BRD0_OK		4
743#define BRD1_OK		8
744	unsigned ok = 0;
745	int subnet = 0;		/* Primary network */
746	int gone = 1;		/* Address is missing */
747	int same_prefsrc = 0;	/* Another primary with same IP */
748
749	if (ifa->ifa_flags & IFA_F_SECONDARY) {
750		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
751		if (prim == NULL) {
752			pr_warn("%s: bug: prim == NULL\n", __func__);
753			return;
754		}
755		if (iprim && iprim != prim) {
756			pr_warn("%s: bug: iprim != prim\n", __func__);
757			return;
758		}
759	} else if (!ipv4_is_zeronet(any) &&
760		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
761		fib_magic(RTM_DELROUTE,
762			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
763			  any, ifa->ifa_prefixlen, prim);
764		subnet = 1;
765	}
766
767	/* Deletion is more complicated than add.
768	 * We should take care of not to delete too much :-)
769	 *
770	 * Scan address list to be sure that addresses are really gone.
771	 */
772
773	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
774		if (ifa1 == ifa) {
775			/* promotion, keep the IP */
776			gone = 0;
777			continue;
778		}
779		/* Ignore IFAs from our subnet */
780		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
781		    inet_ifa_match(ifa1->ifa_address, iprim))
782			continue;
783
784		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
785		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
786			/* Another address from our subnet? */
787			if (ifa1->ifa_mask == prim->ifa_mask &&
788			    inet_ifa_match(ifa1->ifa_address, prim))
789				prim1 = prim;
790			else {
791				/* We reached the secondaries, so
792				 * same_prefsrc should be determined.
793				 */
794				if (!same_prefsrc)
795					continue;
796				/* Search new prim1 if ifa1 is not
797				 * using the current prim1
798				 */
799				if (!prim1 ||
800				    ifa1->ifa_mask != prim1->ifa_mask ||
801				    !inet_ifa_match(ifa1->ifa_address, prim1))
802					prim1 = inet_ifa_byprefix(in_dev,
803							ifa1->ifa_address,
804							ifa1->ifa_mask);
805				if (!prim1)
806					continue;
807				if (prim1->ifa_local != prim->ifa_local)
808					continue;
809			}
810		} else {
811			if (prim->ifa_local != ifa1->ifa_local)
812				continue;
813			prim1 = ifa1;
814			if (prim != prim1)
815				same_prefsrc = 1;
816		}
817		if (ifa->ifa_local == ifa1->ifa_local)
818			ok |= LOCAL_OK;
819		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
820			ok |= BRD_OK;
821		if (brd == ifa1->ifa_broadcast)
822			ok |= BRD1_OK;
823		if (any == ifa1->ifa_broadcast)
824			ok |= BRD0_OK;
825		/* primary has network specific broadcasts */
826		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
827			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
828			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
829
830			if (!ipv4_is_zeronet(any1)) {
831				if (ifa->ifa_broadcast == brd1 ||
832				    ifa->ifa_broadcast == any1)
833					ok |= BRD_OK;
834				if (brd == brd1 || brd == any1)
835					ok |= BRD1_OK;
836				if (any == brd1 || any == any1)
837					ok |= BRD0_OK;
838			}
839		}
840	}
841
842	if (!(ok & BRD_OK))
843		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
844	if (subnet && ifa->ifa_prefixlen < 31) {
845		if (!(ok & BRD1_OK))
846			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
847		if (!(ok & BRD0_OK))
848			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
849	}
850	if (!(ok & LOCAL_OK)) {
851		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
852
853		/* Check, that this local address finally disappeared. */
854		if (gone &&
855		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
856			/* And the last, but not the least thing.
857			 * We must flush stray FIB entries.
858			 *
859			 * First of all, we scan fib_info list searching
860			 * for stray nexthop entries, then ignite fib_flush.
861			 */
862			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
863				fib_flush(dev_net(dev));
864		}
865	}
866#undef LOCAL_OK
867#undef BRD_OK
868#undef BRD0_OK
869#undef BRD1_OK
870}
871
872static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
873{
874
875	struct fib_result       res;
876	struct flowi4           fl4 = {
877		.flowi4_mark = frn->fl_mark,
878		.daddr = frn->fl_addr,
879		.flowi4_tos = frn->fl_tos,
880		.flowi4_scope = frn->fl_scope,
881	};
882
883#ifdef CONFIG_IP_MULTIPLE_TABLES
884	res.r = NULL;
885#endif
886
887	frn->err = -ENOENT;
888	if (tb) {
889		local_bh_disable();
890
891		frn->tb_id = tb->tb_id;
892		rcu_read_lock();
893		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
894
895		if (!frn->err) {
896			frn->prefixlen = res.prefixlen;
897			frn->nh_sel = res.nh_sel;
898			frn->type = res.type;
899			frn->scope = res.scope;
900		}
901		rcu_read_unlock();
902		local_bh_enable();
903	}
904}
905
906static void nl_fib_input(struct sk_buff *skb)
907{
908	struct net *net;
909	struct fib_result_nl *frn;
910	struct nlmsghdr *nlh;
911	struct fib_table *tb;
912	u32 pid;
913
914	net = sock_net(skb->sk);
915	nlh = nlmsg_hdr(skb);
916	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
917	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
918		return;
919
920	skb = skb_clone(skb, GFP_KERNEL);
921	if (skb == NULL)
922		return;
923	nlh = nlmsg_hdr(skb);
924
925	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
926	tb = fib_get_table(net, frn->tb_id_in);
927
928	nl_fib_lookup(frn, tb);
929
930	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
931	NETLINK_CB(skb).pid = 0;        /* from kernel */
932	NETLINK_CB(skb).dst_group = 0;  /* unicast */
933	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
934}
935
936static int __net_init nl_fib_lookup_init(struct net *net)
937{
938	struct sock *sk;
939	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
940				   nl_fib_input, NULL, THIS_MODULE);
941	if (sk == NULL)
942		return -EAFNOSUPPORT;
943	net->ipv4.fibnl = sk;
944	return 0;
945}
946
947static void nl_fib_lookup_exit(struct net *net)
948{
949	netlink_kernel_release(net->ipv4.fibnl);
950	net->ipv4.fibnl = NULL;
951}
952
953static void fib_disable_ip(struct net_device *dev, int force, int delay)
954{
955	if (fib_sync_down_dev(dev, force))
956		fib_flush(dev_net(dev));
957	rt_cache_flush(dev_net(dev), delay);
958	arp_ifdown(dev);
959}
960
961static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
962{
963	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
964	struct net_device *dev = ifa->ifa_dev->dev;
965	struct net *net = dev_net(dev);
966
967	switch (event) {
968	case NETDEV_UP:
969		fib_add_ifaddr(ifa);
970#ifdef CONFIG_IP_ROUTE_MULTIPATH
971		fib_sync_up(dev);
972#endif
973		atomic_inc(&net->ipv4.dev_addr_genid);
974		rt_cache_flush(dev_net(dev), -1);
975		break;
976	case NETDEV_DOWN:
977		fib_del_ifaddr(ifa, NULL);
978		atomic_inc(&net->ipv4.dev_addr_genid);
979		if (ifa->ifa_dev->ifa_list == NULL) {
980			/* Last address was deleted from this interface.
981			 * Disable IP.
982			 */
983			fib_disable_ip(dev, 1, 0);
984		} else {
985			rt_cache_flush(dev_net(dev), -1);
986		}
987		break;
988	}
989	return NOTIFY_DONE;
990}
991
992static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
993{
994	struct net_device *dev = ptr;
995	struct in_device *in_dev = __in_dev_get_rtnl(dev);
996	struct net *net = dev_net(dev);
997
998	if (event == NETDEV_UNREGISTER) {
999		fib_disable_ip(dev, 2, -1);
1000		return NOTIFY_DONE;
1001	}
1002
1003	if (!in_dev)
1004		return NOTIFY_DONE;
1005
1006	switch (event) {
1007	case NETDEV_UP:
1008		for_ifa(in_dev) {
1009			fib_add_ifaddr(ifa);
1010		} endfor_ifa(in_dev);
1011#ifdef CONFIG_IP_ROUTE_MULTIPATH
1012		fib_sync_up(dev);
1013#endif
1014		atomic_inc(&net->ipv4.dev_addr_genid);
1015		rt_cache_flush(dev_net(dev), -1);
1016		break;
1017	case NETDEV_DOWN:
1018		fib_disable_ip(dev, 0, 0);
1019		break;
1020	case NETDEV_CHANGEMTU:
1021	case NETDEV_CHANGE:
1022		rt_cache_flush(dev_net(dev), 0);
1023		break;
1024	case NETDEV_UNREGISTER_BATCH:
1025		/* The batch unregister is only called on the first
1026		 * device in the list of devices being unregistered.
1027		 * Therefore we should not pass dev_net(dev) in here.
1028		 */
1029		rt_cache_flush_batch(NULL);
1030		break;
1031	}
1032	return NOTIFY_DONE;
1033}
1034
1035static struct notifier_block fib_inetaddr_notifier = {
1036	.notifier_call = fib_inetaddr_event,
1037};
1038
1039static struct notifier_block fib_netdev_notifier = {
1040	.notifier_call = fib_netdev_event,
1041};
1042
1043static int __net_init ip_fib_net_init(struct net *net)
1044{
1045	int err;
1046	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1047
1048	/* Avoid false sharing : Use at least a full cache line */
1049	size = max_t(size_t, size, L1_CACHE_BYTES);
1050
1051	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1052	if (net->ipv4.fib_table_hash == NULL)
1053		return -ENOMEM;
1054
1055	err = fib4_rules_init(net);
1056	if (err < 0)
1057		goto fail;
1058	return 0;
1059
1060fail:
1061	kfree(net->ipv4.fib_table_hash);
1062	return err;
1063}
1064
1065static void ip_fib_net_exit(struct net *net)
1066{
1067	unsigned int i;
1068
1069#ifdef CONFIG_IP_MULTIPLE_TABLES
1070	fib4_rules_exit(net);
1071#endif
1072
1073	rtnl_lock();
1074	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1075		struct fib_table *tb;
1076		struct hlist_head *head;
1077		struct hlist_node *node, *tmp;
1078
1079		head = &net->ipv4.fib_table_hash[i];
1080		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1081			hlist_del(node);
1082			fib_table_flush(tb);
1083			fib_free_table(tb);
1084		}
1085	}
1086	rtnl_unlock();
1087	kfree(net->ipv4.fib_table_hash);
1088}
1089
1090static int __net_init fib_net_init(struct net *net)
1091{
1092	int error;
1093
1094	error = ip_fib_net_init(net);
1095	if (error < 0)
1096		goto out;
1097	error = nl_fib_lookup_init(net);
1098	if (error < 0)
1099		goto out_nlfl;
1100	error = fib_proc_init(net);
1101	if (error < 0)
1102		goto out_proc;
1103out:
1104	return error;
1105
1106out_proc:
1107	nl_fib_lookup_exit(net);
1108out_nlfl:
1109	ip_fib_net_exit(net);
1110	goto out;
1111}
1112
1113static void __net_exit fib_net_exit(struct net *net)
1114{
1115	fib_proc_exit(net);
1116	nl_fib_lookup_exit(net);
1117	ip_fib_net_exit(net);
1118}
1119
1120static struct pernet_operations fib_net_ops = {
1121	.init = fib_net_init,
1122	.exit = fib_net_exit,
1123};
1124
1125void __init ip_fib_init(void)
1126{
1127	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
1128	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
1129	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
1130
1131	register_pernet_subsys(&fib_net_ops);
1132	register_netdevice_notifier(&fib_netdev_notifier);
1133	register_inetaddr_notifier(&fib_inetaddr_notifier);
1134
1135	fib_trie_init();
1136}
1137