1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		ROUTE - implementation of the IP router.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 *		Alan Cox	:	Verify area fixes.
16 *		Alan Cox	:	cli() protects routing changes
17 *		Rui Oliveira	:	ICMP routing table updates
18 *		(rco@di.uminho.pt)	Routing table insertion and update
19 *		Linus Torvalds	:	Rewrote bits to be sensible
20 *		Alan Cox	:	Added BSD route gw semantics
21 *		Alan Cox	:	Super /proc >4K
22 *		Alan Cox	:	MTU in route table
23 *		Alan Cox	: 	MSS actually. Also added the window
24 *					clamper.
25 *		Sam Lantinga	:	Fixed route matching in rt_del()
26 *		Alan Cox	:	Routing cache support.
27 *		Alan Cox	:	Removed compatibility cruft.
28 *		Alan Cox	:	RTF_REJECT support.
29 *		Alan Cox	:	TCP irtt support.
30 *		Jonathan Naylor	:	Added Metric support.
31 *	Miquel van Smoorenburg	:	BSD API fixes.
32 *	Miquel van Smoorenburg	:	Metrics.
33 *		Alan Cox	:	Use __u32 properly
34 *		Alan Cox	:	Aligned routing errors more closely with BSD
35 *					our system is still very different.
36 *		Alan Cox	:	Faster /proc handling
37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38 *					routing caches and better behaviour.
39 *
40 *		Olaf Erb	:	irtt wasn't being copied right.
41 *		Bjorn Ekwall	:	Kerneld route support.
42 *		Alan Cox	:	Multicast fixed (I hope)
43 * 		Pavel Krauz	:	Limited broadcast fixed
44 *		Mike McLagan	:	Routing by source
45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46 *					route.c and rewritten from scratch.
47 *		Andi Kleen	:	Load-limit warning messages.
48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52 *		Marc Boucher	:	routing by fwmark
53 *	Robert Olsson		:	Added rt_cache statistics
54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58 *
59 *		This program is free software; you can redistribute it and/or
60 *		modify it under the terms of the GNU General Public License
61 *		as published by the Free Software Foundation; either version
62 *		2 of the License, or (at your option) any later version.
63 */
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <linux/jhash.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#include <linux/kmemleak.h>
110#endif
111#include <net/secure_seq.h>
112
113#define RT_FL_TOS(oldflp4) \
114	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_redirect_number __read_mostly	= 9;
120static int ip_rt_redirect_load __read_mostly	= HZ / 50;
121static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
122static int ip_rt_error_cost __read_mostly	= HZ;
123static int ip_rt_error_burst __read_mostly	= 5 * HZ;
124static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
125static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
126static int ip_rt_min_advmss __read_mostly	= 256;
127
128/*
129 *	Interface to generic destination cache.
130 */
131
132static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
134static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
135static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136static void		 ipv4_link_failure(struct sk_buff *skb);
137static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138					   struct sk_buff *skb, u32 mtu);
139static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
140					struct sk_buff *skb);
141static void		ipv4_dst_destroy(struct dst_entry *dst);
142
143static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
144{
145	WARN_ON(1);
146	return NULL;
147}
148
149static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
150					   struct sk_buff *skb,
151					   const void *daddr);
152
153static struct dst_ops ipv4_dst_ops = {
154	.family =		AF_INET,
155	.protocol =		cpu_to_be16(ETH_P_IP),
156	.check =		ipv4_dst_check,
157	.default_advmss =	ipv4_default_advmss,
158	.mtu =			ipv4_mtu,
159	.cow_metrics =		ipv4_cow_metrics,
160	.destroy =		ipv4_dst_destroy,
161	.negative_advice =	ipv4_negative_advice,
162	.link_failure =		ipv4_link_failure,
163	.update_pmtu =		ip_rt_update_pmtu,
164	.redirect =		ip_do_redirect,
165	.local_out =		__ip_local_out,
166	.neigh_lookup =		ipv4_neigh_lookup,
167};
168
169#define ECN_OR_COST(class)	TC_PRIO_##class
170
171const __u8 ip_tos2prio[16] = {
172	TC_PRIO_BESTEFFORT,
173	ECN_OR_COST(BESTEFFORT),
174	TC_PRIO_BESTEFFORT,
175	ECN_OR_COST(BESTEFFORT),
176	TC_PRIO_BULK,
177	ECN_OR_COST(BULK),
178	TC_PRIO_BULK,
179	ECN_OR_COST(BULK),
180	TC_PRIO_INTERACTIVE,
181	ECN_OR_COST(INTERACTIVE),
182	TC_PRIO_INTERACTIVE,
183	ECN_OR_COST(INTERACTIVE),
184	TC_PRIO_INTERACTIVE_BULK,
185	ECN_OR_COST(INTERACTIVE_BULK),
186	TC_PRIO_INTERACTIVE_BULK,
187	ECN_OR_COST(INTERACTIVE_BULK)
188};
189EXPORT_SYMBOL(ip_tos2prio);
190
191static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
192#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
193
194#ifdef CONFIG_PROC_FS
195static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
196{
197	if (*pos)
198		return NULL;
199	return SEQ_START_TOKEN;
200}
201
202static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
203{
204	++*pos;
205	return NULL;
206}
207
208static void rt_cache_seq_stop(struct seq_file *seq, void *v)
209{
210}
211
212static int rt_cache_seq_show(struct seq_file *seq, void *v)
213{
214	if (v == SEQ_START_TOKEN)
215		seq_printf(seq, "%-127s\n",
216			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
217			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
218			   "HHUptod\tSpecDst");
219	return 0;
220}
221
222static const struct seq_operations rt_cache_seq_ops = {
223	.start  = rt_cache_seq_start,
224	.next   = rt_cache_seq_next,
225	.stop   = rt_cache_seq_stop,
226	.show   = rt_cache_seq_show,
227};
228
229static int rt_cache_seq_open(struct inode *inode, struct file *file)
230{
231	return seq_open(file, &rt_cache_seq_ops);
232}
233
234static const struct file_operations rt_cache_seq_fops = {
235	.owner	 = THIS_MODULE,
236	.open	 = rt_cache_seq_open,
237	.read	 = seq_read,
238	.llseek	 = seq_lseek,
239	.release = seq_release,
240};
241
242
243static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
244{
245	int cpu;
246
247	if (*pos == 0)
248		return SEQ_START_TOKEN;
249
250	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
251		if (!cpu_possible(cpu))
252			continue;
253		*pos = cpu+1;
254		return &per_cpu(rt_cache_stat, cpu);
255	}
256	return NULL;
257}
258
259static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
260{
261	int cpu;
262
263	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
264		if (!cpu_possible(cpu))
265			continue;
266		*pos = cpu+1;
267		return &per_cpu(rt_cache_stat, cpu);
268	}
269	return NULL;
270
271}
272
273static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
274{
275
276}
277
278static int rt_cpu_seq_show(struct seq_file *seq, void *v)
279{
280	struct rt_cache_stat *st = v;
281
282	if (v == SEQ_START_TOKEN) {
283		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
284		return 0;
285	}
286
287	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
288		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
289		   dst_entries_get_slow(&ipv4_dst_ops),
290		   0, /* st->in_hit */
291		   st->in_slow_tot,
292		   st->in_slow_mc,
293		   st->in_no_route,
294		   st->in_brd,
295		   st->in_martian_dst,
296		   st->in_martian_src,
297
298		   0, /* st->out_hit */
299		   st->out_slow_tot,
300		   st->out_slow_mc,
301
302		   0, /* st->gc_total */
303		   0, /* st->gc_ignored */
304		   0, /* st->gc_goal_miss */
305		   0, /* st->gc_dst_overflow */
306		   0, /* st->in_hlist_search */
307		   0  /* st->out_hlist_search */
308		);
309	return 0;
310}
311
312static const struct seq_operations rt_cpu_seq_ops = {
313	.start  = rt_cpu_seq_start,
314	.next   = rt_cpu_seq_next,
315	.stop   = rt_cpu_seq_stop,
316	.show   = rt_cpu_seq_show,
317};
318
319
320static int rt_cpu_seq_open(struct inode *inode, struct file *file)
321{
322	return seq_open(file, &rt_cpu_seq_ops);
323}
324
325static const struct file_operations rt_cpu_seq_fops = {
326	.owner	 = THIS_MODULE,
327	.open	 = rt_cpu_seq_open,
328	.read	 = seq_read,
329	.llseek	 = seq_lseek,
330	.release = seq_release,
331};
332
333#ifdef CONFIG_IP_ROUTE_CLASSID
334static int rt_acct_proc_show(struct seq_file *m, void *v)
335{
336	struct ip_rt_acct *dst, *src;
337	unsigned int i, j;
338
339	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
340	if (!dst)
341		return -ENOMEM;
342
343	for_each_possible_cpu(i) {
344		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
345		for (j = 0; j < 256; j++) {
346			dst[j].o_bytes   += src[j].o_bytes;
347			dst[j].o_packets += src[j].o_packets;
348			dst[j].i_bytes   += src[j].i_bytes;
349			dst[j].i_packets += src[j].i_packets;
350		}
351	}
352
353	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
354	kfree(dst);
355	return 0;
356}
357
358static int rt_acct_proc_open(struct inode *inode, struct file *file)
359{
360	return single_open(file, rt_acct_proc_show, NULL);
361}
362
363static const struct file_operations rt_acct_proc_fops = {
364	.owner		= THIS_MODULE,
365	.open		= rt_acct_proc_open,
366	.read		= seq_read,
367	.llseek		= seq_lseek,
368	.release	= single_release,
369};
370#endif
371
372static int __net_init ip_rt_do_proc_init(struct net *net)
373{
374	struct proc_dir_entry *pde;
375
376	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
377			  &rt_cache_seq_fops);
378	if (!pde)
379		goto err1;
380
381	pde = proc_create("rt_cache", S_IRUGO,
382			  net->proc_net_stat, &rt_cpu_seq_fops);
383	if (!pde)
384		goto err2;
385
386#ifdef CONFIG_IP_ROUTE_CLASSID
387	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
388	if (!pde)
389		goto err3;
390#endif
391	return 0;
392
393#ifdef CONFIG_IP_ROUTE_CLASSID
394err3:
395	remove_proc_entry("rt_cache", net->proc_net_stat);
396#endif
397err2:
398	remove_proc_entry("rt_cache", net->proc_net);
399err1:
400	return -ENOMEM;
401}
402
403static void __net_exit ip_rt_do_proc_exit(struct net *net)
404{
405	remove_proc_entry("rt_cache", net->proc_net_stat);
406	remove_proc_entry("rt_cache", net->proc_net);
407#ifdef CONFIG_IP_ROUTE_CLASSID
408	remove_proc_entry("rt_acct", net->proc_net);
409#endif
410}
411
412static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
413	.init = ip_rt_do_proc_init,
414	.exit = ip_rt_do_proc_exit,
415};
416
417static int __init ip_rt_proc_init(void)
418{
419	return register_pernet_subsys(&ip_rt_proc_ops);
420}
421
422#else
423static inline int ip_rt_proc_init(void)
424{
425	return 0;
426}
427#endif /* CONFIG_PROC_FS */
428
429static inline bool rt_is_expired(const struct rtable *rth)
430{
431	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
432}
433
434void rt_cache_flush(struct net *net)
435{
436	rt_genid_bump_ipv4(net);
437}
438
439static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440					   struct sk_buff *skb,
441					   const void *daddr)
442{
443	struct net_device *dev = dst->dev;
444	const __be32 *pkey = daddr;
445	const struct rtable *rt;
446	struct neighbour *n;
447
448	rt = (const struct rtable *) dst;
449	if (rt->rt_gateway)
450		pkey = (const __be32 *) &rt->rt_gateway;
451	else if (skb)
452		pkey = &ip_hdr(skb)->daddr;
453
454	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
455	if (n)
456		return n;
457	return neigh_create(&arp_tbl, pkey, dev);
458}
459
460#define IP_IDENTS_SZ 2048u
461struct ip_ident_bucket {
462	atomic_t	id;
463	u32		stamp32;
464};
465
466static struct ip_ident_bucket *ip_idents __read_mostly;
467
468/* In order to protect privacy, we add a perturbation to identifiers
469 * if one generator is seldom used. This makes hard for an attacker
470 * to infer how many packets were sent between two points in time.
471 */
472u32 ip_idents_reserve(u32 hash, int segs)
473{
474	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
475	u32 old = ACCESS_ONCE(bucket->stamp32);
476	u32 now = (u32)jiffies;
477	u32 delta = 0;
478
479	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
480		delta = prandom_u32_max(now - old);
481
482	return atomic_add_return(segs + delta, &bucket->id) - segs;
483}
484EXPORT_SYMBOL(ip_idents_reserve);
485
486void __ip_select_ident(struct iphdr *iph, int segs)
487{
488	static u32 ip_idents_hashrnd __read_mostly;
489	u32 hash, id;
490
491	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
492
493	hash = jhash_3words((__force u32)iph->daddr,
494			    (__force u32)iph->saddr,
495			    iph->protocol,
496			    ip_idents_hashrnd);
497	id = ip_idents_reserve(hash, segs);
498	iph->id = htons(id);
499}
500EXPORT_SYMBOL(__ip_select_ident);
501
502static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
503			     const struct iphdr *iph,
504			     int oif, u8 tos,
505			     u8 prot, u32 mark, int flow_flags)
506{
507	if (sk) {
508		const struct inet_sock *inet = inet_sk(sk);
509
510		oif = sk->sk_bound_dev_if;
511		mark = sk->sk_mark;
512		tos = RT_CONN_FLAGS(sk);
513		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
514	}
515	flowi4_init_output(fl4, oif, mark, tos,
516			   RT_SCOPE_UNIVERSE, prot,
517			   flow_flags,
518			   iph->daddr, iph->saddr, 0, 0,
519			   sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID);
520}
521
522static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523			       struct sock *sk)
524{
525	const struct iphdr *iph = ip_hdr(skb);
526	int oif = skb->dev->ifindex;
527	u8 tos = RT_TOS(iph->tos);
528	u8 prot = iph->protocol;
529	u32 mark = skb->mark;
530
531	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532}
533
534static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
535{
536	const struct inet_sock *inet = inet_sk(sk);
537	const struct ip_options_rcu *inet_opt;
538	__be32 daddr = inet->inet_daddr;
539
540	rcu_read_lock();
541	inet_opt = rcu_dereference(inet->inet_opt);
542	if (inet_opt && inet_opt->opt.srr)
543		daddr = inet_opt->opt.faddr;
544	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547			   inet_sk_flowi_flags(sk),
548			   daddr, inet->inet_saddr, 0, 0,
549			   sock_i_uid(sk));
550	rcu_read_unlock();
551}
552
553static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
554				 const struct sk_buff *skb)
555{
556	if (skb)
557		build_skb_flow_key(fl4, skb, sk);
558	else
559		build_sk_flow_key(fl4, sk);
560}
561
562static inline void rt_free(struct rtable *rt)
563{
564	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
565}
566
567static DEFINE_SPINLOCK(fnhe_lock);
568
569static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570{
571	struct rtable *rt;
572
573	rt = rcu_dereference(fnhe->fnhe_rth_input);
574	if (rt) {
575		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
576		rt_free(rt);
577	}
578	rt = rcu_dereference(fnhe->fnhe_rth_output);
579	if (rt) {
580		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
581		rt_free(rt);
582	}
583}
584
585static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586{
587	struct fib_nh_exception *fnhe, *oldest;
588
589	oldest = rcu_dereference(hash->chain);
590	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
591	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
592		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
593			oldest = fnhe;
594	}
595	fnhe_flush_routes(oldest);
596	return oldest;
597}
598
599static inline u32 fnhe_hashfun(__be32 daddr)
600{
601	static u32 fnhe_hashrnd __read_mostly;
602	u32 hval;
603
604	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
605	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
606	return hash_32(hval, FNHE_HASH_SHIFT);
607}
608
609static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
610{
611	rt->rt_pmtu = fnhe->fnhe_pmtu;
612	rt->dst.expires = fnhe->fnhe_expires;
613
614	if (fnhe->fnhe_gw) {
615		rt->rt_flags |= RTCF_REDIRECTED;
616		rt->rt_gateway = fnhe->fnhe_gw;
617		rt->rt_uses_gateway = 1;
618	}
619}
620
621static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
622				  u32 pmtu, unsigned long expires)
623{
624	struct fnhe_hash_bucket *hash;
625	struct fib_nh_exception *fnhe;
626	struct rtable *rt;
627	unsigned int i;
628	int depth;
629	u32 hval = fnhe_hashfun(daddr);
630
631	spin_lock_bh(&fnhe_lock);
632
633	hash = rcu_dereference(nh->nh_exceptions);
634	if (!hash) {
635		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
636		if (!hash)
637			goto out_unlock;
638		rcu_assign_pointer(nh->nh_exceptions, hash);
639	}
640
641	hash += hval;
642
643	depth = 0;
644	for (fnhe = rcu_dereference(hash->chain); fnhe;
645	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
646		if (fnhe->fnhe_daddr == daddr)
647			break;
648		depth++;
649	}
650
651	if (fnhe) {
652		if (gw)
653			fnhe->fnhe_gw = gw;
654		if (pmtu) {
655			fnhe->fnhe_pmtu = pmtu;
656			fnhe->fnhe_expires = max(1UL, expires);
657		}
658		/* Update all cached dsts too */
659		rt = rcu_dereference(fnhe->fnhe_rth_input);
660		if (rt)
661			fill_route_from_fnhe(rt, fnhe);
662		rt = rcu_dereference(fnhe->fnhe_rth_output);
663		if (rt)
664			fill_route_from_fnhe(rt, fnhe);
665	} else {
666		if (depth > FNHE_RECLAIM_DEPTH)
667			fnhe = fnhe_oldest(hash);
668		else {
669			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670			if (!fnhe)
671				goto out_unlock;
672
673			fnhe->fnhe_next = hash->chain;
674			rcu_assign_pointer(hash->chain, fnhe);
675		}
676		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
677		fnhe->fnhe_daddr = daddr;
678		fnhe->fnhe_gw = gw;
679		fnhe->fnhe_pmtu = pmtu;
680		fnhe->fnhe_expires = expires;
681
682		/* Exception created; mark the cached routes for the nexthop
683		 * stale, so anyone caching it rechecks if this exception
684		 * applies to them.
685		 */
686		rt = rcu_dereference(nh->nh_rth_input);
687		if (rt)
688			rt->dst.obsolete = DST_OBSOLETE_KILL;
689
690		for_each_possible_cpu(i) {
691			struct rtable __rcu **prt;
692			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
693			rt = rcu_dereference(*prt);
694			if (rt)
695				rt->dst.obsolete = DST_OBSOLETE_KILL;
696		}
697	}
698
699	fnhe->fnhe_stamp = jiffies;
700
701out_unlock:
702	spin_unlock_bh(&fnhe_lock);
703}
704
705static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
706			     bool kill_route)
707{
708	__be32 new_gw = icmp_hdr(skb)->un.gateway;
709	__be32 old_gw = ip_hdr(skb)->saddr;
710	struct net_device *dev = skb->dev;
711	struct in_device *in_dev;
712	struct fib_result res;
713	struct neighbour *n;
714	struct net *net;
715
716	switch (icmp_hdr(skb)->code & 7) {
717	case ICMP_REDIR_NET:
718	case ICMP_REDIR_NETTOS:
719	case ICMP_REDIR_HOST:
720	case ICMP_REDIR_HOSTTOS:
721		break;
722
723	default:
724		return;
725	}
726
727	if (rt->rt_gateway != old_gw)
728		return;
729
730	in_dev = __in_dev_get_rcu(dev);
731	if (!in_dev)
732		return;
733
734	net = dev_net(dev);
735	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
736	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
737	    ipv4_is_zeronet(new_gw))
738		goto reject_redirect;
739
740	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
741		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
742			goto reject_redirect;
743		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
744			goto reject_redirect;
745	} else {
746		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
747			goto reject_redirect;
748	}
749
750	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
751	if (!IS_ERR(n)) {
752		if (!(n->nud_state & NUD_VALID)) {
753			neigh_event_send(n, NULL);
754		} else {
755			if (fib_lookup(net, fl4, &res) == 0) {
756				struct fib_nh *nh = &FIB_RES_NH(res);
757
758				update_or_create_fnhe(nh, fl4->daddr, new_gw,
759						      0, 0);
760			}
761			if (kill_route)
762				rt->dst.obsolete = DST_OBSOLETE_KILL;
763			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
764		}
765		neigh_release(n);
766	}
767	return;
768
769reject_redirect:
770#ifdef CONFIG_IP_ROUTE_VERBOSE
771	if (IN_DEV_LOG_MARTIANS(in_dev)) {
772		const struct iphdr *iph = (const struct iphdr *) skb->data;
773		__be32 daddr = iph->daddr;
774		__be32 saddr = iph->saddr;
775
776		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
777				     "  Advised path = %pI4 -> %pI4\n",
778				     &old_gw, dev->name, &new_gw,
779				     &saddr, &daddr);
780	}
781#endif
782	;
783}
784
785static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
786{
787	struct rtable *rt;
788	struct flowi4 fl4;
789	const struct iphdr *iph = (const struct iphdr *) skb->data;
790	int oif = skb->dev->ifindex;
791	u8 tos = RT_TOS(iph->tos);
792	u8 prot = iph->protocol;
793	u32 mark = skb->mark;
794
795	rt = (struct rtable *) dst;
796
797	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
798	__ip_do_redirect(rt, skb, &fl4, true);
799}
800
801static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
802{
803	struct rtable *rt = (struct rtable *)dst;
804	struct dst_entry *ret = dst;
805
806	if (rt) {
807		if (dst->obsolete > 0) {
808			ip_rt_put(rt);
809			ret = NULL;
810		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
811			   rt->dst.expires) {
812			ip_rt_put(rt);
813			ret = NULL;
814		}
815	}
816	return ret;
817}
818
819/*
820 * Algorithm:
821 *	1. The first ip_rt_redirect_number redirects are sent
822 *	   with exponential backoff, then we stop sending them at all,
823 *	   assuming that the host ignores our redirects.
824 *	2. If we did not see packets requiring redirects
825 *	   during ip_rt_redirect_silence, we assume that the host
826 *	   forgot redirected route and start to send redirects again.
827 *
828 * This algorithm is much cheaper and more intelligent than dumb load limiting
829 * in icmp.c.
830 *
831 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
832 * and "frag. need" (breaks PMTU discovery) in icmp.c.
833 */
834
835void ip_rt_send_redirect(struct sk_buff *skb)
836{
837	struct rtable *rt = skb_rtable(skb);
838	struct in_device *in_dev;
839	struct inet_peer *peer;
840	struct net *net;
841	int log_martians;
842
843	rcu_read_lock();
844	in_dev = __in_dev_get_rcu(rt->dst.dev);
845	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
846		rcu_read_unlock();
847		return;
848	}
849	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
850	rcu_read_unlock();
851
852	net = dev_net(rt->dst.dev);
853	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
854	if (!peer) {
855		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
856			  rt_nexthop(rt, ip_hdr(skb)->daddr));
857		return;
858	}
859
860	/* No redirected packets during ip_rt_redirect_silence;
861	 * reset the algorithm.
862	 */
863	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
864		peer->rate_tokens = 0;
865
866	/* Too many ignored redirects; do not send anything
867	 * set dst.rate_last to the last seen redirected packet.
868	 */
869	if (peer->rate_tokens >= ip_rt_redirect_number) {
870		peer->rate_last = jiffies;
871		goto out_put_peer;
872	}
873
874	/* Check for load limit; set rate_last to the latest sent
875	 * redirect.
876	 */
877	if (peer->rate_tokens == 0 ||
878	    time_after(jiffies,
879		       (peer->rate_last +
880			(ip_rt_redirect_load << peer->rate_tokens)))) {
881		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
882
883		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
884		peer->rate_last = jiffies;
885		++peer->rate_tokens;
886#ifdef CONFIG_IP_ROUTE_VERBOSE
887		if (log_martians &&
888		    peer->rate_tokens == ip_rt_redirect_number)
889			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
890					     &ip_hdr(skb)->saddr, inet_iif(skb),
891					     &ip_hdr(skb)->daddr, &gw);
892#endif
893	}
894out_put_peer:
895	inet_putpeer(peer);
896}
897
898static int ip_error(struct sk_buff *skb)
899{
900	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
901	struct rtable *rt = skb_rtable(skb);
902	struct inet_peer *peer;
903	unsigned long now;
904	struct net *net;
905	bool send;
906	int code;
907
908	net = dev_net(rt->dst.dev);
909	if (!IN_DEV_FORWARD(in_dev)) {
910		switch (rt->dst.error) {
911		case EHOSTUNREACH:
912			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
913			break;
914
915		case ENETUNREACH:
916			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
917			break;
918		}
919		goto out;
920	}
921
922	switch (rt->dst.error) {
923	case EINVAL:
924	default:
925		goto out;
926	case EHOSTUNREACH:
927		code = ICMP_HOST_UNREACH;
928		break;
929	case ENETUNREACH:
930		code = ICMP_NET_UNREACH;
931		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
932		break;
933	case EACCES:
934		code = ICMP_PKT_FILTERED;
935		break;
936	}
937
938	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
939
940	send = true;
941	if (peer) {
942		now = jiffies;
943		peer->rate_tokens += now - peer->rate_last;
944		if (peer->rate_tokens > ip_rt_error_burst)
945			peer->rate_tokens = ip_rt_error_burst;
946		peer->rate_last = now;
947		if (peer->rate_tokens >= ip_rt_error_cost)
948			peer->rate_tokens -= ip_rt_error_cost;
949		else
950			send = false;
951		inet_putpeer(peer);
952	}
953	if (send)
954		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
955
956out:	kfree_skb(skb);
957	return 0;
958}
959
960static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
961{
962	struct dst_entry *dst = &rt->dst;
963	struct fib_result res;
964
965	if (dst_metric_locked(dst, RTAX_MTU))
966		return;
967
968	if (dst->dev->mtu < mtu)
969		return;
970
971	if (mtu < ip_rt_min_pmtu)
972		mtu = ip_rt_min_pmtu;
973
974	if (rt->rt_pmtu == mtu &&
975	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
976		return;
977
978	rcu_read_lock();
979	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
980		struct fib_nh *nh = &FIB_RES_NH(res);
981
982		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
983				      jiffies + ip_rt_mtu_expires);
984	}
985	rcu_read_unlock();
986}
987
988static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
989			      struct sk_buff *skb, u32 mtu)
990{
991	struct rtable *rt = (struct rtable *) dst;
992	struct flowi4 fl4;
993
994	ip_rt_build_flow_key(&fl4, sk, skb);
995	__ip_rt_update_pmtu(rt, &fl4, mtu);
996}
997
998void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
999		      int oif, u32 mark, u8 protocol, int flow_flags)
1000{
1001	const struct iphdr *iph = (const struct iphdr *) skb->data;
1002	struct flowi4 fl4;
1003	struct rtable *rt;
1004
1005	if (!mark)
1006		mark = IP4_REPLY_MARK(net, skb->mark);
1007
1008	__build_flow_key(&fl4, NULL, iph, oif,
1009			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1010	rt = __ip_route_output_key(net, &fl4);
1011	if (!IS_ERR(rt)) {
1012		__ip_rt_update_pmtu(rt, &fl4, mtu);
1013		ip_rt_put(rt);
1014	}
1015}
1016EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1017
1018static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1019{
1020	const struct iphdr *iph = (const struct iphdr *) skb->data;
1021	struct flowi4 fl4;
1022	struct rtable *rt;
1023
1024	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1025
1026	if (!fl4.flowi4_mark)
1027		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1028
1029	rt = __ip_route_output_key(sock_net(sk), &fl4);
1030	if (!IS_ERR(rt)) {
1031		__ip_rt_update_pmtu(rt, &fl4, mtu);
1032		ip_rt_put(rt);
1033	}
1034}
1035
1036void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1037{
1038	const struct iphdr *iph = (const struct iphdr *) skb->data;
1039	struct flowi4 fl4;
1040	struct rtable *rt;
1041	struct dst_entry *odst = NULL;
1042	bool new = false;
1043
1044	bh_lock_sock(sk);
1045
1046	if (!ip_sk_accept_pmtu(sk))
1047		goto out;
1048
1049	odst = sk_dst_get(sk);
1050
1051	if (sock_owned_by_user(sk) || !odst) {
1052		__ipv4_sk_update_pmtu(skb, sk, mtu);
1053		goto out;
1054	}
1055
1056	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1057
1058	rt = (struct rtable *)odst;
1059	if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1060		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1061		if (IS_ERR(rt))
1062			goto out;
1063
1064		new = true;
1065	}
1066
1067	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1068
1069	if (!dst_check(&rt->dst, 0)) {
1070		if (new)
1071			dst_release(&rt->dst);
1072
1073		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074		if (IS_ERR(rt))
1075			goto out;
1076
1077		new = true;
1078	}
1079
1080	if (new)
1081		sk_dst_set(sk, &rt->dst);
1082
1083out:
1084	bh_unlock_sock(sk);
1085	dst_release(odst);
1086}
1087EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1088
1089void ipv4_redirect(struct sk_buff *skb, struct net *net,
1090		   int oif, u32 mark, u8 protocol, int flow_flags)
1091{
1092	const struct iphdr *iph = (const struct iphdr *) skb->data;
1093	struct flowi4 fl4;
1094	struct rtable *rt;
1095
1096	__build_flow_key(&fl4, NULL, iph, oif,
1097			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1098	rt = __ip_route_output_key(net, &fl4);
1099	if (!IS_ERR(rt)) {
1100		__ip_do_redirect(rt, skb, &fl4, false);
1101		ip_rt_put(rt);
1102	}
1103}
1104EXPORT_SYMBOL_GPL(ipv4_redirect);
1105
1106void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1107{
1108	const struct iphdr *iph = (const struct iphdr *) skb->data;
1109	struct flowi4 fl4;
1110	struct rtable *rt;
1111
1112	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1113	rt = __ip_route_output_key(sock_net(sk), &fl4);
1114	if (!IS_ERR(rt)) {
1115		__ip_do_redirect(rt, skb, &fl4, false);
1116		ip_rt_put(rt);
1117	}
1118}
1119EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1120
1121static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1122{
1123	struct rtable *rt = (struct rtable *) dst;
1124
1125	/* All IPV4 dsts are created with ->obsolete set to the value
1126	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1127	 * into this function always.
1128	 *
1129	 * When a PMTU/redirect information update invalidates a route,
1130	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1131	 * DST_OBSOLETE_DEAD by dst_free().
1132	 */
1133	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1134		return NULL;
1135	return dst;
1136}
1137
1138static void ipv4_link_failure(struct sk_buff *skb)
1139{
1140	struct rtable *rt;
1141
1142	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1143
1144	rt = skb_rtable(skb);
1145	if (rt)
1146		dst_set_expires(&rt->dst, 0);
1147}
1148
1149static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1150{
1151	pr_debug("%s: %pI4 -> %pI4, %s\n",
1152		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1153		 skb->dev ? skb->dev->name : "?");
1154	kfree_skb(skb);
1155	WARN_ON(1);
1156	return 0;
1157}
1158
1159/*
1160   We do not cache source address of outgoing interface,
1161   because it is used only by IP RR, TS and SRR options,
1162   so that it out of fast path.
1163
1164   BTW remember: "addr" is allowed to be not aligned
1165   in IP options!
1166 */
1167
1168void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1169{
1170	__be32 src;
1171
1172	if (rt_is_output_route(rt))
1173		src = ip_hdr(skb)->saddr;
1174	else {
1175		struct fib_result res;
1176		struct flowi4 fl4;
1177		struct iphdr *iph;
1178
1179		iph = ip_hdr(skb);
1180
1181		memset(&fl4, 0, sizeof(fl4));
1182		fl4.daddr = iph->daddr;
1183		fl4.saddr = iph->saddr;
1184		fl4.flowi4_tos = RT_TOS(iph->tos);
1185		fl4.flowi4_oif = rt->dst.dev->ifindex;
1186		fl4.flowi4_iif = skb->dev->ifindex;
1187		fl4.flowi4_mark = skb->mark;
1188
1189		rcu_read_lock();
1190		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1191			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1192		else
1193			src = inet_select_addr(rt->dst.dev,
1194					       rt_nexthop(rt, iph->daddr),
1195					       RT_SCOPE_UNIVERSE);
1196		rcu_read_unlock();
1197	}
1198	memcpy(addr, &src, 4);
1199}
1200
1201#ifdef CONFIG_IP_ROUTE_CLASSID
1202static void set_class_tag(struct rtable *rt, u32 tag)
1203{
1204	if (!(rt->dst.tclassid & 0xFFFF))
1205		rt->dst.tclassid |= tag & 0xFFFF;
1206	if (!(rt->dst.tclassid & 0xFFFF0000))
1207		rt->dst.tclassid |= tag & 0xFFFF0000;
1208}
1209#endif
1210
1211static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1212{
1213	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1214
1215	if (advmss == 0) {
1216		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1217			       ip_rt_min_advmss);
1218		if (advmss > 65535 - 40)
1219			advmss = 65535 - 40;
1220	}
1221	return advmss;
1222}
1223
1224static unsigned int ipv4_mtu(const struct dst_entry *dst)
1225{
1226	const struct rtable *rt = (const struct rtable *) dst;
1227	unsigned int mtu = rt->rt_pmtu;
1228
1229	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1230		mtu = dst_metric_raw(dst, RTAX_MTU);
1231
1232	if (mtu)
1233		return mtu;
1234
1235	mtu = dst->dev->mtu;
1236
1237	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1238		if (rt->rt_uses_gateway && mtu > 576)
1239			mtu = 576;
1240	}
1241
1242	return min_t(unsigned int, mtu, IP_MAX_MTU);
1243}
1244
1245static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1246{
1247	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1248	struct fib_nh_exception *fnhe;
1249	u32 hval;
1250
1251	if (!hash)
1252		return NULL;
1253
1254	hval = fnhe_hashfun(daddr);
1255
1256	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1257	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1258		if (fnhe->fnhe_daddr == daddr)
1259			return fnhe;
1260	}
1261	return NULL;
1262}
1263
1264static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1265			      __be32 daddr)
1266{
1267	bool ret = false;
1268
1269	spin_lock_bh(&fnhe_lock);
1270
1271	if (daddr == fnhe->fnhe_daddr) {
1272		struct rtable __rcu **porig;
1273		struct rtable *orig;
1274		int genid = fnhe_genid(dev_net(rt->dst.dev));
1275
1276		if (rt_is_input_route(rt))
1277			porig = &fnhe->fnhe_rth_input;
1278		else
1279			porig = &fnhe->fnhe_rth_output;
1280		orig = rcu_dereference(*porig);
1281
1282		if (fnhe->fnhe_genid != genid) {
1283			fnhe->fnhe_genid = genid;
1284			fnhe->fnhe_gw = 0;
1285			fnhe->fnhe_pmtu = 0;
1286			fnhe->fnhe_expires = 0;
1287			fnhe_flush_routes(fnhe);
1288			orig = NULL;
1289		}
1290		fill_route_from_fnhe(rt, fnhe);
1291		if (!rt->rt_gateway)
1292			rt->rt_gateway = daddr;
1293
1294		if (!(rt->dst.flags & DST_NOCACHE)) {
1295			rcu_assign_pointer(*porig, rt);
1296			if (orig)
1297				rt_free(orig);
1298			ret = true;
1299		}
1300
1301		fnhe->fnhe_stamp = jiffies;
1302	}
1303	spin_unlock_bh(&fnhe_lock);
1304
1305	return ret;
1306}
1307
1308static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1309{
1310	struct rtable *orig, *prev, **p;
1311	bool ret = true;
1312
1313	if (rt_is_input_route(rt)) {
1314		p = (struct rtable **)&nh->nh_rth_input;
1315	} else {
1316		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1317	}
1318	orig = *p;
1319
1320	prev = cmpxchg(p, orig, rt);
1321	if (prev == orig) {
1322		if (orig)
1323			rt_free(orig);
1324	} else
1325		ret = false;
1326
1327	return ret;
1328}
1329
1330static DEFINE_SPINLOCK(rt_uncached_lock);
1331static LIST_HEAD(rt_uncached_list);
1332
1333static void rt_add_uncached_list(struct rtable *rt)
1334{
1335	spin_lock_bh(&rt_uncached_lock);
1336	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1337	spin_unlock_bh(&rt_uncached_lock);
1338}
1339
1340static void ipv4_dst_destroy(struct dst_entry *dst)
1341{
1342	struct rtable *rt = (struct rtable *) dst;
1343
1344	if (!list_empty(&rt->rt_uncached)) {
1345		spin_lock_bh(&rt_uncached_lock);
1346		list_del(&rt->rt_uncached);
1347		spin_unlock_bh(&rt_uncached_lock);
1348	}
1349}
1350
1351void rt_flush_dev(struct net_device *dev)
1352{
1353	if (!list_empty(&rt_uncached_list)) {
1354		struct net *net = dev_net(dev);
1355		struct rtable *rt;
1356
1357		spin_lock_bh(&rt_uncached_lock);
1358		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1359			if (rt->dst.dev != dev)
1360				continue;
1361			rt->dst.dev = net->loopback_dev;
1362			dev_hold(rt->dst.dev);
1363			dev_put(dev);
1364		}
1365		spin_unlock_bh(&rt_uncached_lock);
1366	}
1367}
1368
1369static bool rt_cache_valid(const struct rtable *rt)
1370{
1371	return	rt &&
1372		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1373		!rt_is_expired(rt);
1374}
1375
1376static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1377			   const struct fib_result *res,
1378			   struct fib_nh_exception *fnhe,
1379			   struct fib_info *fi, u16 type, u32 itag)
1380{
1381	bool cached = false;
1382
1383	if (fi) {
1384		struct fib_nh *nh = &FIB_RES_NH(*res);
1385
1386		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1387			rt->rt_gateway = nh->nh_gw;
1388			rt->rt_uses_gateway = 1;
1389		}
1390		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1391#ifdef CONFIG_IP_ROUTE_CLASSID
1392		rt->dst.tclassid = nh->nh_tclassid;
1393#endif
1394		if (unlikely(fnhe))
1395			cached = rt_bind_exception(rt, fnhe, daddr);
1396		else if (!(rt->dst.flags & DST_NOCACHE))
1397			cached = rt_cache_route(nh, rt);
1398		if (unlikely(!cached)) {
1399			/* Routes we intend to cache in nexthop exception or
1400			 * FIB nexthop have the DST_NOCACHE bit clear.
1401			 * However, if we are unsuccessful at storing this
1402			 * route into the cache we really need to set it.
1403			 */
1404			rt->dst.flags |= DST_NOCACHE;
1405			if (!rt->rt_gateway)
1406				rt->rt_gateway = daddr;
1407			rt_add_uncached_list(rt);
1408		}
1409	} else
1410		rt_add_uncached_list(rt);
1411
1412#ifdef CONFIG_IP_ROUTE_CLASSID
1413#ifdef CONFIG_IP_MULTIPLE_TABLES
1414	set_class_tag(rt, res->tclassid);
1415#endif
1416	set_class_tag(rt, itag);
1417#endif
1418}
1419
1420static struct rtable *rt_dst_alloc(struct net_device *dev,
1421				   bool nopolicy, bool noxfrm, bool will_cache)
1422{
1423	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1424			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1425			 (nopolicy ? DST_NOPOLICY : 0) |
1426			 (noxfrm ? DST_NOXFRM : 0));
1427}
1428
1429/* called in rcu_read_lock() section */
1430static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1431				u8 tos, struct net_device *dev, int our)
1432{
1433	struct rtable *rth;
1434	struct in_device *in_dev = __in_dev_get_rcu(dev);
1435	u32 itag = 0;
1436	int err;
1437
1438	/* Primary sanity checks. */
1439
1440	if (in_dev == NULL)
1441		return -EINVAL;
1442
1443	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1444	    skb->protocol != htons(ETH_P_IP))
1445		goto e_inval;
1446
1447	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1448		if (ipv4_is_loopback(saddr))
1449			goto e_inval;
1450
1451	if (ipv4_is_zeronet(saddr)) {
1452		if (!ipv4_is_local_multicast(daddr))
1453			goto e_inval;
1454	} else {
1455		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1456					  in_dev, &itag);
1457		if (err < 0)
1458			goto e_err;
1459	}
1460	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1461			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1462	if (!rth)
1463		goto e_nobufs;
1464
1465#ifdef CONFIG_IP_ROUTE_CLASSID
1466	rth->dst.tclassid = itag;
1467#endif
1468	rth->dst.output = ip_rt_bug;
1469
1470	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1471	rth->rt_flags	= RTCF_MULTICAST;
1472	rth->rt_type	= RTN_MULTICAST;
1473	rth->rt_is_input= 1;
1474	rth->rt_iif	= 0;
1475	rth->rt_pmtu	= 0;
1476	rth->rt_gateway	= 0;
1477	rth->rt_uses_gateway = 0;
1478	INIT_LIST_HEAD(&rth->rt_uncached);
1479	if (our) {
1480		rth->dst.input= ip_local_deliver;
1481		rth->rt_flags |= RTCF_LOCAL;
1482	}
1483
1484#ifdef CONFIG_IP_MROUTE
1485	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1486		rth->dst.input = ip_mr_input;
1487#endif
1488	RT_CACHE_STAT_INC(in_slow_mc);
1489
1490	skb_dst_set(skb, &rth->dst);
1491	return 0;
1492
1493e_nobufs:
1494	return -ENOBUFS;
1495e_inval:
1496	return -EINVAL;
1497e_err:
1498	return err;
1499}
1500
1501
1502static void ip_handle_martian_source(struct net_device *dev,
1503				     struct in_device *in_dev,
1504				     struct sk_buff *skb,
1505				     __be32 daddr,
1506				     __be32 saddr)
1507{
1508	RT_CACHE_STAT_INC(in_martian_src);
1509#ifdef CONFIG_IP_ROUTE_VERBOSE
1510	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1511		/*
1512		 *	RFC1812 recommendation, if source is martian,
1513		 *	the only hint is MAC header.
1514		 */
1515		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1516			&daddr, &saddr, dev->name);
1517		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1518			print_hex_dump(KERN_WARNING, "ll header: ",
1519				       DUMP_PREFIX_OFFSET, 16, 1,
1520				       skb_mac_header(skb),
1521				       dev->hard_header_len, true);
1522		}
1523	}
1524#endif
1525}
1526
1527/* called in rcu_read_lock() section */
1528static int __mkroute_input(struct sk_buff *skb,
1529			   const struct fib_result *res,
1530			   struct in_device *in_dev,
1531			   __be32 daddr, __be32 saddr, u32 tos)
1532{
1533	struct fib_nh_exception *fnhe;
1534	struct rtable *rth;
1535	int err;
1536	struct in_device *out_dev;
1537	unsigned int flags = 0;
1538	bool do_cache;
1539	u32 itag = 0;
1540
1541	/* get a working reference to the output device */
1542	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1543	if (out_dev == NULL) {
1544		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1545		return -EINVAL;
1546	}
1547
1548	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1549				  in_dev->dev, in_dev, &itag);
1550	if (err < 0) {
1551		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1552					 saddr);
1553
1554		goto cleanup;
1555	}
1556
1557	do_cache = res->fi && !itag;
1558	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1559	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1560	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1561		flags |= RTCF_DOREDIRECT;
1562		do_cache = false;
1563	}
1564
1565	if (skb->protocol != htons(ETH_P_IP)) {
1566		/* Not IP (i.e. ARP). Do not create route, if it is
1567		 * invalid for proxy arp. DNAT routes are always valid.
1568		 *
1569		 * Proxy arp feature have been extended to allow, ARP
1570		 * replies back to the same interface, to support
1571		 * Private VLAN switch technologies. See arp.c.
1572		 */
1573		if (out_dev == in_dev &&
1574		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1575			err = -EINVAL;
1576			goto cleanup;
1577		}
1578	}
1579
1580	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1581	if (do_cache) {
1582		if (fnhe != NULL)
1583			rth = rcu_dereference(fnhe->fnhe_rth_input);
1584		else
1585			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1586
1587		if (rt_cache_valid(rth)) {
1588			skb_dst_set_noref(skb, &rth->dst);
1589			goto out;
1590		}
1591	}
1592
1593	rth = rt_dst_alloc(out_dev->dev,
1594			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1595			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1596	if (!rth) {
1597		err = -ENOBUFS;
1598		goto cleanup;
1599	}
1600
1601	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1602	rth->rt_flags = flags;
1603	rth->rt_type = res->type;
1604	rth->rt_is_input = 1;
1605	rth->rt_iif 	= 0;
1606	rth->rt_pmtu	= 0;
1607	rth->rt_gateway	= 0;
1608	rth->rt_uses_gateway = 0;
1609	INIT_LIST_HEAD(&rth->rt_uncached);
1610	RT_CACHE_STAT_INC(in_slow_tot);
1611
1612	rth->dst.input = ip_forward;
1613	rth->dst.output = ip_output;
1614
1615	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1616	skb_dst_set(skb, &rth->dst);
1617out:
1618	err = 0;
1619 cleanup:
1620	return err;
1621}
1622
1623static int ip_mkroute_input(struct sk_buff *skb,
1624			    struct fib_result *res,
1625			    const struct flowi4 *fl4,
1626			    struct in_device *in_dev,
1627			    __be32 daddr, __be32 saddr, u32 tos)
1628{
1629#ifdef CONFIG_IP_ROUTE_MULTIPATH
1630	if (res->fi && res->fi->fib_nhs > 1)
1631		fib_select_multipath(res);
1632#endif
1633
1634	/* create a routing cache entry */
1635	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1636}
1637
1638/*
1639 *	NOTE. We drop all the packets that has local source
1640 *	addresses, because every properly looped back packet
1641 *	must have correct destination already attached by output routine.
1642 *
1643 *	Such approach solves two big problems:
1644 *	1. Not simplex devices are handled properly.
1645 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1646 *	called with rcu_read_lock()
1647 */
1648
1649static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1650			       u8 tos, struct net_device *dev)
1651{
1652	struct fib_result res;
1653	struct in_device *in_dev = __in_dev_get_rcu(dev);
1654	struct flowi4	fl4;
1655	unsigned int	flags = 0;
1656	u32		itag = 0;
1657	struct rtable	*rth;
1658	int		err = -EINVAL;
1659	struct net    *net = dev_net(dev);
1660	bool do_cache;
1661
1662	/* IP on this device is disabled. */
1663
1664	if (!in_dev)
1665		goto out;
1666
1667	/* Check for the most weird martians, which can be not detected
1668	   by fib_lookup.
1669	 */
1670
1671	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1672		goto martian_source;
1673
1674	res.fi = NULL;
1675	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1676		goto brd_input;
1677
1678	/* Accept zero addresses only to limited broadcast;
1679	 * I even do not know to fix it or not. Waiting for complains :-)
1680	 */
1681	if (ipv4_is_zeronet(saddr))
1682		goto martian_source;
1683
1684	if (ipv4_is_zeronet(daddr))
1685		goto martian_destination;
1686
1687	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1688	 * and call it once if daddr or/and saddr are loopback addresses
1689	 */
1690	if (ipv4_is_loopback(daddr)) {
1691		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1692			goto martian_destination;
1693	} else if (ipv4_is_loopback(saddr)) {
1694		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1695			goto martian_source;
1696	}
1697
1698	/*
1699	 *	Now we are ready to route packet.
1700	 */
1701	fl4.flowi4_oif = 0;
1702	fl4.flowi4_iif = dev->ifindex;
1703	fl4.flowi4_mark = skb->mark;
1704	fl4.flowi4_tos = tos;
1705	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1706	fl4.daddr = daddr;
1707	fl4.saddr = saddr;
1708	err = fib_lookup(net, &fl4, &res);
1709	if (err != 0) {
1710		if (!IN_DEV_FORWARD(in_dev))
1711			err = -EHOSTUNREACH;
1712		goto no_route;
1713	}
1714
1715	if (res.type == RTN_BROADCAST)
1716		goto brd_input;
1717
1718	if (res.type == RTN_LOCAL) {
1719		err = fib_validate_source(skb, saddr, daddr, tos,
1720					  0, dev, in_dev, &itag);
1721		if (err < 0)
1722			goto martian_source_keep_err;
1723		goto local_input;
1724	}
1725
1726	if (!IN_DEV_FORWARD(in_dev)) {
1727		err = -EHOSTUNREACH;
1728		goto no_route;
1729	}
1730	if (res.type != RTN_UNICAST)
1731		goto martian_destination;
1732
1733	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1734out:	return err;
1735
1736brd_input:
1737	if (skb->protocol != htons(ETH_P_IP))
1738		goto e_inval;
1739
1740	if (!ipv4_is_zeronet(saddr)) {
1741		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1742					  in_dev, &itag);
1743		if (err < 0)
1744			goto martian_source_keep_err;
1745	}
1746	flags |= RTCF_BROADCAST;
1747	res.type = RTN_BROADCAST;
1748	RT_CACHE_STAT_INC(in_brd);
1749
1750local_input:
1751	do_cache = false;
1752	if (res.fi) {
1753		if (!itag) {
1754			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1755			if (rt_cache_valid(rth)) {
1756				skb_dst_set_noref(skb, &rth->dst);
1757				err = 0;
1758				goto out;
1759			}
1760			do_cache = true;
1761		}
1762	}
1763
1764	rth = rt_dst_alloc(net->loopback_dev,
1765			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1766	if (!rth)
1767		goto e_nobufs;
1768
1769	rth->dst.input= ip_local_deliver;
1770	rth->dst.output= ip_rt_bug;
1771#ifdef CONFIG_IP_ROUTE_CLASSID
1772	rth->dst.tclassid = itag;
1773#endif
1774
1775	rth->rt_genid = rt_genid_ipv4(net);
1776	rth->rt_flags 	= flags|RTCF_LOCAL;
1777	rth->rt_type	= res.type;
1778	rth->rt_is_input = 1;
1779	rth->rt_iif	= 0;
1780	rth->rt_pmtu	= 0;
1781	rth->rt_gateway	= 0;
1782	rth->rt_uses_gateway = 0;
1783	INIT_LIST_HEAD(&rth->rt_uncached);
1784	RT_CACHE_STAT_INC(in_slow_tot);
1785	if (res.type == RTN_UNREACHABLE) {
1786		rth->dst.input= ip_error;
1787		rth->dst.error= -err;
1788		rth->rt_flags 	&= ~RTCF_LOCAL;
1789	}
1790	if (do_cache) {
1791		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1792			rth->dst.flags |= DST_NOCACHE;
1793			rt_add_uncached_list(rth);
1794		}
1795	}
1796	skb_dst_set(skb, &rth->dst);
1797	err = 0;
1798	goto out;
1799
1800no_route:
1801	RT_CACHE_STAT_INC(in_no_route);
1802	res.type = RTN_UNREACHABLE;
1803	res.fi = NULL;
1804	goto local_input;
1805
1806	/*
1807	 *	Do not cache martian addresses: they should be logged (RFC1812)
1808	 */
1809martian_destination:
1810	RT_CACHE_STAT_INC(in_martian_dst);
1811#ifdef CONFIG_IP_ROUTE_VERBOSE
1812	if (IN_DEV_LOG_MARTIANS(in_dev))
1813		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1814				     &daddr, &saddr, dev->name);
1815#endif
1816
1817e_inval:
1818	err = -EINVAL;
1819	goto out;
1820
1821e_nobufs:
1822	err = -ENOBUFS;
1823	goto out;
1824
1825martian_source:
1826	err = -EINVAL;
1827martian_source_keep_err:
1828	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1829	goto out;
1830}
1831
1832int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1833			 u8 tos, struct net_device *dev)
1834{
1835	int res;
1836
1837	rcu_read_lock();
1838
1839	/* Multicast recognition logic is moved from route cache to here.
1840	   The problem was that too many Ethernet cards have broken/missing
1841	   hardware multicast filters :-( As result the host on multicasting
1842	   network acquires a lot of useless route cache entries, sort of
1843	   SDR messages from all the world. Now we try to get rid of them.
1844	   Really, provided software IP multicast filter is organized
1845	   reasonably (at least, hashed), it does not result in a slowdown
1846	   comparing with route cache reject entries.
1847	   Note, that multicast routers are not affected, because
1848	   route cache entry is created eventually.
1849	 */
1850	if (ipv4_is_multicast(daddr)) {
1851		struct in_device *in_dev = __in_dev_get_rcu(dev);
1852
1853		if (in_dev) {
1854			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1855						  ip_hdr(skb)->protocol);
1856			if (our
1857#ifdef CONFIG_IP_MROUTE
1858				||
1859			    (!ipv4_is_local_multicast(daddr) &&
1860			     IN_DEV_MFORWARD(in_dev))
1861#endif
1862			   ) {
1863				int res = ip_route_input_mc(skb, daddr, saddr,
1864							    tos, dev, our);
1865				rcu_read_unlock();
1866				return res;
1867			}
1868		}
1869		rcu_read_unlock();
1870		return -EINVAL;
1871	}
1872	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1873	rcu_read_unlock();
1874	return res;
1875}
1876EXPORT_SYMBOL(ip_route_input_noref);
1877
1878/* called with rcu_read_lock() */
1879static struct rtable *__mkroute_output(const struct fib_result *res,
1880				       const struct flowi4 *fl4, int orig_oif,
1881				       struct net_device *dev_out,
1882				       unsigned int flags)
1883{
1884	struct fib_info *fi = res->fi;
1885	struct fib_nh_exception *fnhe;
1886	struct in_device *in_dev;
1887	u16 type = res->type;
1888	struct rtable *rth;
1889	bool do_cache;
1890
1891	in_dev = __in_dev_get_rcu(dev_out);
1892	if (!in_dev)
1893		return ERR_PTR(-EINVAL);
1894
1895	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1896		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1897			return ERR_PTR(-EINVAL);
1898
1899	if (ipv4_is_lbcast(fl4->daddr))
1900		type = RTN_BROADCAST;
1901	else if (ipv4_is_multicast(fl4->daddr))
1902		type = RTN_MULTICAST;
1903	else if (ipv4_is_zeronet(fl4->daddr))
1904		return ERR_PTR(-EINVAL);
1905
1906	if (dev_out->flags & IFF_LOOPBACK)
1907		flags |= RTCF_LOCAL;
1908
1909	do_cache = true;
1910	if (type == RTN_BROADCAST) {
1911		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1912		fi = NULL;
1913	} else if (type == RTN_MULTICAST) {
1914		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1915		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1916				     fl4->flowi4_proto))
1917			flags &= ~RTCF_LOCAL;
1918		else
1919			do_cache = false;
1920		/* If multicast route do not exist use
1921		 * default one, but do not gateway in this case.
1922		 * Yes, it is hack.
1923		 */
1924		if (fi && res->prefixlen < 4)
1925			fi = NULL;
1926	}
1927
1928	fnhe = NULL;
1929	do_cache &= fi != NULL;
1930	if (do_cache) {
1931		struct rtable __rcu **prth;
1932		struct fib_nh *nh = &FIB_RES_NH(*res);
1933
1934		fnhe = find_exception(nh, fl4->daddr);
1935		if (fnhe)
1936			prth = &fnhe->fnhe_rth_output;
1937		else {
1938			if (unlikely(fl4->flowi4_flags &
1939				     FLOWI_FLAG_KNOWN_NH &&
1940				     !(nh->nh_gw &&
1941				       nh->nh_scope == RT_SCOPE_LINK))) {
1942				do_cache = false;
1943				goto add;
1944			}
1945			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1946		}
1947		rth = rcu_dereference(*prth);
1948		if (rt_cache_valid(rth)) {
1949			dst_hold(&rth->dst);
1950			return rth;
1951		}
1952	}
1953
1954add:
1955	rth = rt_dst_alloc(dev_out,
1956			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1957			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1958			   do_cache);
1959	if (!rth)
1960		return ERR_PTR(-ENOBUFS);
1961
1962	rth->dst.output = ip_output;
1963
1964	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1965	rth->rt_flags	= flags;
1966	rth->rt_type	= type;
1967	rth->rt_is_input = 0;
1968	rth->rt_iif	= orig_oif ? : 0;
1969	rth->rt_pmtu	= 0;
1970	rth->rt_gateway = 0;
1971	rth->rt_uses_gateway = 0;
1972	INIT_LIST_HEAD(&rth->rt_uncached);
1973
1974	RT_CACHE_STAT_INC(out_slow_tot);
1975
1976	if (flags & RTCF_LOCAL)
1977		rth->dst.input = ip_local_deliver;
1978	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1979		if (flags & RTCF_LOCAL &&
1980		    !(dev_out->flags & IFF_LOOPBACK)) {
1981			rth->dst.output = ip_mc_output;
1982			RT_CACHE_STAT_INC(out_slow_mc);
1983		}
1984#ifdef CONFIG_IP_MROUTE
1985		if (type == RTN_MULTICAST) {
1986			if (IN_DEV_MFORWARD(in_dev) &&
1987			    !ipv4_is_local_multicast(fl4->daddr)) {
1988				rth->dst.input = ip_mr_input;
1989				rth->dst.output = ip_mc_output;
1990			}
1991		}
1992#endif
1993	}
1994
1995	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1996
1997	return rth;
1998}
1999
2000/*
2001 * Major route resolver routine.
2002 */
2003
2004struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2005{
2006	struct net_device *dev_out = NULL;
2007	__u8 tos = RT_FL_TOS(fl4);
2008	unsigned int flags = 0;
2009	struct fib_result res;
2010	struct rtable *rth;
2011	int orig_oif;
2012
2013	res.tclassid	= 0;
2014	res.fi		= NULL;
2015	res.table	= NULL;
2016
2017	orig_oif = fl4->flowi4_oif;
2018
2019	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2020	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2021	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2022			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2023
2024	rcu_read_lock();
2025	if (fl4->saddr) {
2026		rth = ERR_PTR(-EINVAL);
2027		if (ipv4_is_multicast(fl4->saddr) ||
2028		    ipv4_is_lbcast(fl4->saddr) ||
2029		    ipv4_is_zeronet(fl4->saddr))
2030			goto out;
2031
2032		/* I removed check for oif == dev_out->oif here.
2033		   It was wrong for two reasons:
2034		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2035		      is assigned to multiple interfaces.
2036		   2. Moreover, we are allowed to send packets with saddr
2037		      of another iface. --ANK
2038		 */
2039
2040		if (fl4->flowi4_oif == 0 &&
2041		    (ipv4_is_multicast(fl4->daddr) ||
2042		     ipv4_is_lbcast(fl4->daddr))) {
2043			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2044			dev_out = __ip_dev_find(net, fl4->saddr, false);
2045			if (dev_out == NULL)
2046				goto out;
2047
2048			/* Special hack: user can direct multicasts
2049			   and limited broadcast via necessary interface
2050			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2051			   This hack is not just for fun, it allows
2052			   vic,vat and friends to work.
2053			   They bind socket to loopback, set ttl to zero
2054			   and expect that it will work.
2055			   From the viewpoint of routing cache they are broken,
2056			   because we are not allowed to build multicast path
2057			   with loopback source addr (look, routing cache
2058			   cannot know, that ttl is zero, so that packet
2059			   will not leave this host and route is valid).
2060			   Luckily, this hack is good workaround.
2061			 */
2062
2063			fl4->flowi4_oif = dev_out->ifindex;
2064			goto make_route;
2065		}
2066
2067		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2068			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2069			if (!__ip_dev_find(net, fl4->saddr, false))
2070				goto out;
2071		}
2072	}
2073
2074
2075	if (fl4->flowi4_oif) {
2076		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2077		rth = ERR_PTR(-ENODEV);
2078		if (dev_out == NULL)
2079			goto out;
2080
2081		/* RACE: Check return value of inet_select_addr instead. */
2082		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2083			rth = ERR_PTR(-ENETUNREACH);
2084			goto out;
2085		}
2086		if (ipv4_is_local_multicast(fl4->daddr) ||
2087		    ipv4_is_lbcast(fl4->daddr)) {
2088			if (!fl4->saddr)
2089				fl4->saddr = inet_select_addr(dev_out, 0,
2090							      RT_SCOPE_LINK);
2091			goto make_route;
2092		}
2093		if (!fl4->saddr) {
2094			if (ipv4_is_multicast(fl4->daddr))
2095				fl4->saddr = inet_select_addr(dev_out, 0,
2096							      fl4->flowi4_scope);
2097			else if (!fl4->daddr)
2098				fl4->saddr = inet_select_addr(dev_out, 0,
2099							      RT_SCOPE_HOST);
2100		}
2101	}
2102
2103	if (!fl4->daddr) {
2104		fl4->daddr = fl4->saddr;
2105		if (!fl4->daddr)
2106			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2107		dev_out = net->loopback_dev;
2108		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2109		res.type = RTN_LOCAL;
2110		flags |= RTCF_LOCAL;
2111		goto make_route;
2112	}
2113
2114	if (fib_lookup(net, fl4, &res)) {
2115		res.fi = NULL;
2116		res.table = NULL;
2117		if (fl4->flowi4_oif) {
2118			/* Apparently, routing tables are wrong. Assume,
2119			   that the destination is on link.
2120
2121			   WHY? DW.
2122			   Because we are allowed to send to iface
2123			   even if it has NO routes and NO assigned
2124			   addresses. When oif is specified, routing
2125			   tables are looked up with only one purpose:
2126			   to catch if destination is gatewayed, rather than
2127			   direct. Moreover, if MSG_DONTROUTE is set,
2128			   we send packet, ignoring both routing tables
2129			   and ifaddr state. --ANK
2130
2131
2132			   We could make it even if oif is unknown,
2133			   likely IPv6, but we do not.
2134			 */
2135
2136			if (fl4->saddr == 0)
2137				fl4->saddr = inet_select_addr(dev_out, 0,
2138							      RT_SCOPE_LINK);
2139			res.type = RTN_UNICAST;
2140			goto make_route;
2141		}
2142		rth = ERR_PTR(-ENETUNREACH);
2143		goto out;
2144	}
2145
2146	if (res.type == RTN_LOCAL) {
2147		if (!fl4->saddr) {
2148			if (res.fi->fib_prefsrc)
2149				fl4->saddr = res.fi->fib_prefsrc;
2150			else
2151				fl4->saddr = fl4->daddr;
2152		}
2153		dev_out = net->loopback_dev;
2154		fl4->flowi4_oif = dev_out->ifindex;
2155		flags |= RTCF_LOCAL;
2156		goto make_route;
2157	}
2158
2159#ifdef CONFIG_IP_ROUTE_MULTIPATH
2160	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2161		fib_select_multipath(&res);
2162	else
2163#endif
2164	if (!res.prefixlen &&
2165	    res.table->tb_num_default > 1 &&
2166	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2167		fib_select_default(&res);
2168
2169	if (!fl4->saddr)
2170		fl4->saddr = FIB_RES_PREFSRC(net, res);
2171
2172	dev_out = FIB_RES_DEV(res);
2173	fl4->flowi4_oif = dev_out->ifindex;
2174
2175
2176make_route:
2177	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2178
2179out:
2180	rcu_read_unlock();
2181	return rth;
2182}
2183EXPORT_SYMBOL_GPL(__ip_route_output_key);
2184
2185static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2186{
2187	return NULL;
2188}
2189
2190static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2191{
2192	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2193
2194	return mtu ? : dst->dev->mtu;
2195}
2196
2197static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2198					  struct sk_buff *skb, u32 mtu)
2199{
2200}
2201
2202static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2203				       struct sk_buff *skb)
2204{
2205}
2206
2207static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2208					  unsigned long old)
2209{
2210	return NULL;
2211}
2212
2213static struct dst_ops ipv4_dst_blackhole_ops = {
2214	.family			=	AF_INET,
2215	.protocol		=	cpu_to_be16(ETH_P_IP),
2216	.check			=	ipv4_blackhole_dst_check,
2217	.mtu			=	ipv4_blackhole_mtu,
2218	.default_advmss		=	ipv4_default_advmss,
2219	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2220	.redirect		=	ipv4_rt_blackhole_redirect,
2221	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2222	.neigh_lookup		=	ipv4_neigh_lookup,
2223};
2224
2225struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2226{
2227	struct rtable *ort = (struct rtable *) dst_orig;
2228	struct rtable *rt;
2229
2230	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2231	if (rt) {
2232		struct dst_entry *new = &rt->dst;
2233
2234		new->__use = 1;
2235		new->input = dst_discard;
2236		new->output = dst_discard_sk;
2237
2238		new->dev = ort->dst.dev;
2239		if (new->dev)
2240			dev_hold(new->dev);
2241
2242		rt->rt_is_input = ort->rt_is_input;
2243		rt->rt_iif = ort->rt_iif;
2244		rt->rt_pmtu = ort->rt_pmtu;
2245
2246		rt->rt_genid = rt_genid_ipv4(net);
2247		rt->rt_flags = ort->rt_flags;
2248		rt->rt_type = ort->rt_type;
2249		rt->rt_gateway = ort->rt_gateway;
2250		rt->rt_uses_gateway = ort->rt_uses_gateway;
2251
2252		INIT_LIST_HEAD(&rt->rt_uncached);
2253
2254		dst_free(new);
2255	}
2256
2257	dst_release(dst_orig);
2258
2259	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2260}
2261
2262struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2263				    struct sock *sk)
2264{
2265	struct rtable *rt = __ip_route_output_key(net, flp4);
2266
2267	if (IS_ERR(rt))
2268		return rt;
2269
2270	if (flp4->flowi4_proto)
2271		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2272							flowi4_to_flowi(flp4),
2273							sk, 0);
2274
2275	return rt;
2276}
2277EXPORT_SYMBOL_GPL(ip_route_output_flow);
2278
2279static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2280			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2281			u32 seq, int event, int nowait, unsigned int flags)
2282{
2283	struct rtable *rt = skb_rtable(skb);
2284	struct rtmsg *r;
2285	struct nlmsghdr *nlh;
2286	unsigned long expires = 0;
2287	u32 error;
2288	u32 metrics[RTAX_MAX];
2289
2290	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2291	if (nlh == NULL)
2292		return -EMSGSIZE;
2293
2294	r = nlmsg_data(nlh);
2295	r->rtm_family	 = AF_INET;
2296	r->rtm_dst_len	= 32;
2297	r->rtm_src_len	= 0;
2298	r->rtm_tos	= fl4->flowi4_tos;
2299	r->rtm_table	= RT_TABLE_MAIN;
2300	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2301		goto nla_put_failure;
2302	r->rtm_type	= rt->rt_type;
2303	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2304	r->rtm_protocol = RTPROT_UNSPEC;
2305	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2306	if (rt->rt_flags & RTCF_NOTIFY)
2307		r->rtm_flags |= RTM_F_NOTIFY;
2308
2309	if (nla_put_be32(skb, RTA_DST, dst))
2310		goto nla_put_failure;
2311	if (src) {
2312		r->rtm_src_len = 32;
2313		if (nla_put_be32(skb, RTA_SRC, src))
2314			goto nla_put_failure;
2315	}
2316	if (rt->dst.dev &&
2317	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2318		goto nla_put_failure;
2319#ifdef CONFIG_IP_ROUTE_CLASSID
2320	if (rt->dst.tclassid &&
2321	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2322		goto nla_put_failure;
2323#endif
2324	if (!rt_is_input_route(rt) &&
2325	    fl4->saddr != src) {
2326		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2327			goto nla_put_failure;
2328	}
2329	if (rt->rt_uses_gateway &&
2330	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2331		goto nla_put_failure;
2332
2333	expires = rt->dst.expires;
2334	if (expires) {
2335		unsigned long now = jiffies;
2336
2337		if (time_before(now, expires))
2338			expires -= now;
2339		else
2340			expires = 0;
2341	}
2342
2343	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2344	if (rt->rt_pmtu && expires)
2345		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2346	if (rtnetlink_put_metrics(skb, metrics) < 0)
2347		goto nla_put_failure;
2348
2349	if (fl4->flowi4_mark &&
2350	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2351		goto nla_put_failure;
2352
2353	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2354	    nla_put_u32(skb, RTA_UID,
2355			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2356		goto nla_put_failure;
2357
2358	error = rt->dst.error;
2359
2360	if (rt_is_input_route(rt)) {
2361#ifdef CONFIG_IP_MROUTE
2362		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2363		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2364			int err = ipmr_get_route(net, skb,
2365						 fl4->saddr, fl4->daddr,
2366						 r, nowait);
2367			if (err <= 0) {
2368				if (!nowait) {
2369					if (err == 0)
2370						return 0;
2371					goto nla_put_failure;
2372				} else {
2373					if (err == -EMSGSIZE)
2374						goto nla_put_failure;
2375					error = err;
2376				}
2377			}
2378		} else
2379#endif
2380			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2381				goto nla_put_failure;
2382	}
2383
2384	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2385		goto nla_put_failure;
2386
2387	return nlmsg_end(skb, nlh);
2388
2389nla_put_failure:
2390	nlmsg_cancel(skb, nlh);
2391	return -EMSGSIZE;
2392}
2393
2394static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2395{
2396	struct net *net = sock_net(in_skb->sk);
2397	struct rtmsg *rtm;
2398	struct nlattr *tb[RTA_MAX+1];
2399	struct rtable *rt = NULL;
2400	struct flowi4 fl4;
2401	__be32 dst = 0;
2402	__be32 src = 0;
2403	u32 iif;
2404	int err;
2405	int mark;
2406	struct sk_buff *skb;
2407	kuid_t uid;
2408
2409	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2410	if (err < 0)
2411		goto errout;
2412
2413	rtm = nlmsg_data(nlh);
2414
2415	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2416	if (skb == NULL) {
2417		err = -ENOBUFS;
2418		goto errout;
2419	}
2420
2421	/* Reserve room for dummy headers, this skb can pass
2422	   through good chunk of routing engine.
2423	 */
2424	skb_reset_mac_header(skb);
2425	skb_reset_network_header(skb);
2426
2427	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2428	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2429	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2430
2431	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2432	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2433	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2434	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2435	if (tb[RTA_UID])
2436		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2437	else
2438		uid = (iif ? INVALID_UID : current_uid());
2439
2440	memset(&fl4, 0, sizeof(fl4));
2441	fl4.daddr = dst;
2442	fl4.saddr = src;
2443	fl4.flowi4_tos = rtm->rtm_tos;
2444	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2445	fl4.flowi4_mark = mark;
2446	fl4.flowi4_uid = uid;
2447
2448	if (iif) {
2449		struct net_device *dev;
2450
2451		dev = __dev_get_by_index(net, iif);
2452		if (dev == NULL) {
2453			err = -ENODEV;
2454			goto errout_free;
2455		}
2456
2457		skb->protocol	= htons(ETH_P_IP);
2458		skb->dev	= dev;
2459		skb->mark	= mark;
2460		local_bh_disable();
2461		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2462		local_bh_enable();
2463
2464		rt = skb_rtable(skb);
2465		if (err == 0 && rt->dst.error)
2466			err = -rt->dst.error;
2467	} else {
2468		rt = ip_route_output_key(net, &fl4);
2469
2470		err = 0;
2471		if (IS_ERR(rt))
2472			err = PTR_ERR(rt);
2473	}
2474
2475	if (err)
2476		goto errout_free;
2477
2478	skb_dst_set(skb, &rt->dst);
2479	if (rtm->rtm_flags & RTM_F_NOTIFY)
2480		rt->rt_flags |= RTCF_NOTIFY;
2481
2482	err = rt_fill_info(net, dst, src, &fl4, skb,
2483			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2484			   RTM_NEWROUTE, 0, 0);
2485	if (err <= 0)
2486		goto errout_free;
2487
2488	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2489errout:
2490	return err;
2491
2492errout_free:
2493	kfree_skb(skb);
2494	goto errout;
2495}
2496
2497void ip_rt_multicast_event(struct in_device *in_dev)
2498{
2499	rt_cache_flush(dev_net(in_dev->dev));
2500}
2501
2502#ifdef CONFIG_SYSCTL
2503static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2504static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2505static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2506static int ip_rt_gc_elasticity __read_mostly	= 8;
2507
2508static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2509					void __user *buffer,
2510					size_t *lenp, loff_t *ppos)
2511{
2512	struct net *net = (struct net *)__ctl->extra1;
2513
2514	if (write) {
2515		rt_cache_flush(net);
2516		fnhe_genid_bump(net);
2517		return 0;
2518	}
2519
2520	return -EINVAL;
2521}
2522
2523static struct ctl_table ipv4_route_table[] = {
2524	{
2525		.procname	= "gc_thresh",
2526		.data		= &ipv4_dst_ops.gc_thresh,
2527		.maxlen		= sizeof(int),
2528		.mode		= 0644,
2529		.proc_handler	= proc_dointvec,
2530	},
2531	{
2532		.procname	= "max_size",
2533		.data		= &ip_rt_max_size,
2534		.maxlen		= sizeof(int),
2535		.mode		= 0644,
2536		.proc_handler	= proc_dointvec,
2537	},
2538	{
2539		/*  Deprecated. Use gc_min_interval_ms */
2540
2541		.procname	= "gc_min_interval",
2542		.data		= &ip_rt_gc_min_interval,
2543		.maxlen		= sizeof(int),
2544		.mode		= 0644,
2545		.proc_handler	= proc_dointvec_jiffies,
2546	},
2547	{
2548		.procname	= "gc_min_interval_ms",
2549		.data		= &ip_rt_gc_min_interval,
2550		.maxlen		= sizeof(int),
2551		.mode		= 0644,
2552		.proc_handler	= proc_dointvec_ms_jiffies,
2553	},
2554	{
2555		.procname	= "gc_timeout",
2556		.data		= &ip_rt_gc_timeout,
2557		.maxlen		= sizeof(int),
2558		.mode		= 0644,
2559		.proc_handler	= proc_dointvec_jiffies,
2560	},
2561	{
2562		.procname	= "gc_interval",
2563		.data		= &ip_rt_gc_interval,
2564		.maxlen		= sizeof(int),
2565		.mode		= 0644,
2566		.proc_handler	= proc_dointvec_jiffies,
2567	},
2568	{
2569		.procname	= "redirect_load",
2570		.data		= &ip_rt_redirect_load,
2571		.maxlen		= sizeof(int),
2572		.mode		= 0644,
2573		.proc_handler	= proc_dointvec,
2574	},
2575	{
2576		.procname	= "redirect_number",
2577		.data		= &ip_rt_redirect_number,
2578		.maxlen		= sizeof(int),
2579		.mode		= 0644,
2580		.proc_handler	= proc_dointvec,
2581	},
2582	{
2583		.procname	= "redirect_silence",
2584		.data		= &ip_rt_redirect_silence,
2585		.maxlen		= sizeof(int),
2586		.mode		= 0644,
2587		.proc_handler	= proc_dointvec,
2588	},
2589	{
2590		.procname	= "error_cost",
2591		.data		= &ip_rt_error_cost,
2592		.maxlen		= sizeof(int),
2593		.mode		= 0644,
2594		.proc_handler	= proc_dointvec,
2595	},
2596	{
2597		.procname	= "error_burst",
2598		.data		= &ip_rt_error_burst,
2599		.maxlen		= sizeof(int),
2600		.mode		= 0644,
2601		.proc_handler	= proc_dointvec,
2602	},
2603	{
2604		.procname	= "gc_elasticity",
2605		.data		= &ip_rt_gc_elasticity,
2606		.maxlen		= sizeof(int),
2607		.mode		= 0644,
2608		.proc_handler	= proc_dointvec,
2609	},
2610	{
2611		.procname	= "mtu_expires",
2612		.data		= &ip_rt_mtu_expires,
2613		.maxlen		= sizeof(int),
2614		.mode		= 0644,
2615		.proc_handler	= proc_dointvec_jiffies,
2616	},
2617	{
2618		.procname	= "min_pmtu",
2619		.data		= &ip_rt_min_pmtu,
2620		.maxlen		= sizeof(int),
2621		.mode		= 0644,
2622		.proc_handler	= proc_dointvec,
2623	},
2624	{
2625		.procname	= "min_adv_mss",
2626		.data		= &ip_rt_min_advmss,
2627		.maxlen		= sizeof(int),
2628		.mode		= 0644,
2629		.proc_handler	= proc_dointvec,
2630	},
2631	{ }
2632};
2633
2634static struct ctl_table ipv4_route_flush_table[] = {
2635	{
2636		.procname	= "flush",
2637		.maxlen		= sizeof(int),
2638		.mode		= 0200,
2639		.proc_handler	= ipv4_sysctl_rtcache_flush,
2640	},
2641	{ },
2642};
2643
2644static __net_init int sysctl_route_net_init(struct net *net)
2645{
2646	struct ctl_table *tbl;
2647
2648	tbl = ipv4_route_flush_table;
2649	if (!net_eq(net, &init_net)) {
2650		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2651		if (tbl == NULL)
2652			goto err_dup;
2653
2654		/* Don't export sysctls to unprivileged users */
2655		if (net->user_ns != &init_user_ns)
2656			tbl[0].procname = NULL;
2657	}
2658	tbl[0].extra1 = net;
2659
2660	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2661	if (net->ipv4.route_hdr == NULL)
2662		goto err_reg;
2663	return 0;
2664
2665err_reg:
2666	if (tbl != ipv4_route_flush_table)
2667		kfree(tbl);
2668err_dup:
2669	return -ENOMEM;
2670}
2671
2672static __net_exit void sysctl_route_net_exit(struct net *net)
2673{
2674	struct ctl_table *tbl;
2675
2676	tbl = net->ipv4.route_hdr->ctl_table_arg;
2677	unregister_net_sysctl_table(net->ipv4.route_hdr);
2678	BUG_ON(tbl == ipv4_route_flush_table);
2679	kfree(tbl);
2680}
2681
2682static __net_initdata struct pernet_operations sysctl_route_ops = {
2683	.init = sysctl_route_net_init,
2684	.exit = sysctl_route_net_exit,
2685};
2686#endif
2687
2688static __net_init int rt_genid_init(struct net *net)
2689{
2690	atomic_set(&net->ipv4.rt_genid, 0);
2691	atomic_set(&net->fnhe_genid, 0);
2692	get_random_bytes(&net->ipv4.dev_addr_genid,
2693			 sizeof(net->ipv4.dev_addr_genid));
2694	return 0;
2695}
2696
2697static __net_initdata struct pernet_operations rt_genid_ops = {
2698	.init = rt_genid_init,
2699};
2700
2701static int __net_init ipv4_inetpeer_init(struct net *net)
2702{
2703	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2704
2705	if (!bp)
2706		return -ENOMEM;
2707	inet_peer_base_init(bp);
2708	net->ipv4.peers = bp;
2709	return 0;
2710}
2711
2712static void __net_exit ipv4_inetpeer_exit(struct net *net)
2713{
2714	struct inet_peer_base *bp = net->ipv4.peers;
2715
2716	net->ipv4.peers = NULL;
2717	inetpeer_invalidate_tree(bp);
2718	kfree(bp);
2719}
2720
2721static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2722	.init	=	ipv4_inetpeer_init,
2723	.exit	=	ipv4_inetpeer_exit,
2724};
2725
2726#ifdef CONFIG_IP_ROUTE_CLASSID
2727struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2728#endif /* CONFIG_IP_ROUTE_CLASSID */
2729
2730int __init ip_rt_init(void)
2731{
2732	int rc = 0;
2733
2734	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2735	if (!ip_idents)
2736		panic("IP: failed to allocate ip_idents\n");
2737
2738	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2739
2740#ifdef CONFIG_IP_ROUTE_CLASSID
2741	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2742	if (!ip_rt_acct)
2743		panic("IP: failed to allocate ip_rt_acct\n");
2744#endif
2745
2746	ipv4_dst_ops.kmem_cachep =
2747		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2748				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2749
2750	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2751
2752	if (dst_entries_init(&ipv4_dst_ops) < 0)
2753		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2754
2755	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2756		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2757
2758	ipv4_dst_ops.gc_thresh = ~0;
2759	ip_rt_max_size = INT_MAX;
2760
2761	devinet_init();
2762	ip_fib_init();
2763
2764	if (ip_rt_proc_init())
2765		pr_err("Unable to create route proc files\n");
2766#ifdef CONFIG_XFRM
2767	xfrm_init();
2768	xfrm4_init();
2769#endif
2770	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2771
2772#ifdef CONFIG_SYSCTL
2773	register_pernet_subsys(&sysctl_route_ops);
2774#endif
2775	register_pernet_subsys(&rt_genid_ops);
2776	register_pernet_subsys(&ipv4_inetpeer_ops);
2777	return rc;
2778}
2779
2780#ifdef CONFIG_SYSCTL
2781/*
2782 * We really need to sanitize the damn ipv4 init order, then all
2783 * this nonsense will go away.
2784 */
2785void __init ip_static_sysctl_init(void)
2786{
2787	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2788}
2789#endif
2790