ip_output.c revision 7ad6848c7e81a603605fad3f3575841aab004eea
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
85/* Generate a checksum for an outgoing IP datagram. */
86__inline__ void ip_send_check(struct iphdr *iph)
87{
88	iph->check = 0;
89	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90}
91
92int __ip_local_out(struct sk_buff *skb)
93{
94	struct iphdr *iph = ip_hdr(skb);
95
96	iph->tot_len = htons(skb->len);
97	ip_send_check(iph);
98	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
99		       dst_output);
100}
101
102int ip_local_out(struct sk_buff *skb)
103{
104	int err;
105
106	err = __ip_local_out(skb);
107	if (likely(err == 1))
108		err = dst_output(skb);
109
110	return err;
111}
112EXPORT_SYMBOL_GPL(ip_local_out);
113
114/* dev_loopback_xmit for use with netfilter. */
115static int ip_dev_loopback_xmit(struct sk_buff *newskb)
116{
117	skb_reset_mac_header(newskb);
118	__skb_pull(newskb, skb_network_offset(newskb));
119	newskb->pkt_type = PACKET_LOOPBACK;
120	newskb->ip_summed = CHECKSUM_UNNECESSARY;
121	WARN_ON(!skb_dst(newskb));
122	netif_rx(newskb);
123	return 0;
124}
125
126static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
127{
128	int ttl = inet->uc_ttl;
129
130	if (ttl < 0)
131		ttl = dst_metric(dst, RTAX_HOPLIMIT);
132	return ttl;
133}
134
135/*
136 *		Add an ip header to a skbuff and send it out.
137 *
138 */
139int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
140			  __be32 saddr, __be32 daddr, struct ip_options *opt)
141{
142	struct inet_sock *inet = inet_sk(sk);
143	struct rtable *rt = skb_rtable(skb);
144	struct iphdr *iph;
145
146	/* Build the IP header. */
147	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
148	skb_reset_network_header(skb);
149	iph = ip_hdr(skb);
150	iph->version  = 4;
151	iph->ihl      = 5;
152	iph->tos      = inet->tos;
153	if (ip_dont_fragment(sk, &rt->u.dst))
154		iph->frag_off = htons(IP_DF);
155	else
156		iph->frag_off = 0;
157	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
158	iph->daddr    = rt->rt_dst;
159	iph->saddr    = rt->rt_src;
160	iph->protocol = sk->sk_protocol;
161	ip_select_ident(iph, &rt->u.dst, sk);
162
163	if (opt && opt->optlen) {
164		iph->ihl += opt->optlen>>2;
165		ip_options_build(skb, opt, daddr, rt, 0);
166	}
167
168	skb->priority = sk->sk_priority;
169	skb->mark = sk->sk_mark;
170
171	/* Send it out. */
172	return ip_local_out(skb);
173}
174
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176
177static inline int ip_finish_output2(struct sk_buff *skb)
178{
179	struct dst_entry *dst = skb_dst(skb);
180	struct rtable *rt = (struct rtable *)dst;
181	struct net_device *dev = dst->dev;
182	unsigned int hh_len = LL_RESERVED_SPACE(dev);
183
184	if (rt->rt_type == RTN_MULTICAST) {
185		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
186	} else if (rt->rt_type == RTN_BROADCAST)
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
188
189	/* Be paranoid, rather than too clever. */
190	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
191		struct sk_buff *skb2;
192
193		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
194		if (skb2 == NULL) {
195			kfree_skb(skb);
196			return -ENOMEM;
197		}
198		if (skb->sk)
199			skb_set_owner_w(skb2, skb->sk);
200		kfree_skb(skb);
201		skb = skb2;
202	}
203
204	if (dst->hh)
205		return neigh_hh_output(dst->hh, skb);
206	else if (dst->neighbour)
207		return dst->neighbour->output(skb);
208
209	if (net_ratelimit())
210		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211	kfree_skb(skb);
212	return -EINVAL;
213}
214
215static inline int ip_skb_dst_mtu(struct sk_buff *skb)
216{
217	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
218
219	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
220	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
221}
222
223static int ip_finish_output(struct sk_buff *skb)
224{
225#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
226	/* Policy lookup after SNAT yielded a new policy */
227	if (skb_dst(skb)->xfrm != NULL) {
228		IPCB(skb)->flags |= IPSKB_REROUTED;
229		return dst_output(skb);
230	}
231#endif
232	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
233		return ip_fragment(skb, ip_finish_output2);
234	else
235		return ip_finish_output2(skb);
236}
237
238int ip_mc_output(struct sk_buff *skb)
239{
240	struct sock *sk = skb->sk;
241	struct rtable *rt = skb_rtable(skb);
242	struct net_device *dev = rt->u.dst.dev;
243
244	/*
245	 *	If the indicated interface is up and running, send the packet.
246	 */
247	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
248
249	skb->dev = dev;
250	skb->protocol = htons(ETH_P_IP);
251
252	/*
253	 *	Multicasts are looped back for other local users
254	 */
255
256	if (rt->rt_flags&RTCF_MULTICAST) {
257		if (sk_mc_loop(sk)
258#ifdef CONFIG_IP_MROUTE
259		/* Small optimization: do not loopback not local frames,
260		   which returned after forwarding; they will be  dropped
261		   by ip_mr_input in any case.
262		   Note, that local frames are looped back to be delivered
263		   to local recipients.
264
265		   This check is duplicated in ip_mr_input at the moment.
266		 */
267		    &&
268		    ((rt->rt_flags & RTCF_LOCAL) ||
269		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
270#endif
271		   ) {
272			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273			if (newskb)
274				NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
275					NULL, newskb->dev,
276					ip_dev_loopback_xmit);
277		}
278
279		/* Multicasts with ttl 0 must not go beyond the host */
280
281		if (ip_hdr(skb)->ttl == 0) {
282			kfree_skb(skb);
283			return 0;
284		}
285	}
286
287	if (rt->rt_flags&RTCF_BROADCAST) {
288		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289		if (newskb)
290			NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
291				newskb->dev, ip_dev_loopback_xmit);
292	}
293
294	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
295			    ip_finish_output,
296			    !(IPCB(skb)->flags & IPSKB_REROUTED));
297}
298
299int ip_output(struct sk_buff *skb)
300{
301	struct net_device *dev = skb_dst(skb)->dev;
302
303	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
304
305	skb->dev = dev;
306	skb->protocol = htons(ETH_P_IP);
307
308	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
309			    ip_finish_output,
310			    !(IPCB(skb)->flags & IPSKB_REROUTED));
311}
312
313int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
314{
315	struct sock *sk = skb->sk;
316	struct inet_sock *inet = inet_sk(sk);
317	struct ip_options *opt = inet->opt;
318	struct rtable *rt;
319	struct iphdr *iph;
320
321	/* Skip all of this if the packet is already routed,
322	 * f.e. by something like SCTP.
323	 */
324	rt = skb_rtable(skb);
325	if (rt != NULL)
326		goto packet_routed;
327
328	/* Make sure we can route this packet. */
329	rt = (struct rtable *)__sk_dst_check(sk, 0);
330	if (rt == NULL) {
331		__be32 daddr;
332
333		/* Use correct destination address if we have options. */
334		daddr = inet->inet_daddr;
335		if(opt && opt->srr)
336			daddr = opt->faddr;
337
338		{
339			struct flowi fl = { .oif = sk->sk_bound_dev_if,
340					    .mark = sk->sk_mark,
341					    .nl_u = { .ip4_u =
342						      { .daddr = daddr,
343							.saddr = inet->inet_saddr,
344							.tos = RT_CONN_FLAGS(sk) } },
345					    .proto = sk->sk_protocol,
346					    .flags = inet_sk_flowi_flags(sk),
347					    .uli_u = { .ports =
348						       { .sport = inet->inet_sport,
349							 .dport = inet->inet_dport } } };
350
351			/* If this fails, retransmit mechanism of transport layer will
352			 * keep trying until route appears or the connection times
353			 * itself out.
354			 */
355			security_sk_classify_flow(sk, &fl);
356			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
357				goto no_route;
358		}
359		sk_setup_caps(sk, &rt->u.dst);
360	}
361	skb_dst_set(skb, dst_clone(&rt->u.dst));
362
363packet_routed:
364	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
365		goto no_route;
366
367	/* OK, we know where to send it, allocate and build IP header. */
368	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
369	skb_reset_network_header(skb);
370	iph = ip_hdr(skb);
371	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
372	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
373		iph->frag_off = htons(IP_DF);
374	else
375		iph->frag_off = 0;
376	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
377	iph->protocol = sk->sk_protocol;
378	iph->saddr    = rt->rt_src;
379	iph->daddr    = rt->rt_dst;
380	/* Transport layer set skb->h.foo itself. */
381
382	if (opt && opt->optlen) {
383		iph->ihl += opt->optlen >> 2;
384		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
385	}
386
387	ip_select_ident_more(iph, &rt->u.dst, sk,
388			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
389
390	skb->priority = sk->sk_priority;
391	skb->mark = sk->sk_mark;
392
393	return ip_local_out(skb);
394
395no_route:
396	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
397	kfree_skb(skb);
398	return -EHOSTUNREACH;
399}
400
401
402static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403{
404	to->pkt_type = from->pkt_type;
405	to->priority = from->priority;
406	to->protocol = from->protocol;
407	skb_dst_drop(to);
408	skb_dst_set(to, dst_clone(skb_dst(from)));
409	to->dev = from->dev;
410	to->mark = from->mark;
411
412	/* Copy the flags to each fragment. */
413	IPCB(to)->flags = IPCB(from)->flags;
414
415#ifdef CONFIG_NET_SCHED
416	to->tc_index = from->tc_index;
417#endif
418	nf_copy(to, from);
419#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421	to->nf_trace = from->nf_trace;
422#endif
423#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424	to->ipvs_property = from->ipvs_property;
425#endif
426	skb_copy_secmark(to, from);
427}
428
429/*
430 *	This IP datagram is too large to be sent in one piece.  Break it up into
431 *	smaller pieces (each of size equal to IP header plus
432 *	a block of the data of the original IP data part) that will yet fit in a
433 *	single device frame, and queue such a frame for sending.
434 */
435
436int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437{
438	struct iphdr *iph;
439	int raw = 0;
440	int ptr;
441	struct net_device *dev;
442	struct sk_buff *skb2;
443	unsigned int mtu, hlen, left, len, ll_rs, pad;
444	int offset;
445	__be16 not_last_frag;
446	struct rtable *rt = skb_rtable(skb);
447	int err = 0;
448
449	dev = rt->u.dst.dev;
450
451	/*
452	 *	Point into the IP datagram header.
453	 */
454
455	iph = ip_hdr(skb);
456
457	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
458		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
459		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
460			  htonl(ip_skb_dst_mtu(skb)));
461		kfree_skb(skb);
462		return -EMSGSIZE;
463	}
464
465	/*
466	 *	Setup starting values.
467	 */
468
469	hlen = iph->ihl * 4;
470	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
471	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
472
473	/* When frag_list is given, use it. First, check its validity:
474	 * some transformers could create wrong frag_list or break existing
475	 * one, it is not prohibited. In this case fall back to copying.
476	 *
477	 * LATER: this step can be merged to real generation of fragments,
478	 * we can switch to copy when see the first bad fragment.
479	 */
480	if (skb_has_frags(skb)) {
481		struct sk_buff *frag;
482		int first_len = skb_pagelen(skb);
483		int truesizes = 0;
484
485		if (first_len - hlen > mtu ||
486		    ((first_len - hlen) & 7) ||
487		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
488		    skb_cloned(skb))
489			goto slow_path;
490
491		skb_walk_frags(skb, frag) {
492			/* Correct geometry. */
493			if (frag->len > mtu ||
494			    ((frag->len & 7) && frag->next) ||
495			    skb_headroom(frag) < hlen)
496			    goto slow_path;
497
498			/* Partially cloned skb? */
499			if (skb_shared(frag))
500				goto slow_path;
501
502			BUG_ON(frag->sk);
503			if (skb->sk) {
504				frag->sk = skb->sk;
505				frag->destructor = sock_wfree;
506			}
507			truesizes += frag->truesize;
508		}
509
510		/* Everything is OK. Generate! */
511
512		err = 0;
513		offset = 0;
514		frag = skb_shinfo(skb)->frag_list;
515		skb_frag_list_init(skb);
516		skb->data_len = first_len - skb_headlen(skb);
517		skb->truesize -= truesizes;
518		skb->len = first_len;
519		iph->tot_len = htons(first_len);
520		iph->frag_off = htons(IP_MF);
521		ip_send_check(iph);
522
523		for (;;) {
524			/* Prepare header of the next frame,
525			 * before previous one went down. */
526			if (frag) {
527				frag->ip_summed = CHECKSUM_NONE;
528				skb_reset_transport_header(frag);
529				__skb_push(frag, hlen);
530				skb_reset_network_header(frag);
531				memcpy(skb_network_header(frag), iph, hlen);
532				iph = ip_hdr(frag);
533				iph->tot_len = htons(frag->len);
534				ip_copy_metadata(frag, skb);
535				if (offset == 0)
536					ip_options_fragment(frag);
537				offset += skb->len - hlen;
538				iph->frag_off = htons(offset>>3);
539				if (frag->next != NULL)
540					iph->frag_off |= htons(IP_MF);
541				/* Ready, complete checksum */
542				ip_send_check(iph);
543			}
544
545			err = output(skb);
546
547			if (!err)
548				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
549			if (err || !frag)
550				break;
551
552			skb = frag;
553			frag = skb->next;
554			skb->next = NULL;
555		}
556
557		if (err == 0) {
558			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
559			return 0;
560		}
561
562		while (frag) {
563			skb = frag->next;
564			kfree_skb(frag);
565			frag = skb;
566		}
567		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
568		return err;
569	}
570
571slow_path:
572	left = skb->len - hlen;		/* Space per frame */
573	ptr = raw + hlen;		/* Where to start from */
574
575	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
576	 * we need to make room for the encapsulating header
577	 */
578	pad = nf_bridge_pad(skb);
579	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
580	mtu -= pad;
581
582	/*
583	 *	Fragment the datagram.
584	 */
585
586	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
587	not_last_frag = iph->frag_off & htons(IP_MF);
588
589	/*
590	 *	Keep copying data until we run out.
591	 */
592
593	while (left > 0) {
594		len = left;
595		/* IF: it doesn't fit, use 'mtu' - the data space left */
596		if (len > mtu)
597			len = mtu;
598		/* IF: we are not sending upto and including the packet end
599		   then align the next start on an eight byte boundary */
600		if (len < left)	{
601			len &= ~7;
602		}
603		/*
604		 *	Allocate buffer.
605		 */
606
607		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
608			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
609			err = -ENOMEM;
610			goto fail;
611		}
612
613		/*
614		 *	Set up data on packet
615		 */
616
617		ip_copy_metadata(skb2, skb);
618		skb_reserve(skb2, ll_rs);
619		skb_put(skb2, len + hlen);
620		skb_reset_network_header(skb2);
621		skb2->transport_header = skb2->network_header + hlen;
622
623		/*
624		 *	Charge the memory for the fragment to any owner
625		 *	it might possess
626		 */
627
628		if (skb->sk)
629			skb_set_owner_w(skb2, skb->sk);
630
631		/*
632		 *	Copy the packet header into the new buffer.
633		 */
634
635		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
636
637		/*
638		 *	Copy a block of the IP datagram.
639		 */
640		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
641			BUG();
642		left -= len;
643
644		/*
645		 *	Fill in the new header fields.
646		 */
647		iph = ip_hdr(skb2);
648		iph->frag_off = htons((offset >> 3));
649
650		/* ANK: dirty, but effective trick. Upgrade options only if
651		 * the segment to be fragmented was THE FIRST (otherwise,
652		 * options are already fixed) and make it ONCE
653		 * on the initial skb, so that all the following fragments
654		 * will inherit fixed options.
655		 */
656		if (offset == 0)
657			ip_options_fragment(skb);
658
659		/*
660		 *	Added AC : If we are fragmenting a fragment that's not the
661		 *		   last fragment then keep MF on each bit
662		 */
663		if (left > 0 || not_last_frag)
664			iph->frag_off |= htons(IP_MF);
665		ptr += len;
666		offset += len;
667
668		/*
669		 *	Put this fragment into the sending queue.
670		 */
671		iph->tot_len = htons(len + hlen);
672
673		ip_send_check(iph);
674
675		err = output(skb2);
676		if (err)
677			goto fail;
678
679		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
680	}
681	kfree_skb(skb);
682	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
683	return err;
684
685fail:
686	kfree_skb(skb);
687	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
688	return err;
689}
690
691EXPORT_SYMBOL(ip_fragment);
692
693int
694ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
695{
696	struct iovec *iov = from;
697
698	if (skb->ip_summed == CHECKSUM_PARTIAL) {
699		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
700			return -EFAULT;
701	} else {
702		__wsum csum = 0;
703		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
704			return -EFAULT;
705		skb->csum = csum_block_add(skb->csum, csum, odd);
706	}
707	return 0;
708}
709
710static inline __wsum
711csum_page(struct page *page, int offset, int copy)
712{
713	char *kaddr;
714	__wsum csum;
715	kaddr = kmap(page);
716	csum = csum_partial(kaddr + offset, copy, 0);
717	kunmap(page);
718	return csum;
719}
720
721static inline int ip_ufo_append_data(struct sock *sk,
722			int getfrag(void *from, char *to, int offset, int len,
723			       int odd, struct sk_buff *skb),
724			void *from, int length, int hh_len, int fragheaderlen,
725			int transhdrlen, int mtu, unsigned int flags)
726{
727	struct sk_buff *skb;
728	int err;
729
730	/* There is support for UDP fragmentation offload by network
731	 * device, so create one single skb packet containing complete
732	 * udp datagram
733	 */
734	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
735		skb = sock_alloc_send_skb(sk,
736			hh_len + fragheaderlen + transhdrlen + 20,
737			(flags & MSG_DONTWAIT), &err);
738
739		if (skb == NULL)
740			return err;
741
742		/* reserve space for Hardware header */
743		skb_reserve(skb, hh_len);
744
745		/* create space for UDP/IP header */
746		skb_put(skb, fragheaderlen + transhdrlen);
747
748		/* initialize network header pointer */
749		skb_reset_network_header(skb);
750
751		/* initialize protocol header pointer */
752		skb->transport_header = skb->network_header + fragheaderlen;
753
754		skb->ip_summed = CHECKSUM_PARTIAL;
755		skb->csum = 0;
756		sk->sk_sndmsg_off = 0;
757
758		/* specify the length of each IP datagram fragment */
759		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
760		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
761		__skb_queue_tail(&sk->sk_write_queue, skb);
762	}
763
764	return skb_append_datato_frags(sk, skb, getfrag, from,
765				       (length - transhdrlen));
766}
767
768/*
769 *	ip_append_data() and ip_append_page() can make one large IP datagram
770 *	from many pieces of data. Each pieces will be holded on the socket
771 *	until ip_push_pending_frames() is called. Each piece can be a page
772 *	or non-page data.
773 *
774 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
775 *	this interface potentially.
776 *
777 *	LATER: length must be adjusted by pad at tail, when it is required.
778 */
779int ip_append_data(struct sock *sk,
780		   int getfrag(void *from, char *to, int offset, int len,
781			       int odd, struct sk_buff *skb),
782		   void *from, int length, int transhdrlen,
783		   struct ipcm_cookie *ipc, struct rtable **rtp,
784		   unsigned int flags)
785{
786	struct inet_sock *inet = inet_sk(sk);
787	struct sk_buff *skb;
788
789	struct ip_options *opt = NULL;
790	int hh_len;
791	int exthdrlen;
792	int mtu;
793	int copy;
794	int err;
795	int offset = 0;
796	unsigned int maxfraglen, fragheaderlen;
797	int csummode = CHECKSUM_NONE;
798	struct rtable *rt;
799
800	if (flags&MSG_PROBE)
801		return 0;
802
803	if (skb_queue_empty(&sk->sk_write_queue)) {
804		/*
805		 * setup for corking.
806		 */
807		opt = ipc->opt;
808		if (opt) {
809			if (inet->cork.opt == NULL) {
810				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
811				if (unlikely(inet->cork.opt == NULL))
812					return -ENOBUFS;
813			}
814			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
815			inet->cork.flags |= IPCORK_OPT;
816			inet->cork.addr = ipc->addr;
817		}
818		rt = *rtp;
819		if (unlikely(!rt))
820			return -EFAULT;
821		/*
822		 * We steal reference to this route, caller should not release it
823		 */
824		*rtp = NULL;
825		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
826					    rt->u.dst.dev->mtu :
827					    dst_mtu(rt->u.dst.path);
828		inet->cork.dst = &rt->u.dst;
829		inet->cork.length = 0;
830		sk->sk_sndmsg_page = NULL;
831		sk->sk_sndmsg_off = 0;
832		if ((exthdrlen = rt->u.dst.header_len) != 0) {
833			length += exthdrlen;
834			transhdrlen += exthdrlen;
835		}
836	} else {
837		rt = (struct rtable *)inet->cork.dst;
838		if (inet->cork.flags & IPCORK_OPT)
839			opt = inet->cork.opt;
840
841		transhdrlen = 0;
842		exthdrlen = 0;
843		mtu = inet->cork.fragsize;
844	}
845	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
846
847	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
849
850	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
851		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
852			       mtu-exthdrlen);
853		return -EMSGSIZE;
854	}
855
856	/*
857	 * transhdrlen > 0 means that this is the first fragment and we wish
858	 * it won't be fragmented in the future.
859	 */
860	if (transhdrlen &&
861	    length + fragheaderlen <= mtu &&
862	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
863	    !exthdrlen)
864		csummode = CHECKSUM_PARTIAL;
865
866	inet->cork.length += length;
867	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
868	    (sk->sk_protocol == IPPROTO_UDP) &&
869	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
870		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
871					 fragheaderlen, transhdrlen, mtu,
872					 flags);
873		if (err)
874			goto error;
875		return 0;
876	}
877
878	/* So, what's going on in the loop below?
879	 *
880	 * We use calculated fragment length to generate chained skb,
881	 * each of segments is IP fragment ready for sending to network after
882	 * adding appropriate IP header.
883	 */
884
885	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
886		goto alloc_new_skb;
887
888	while (length > 0) {
889		/* Check if the remaining data fits into current packet. */
890		copy = mtu - skb->len;
891		if (copy < length)
892			copy = maxfraglen - skb->len;
893		if (copy <= 0) {
894			char *data;
895			unsigned int datalen;
896			unsigned int fraglen;
897			unsigned int fraggap;
898			unsigned int alloclen;
899			struct sk_buff *skb_prev;
900alloc_new_skb:
901			skb_prev = skb;
902			if (skb_prev)
903				fraggap = skb_prev->len - maxfraglen;
904			else
905				fraggap = 0;
906
907			/*
908			 * If remaining data exceeds the mtu,
909			 * we know we need more fragment(s).
910			 */
911			datalen = length + fraggap;
912			if (datalen > mtu - fragheaderlen)
913				datalen = maxfraglen - fragheaderlen;
914			fraglen = datalen + fragheaderlen;
915
916			if ((flags & MSG_MORE) &&
917			    !(rt->u.dst.dev->features&NETIF_F_SG))
918				alloclen = mtu;
919			else
920				alloclen = datalen + fragheaderlen;
921
922			/* The last fragment gets additional space at tail.
923			 * Note, with MSG_MORE we overallocate on fragments,
924			 * because we have no idea what fragment will be
925			 * the last.
926			 */
927			if (datalen == length + fraggap)
928				alloclen += rt->u.dst.trailer_len;
929
930			if (transhdrlen) {
931				skb = sock_alloc_send_skb(sk,
932						alloclen + hh_len + 15,
933						(flags & MSG_DONTWAIT), &err);
934			} else {
935				skb = NULL;
936				if (atomic_read(&sk->sk_wmem_alloc) <=
937				    2 * sk->sk_sndbuf)
938					skb = sock_wmalloc(sk,
939							   alloclen + hh_len + 15, 1,
940							   sk->sk_allocation);
941				if (unlikely(skb == NULL))
942					err = -ENOBUFS;
943				else
944					/* only the initial fragment is
945					   time stamped */
946					ipc->shtx.flags = 0;
947			}
948			if (skb == NULL)
949				goto error;
950
951			/*
952			 *	Fill in the control structures
953			 */
954			skb->ip_summed = csummode;
955			skb->csum = 0;
956			skb_reserve(skb, hh_len);
957			*skb_tx(skb) = ipc->shtx;
958
959			/*
960			 *	Find where to start putting bytes.
961			 */
962			data = skb_put(skb, fraglen);
963			skb_set_network_header(skb, exthdrlen);
964			skb->transport_header = (skb->network_header +
965						 fragheaderlen);
966			data += fragheaderlen;
967
968			if (fraggap) {
969				skb->csum = skb_copy_and_csum_bits(
970					skb_prev, maxfraglen,
971					data + transhdrlen, fraggap, 0);
972				skb_prev->csum = csum_sub(skb_prev->csum,
973							  skb->csum);
974				data += fraggap;
975				pskb_trim_unique(skb_prev, maxfraglen);
976			}
977
978			copy = datalen - transhdrlen - fraggap;
979			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
980				err = -EFAULT;
981				kfree_skb(skb);
982				goto error;
983			}
984
985			offset += copy;
986			length -= datalen - fraggap;
987			transhdrlen = 0;
988			exthdrlen = 0;
989			csummode = CHECKSUM_NONE;
990
991			/*
992			 * Put the packet on the pending queue.
993			 */
994			__skb_queue_tail(&sk->sk_write_queue, skb);
995			continue;
996		}
997
998		if (copy > length)
999			copy = length;
1000
1001		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1002			unsigned int off;
1003
1004			off = skb->len;
1005			if (getfrag(from, skb_put(skb, copy),
1006					offset, copy, off, skb) < 0) {
1007				__skb_trim(skb, off);
1008				err = -EFAULT;
1009				goto error;
1010			}
1011		} else {
1012			int i = skb_shinfo(skb)->nr_frags;
1013			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1014			struct page *page = sk->sk_sndmsg_page;
1015			int off = sk->sk_sndmsg_off;
1016			unsigned int left;
1017
1018			if (page && (left = PAGE_SIZE - off) > 0) {
1019				if (copy >= left)
1020					copy = left;
1021				if (page != frag->page) {
1022					if (i == MAX_SKB_FRAGS) {
1023						err = -EMSGSIZE;
1024						goto error;
1025					}
1026					get_page(page);
1027					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1028					frag = &skb_shinfo(skb)->frags[i];
1029				}
1030			} else if (i < MAX_SKB_FRAGS) {
1031				if (copy > PAGE_SIZE)
1032					copy = PAGE_SIZE;
1033				page = alloc_pages(sk->sk_allocation, 0);
1034				if (page == NULL)  {
1035					err = -ENOMEM;
1036					goto error;
1037				}
1038				sk->sk_sndmsg_page = page;
1039				sk->sk_sndmsg_off = 0;
1040
1041				skb_fill_page_desc(skb, i, page, 0, 0);
1042				frag = &skb_shinfo(skb)->frags[i];
1043			} else {
1044				err = -EMSGSIZE;
1045				goto error;
1046			}
1047			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1048				err = -EFAULT;
1049				goto error;
1050			}
1051			sk->sk_sndmsg_off += copy;
1052			frag->size += copy;
1053			skb->len += copy;
1054			skb->data_len += copy;
1055			skb->truesize += copy;
1056			atomic_add(copy, &sk->sk_wmem_alloc);
1057		}
1058		offset += copy;
1059		length -= copy;
1060	}
1061
1062	return 0;
1063
1064error:
1065	inet->cork.length -= length;
1066	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1067	return err;
1068}
1069
1070ssize_t	ip_append_page(struct sock *sk, struct page *page,
1071		       int offset, size_t size, int flags)
1072{
1073	struct inet_sock *inet = inet_sk(sk);
1074	struct sk_buff *skb;
1075	struct rtable *rt;
1076	struct ip_options *opt = NULL;
1077	int hh_len;
1078	int mtu;
1079	int len;
1080	int err;
1081	unsigned int maxfraglen, fragheaderlen, fraggap;
1082
1083	if (inet->hdrincl)
1084		return -EPERM;
1085
1086	if (flags&MSG_PROBE)
1087		return 0;
1088
1089	if (skb_queue_empty(&sk->sk_write_queue))
1090		return -EINVAL;
1091
1092	rt = (struct rtable *)inet->cork.dst;
1093	if (inet->cork.flags & IPCORK_OPT)
1094		opt = inet->cork.opt;
1095
1096	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1097		return -EOPNOTSUPP;
1098
1099	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1100	mtu = inet->cork.fragsize;
1101
1102	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1103	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1104
1105	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1106		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1107		return -EMSGSIZE;
1108	}
1109
1110	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1111		return -EINVAL;
1112
1113	inet->cork.length += size;
1114	if ((sk->sk_protocol == IPPROTO_UDP) &&
1115	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1116		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1117		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1118	}
1119
1120
1121	while (size > 0) {
1122		int i;
1123
1124		if (skb_is_gso(skb))
1125			len = size;
1126		else {
1127
1128			/* Check if the remaining data fits into current packet. */
1129			len = mtu - skb->len;
1130			if (len < size)
1131				len = maxfraglen - skb->len;
1132		}
1133		if (len <= 0) {
1134			struct sk_buff *skb_prev;
1135			int alloclen;
1136
1137			skb_prev = skb;
1138			fraggap = skb_prev->len - maxfraglen;
1139
1140			alloclen = fragheaderlen + hh_len + fraggap + 15;
1141			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1142			if (unlikely(!skb)) {
1143				err = -ENOBUFS;
1144				goto error;
1145			}
1146
1147			/*
1148			 *	Fill in the control structures
1149			 */
1150			skb->ip_summed = CHECKSUM_NONE;
1151			skb->csum = 0;
1152			skb_reserve(skb, hh_len);
1153
1154			/*
1155			 *	Find where to start putting bytes.
1156			 */
1157			skb_put(skb, fragheaderlen + fraggap);
1158			skb_reset_network_header(skb);
1159			skb->transport_header = (skb->network_header +
1160						 fragheaderlen);
1161			if (fraggap) {
1162				skb->csum = skb_copy_and_csum_bits(skb_prev,
1163								   maxfraglen,
1164						    skb_transport_header(skb),
1165								   fraggap, 0);
1166				skb_prev->csum = csum_sub(skb_prev->csum,
1167							  skb->csum);
1168				pskb_trim_unique(skb_prev, maxfraglen);
1169			}
1170
1171			/*
1172			 * Put the packet on the pending queue.
1173			 */
1174			__skb_queue_tail(&sk->sk_write_queue, skb);
1175			continue;
1176		}
1177
1178		i = skb_shinfo(skb)->nr_frags;
1179		if (len > size)
1180			len = size;
1181		if (skb_can_coalesce(skb, i, page, offset)) {
1182			skb_shinfo(skb)->frags[i-1].size += len;
1183		} else if (i < MAX_SKB_FRAGS) {
1184			get_page(page);
1185			skb_fill_page_desc(skb, i, page, offset, len);
1186		} else {
1187			err = -EMSGSIZE;
1188			goto error;
1189		}
1190
1191		if (skb->ip_summed == CHECKSUM_NONE) {
1192			__wsum csum;
1193			csum = csum_page(page, offset, len);
1194			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1195		}
1196
1197		skb->len += len;
1198		skb->data_len += len;
1199		skb->truesize += len;
1200		atomic_add(len, &sk->sk_wmem_alloc);
1201		offset += len;
1202		size -= len;
1203	}
1204	return 0;
1205
1206error:
1207	inet->cork.length -= size;
1208	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1209	return err;
1210}
1211
1212static void ip_cork_release(struct inet_sock *inet)
1213{
1214	inet->cork.flags &= ~IPCORK_OPT;
1215	kfree(inet->cork.opt);
1216	inet->cork.opt = NULL;
1217	dst_release(inet->cork.dst);
1218	inet->cork.dst = NULL;
1219}
1220
1221/*
1222 *	Combined all pending IP fragments on the socket as one IP datagram
1223 *	and push them out.
1224 */
1225int ip_push_pending_frames(struct sock *sk)
1226{
1227	struct sk_buff *skb, *tmp_skb;
1228	struct sk_buff **tail_skb;
1229	struct inet_sock *inet = inet_sk(sk);
1230	struct net *net = sock_net(sk);
1231	struct ip_options *opt = NULL;
1232	struct rtable *rt = (struct rtable *)inet->cork.dst;
1233	struct iphdr *iph;
1234	__be16 df = 0;
1235	__u8 ttl;
1236	int err = 0;
1237
1238	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1239		goto out;
1240	tail_skb = &(skb_shinfo(skb)->frag_list);
1241
1242	/* move skb->data to ip header from ext header */
1243	if (skb->data < skb_network_header(skb))
1244		__skb_pull(skb, skb_network_offset(skb));
1245	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1246		__skb_pull(tmp_skb, skb_network_header_len(skb));
1247		*tail_skb = tmp_skb;
1248		tail_skb = &(tmp_skb->next);
1249		skb->len += tmp_skb->len;
1250		skb->data_len += tmp_skb->len;
1251		skb->truesize += tmp_skb->truesize;
1252		tmp_skb->destructor = NULL;
1253		tmp_skb->sk = NULL;
1254	}
1255
1256	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1257	 * to fragment the frame generated here. No matter, what transforms
1258	 * how transforms change size of the packet, it will come out.
1259	 */
1260	if (inet->pmtudisc < IP_PMTUDISC_DO)
1261		skb->local_df = 1;
1262
1263	/* DF bit is set when we want to see DF on outgoing frames.
1264	 * If local_df is set too, we still allow to fragment this frame
1265	 * locally. */
1266	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1267	    (skb->len <= dst_mtu(&rt->u.dst) &&
1268	     ip_dont_fragment(sk, &rt->u.dst)))
1269		df = htons(IP_DF);
1270
1271	if (inet->cork.flags & IPCORK_OPT)
1272		opt = inet->cork.opt;
1273
1274	if (rt->rt_type == RTN_MULTICAST)
1275		ttl = inet->mc_ttl;
1276	else
1277		ttl = ip_select_ttl(inet, &rt->u.dst);
1278
1279	iph = (struct iphdr *)skb->data;
1280	iph->version = 4;
1281	iph->ihl = 5;
1282	if (opt) {
1283		iph->ihl += opt->optlen>>2;
1284		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1285	}
1286	iph->tos = inet->tos;
1287	iph->frag_off = df;
1288	ip_select_ident(iph, &rt->u.dst, sk);
1289	iph->ttl = ttl;
1290	iph->protocol = sk->sk_protocol;
1291	iph->saddr = rt->rt_src;
1292	iph->daddr = rt->rt_dst;
1293
1294	skb->priority = sk->sk_priority;
1295	skb->mark = sk->sk_mark;
1296	/*
1297	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1298	 * on dst refcount
1299	 */
1300	inet->cork.dst = NULL;
1301	skb_dst_set(skb, &rt->u.dst);
1302
1303	if (iph->protocol == IPPROTO_ICMP)
1304		icmp_out_count(net, ((struct icmphdr *)
1305			skb_transport_header(skb))->type);
1306
1307	/* Netfilter gets whole the not fragmented skb. */
1308	err = ip_local_out(skb);
1309	if (err) {
1310		if (err > 0)
1311			err = net_xmit_errno(err);
1312		if (err)
1313			goto error;
1314	}
1315
1316out:
1317	ip_cork_release(inet);
1318	return err;
1319
1320error:
1321	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1322	goto out;
1323}
1324
1325/*
1326 *	Throw away all pending data on the socket.
1327 */
1328void ip_flush_pending_frames(struct sock *sk)
1329{
1330	struct sk_buff *skb;
1331
1332	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1333		kfree_skb(skb);
1334
1335	ip_cork_release(inet_sk(sk));
1336}
1337
1338
1339/*
1340 *	Fetch data from kernel space and fill in checksum if needed.
1341 */
1342static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1343			      int len, int odd, struct sk_buff *skb)
1344{
1345	__wsum csum;
1346
1347	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1348	skb->csum = csum_block_add(skb->csum, csum, odd);
1349	return 0;
1350}
1351
1352/*
1353 *	Generic function to send a packet as reply to another packet.
1354 *	Used to send TCP resets so far. ICMP should use this function too.
1355 *
1356 *	Should run single threaded per socket because it uses the sock
1357 *     	structure to pass arguments.
1358 */
1359void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1360		   unsigned int len)
1361{
1362	struct inet_sock *inet = inet_sk(sk);
1363	struct {
1364		struct ip_options	opt;
1365		char			data[40];
1366	} replyopts;
1367	struct ipcm_cookie ipc;
1368	__be32 daddr;
1369	struct rtable *rt = skb_rtable(skb);
1370
1371	if (ip_options_echo(&replyopts.opt, skb))
1372		return;
1373
1374	daddr = ipc.addr = rt->rt_src;
1375	ipc.opt = NULL;
1376	ipc.shtx.flags = 0;
1377
1378	if (replyopts.opt.optlen) {
1379		ipc.opt = &replyopts.opt;
1380
1381		if (ipc.opt->srr)
1382			daddr = replyopts.opt.faddr;
1383	}
1384
1385	{
1386		struct flowi fl = { .oif = arg->bound_dev_if,
1387				    .nl_u = { .ip4_u =
1388					      { .daddr = daddr,
1389						.saddr = rt->rt_spec_dst,
1390						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1391				    /* Not quite clean, but right. */
1392				    .uli_u = { .ports =
1393					       { .sport = tcp_hdr(skb)->dest,
1394						 .dport = tcp_hdr(skb)->source } },
1395				    .proto = sk->sk_protocol,
1396				    .flags = ip_reply_arg_flowi_flags(arg) };
1397		security_skb_classify_flow(skb, &fl);
1398		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1399			return;
1400	}
1401
1402	/* And let IP do all the hard work.
1403
1404	   This chunk is not reenterable, hence spinlock.
1405	   Note that it uses the fact, that this function is called
1406	   with locally disabled BH and that sk cannot be already spinlocked.
1407	 */
1408	bh_lock_sock(sk);
1409	inet->tos = ip_hdr(skb)->tos;
1410	sk->sk_priority = skb->priority;
1411	sk->sk_protocol = ip_hdr(skb)->protocol;
1412	sk->sk_bound_dev_if = arg->bound_dev_if;
1413	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1414		       &ipc, &rt, MSG_DONTWAIT);
1415	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1416		if (arg->csumoffset >= 0)
1417			*((__sum16 *)skb_transport_header(skb) +
1418			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1419								arg->csum));
1420		skb->ip_summed = CHECKSUM_NONE;
1421		ip_push_pending_frames(sk);
1422	}
1423
1424	bh_unlock_sock(sk);
1425
1426	ip_rt_put(rt);
1427}
1428
1429void __init ip_init(void)
1430{
1431	ip_rt_init();
1432	inet_initpeers();
1433
1434#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1435	igmp_mc_proc_init();
1436#endif
1437}
1438
1439EXPORT_SYMBOL(ip_generic_getfrag);
1440EXPORT_SYMBOL(ip_queue_xmit);
1441EXPORT_SYMBOL(ip_send_check);
1442