ip_output.c revision e281b19897dc21c1071802808d461627d747a877
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
85/* Generate a checksum for an outgoing IP datagram. */
86__inline__ void ip_send_check(struct iphdr *iph)
87{
88	iph->check = 0;
89	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90}
91
92int __ip_local_out(struct sk_buff *skb)
93{
94	struct iphdr *iph = ip_hdr(skb);
95
96	iph->tot_len = htons(skb->len);
97	ip_send_check(iph);
98	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
99		       skb_dst(skb)->dev, dst_output);
100}
101
102int ip_local_out(struct sk_buff *skb)
103{
104	int err;
105
106	err = __ip_local_out(skb);
107	if (likely(err == 1))
108		err = dst_output(skb);
109
110	return err;
111}
112EXPORT_SYMBOL_GPL(ip_local_out);
113
114/* dev_loopback_xmit for use with netfilter. */
115static int ip_dev_loopback_xmit(struct sk_buff *newskb)
116{
117	skb_reset_mac_header(newskb);
118	__skb_pull(newskb, skb_network_offset(newskb));
119	newskb->pkt_type = PACKET_LOOPBACK;
120	newskb->ip_summed = CHECKSUM_UNNECESSARY;
121	WARN_ON(!skb_dst(newskb));
122	netif_rx(newskb);
123	return 0;
124}
125
126static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
127{
128	int ttl = inet->uc_ttl;
129
130	if (ttl < 0)
131		ttl = dst_metric(dst, RTAX_HOPLIMIT);
132	return ttl;
133}
134
135/*
136 *		Add an ip header to a skbuff and send it out.
137 *
138 */
139int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
140			  __be32 saddr, __be32 daddr, struct ip_options *opt)
141{
142	struct inet_sock *inet = inet_sk(sk);
143	struct rtable *rt = skb_rtable(skb);
144	struct iphdr *iph;
145
146	/* Build the IP header. */
147	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
148	skb_reset_network_header(skb);
149	iph = ip_hdr(skb);
150	iph->version  = 4;
151	iph->ihl      = 5;
152	iph->tos      = inet->tos;
153	if (ip_dont_fragment(sk, &rt->u.dst))
154		iph->frag_off = htons(IP_DF);
155	else
156		iph->frag_off = 0;
157	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
158	iph->daddr    = rt->rt_dst;
159	iph->saddr    = rt->rt_src;
160	iph->protocol = sk->sk_protocol;
161	ip_select_ident(iph, &rt->u.dst, sk);
162
163	if (opt && opt->optlen) {
164		iph->ihl += opt->optlen>>2;
165		ip_options_build(skb, opt, daddr, rt, 0);
166	}
167
168	skb->priority = sk->sk_priority;
169	skb->mark = sk->sk_mark;
170
171	/* Send it out. */
172	return ip_local_out(skb);
173}
174
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176
177static inline int ip_finish_output2(struct sk_buff *skb)
178{
179	struct dst_entry *dst = skb_dst(skb);
180	struct rtable *rt = (struct rtable *)dst;
181	struct net_device *dev = dst->dev;
182	unsigned int hh_len = LL_RESERVED_SPACE(dev);
183
184	if (rt->rt_type == RTN_MULTICAST) {
185		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
186	} else if (rt->rt_type == RTN_BROADCAST)
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
188
189	/* Be paranoid, rather than too clever. */
190	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
191		struct sk_buff *skb2;
192
193		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
194		if (skb2 == NULL) {
195			kfree_skb(skb);
196			return -ENOMEM;
197		}
198		if (skb->sk)
199			skb_set_owner_w(skb2, skb->sk);
200		kfree_skb(skb);
201		skb = skb2;
202	}
203
204	if (dst->hh)
205		return neigh_hh_output(dst->hh, skb);
206	else if (dst->neighbour)
207		return dst->neighbour->output(skb);
208
209	if (net_ratelimit())
210		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211	kfree_skb(skb);
212	return -EINVAL;
213}
214
215static inline int ip_skb_dst_mtu(struct sk_buff *skb)
216{
217	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
218
219	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
220	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
221}
222
223static int ip_finish_output(struct sk_buff *skb)
224{
225#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
226	/* Policy lookup after SNAT yielded a new policy */
227	if (skb_dst(skb)->xfrm != NULL) {
228		IPCB(skb)->flags |= IPSKB_REROUTED;
229		return dst_output(skb);
230	}
231#endif
232	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
233		return ip_fragment(skb, ip_finish_output2);
234	else
235		return ip_finish_output2(skb);
236}
237
238int ip_mc_output(struct sk_buff *skb)
239{
240	struct sock *sk = skb->sk;
241	struct rtable *rt = skb_rtable(skb);
242	struct net_device *dev = rt->u.dst.dev;
243
244	/*
245	 *	If the indicated interface is up and running, send the packet.
246	 */
247	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
248
249	skb->dev = dev;
250	skb->protocol = htons(ETH_P_IP);
251
252	/*
253	 *	Multicasts are looped back for other local users
254	 */
255
256	if (rt->rt_flags&RTCF_MULTICAST) {
257		if (sk_mc_loop(sk)
258#ifdef CONFIG_IP_MROUTE
259		/* Small optimization: do not loopback not local frames,
260		   which returned after forwarding; they will be  dropped
261		   by ip_mr_input in any case.
262		   Note, that local frames are looped back to be delivered
263		   to local recipients.
264
265		   This check is duplicated in ip_mr_input at the moment.
266		 */
267		    &&
268		    ((rt->rt_flags & RTCF_LOCAL) ||
269		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
270#endif
271		   ) {
272			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273			if (newskb)
274				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
275					newskb, NULL, newskb->dev,
276					ip_dev_loopback_xmit);
277		}
278
279		/* Multicasts with ttl 0 must not go beyond the host */
280
281		if (ip_hdr(skb)->ttl == 0) {
282			kfree_skb(skb);
283			return 0;
284		}
285	}
286
287	if (rt->rt_flags&RTCF_BROADCAST) {
288		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289		if (newskb)
290			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
291				NULL, newskb->dev, ip_dev_loopback_xmit);
292	}
293
294	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
295			    skb->dev, ip_finish_output,
296			    !(IPCB(skb)->flags & IPSKB_REROUTED));
297}
298
299int ip_output(struct sk_buff *skb)
300{
301	struct net_device *dev = skb_dst(skb)->dev;
302
303	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
304
305	skb->dev = dev;
306	skb->protocol = htons(ETH_P_IP);
307
308	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
309			    ip_finish_output,
310			    !(IPCB(skb)->flags & IPSKB_REROUTED));
311}
312EXPORT_SYMBOL_GPL(ip_output);
313
314int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
315{
316	struct sock *sk = skb->sk;
317	struct inet_sock *inet = inet_sk(sk);
318	struct ip_options *opt = inet->opt;
319	struct rtable *rt;
320	struct iphdr *iph;
321
322	/* Skip all of this if the packet is already routed,
323	 * f.e. by something like SCTP.
324	 */
325	rt = skb_rtable(skb);
326	if (rt != NULL)
327		goto packet_routed;
328
329	/* Make sure we can route this packet. */
330	rt = (struct rtable *)__sk_dst_check(sk, 0);
331	if (rt == NULL) {
332		__be32 daddr;
333
334		/* Use correct destination address if we have options. */
335		daddr = inet->inet_daddr;
336		if(opt && opt->srr)
337			daddr = opt->faddr;
338
339		{
340			struct flowi fl = { .oif = sk->sk_bound_dev_if,
341					    .mark = sk->sk_mark,
342					    .nl_u = { .ip4_u =
343						      { .daddr = daddr,
344							.saddr = inet->inet_saddr,
345							.tos = RT_CONN_FLAGS(sk) } },
346					    .proto = sk->sk_protocol,
347					    .flags = inet_sk_flowi_flags(sk),
348					    .uli_u = { .ports =
349						       { .sport = inet->inet_sport,
350							 .dport = inet->inet_dport } } };
351
352			/* If this fails, retransmit mechanism of transport layer will
353			 * keep trying until route appears or the connection times
354			 * itself out.
355			 */
356			security_sk_classify_flow(sk, &fl);
357			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
358				goto no_route;
359		}
360		sk_setup_caps(sk, &rt->u.dst);
361	}
362	skb_dst_set(skb, dst_clone(&rt->u.dst));
363
364packet_routed:
365	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
366		goto no_route;
367
368	/* OK, we know where to send it, allocate and build IP header. */
369	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
370	skb_reset_network_header(skb);
371	iph = ip_hdr(skb);
372	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
373	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
374		iph->frag_off = htons(IP_DF);
375	else
376		iph->frag_off = 0;
377	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
378	iph->protocol = sk->sk_protocol;
379	iph->saddr    = rt->rt_src;
380	iph->daddr    = rt->rt_dst;
381	/* Transport layer set skb->h.foo itself. */
382
383	if (opt && opt->optlen) {
384		iph->ihl += opt->optlen >> 2;
385		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
386	}
387
388	ip_select_ident_more(iph, &rt->u.dst, sk,
389			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
390
391	skb->priority = sk->sk_priority;
392	skb->mark = sk->sk_mark;
393
394	return ip_local_out(skb);
395
396no_route:
397	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
398	kfree_skb(skb);
399	return -EHOSTUNREACH;
400}
401
402
403static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
404{
405	to->pkt_type = from->pkt_type;
406	to->priority = from->priority;
407	to->protocol = from->protocol;
408	skb_dst_drop(to);
409	skb_dst_set(to, dst_clone(skb_dst(from)));
410	to->dev = from->dev;
411	to->mark = from->mark;
412
413	/* Copy the flags to each fragment. */
414	IPCB(to)->flags = IPCB(from)->flags;
415
416#ifdef CONFIG_NET_SCHED
417	to->tc_index = from->tc_index;
418#endif
419	nf_copy(to, from);
420#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
421    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
422	to->nf_trace = from->nf_trace;
423#endif
424#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
425	to->ipvs_property = from->ipvs_property;
426#endif
427	skb_copy_secmark(to, from);
428}
429
430/*
431 *	This IP datagram is too large to be sent in one piece.  Break it up into
432 *	smaller pieces (each of size equal to IP header plus
433 *	a block of the data of the original IP data part) that will yet fit in a
434 *	single device frame, and queue such a frame for sending.
435 */
436
437int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
438{
439	struct iphdr *iph;
440	int raw = 0;
441	int ptr;
442	struct net_device *dev;
443	struct sk_buff *skb2;
444	unsigned int mtu, hlen, left, len, ll_rs, pad;
445	int offset;
446	__be16 not_last_frag;
447	struct rtable *rt = skb_rtable(skb);
448	int err = 0;
449
450	dev = rt->u.dst.dev;
451
452	/*
453	 *	Point into the IP datagram header.
454	 */
455
456	iph = ip_hdr(skb);
457
458	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
459		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
460		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
461			  htonl(ip_skb_dst_mtu(skb)));
462		kfree_skb(skb);
463		return -EMSGSIZE;
464	}
465
466	/*
467	 *	Setup starting values.
468	 */
469
470	hlen = iph->ihl * 4;
471	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
472	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
473
474	/* When frag_list is given, use it. First, check its validity:
475	 * some transformers could create wrong frag_list or break existing
476	 * one, it is not prohibited. In this case fall back to copying.
477	 *
478	 * LATER: this step can be merged to real generation of fragments,
479	 * we can switch to copy when see the first bad fragment.
480	 */
481	if (skb_has_frags(skb)) {
482		struct sk_buff *frag;
483		int first_len = skb_pagelen(skb);
484		int truesizes = 0;
485
486		if (first_len - hlen > mtu ||
487		    ((first_len - hlen) & 7) ||
488		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
489		    skb_cloned(skb))
490			goto slow_path;
491
492		skb_walk_frags(skb, frag) {
493			/* Correct geometry. */
494			if (frag->len > mtu ||
495			    ((frag->len & 7) && frag->next) ||
496			    skb_headroom(frag) < hlen)
497			    goto slow_path;
498
499			/* Partially cloned skb? */
500			if (skb_shared(frag))
501				goto slow_path;
502
503			BUG_ON(frag->sk);
504			if (skb->sk) {
505				frag->sk = skb->sk;
506				frag->destructor = sock_wfree;
507			}
508			truesizes += frag->truesize;
509		}
510
511		/* Everything is OK. Generate! */
512
513		err = 0;
514		offset = 0;
515		frag = skb_shinfo(skb)->frag_list;
516		skb_frag_list_init(skb);
517		skb->data_len = first_len - skb_headlen(skb);
518		skb->truesize -= truesizes;
519		skb->len = first_len;
520		iph->tot_len = htons(first_len);
521		iph->frag_off = htons(IP_MF);
522		ip_send_check(iph);
523
524		for (;;) {
525			/* Prepare header of the next frame,
526			 * before previous one went down. */
527			if (frag) {
528				frag->ip_summed = CHECKSUM_NONE;
529				skb_reset_transport_header(frag);
530				__skb_push(frag, hlen);
531				skb_reset_network_header(frag);
532				memcpy(skb_network_header(frag), iph, hlen);
533				iph = ip_hdr(frag);
534				iph->tot_len = htons(frag->len);
535				ip_copy_metadata(frag, skb);
536				if (offset == 0)
537					ip_options_fragment(frag);
538				offset += skb->len - hlen;
539				iph->frag_off = htons(offset>>3);
540				if (frag->next != NULL)
541					iph->frag_off |= htons(IP_MF);
542				/* Ready, complete checksum */
543				ip_send_check(iph);
544			}
545
546			err = output(skb);
547
548			if (!err)
549				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
550			if (err || !frag)
551				break;
552
553			skb = frag;
554			frag = skb->next;
555			skb->next = NULL;
556		}
557
558		if (err == 0) {
559			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
560			return 0;
561		}
562
563		while (frag) {
564			skb = frag->next;
565			kfree_skb(frag);
566			frag = skb;
567		}
568		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
569		return err;
570	}
571
572slow_path:
573	left = skb->len - hlen;		/* Space per frame */
574	ptr = raw + hlen;		/* Where to start from */
575
576	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
577	 * we need to make room for the encapsulating header
578	 */
579	pad = nf_bridge_pad(skb);
580	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
581	mtu -= pad;
582
583	/*
584	 *	Fragment the datagram.
585	 */
586
587	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
588	not_last_frag = iph->frag_off & htons(IP_MF);
589
590	/*
591	 *	Keep copying data until we run out.
592	 */
593
594	while (left > 0) {
595		len = left;
596		/* IF: it doesn't fit, use 'mtu' - the data space left */
597		if (len > mtu)
598			len = mtu;
599		/* IF: we are not sending upto and including the packet end
600		   then align the next start on an eight byte boundary */
601		if (len < left)	{
602			len &= ~7;
603		}
604		/*
605		 *	Allocate buffer.
606		 */
607
608		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
609			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
610			err = -ENOMEM;
611			goto fail;
612		}
613
614		/*
615		 *	Set up data on packet
616		 */
617
618		ip_copy_metadata(skb2, skb);
619		skb_reserve(skb2, ll_rs);
620		skb_put(skb2, len + hlen);
621		skb_reset_network_header(skb2);
622		skb2->transport_header = skb2->network_header + hlen;
623
624		/*
625		 *	Charge the memory for the fragment to any owner
626		 *	it might possess
627		 */
628
629		if (skb->sk)
630			skb_set_owner_w(skb2, skb->sk);
631
632		/*
633		 *	Copy the packet header into the new buffer.
634		 */
635
636		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
637
638		/*
639		 *	Copy a block of the IP datagram.
640		 */
641		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
642			BUG();
643		left -= len;
644
645		/*
646		 *	Fill in the new header fields.
647		 */
648		iph = ip_hdr(skb2);
649		iph->frag_off = htons((offset >> 3));
650
651		/* ANK: dirty, but effective trick. Upgrade options only if
652		 * the segment to be fragmented was THE FIRST (otherwise,
653		 * options are already fixed) and make it ONCE
654		 * on the initial skb, so that all the following fragments
655		 * will inherit fixed options.
656		 */
657		if (offset == 0)
658			ip_options_fragment(skb);
659
660		/*
661		 *	Added AC : If we are fragmenting a fragment that's not the
662		 *		   last fragment then keep MF on each bit
663		 */
664		if (left > 0 || not_last_frag)
665			iph->frag_off |= htons(IP_MF);
666		ptr += len;
667		offset += len;
668
669		/*
670		 *	Put this fragment into the sending queue.
671		 */
672		iph->tot_len = htons(len + hlen);
673
674		ip_send_check(iph);
675
676		err = output(skb2);
677		if (err)
678			goto fail;
679
680		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
681	}
682	kfree_skb(skb);
683	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
684	return err;
685
686fail:
687	kfree_skb(skb);
688	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
689	return err;
690}
691
692EXPORT_SYMBOL(ip_fragment);
693
694int
695ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
696{
697	struct iovec *iov = from;
698
699	if (skb->ip_summed == CHECKSUM_PARTIAL) {
700		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
701			return -EFAULT;
702	} else {
703		__wsum csum = 0;
704		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
705			return -EFAULT;
706		skb->csum = csum_block_add(skb->csum, csum, odd);
707	}
708	return 0;
709}
710
711static inline __wsum
712csum_page(struct page *page, int offset, int copy)
713{
714	char *kaddr;
715	__wsum csum;
716	kaddr = kmap(page);
717	csum = csum_partial(kaddr + offset, copy, 0);
718	kunmap(page);
719	return csum;
720}
721
722static inline int ip_ufo_append_data(struct sock *sk,
723			int getfrag(void *from, char *to, int offset, int len,
724			       int odd, struct sk_buff *skb),
725			void *from, int length, int hh_len, int fragheaderlen,
726			int transhdrlen, int mtu, unsigned int flags)
727{
728	struct sk_buff *skb;
729	int err;
730
731	/* There is support for UDP fragmentation offload by network
732	 * device, so create one single skb packet containing complete
733	 * udp datagram
734	 */
735	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
736		skb = sock_alloc_send_skb(sk,
737			hh_len + fragheaderlen + transhdrlen + 20,
738			(flags & MSG_DONTWAIT), &err);
739
740		if (skb == NULL)
741			return err;
742
743		/* reserve space for Hardware header */
744		skb_reserve(skb, hh_len);
745
746		/* create space for UDP/IP header */
747		skb_put(skb, fragheaderlen + transhdrlen);
748
749		/* initialize network header pointer */
750		skb_reset_network_header(skb);
751
752		/* initialize protocol header pointer */
753		skb->transport_header = skb->network_header + fragheaderlen;
754
755		skb->ip_summed = CHECKSUM_PARTIAL;
756		skb->csum = 0;
757		sk->sk_sndmsg_off = 0;
758
759		/* specify the length of each IP datagram fragment */
760		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
761		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
762		__skb_queue_tail(&sk->sk_write_queue, skb);
763	}
764
765	return skb_append_datato_frags(sk, skb, getfrag, from,
766				       (length - transhdrlen));
767}
768
769/*
770 *	ip_append_data() and ip_append_page() can make one large IP datagram
771 *	from many pieces of data. Each pieces will be holded on the socket
772 *	until ip_push_pending_frames() is called. Each piece can be a page
773 *	or non-page data.
774 *
775 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
776 *	this interface potentially.
777 *
778 *	LATER: length must be adjusted by pad at tail, when it is required.
779 */
780int ip_append_data(struct sock *sk,
781		   int getfrag(void *from, char *to, int offset, int len,
782			       int odd, struct sk_buff *skb),
783		   void *from, int length, int transhdrlen,
784		   struct ipcm_cookie *ipc, struct rtable **rtp,
785		   unsigned int flags)
786{
787	struct inet_sock *inet = inet_sk(sk);
788	struct sk_buff *skb;
789
790	struct ip_options *opt = NULL;
791	int hh_len;
792	int exthdrlen;
793	int mtu;
794	int copy;
795	int err;
796	int offset = 0;
797	unsigned int maxfraglen, fragheaderlen;
798	int csummode = CHECKSUM_NONE;
799	struct rtable *rt;
800
801	if (flags&MSG_PROBE)
802		return 0;
803
804	if (skb_queue_empty(&sk->sk_write_queue)) {
805		/*
806		 * setup for corking.
807		 */
808		opt = ipc->opt;
809		if (opt) {
810			if (inet->cork.opt == NULL) {
811				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
812				if (unlikely(inet->cork.opt == NULL))
813					return -ENOBUFS;
814			}
815			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
816			inet->cork.flags |= IPCORK_OPT;
817			inet->cork.addr = ipc->addr;
818		}
819		rt = *rtp;
820		if (unlikely(!rt))
821			return -EFAULT;
822		/*
823		 * We steal reference to this route, caller should not release it
824		 */
825		*rtp = NULL;
826		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
827					    rt->u.dst.dev->mtu :
828					    dst_mtu(rt->u.dst.path);
829		inet->cork.dst = &rt->u.dst;
830		inet->cork.length = 0;
831		sk->sk_sndmsg_page = NULL;
832		sk->sk_sndmsg_off = 0;
833		if ((exthdrlen = rt->u.dst.header_len) != 0) {
834			length += exthdrlen;
835			transhdrlen += exthdrlen;
836		}
837	} else {
838		rt = (struct rtable *)inet->cork.dst;
839		if (inet->cork.flags & IPCORK_OPT)
840			opt = inet->cork.opt;
841
842		transhdrlen = 0;
843		exthdrlen = 0;
844		mtu = inet->cork.fragsize;
845	}
846	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
847
848	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
849	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
850
851	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
852		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
853			       mtu-exthdrlen);
854		return -EMSGSIZE;
855	}
856
857	/*
858	 * transhdrlen > 0 means that this is the first fragment and we wish
859	 * it won't be fragmented in the future.
860	 */
861	if (transhdrlen &&
862	    length + fragheaderlen <= mtu &&
863	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
864	    !exthdrlen)
865		csummode = CHECKSUM_PARTIAL;
866
867	inet->cork.length += length;
868	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
869	    (sk->sk_protocol == IPPROTO_UDP) &&
870	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
871		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
872					 fragheaderlen, transhdrlen, mtu,
873					 flags);
874		if (err)
875			goto error;
876		return 0;
877	}
878
879	/* So, what's going on in the loop below?
880	 *
881	 * We use calculated fragment length to generate chained skb,
882	 * each of segments is IP fragment ready for sending to network after
883	 * adding appropriate IP header.
884	 */
885
886	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
887		goto alloc_new_skb;
888
889	while (length > 0) {
890		/* Check if the remaining data fits into current packet. */
891		copy = mtu - skb->len;
892		if (copy < length)
893			copy = maxfraglen - skb->len;
894		if (copy <= 0) {
895			char *data;
896			unsigned int datalen;
897			unsigned int fraglen;
898			unsigned int fraggap;
899			unsigned int alloclen;
900			struct sk_buff *skb_prev;
901alloc_new_skb:
902			skb_prev = skb;
903			if (skb_prev)
904				fraggap = skb_prev->len - maxfraglen;
905			else
906				fraggap = 0;
907
908			/*
909			 * If remaining data exceeds the mtu,
910			 * we know we need more fragment(s).
911			 */
912			datalen = length + fraggap;
913			if (datalen > mtu - fragheaderlen)
914				datalen = maxfraglen - fragheaderlen;
915			fraglen = datalen + fragheaderlen;
916
917			if ((flags & MSG_MORE) &&
918			    !(rt->u.dst.dev->features&NETIF_F_SG))
919				alloclen = mtu;
920			else
921				alloclen = datalen + fragheaderlen;
922
923			/* The last fragment gets additional space at tail.
924			 * Note, with MSG_MORE we overallocate on fragments,
925			 * because we have no idea what fragment will be
926			 * the last.
927			 */
928			if (datalen == length + fraggap)
929				alloclen += rt->u.dst.trailer_len;
930
931			if (transhdrlen) {
932				skb = sock_alloc_send_skb(sk,
933						alloclen + hh_len + 15,
934						(flags & MSG_DONTWAIT), &err);
935			} else {
936				skb = NULL;
937				if (atomic_read(&sk->sk_wmem_alloc) <=
938				    2 * sk->sk_sndbuf)
939					skb = sock_wmalloc(sk,
940							   alloclen + hh_len + 15, 1,
941							   sk->sk_allocation);
942				if (unlikely(skb == NULL))
943					err = -ENOBUFS;
944				else
945					/* only the initial fragment is
946					   time stamped */
947					ipc->shtx.flags = 0;
948			}
949			if (skb == NULL)
950				goto error;
951
952			/*
953			 *	Fill in the control structures
954			 */
955			skb->ip_summed = csummode;
956			skb->csum = 0;
957			skb_reserve(skb, hh_len);
958			*skb_tx(skb) = ipc->shtx;
959
960			/*
961			 *	Find where to start putting bytes.
962			 */
963			data = skb_put(skb, fraglen);
964			skb_set_network_header(skb, exthdrlen);
965			skb->transport_header = (skb->network_header +
966						 fragheaderlen);
967			data += fragheaderlen;
968
969			if (fraggap) {
970				skb->csum = skb_copy_and_csum_bits(
971					skb_prev, maxfraglen,
972					data + transhdrlen, fraggap, 0);
973				skb_prev->csum = csum_sub(skb_prev->csum,
974							  skb->csum);
975				data += fraggap;
976				pskb_trim_unique(skb_prev, maxfraglen);
977			}
978
979			copy = datalen - transhdrlen - fraggap;
980			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
981				err = -EFAULT;
982				kfree_skb(skb);
983				goto error;
984			}
985
986			offset += copy;
987			length -= datalen - fraggap;
988			transhdrlen = 0;
989			exthdrlen = 0;
990			csummode = CHECKSUM_NONE;
991
992			/*
993			 * Put the packet on the pending queue.
994			 */
995			__skb_queue_tail(&sk->sk_write_queue, skb);
996			continue;
997		}
998
999		if (copy > length)
1000			copy = length;
1001
1002		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1003			unsigned int off;
1004
1005			off = skb->len;
1006			if (getfrag(from, skb_put(skb, copy),
1007					offset, copy, off, skb) < 0) {
1008				__skb_trim(skb, off);
1009				err = -EFAULT;
1010				goto error;
1011			}
1012		} else {
1013			int i = skb_shinfo(skb)->nr_frags;
1014			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1015			struct page *page = sk->sk_sndmsg_page;
1016			int off = sk->sk_sndmsg_off;
1017			unsigned int left;
1018
1019			if (page && (left = PAGE_SIZE - off) > 0) {
1020				if (copy >= left)
1021					copy = left;
1022				if (page != frag->page) {
1023					if (i == MAX_SKB_FRAGS) {
1024						err = -EMSGSIZE;
1025						goto error;
1026					}
1027					get_page(page);
1028					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1029					frag = &skb_shinfo(skb)->frags[i];
1030				}
1031			} else if (i < MAX_SKB_FRAGS) {
1032				if (copy > PAGE_SIZE)
1033					copy = PAGE_SIZE;
1034				page = alloc_pages(sk->sk_allocation, 0);
1035				if (page == NULL)  {
1036					err = -ENOMEM;
1037					goto error;
1038				}
1039				sk->sk_sndmsg_page = page;
1040				sk->sk_sndmsg_off = 0;
1041
1042				skb_fill_page_desc(skb, i, page, 0, 0);
1043				frag = &skb_shinfo(skb)->frags[i];
1044			} else {
1045				err = -EMSGSIZE;
1046				goto error;
1047			}
1048			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1049				err = -EFAULT;
1050				goto error;
1051			}
1052			sk->sk_sndmsg_off += copy;
1053			frag->size += copy;
1054			skb->len += copy;
1055			skb->data_len += copy;
1056			skb->truesize += copy;
1057			atomic_add(copy, &sk->sk_wmem_alloc);
1058		}
1059		offset += copy;
1060		length -= copy;
1061	}
1062
1063	return 0;
1064
1065error:
1066	inet->cork.length -= length;
1067	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1068	return err;
1069}
1070
1071ssize_t	ip_append_page(struct sock *sk, struct page *page,
1072		       int offset, size_t size, int flags)
1073{
1074	struct inet_sock *inet = inet_sk(sk);
1075	struct sk_buff *skb;
1076	struct rtable *rt;
1077	struct ip_options *opt = NULL;
1078	int hh_len;
1079	int mtu;
1080	int len;
1081	int err;
1082	unsigned int maxfraglen, fragheaderlen, fraggap;
1083
1084	if (inet->hdrincl)
1085		return -EPERM;
1086
1087	if (flags&MSG_PROBE)
1088		return 0;
1089
1090	if (skb_queue_empty(&sk->sk_write_queue))
1091		return -EINVAL;
1092
1093	rt = (struct rtable *)inet->cork.dst;
1094	if (inet->cork.flags & IPCORK_OPT)
1095		opt = inet->cork.opt;
1096
1097	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1098		return -EOPNOTSUPP;
1099
1100	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1101	mtu = inet->cork.fragsize;
1102
1103	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1104	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1105
1106	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1107		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1108		return -EMSGSIZE;
1109	}
1110
1111	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1112		return -EINVAL;
1113
1114	inet->cork.length += size;
1115	if ((sk->sk_protocol == IPPROTO_UDP) &&
1116	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1117		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1118		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1119	}
1120
1121
1122	while (size > 0) {
1123		int i;
1124
1125		if (skb_is_gso(skb))
1126			len = size;
1127		else {
1128
1129			/* Check if the remaining data fits into current packet. */
1130			len = mtu - skb->len;
1131			if (len < size)
1132				len = maxfraglen - skb->len;
1133		}
1134		if (len <= 0) {
1135			struct sk_buff *skb_prev;
1136			int alloclen;
1137
1138			skb_prev = skb;
1139			fraggap = skb_prev->len - maxfraglen;
1140
1141			alloclen = fragheaderlen + hh_len + fraggap + 15;
1142			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1143			if (unlikely(!skb)) {
1144				err = -ENOBUFS;
1145				goto error;
1146			}
1147
1148			/*
1149			 *	Fill in the control structures
1150			 */
1151			skb->ip_summed = CHECKSUM_NONE;
1152			skb->csum = 0;
1153			skb_reserve(skb, hh_len);
1154
1155			/*
1156			 *	Find where to start putting bytes.
1157			 */
1158			skb_put(skb, fragheaderlen + fraggap);
1159			skb_reset_network_header(skb);
1160			skb->transport_header = (skb->network_header +
1161						 fragheaderlen);
1162			if (fraggap) {
1163				skb->csum = skb_copy_and_csum_bits(skb_prev,
1164								   maxfraglen,
1165						    skb_transport_header(skb),
1166								   fraggap, 0);
1167				skb_prev->csum = csum_sub(skb_prev->csum,
1168							  skb->csum);
1169				pskb_trim_unique(skb_prev, maxfraglen);
1170			}
1171
1172			/*
1173			 * Put the packet on the pending queue.
1174			 */
1175			__skb_queue_tail(&sk->sk_write_queue, skb);
1176			continue;
1177		}
1178
1179		i = skb_shinfo(skb)->nr_frags;
1180		if (len > size)
1181			len = size;
1182		if (skb_can_coalesce(skb, i, page, offset)) {
1183			skb_shinfo(skb)->frags[i-1].size += len;
1184		} else if (i < MAX_SKB_FRAGS) {
1185			get_page(page);
1186			skb_fill_page_desc(skb, i, page, offset, len);
1187		} else {
1188			err = -EMSGSIZE;
1189			goto error;
1190		}
1191
1192		if (skb->ip_summed == CHECKSUM_NONE) {
1193			__wsum csum;
1194			csum = csum_page(page, offset, len);
1195			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1196		}
1197
1198		skb->len += len;
1199		skb->data_len += len;
1200		skb->truesize += len;
1201		atomic_add(len, &sk->sk_wmem_alloc);
1202		offset += len;
1203		size -= len;
1204	}
1205	return 0;
1206
1207error:
1208	inet->cork.length -= size;
1209	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1210	return err;
1211}
1212
1213static void ip_cork_release(struct inet_sock *inet)
1214{
1215	inet->cork.flags &= ~IPCORK_OPT;
1216	kfree(inet->cork.opt);
1217	inet->cork.opt = NULL;
1218	dst_release(inet->cork.dst);
1219	inet->cork.dst = NULL;
1220}
1221
1222/*
1223 *	Combined all pending IP fragments on the socket as one IP datagram
1224 *	and push them out.
1225 */
1226int ip_push_pending_frames(struct sock *sk)
1227{
1228	struct sk_buff *skb, *tmp_skb;
1229	struct sk_buff **tail_skb;
1230	struct inet_sock *inet = inet_sk(sk);
1231	struct net *net = sock_net(sk);
1232	struct ip_options *opt = NULL;
1233	struct rtable *rt = (struct rtable *)inet->cork.dst;
1234	struct iphdr *iph;
1235	__be16 df = 0;
1236	__u8 ttl;
1237	int err = 0;
1238
1239	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1240		goto out;
1241	tail_skb = &(skb_shinfo(skb)->frag_list);
1242
1243	/* move skb->data to ip header from ext header */
1244	if (skb->data < skb_network_header(skb))
1245		__skb_pull(skb, skb_network_offset(skb));
1246	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1247		__skb_pull(tmp_skb, skb_network_header_len(skb));
1248		*tail_skb = tmp_skb;
1249		tail_skb = &(tmp_skb->next);
1250		skb->len += tmp_skb->len;
1251		skb->data_len += tmp_skb->len;
1252		skb->truesize += tmp_skb->truesize;
1253		tmp_skb->destructor = NULL;
1254		tmp_skb->sk = NULL;
1255	}
1256
1257	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1258	 * to fragment the frame generated here. No matter, what transforms
1259	 * how transforms change size of the packet, it will come out.
1260	 */
1261	if (inet->pmtudisc < IP_PMTUDISC_DO)
1262		skb->local_df = 1;
1263
1264	/* DF bit is set when we want to see DF on outgoing frames.
1265	 * If local_df is set too, we still allow to fragment this frame
1266	 * locally. */
1267	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1268	    (skb->len <= dst_mtu(&rt->u.dst) &&
1269	     ip_dont_fragment(sk, &rt->u.dst)))
1270		df = htons(IP_DF);
1271
1272	if (inet->cork.flags & IPCORK_OPT)
1273		opt = inet->cork.opt;
1274
1275	if (rt->rt_type == RTN_MULTICAST)
1276		ttl = inet->mc_ttl;
1277	else
1278		ttl = ip_select_ttl(inet, &rt->u.dst);
1279
1280	iph = (struct iphdr *)skb->data;
1281	iph->version = 4;
1282	iph->ihl = 5;
1283	if (opt) {
1284		iph->ihl += opt->optlen>>2;
1285		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1286	}
1287	iph->tos = inet->tos;
1288	iph->frag_off = df;
1289	ip_select_ident(iph, &rt->u.dst, sk);
1290	iph->ttl = ttl;
1291	iph->protocol = sk->sk_protocol;
1292	iph->saddr = rt->rt_src;
1293	iph->daddr = rt->rt_dst;
1294
1295	skb->priority = sk->sk_priority;
1296	skb->mark = sk->sk_mark;
1297	/*
1298	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1299	 * on dst refcount
1300	 */
1301	inet->cork.dst = NULL;
1302	skb_dst_set(skb, &rt->u.dst);
1303
1304	if (iph->protocol == IPPROTO_ICMP)
1305		icmp_out_count(net, ((struct icmphdr *)
1306			skb_transport_header(skb))->type);
1307
1308	/* Netfilter gets whole the not fragmented skb. */
1309	err = ip_local_out(skb);
1310	if (err) {
1311		if (err > 0)
1312			err = net_xmit_errno(err);
1313		if (err)
1314			goto error;
1315	}
1316
1317out:
1318	ip_cork_release(inet);
1319	return err;
1320
1321error:
1322	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1323	goto out;
1324}
1325
1326/*
1327 *	Throw away all pending data on the socket.
1328 */
1329void ip_flush_pending_frames(struct sock *sk)
1330{
1331	struct sk_buff *skb;
1332
1333	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1334		kfree_skb(skb);
1335
1336	ip_cork_release(inet_sk(sk));
1337}
1338
1339
1340/*
1341 *	Fetch data from kernel space and fill in checksum if needed.
1342 */
1343static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1344			      int len, int odd, struct sk_buff *skb)
1345{
1346	__wsum csum;
1347
1348	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1349	skb->csum = csum_block_add(skb->csum, csum, odd);
1350	return 0;
1351}
1352
1353/*
1354 *	Generic function to send a packet as reply to another packet.
1355 *	Used to send TCP resets so far. ICMP should use this function too.
1356 *
1357 *	Should run single threaded per socket because it uses the sock
1358 *     	structure to pass arguments.
1359 */
1360void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1361		   unsigned int len)
1362{
1363	struct inet_sock *inet = inet_sk(sk);
1364	struct {
1365		struct ip_options	opt;
1366		char			data[40];
1367	} replyopts;
1368	struct ipcm_cookie ipc;
1369	__be32 daddr;
1370	struct rtable *rt = skb_rtable(skb);
1371
1372	if (ip_options_echo(&replyopts.opt, skb))
1373		return;
1374
1375	daddr = ipc.addr = rt->rt_src;
1376	ipc.opt = NULL;
1377	ipc.shtx.flags = 0;
1378
1379	if (replyopts.opt.optlen) {
1380		ipc.opt = &replyopts.opt;
1381
1382		if (ipc.opt->srr)
1383			daddr = replyopts.opt.faddr;
1384	}
1385
1386	{
1387		struct flowi fl = { .oif = arg->bound_dev_if,
1388				    .nl_u = { .ip4_u =
1389					      { .daddr = daddr,
1390						.saddr = rt->rt_spec_dst,
1391						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1392				    /* Not quite clean, but right. */
1393				    .uli_u = { .ports =
1394					       { .sport = tcp_hdr(skb)->dest,
1395						 .dport = tcp_hdr(skb)->source } },
1396				    .proto = sk->sk_protocol,
1397				    .flags = ip_reply_arg_flowi_flags(arg) };
1398		security_skb_classify_flow(skb, &fl);
1399		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1400			return;
1401	}
1402
1403	/* And let IP do all the hard work.
1404
1405	   This chunk is not reenterable, hence spinlock.
1406	   Note that it uses the fact, that this function is called
1407	   with locally disabled BH and that sk cannot be already spinlocked.
1408	 */
1409	bh_lock_sock(sk);
1410	inet->tos = ip_hdr(skb)->tos;
1411	sk->sk_priority = skb->priority;
1412	sk->sk_protocol = ip_hdr(skb)->protocol;
1413	sk->sk_bound_dev_if = arg->bound_dev_if;
1414	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1415		       &ipc, &rt, MSG_DONTWAIT);
1416	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1417		if (arg->csumoffset >= 0)
1418			*((__sum16 *)skb_transport_header(skb) +
1419			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1420								arg->csum));
1421		skb->ip_summed = CHECKSUM_NONE;
1422		ip_push_pending_frames(sk);
1423	}
1424
1425	bh_unlock_sock(sk);
1426
1427	ip_rt_put(rt);
1428}
1429
1430void __init ip_init(void)
1431{
1432	ip_rt_init();
1433	inet_initpeers();
1434
1435#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1436	igmp_mc_proc_init();
1437#endif
1438}
1439
1440EXPORT_SYMBOL(ip_generic_getfrag);
1441EXPORT_SYMBOL(ip_queue_xmit);
1442EXPORT_SYMBOL(ip_send_check);
1443