ip_output.c revision 914a9ab386a288d0f22252fc268ecbc048cdcbd5
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
85/* Generate a checksum for an outgoing IP datagram. */
86__inline__ void ip_send_check(struct iphdr *iph)
87{
88	iph->check = 0;
89	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90}
91
92int __ip_local_out(struct sk_buff *skb)
93{
94	struct iphdr *iph = ip_hdr(skb);
95
96	iph->tot_len = htons(skb->len);
97	ip_send_check(iph);
98	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
99		       dst_output);
100}
101
102int ip_local_out(struct sk_buff *skb)
103{
104	int err;
105
106	err = __ip_local_out(skb);
107	if (likely(err == 1))
108		err = dst_output(skb);
109
110	return err;
111}
112EXPORT_SYMBOL_GPL(ip_local_out);
113
114/* dev_loopback_xmit for use with netfilter. */
115static int ip_dev_loopback_xmit(struct sk_buff *newskb)
116{
117	skb_reset_mac_header(newskb);
118	__skb_pull(newskb, skb_network_offset(newskb));
119	newskb->pkt_type = PACKET_LOOPBACK;
120	newskb->ip_summed = CHECKSUM_UNNECESSARY;
121	WARN_ON(!skb_dst(newskb));
122	netif_rx(newskb);
123	return 0;
124}
125
126static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
127{
128	int ttl = inet->uc_ttl;
129
130	if (ttl < 0)
131		ttl = dst_metric(dst, RTAX_HOPLIMIT);
132	return ttl;
133}
134
135/*
136 *		Add an ip header to a skbuff and send it out.
137 *
138 */
139int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
140			  __be32 saddr, __be32 daddr, struct ip_options *opt)
141{
142	struct inet_sock *inet = inet_sk(sk);
143	struct rtable *rt = skb_rtable(skb);
144	struct iphdr *iph;
145
146	/* Build the IP header. */
147	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
148	skb_reset_network_header(skb);
149	iph = ip_hdr(skb);
150	iph->version  = 4;
151	iph->ihl      = 5;
152	iph->tos      = inet->tos;
153	if (ip_dont_fragment(sk, &rt->u.dst))
154		iph->frag_off = htons(IP_DF);
155	else
156		iph->frag_off = 0;
157	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
158	iph->daddr    = rt->rt_dst;
159	iph->saddr    = rt->rt_src;
160	iph->protocol = sk->sk_protocol;
161	ip_select_ident(iph, &rt->u.dst, sk);
162
163	if (opt && opt->optlen) {
164		iph->ihl += opt->optlen>>2;
165		ip_options_build(skb, opt, daddr, rt, 0);
166	}
167
168	skb->priority = sk->sk_priority;
169	skb->mark = sk->sk_mark;
170
171	/* Send it out. */
172	return ip_local_out(skb);
173}
174
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176
177static inline int ip_finish_output2(struct sk_buff *skb)
178{
179	struct dst_entry *dst = skb_dst(skb);
180	struct rtable *rt = (struct rtable *)dst;
181	struct net_device *dev = dst->dev;
182	unsigned int hh_len = LL_RESERVED_SPACE(dev);
183
184	if (rt->rt_type == RTN_MULTICAST) {
185		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
186	} else if (rt->rt_type == RTN_BROADCAST)
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
188
189	/* Be paranoid, rather than too clever. */
190	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
191		struct sk_buff *skb2;
192
193		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
194		if (skb2 == NULL) {
195			kfree_skb(skb);
196			return -ENOMEM;
197		}
198		if (skb->sk)
199			skb_set_owner_w(skb2, skb->sk);
200		kfree_skb(skb);
201		skb = skb2;
202	}
203
204	if (dst->hh)
205		return neigh_hh_output(dst->hh, skb);
206	else if (dst->neighbour)
207		return dst->neighbour->output(skb);
208
209	if (net_ratelimit())
210		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211	kfree_skb(skb);
212	return -EINVAL;
213}
214
215static inline int ip_skb_dst_mtu(struct sk_buff *skb)
216{
217	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
218
219	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
220	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
221}
222
223static int ip_finish_output(struct sk_buff *skb)
224{
225#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
226	/* Policy lookup after SNAT yielded a new policy */
227	if (skb_dst(skb)->xfrm != NULL) {
228		IPCB(skb)->flags |= IPSKB_REROUTED;
229		return dst_output(skb);
230	}
231#endif
232	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
233		return ip_fragment(skb, ip_finish_output2);
234	else
235		return ip_finish_output2(skb);
236}
237
238int ip_mc_output(struct sk_buff *skb)
239{
240	struct sock *sk = skb->sk;
241	struct rtable *rt = skb_rtable(skb);
242	struct net_device *dev = rt->u.dst.dev;
243
244	/*
245	 *	If the indicated interface is up and running, send the packet.
246	 */
247	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
248
249	skb->dev = dev;
250	skb->protocol = htons(ETH_P_IP);
251
252	/*
253	 *	Multicasts are looped back for other local users
254	 */
255
256	if (rt->rt_flags&RTCF_MULTICAST) {
257		if ((!sk || inet_sk(sk)->mc_loop)
258#ifdef CONFIG_IP_MROUTE
259		/* Small optimization: do not loopback not local frames,
260		   which returned after forwarding; they will be  dropped
261		   by ip_mr_input in any case.
262		   Note, that local frames are looped back to be delivered
263		   to local recipients.
264
265		   This check is duplicated in ip_mr_input at the moment.
266		 */
267		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
268#endif
269		) {
270			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
271			if (newskb)
272				NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
273					NULL, newskb->dev,
274					ip_dev_loopback_xmit);
275		}
276
277		/* Multicasts with ttl 0 must not go beyond the host */
278
279		if (ip_hdr(skb)->ttl == 0) {
280			kfree_skb(skb);
281			return 0;
282		}
283	}
284
285	if (rt->rt_flags&RTCF_BROADCAST) {
286		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
287		if (newskb)
288			NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
289				newskb->dev, ip_dev_loopback_xmit);
290	}
291
292	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
293			    ip_finish_output,
294			    !(IPCB(skb)->flags & IPSKB_REROUTED));
295}
296
297int ip_output(struct sk_buff *skb)
298{
299	struct net_device *dev = skb_dst(skb)->dev;
300
301	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
302
303	skb->dev = dev;
304	skb->protocol = htons(ETH_P_IP);
305
306	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
307			    ip_finish_output,
308			    !(IPCB(skb)->flags & IPSKB_REROUTED));
309}
310
311int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
312{
313	struct sock *sk = skb->sk;
314	struct inet_sock *inet = inet_sk(sk);
315	struct ip_options *opt = inet->opt;
316	struct rtable *rt;
317	struct iphdr *iph;
318
319	/* Skip all of this if the packet is already routed,
320	 * f.e. by something like SCTP.
321	 */
322	rt = skb_rtable(skb);
323	if (rt != NULL)
324		goto packet_routed;
325
326	/* Make sure we can route this packet. */
327	rt = (struct rtable *)__sk_dst_check(sk, 0);
328	if (rt == NULL) {
329		__be32 daddr;
330
331		/* Use correct destination address if we have options. */
332		daddr = inet->daddr;
333		if(opt && opt->srr)
334			daddr = opt->faddr;
335
336		{
337			struct flowi fl = { .oif = sk->sk_bound_dev_if,
338					    .mark = sk->sk_mark,
339					    .nl_u = { .ip4_u =
340						      { .daddr = daddr,
341							.saddr = inet->saddr,
342							.tos = RT_CONN_FLAGS(sk) } },
343					    .proto = sk->sk_protocol,
344					    .flags = inet_sk_flowi_flags(sk),
345					    .uli_u = { .ports =
346						       { .sport = inet->sport,
347							 .dport = inet->dport } } };
348
349			/* If this fails, retransmit mechanism of transport layer will
350			 * keep trying until route appears or the connection times
351			 * itself out.
352			 */
353			security_sk_classify_flow(sk, &fl);
354			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
355				goto no_route;
356		}
357		sk_setup_caps(sk, &rt->u.dst);
358	}
359	skb_dst_set(skb, dst_clone(&rt->u.dst));
360
361packet_routed:
362	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
363		goto no_route;
364
365	/* OK, we know where to send it, allocate and build IP header. */
366	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
367	skb_reset_network_header(skb);
368	iph = ip_hdr(skb);
369	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
370	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
371		iph->frag_off = htons(IP_DF);
372	else
373		iph->frag_off = 0;
374	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
375	iph->protocol = sk->sk_protocol;
376	iph->saddr    = rt->rt_src;
377	iph->daddr    = rt->rt_dst;
378	/* Transport layer set skb->h.foo itself. */
379
380	if (opt && opt->optlen) {
381		iph->ihl += opt->optlen >> 2;
382		ip_options_build(skb, opt, inet->daddr, rt, 0);
383	}
384
385	ip_select_ident_more(iph, &rt->u.dst, sk,
386			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
387
388	skb->priority = sk->sk_priority;
389	skb->mark = sk->sk_mark;
390
391	return ip_local_out(skb);
392
393no_route:
394	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
395	kfree_skb(skb);
396	return -EHOSTUNREACH;
397}
398
399
400static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
401{
402	to->pkt_type = from->pkt_type;
403	to->priority = from->priority;
404	to->protocol = from->protocol;
405	skb_dst_drop(to);
406	skb_dst_set(to, dst_clone(skb_dst(from)));
407	to->dev = from->dev;
408	to->mark = from->mark;
409
410	/* Copy the flags to each fragment. */
411	IPCB(to)->flags = IPCB(from)->flags;
412
413#ifdef CONFIG_NET_SCHED
414	to->tc_index = from->tc_index;
415#endif
416	nf_copy(to, from);
417#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
418    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
419	to->nf_trace = from->nf_trace;
420#endif
421#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
422	to->ipvs_property = from->ipvs_property;
423#endif
424	skb_copy_secmark(to, from);
425}
426
427/*
428 *	This IP datagram is too large to be sent in one piece.  Break it up into
429 *	smaller pieces (each of size equal to IP header plus
430 *	a block of the data of the original IP data part) that will yet fit in a
431 *	single device frame, and queue such a frame for sending.
432 */
433
434int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
435{
436	struct iphdr *iph;
437	int raw = 0;
438	int ptr;
439	struct net_device *dev;
440	struct sk_buff *skb2;
441	unsigned int mtu, hlen, left, len, ll_rs, pad;
442	int offset;
443	__be16 not_last_frag;
444	struct rtable *rt = skb_rtable(skb);
445	int err = 0;
446
447	dev = rt->u.dst.dev;
448
449	/*
450	 *	Point into the IP datagram header.
451	 */
452
453	iph = ip_hdr(skb);
454
455	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
456		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
457		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
458			  htonl(ip_skb_dst_mtu(skb)));
459		kfree_skb(skb);
460		return -EMSGSIZE;
461	}
462
463	/*
464	 *	Setup starting values.
465	 */
466
467	hlen = iph->ihl * 4;
468	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
469	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
470
471	/* When frag_list is given, use it. First, check its validity:
472	 * some transformers could create wrong frag_list or break existing
473	 * one, it is not prohibited. In this case fall back to copying.
474	 *
475	 * LATER: this step can be merged to real generation of fragments,
476	 * we can switch to copy when see the first bad fragment.
477	 */
478	if (skb_has_frags(skb)) {
479		struct sk_buff *frag;
480		int first_len = skb_pagelen(skb);
481		int truesizes = 0;
482
483		if (first_len - hlen > mtu ||
484		    ((first_len - hlen) & 7) ||
485		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
486		    skb_cloned(skb))
487			goto slow_path;
488
489		skb_walk_frags(skb, frag) {
490			/* Correct geometry. */
491			if (frag->len > mtu ||
492			    ((frag->len & 7) && frag->next) ||
493			    skb_headroom(frag) < hlen)
494			    goto slow_path;
495
496			/* Partially cloned skb? */
497			if (skb_shared(frag))
498				goto slow_path;
499
500			BUG_ON(frag->sk);
501			if (skb->sk) {
502				frag->sk = skb->sk;
503				frag->destructor = sock_wfree;
504				truesizes += frag->truesize;
505			}
506		}
507
508		/* Everything is OK. Generate! */
509
510		err = 0;
511		offset = 0;
512		frag = skb_shinfo(skb)->frag_list;
513		skb_frag_list_init(skb);
514		skb->data_len = first_len - skb_headlen(skb);
515		skb->truesize -= truesizes;
516		skb->len = first_len;
517		iph->tot_len = htons(first_len);
518		iph->frag_off = htons(IP_MF);
519		ip_send_check(iph);
520
521		for (;;) {
522			/* Prepare header of the next frame,
523			 * before previous one went down. */
524			if (frag) {
525				frag->ip_summed = CHECKSUM_NONE;
526				skb_reset_transport_header(frag);
527				__skb_push(frag, hlen);
528				skb_reset_network_header(frag);
529				memcpy(skb_network_header(frag), iph, hlen);
530				iph = ip_hdr(frag);
531				iph->tot_len = htons(frag->len);
532				ip_copy_metadata(frag, skb);
533				if (offset == 0)
534					ip_options_fragment(frag);
535				offset += skb->len - hlen;
536				iph->frag_off = htons(offset>>3);
537				if (frag->next != NULL)
538					iph->frag_off |= htons(IP_MF);
539				/* Ready, complete checksum */
540				ip_send_check(iph);
541			}
542
543			err = output(skb);
544
545			if (!err)
546				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
547			if (err || !frag)
548				break;
549
550			skb = frag;
551			frag = skb->next;
552			skb->next = NULL;
553		}
554
555		if (err == 0) {
556			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
557			return 0;
558		}
559
560		while (frag) {
561			skb = frag->next;
562			kfree_skb(frag);
563			frag = skb;
564		}
565		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
566		return err;
567	}
568
569slow_path:
570	left = skb->len - hlen;		/* Space per frame */
571	ptr = raw + hlen;		/* Where to start from */
572
573	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
574	 * we need to make room for the encapsulating header
575	 */
576	pad = nf_bridge_pad(skb);
577	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
578	mtu -= pad;
579
580	/*
581	 *	Fragment the datagram.
582	 */
583
584	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
585	not_last_frag = iph->frag_off & htons(IP_MF);
586
587	/*
588	 *	Keep copying data until we run out.
589	 */
590
591	while (left > 0) {
592		len = left;
593		/* IF: it doesn't fit, use 'mtu' - the data space left */
594		if (len > mtu)
595			len = mtu;
596		/* IF: we are not sending upto and including the packet end
597		   then align the next start on an eight byte boundary */
598		if (len < left)	{
599			len &= ~7;
600		}
601		/*
602		 *	Allocate buffer.
603		 */
604
605		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
606			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
607			err = -ENOMEM;
608			goto fail;
609		}
610
611		/*
612		 *	Set up data on packet
613		 */
614
615		ip_copy_metadata(skb2, skb);
616		skb_reserve(skb2, ll_rs);
617		skb_put(skb2, len + hlen);
618		skb_reset_network_header(skb2);
619		skb2->transport_header = skb2->network_header + hlen;
620
621		/*
622		 *	Charge the memory for the fragment to any owner
623		 *	it might possess
624		 */
625
626		if (skb->sk)
627			skb_set_owner_w(skb2, skb->sk);
628
629		/*
630		 *	Copy the packet header into the new buffer.
631		 */
632
633		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
634
635		/*
636		 *	Copy a block of the IP datagram.
637		 */
638		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
639			BUG();
640		left -= len;
641
642		/*
643		 *	Fill in the new header fields.
644		 */
645		iph = ip_hdr(skb2);
646		iph->frag_off = htons((offset >> 3));
647
648		/* ANK: dirty, but effective trick. Upgrade options only if
649		 * the segment to be fragmented was THE FIRST (otherwise,
650		 * options are already fixed) and make it ONCE
651		 * on the initial skb, so that all the following fragments
652		 * will inherit fixed options.
653		 */
654		if (offset == 0)
655			ip_options_fragment(skb);
656
657		/*
658		 *	Added AC : If we are fragmenting a fragment that's not the
659		 *		   last fragment then keep MF on each bit
660		 */
661		if (left > 0 || not_last_frag)
662			iph->frag_off |= htons(IP_MF);
663		ptr += len;
664		offset += len;
665
666		/*
667		 *	Put this fragment into the sending queue.
668		 */
669		iph->tot_len = htons(len + hlen);
670
671		ip_send_check(iph);
672
673		err = output(skb2);
674		if (err)
675			goto fail;
676
677		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
678	}
679	kfree_skb(skb);
680	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
681	return err;
682
683fail:
684	kfree_skb(skb);
685	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
686	return err;
687}
688
689EXPORT_SYMBOL(ip_fragment);
690
691int
692ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
693{
694	struct iovec *iov = from;
695
696	if (skb->ip_summed == CHECKSUM_PARTIAL) {
697		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
698			return -EFAULT;
699	} else {
700		__wsum csum = 0;
701		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
702			return -EFAULT;
703		skb->csum = csum_block_add(skb->csum, csum, odd);
704	}
705	return 0;
706}
707
708static inline __wsum
709csum_page(struct page *page, int offset, int copy)
710{
711	char *kaddr;
712	__wsum csum;
713	kaddr = kmap(page);
714	csum = csum_partial(kaddr + offset, copy, 0);
715	kunmap(page);
716	return csum;
717}
718
719static inline int ip_ufo_append_data(struct sock *sk,
720			int getfrag(void *from, char *to, int offset, int len,
721			       int odd, struct sk_buff *skb),
722			void *from, int length, int hh_len, int fragheaderlen,
723			int transhdrlen, int mtu, unsigned int flags)
724{
725	struct sk_buff *skb;
726	int err;
727
728	/* There is support for UDP fragmentation offload by network
729	 * device, so create one single skb packet containing complete
730	 * udp datagram
731	 */
732	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
733		skb = sock_alloc_send_skb(sk,
734			hh_len + fragheaderlen + transhdrlen + 20,
735			(flags & MSG_DONTWAIT), &err);
736
737		if (skb == NULL)
738			return err;
739
740		/* reserve space for Hardware header */
741		skb_reserve(skb, hh_len);
742
743		/* create space for UDP/IP header */
744		skb_put(skb, fragheaderlen + transhdrlen);
745
746		/* initialize network header pointer */
747		skb_reset_network_header(skb);
748
749		/* initialize protocol header pointer */
750		skb->transport_header = skb->network_header + fragheaderlen;
751
752		skb->ip_summed = CHECKSUM_PARTIAL;
753		skb->csum = 0;
754		sk->sk_sndmsg_off = 0;
755
756		/* specify the length of each IP datagram fragment */
757		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
758		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
759		__skb_queue_tail(&sk->sk_write_queue, skb);
760	}
761
762	return skb_append_datato_frags(sk, skb, getfrag, from,
763				       (length - transhdrlen));
764}
765
766/*
767 *	ip_append_data() and ip_append_page() can make one large IP datagram
768 *	from many pieces of data. Each pieces will be holded on the socket
769 *	until ip_push_pending_frames() is called. Each piece can be a page
770 *	or non-page data.
771 *
772 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
773 *	this interface potentially.
774 *
775 *	LATER: length must be adjusted by pad at tail, when it is required.
776 */
777int ip_append_data(struct sock *sk,
778		   int getfrag(void *from, char *to, int offset, int len,
779			       int odd, struct sk_buff *skb),
780		   void *from, int length, int transhdrlen,
781		   struct ipcm_cookie *ipc, struct rtable **rtp,
782		   unsigned int flags)
783{
784	struct inet_sock *inet = inet_sk(sk);
785	struct sk_buff *skb;
786
787	struct ip_options *opt = NULL;
788	int hh_len;
789	int exthdrlen;
790	int mtu;
791	int copy;
792	int err;
793	int offset = 0;
794	unsigned int maxfraglen, fragheaderlen;
795	int csummode = CHECKSUM_NONE;
796	struct rtable *rt;
797
798	if (flags&MSG_PROBE)
799		return 0;
800
801	if (skb_queue_empty(&sk->sk_write_queue)) {
802		/*
803		 * setup for corking.
804		 */
805		opt = ipc->opt;
806		if (opt) {
807			if (inet->cork.opt == NULL) {
808				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
809				if (unlikely(inet->cork.opt == NULL))
810					return -ENOBUFS;
811			}
812			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
813			inet->cork.flags |= IPCORK_OPT;
814			inet->cork.addr = ipc->addr;
815		}
816		rt = *rtp;
817		if (unlikely(!rt))
818			return -EFAULT;
819		/*
820		 * We steal reference to this route, caller should not release it
821		 */
822		*rtp = NULL;
823		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
824					    rt->u.dst.dev->mtu :
825					    dst_mtu(rt->u.dst.path);
826		inet->cork.dst = &rt->u.dst;
827		inet->cork.length = 0;
828		sk->sk_sndmsg_page = NULL;
829		sk->sk_sndmsg_off = 0;
830		if ((exthdrlen = rt->u.dst.header_len) != 0) {
831			length += exthdrlen;
832			transhdrlen += exthdrlen;
833		}
834	} else {
835		rt = (struct rtable *)inet->cork.dst;
836		if (inet->cork.flags & IPCORK_OPT)
837			opt = inet->cork.opt;
838
839		transhdrlen = 0;
840		exthdrlen = 0;
841		mtu = inet->cork.fragsize;
842	}
843	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
844
845	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
846	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
847
848	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
849		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
850		return -EMSGSIZE;
851	}
852
853	/*
854	 * transhdrlen > 0 means that this is the first fragment and we wish
855	 * it won't be fragmented in the future.
856	 */
857	if (transhdrlen &&
858	    length + fragheaderlen <= mtu &&
859	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
860	    !exthdrlen)
861		csummode = CHECKSUM_PARTIAL;
862
863	inet->cork.length += length;
864	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
865	    (sk->sk_protocol == IPPROTO_UDP) &&
866	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
867		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
868					 fragheaderlen, transhdrlen, mtu,
869					 flags);
870		if (err)
871			goto error;
872		return 0;
873	}
874
875	/* So, what's going on in the loop below?
876	 *
877	 * We use calculated fragment length to generate chained skb,
878	 * each of segments is IP fragment ready for sending to network after
879	 * adding appropriate IP header.
880	 */
881
882	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
883		goto alloc_new_skb;
884
885	while (length > 0) {
886		/* Check if the remaining data fits into current packet. */
887		copy = mtu - skb->len;
888		if (copy < length)
889			copy = maxfraglen - skb->len;
890		if (copy <= 0) {
891			char *data;
892			unsigned int datalen;
893			unsigned int fraglen;
894			unsigned int fraggap;
895			unsigned int alloclen;
896			struct sk_buff *skb_prev;
897alloc_new_skb:
898			skb_prev = skb;
899			if (skb_prev)
900				fraggap = skb_prev->len - maxfraglen;
901			else
902				fraggap = 0;
903
904			/*
905			 * If remaining data exceeds the mtu,
906			 * we know we need more fragment(s).
907			 */
908			datalen = length + fraggap;
909			if (datalen > mtu - fragheaderlen)
910				datalen = maxfraglen - fragheaderlen;
911			fraglen = datalen + fragheaderlen;
912
913			if ((flags & MSG_MORE) &&
914			    !(rt->u.dst.dev->features&NETIF_F_SG))
915				alloclen = mtu;
916			else
917				alloclen = datalen + fragheaderlen;
918
919			/* The last fragment gets additional space at tail.
920			 * Note, with MSG_MORE we overallocate on fragments,
921			 * because we have no idea what fragment will be
922			 * the last.
923			 */
924			if (datalen == length + fraggap)
925				alloclen += rt->u.dst.trailer_len;
926
927			if (transhdrlen) {
928				skb = sock_alloc_send_skb(sk,
929						alloclen + hh_len + 15,
930						(flags & MSG_DONTWAIT), &err);
931			} else {
932				skb = NULL;
933				if (atomic_read(&sk->sk_wmem_alloc) <=
934				    2 * sk->sk_sndbuf)
935					skb = sock_wmalloc(sk,
936							   alloclen + hh_len + 15, 1,
937							   sk->sk_allocation);
938				if (unlikely(skb == NULL))
939					err = -ENOBUFS;
940				else
941					/* only the initial fragment is
942					   time stamped */
943					ipc->shtx.flags = 0;
944			}
945			if (skb == NULL)
946				goto error;
947
948			/*
949			 *	Fill in the control structures
950			 */
951			skb->ip_summed = csummode;
952			skb->csum = 0;
953			skb_reserve(skb, hh_len);
954			*skb_tx(skb) = ipc->shtx;
955
956			/*
957			 *	Find where to start putting bytes.
958			 */
959			data = skb_put(skb, fraglen);
960			skb_set_network_header(skb, exthdrlen);
961			skb->transport_header = (skb->network_header +
962						 fragheaderlen);
963			data += fragheaderlen;
964
965			if (fraggap) {
966				skb->csum = skb_copy_and_csum_bits(
967					skb_prev, maxfraglen,
968					data + transhdrlen, fraggap, 0);
969				skb_prev->csum = csum_sub(skb_prev->csum,
970							  skb->csum);
971				data += fraggap;
972				pskb_trim_unique(skb_prev, maxfraglen);
973			}
974
975			copy = datalen - transhdrlen - fraggap;
976			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
977				err = -EFAULT;
978				kfree_skb(skb);
979				goto error;
980			}
981
982			offset += copy;
983			length -= datalen - fraggap;
984			transhdrlen = 0;
985			exthdrlen = 0;
986			csummode = CHECKSUM_NONE;
987
988			/*
989			 * Put the packet on the pending queue.
990			 */
991			__skb_queue_tail(&sk->sk_write_queue, skb);
992			continue;
993		}
994
995		if (copy > length)
996			copy = length;
997
998		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
999			unsigned int off;
1000
1001			off = skb->len;
1002			if (getfrag(from, skb_put(skb, copy),
1003					offset, copy, off, skb) < 0) {
1004				__skb_trim(skb, off);
1005				err = -EFAULT;
1006				goto error;
1007			}
1008		} else {
1009			int i = skb_shinfo(skb)->nr_frags;
1010			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1011			struct page *page = sk->sk_sndmsg_page;
1012			int off = sk->sk_sndmsg_off;
1013			unsigned int left;
1014
1015			if (page && (left = PAGE_SIZE - off) > 0) {
1016				if (copy >= left)
1017					copy = left;
1018				if (page != frag->page) {
1019					if (i == MAX_SKB_FRAGS) {
1020						err = -EMSGSIZE;
1021						goto error;
1022					}
1023					get_page(page);
1024					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1025					frag = &skb_shinfo(skb)->frags[i];
1026				}
1027			} else if (i < MAX_SKB_FRAGS) {
1028				if (copy > PAGE_SIZE)
1029					copy = PAGE_SIZE;
1030				page = alloc_pages(sk->sk_allocation, 0);
1031				if (page == NULL)  {
1032					err = -ENOMEM;
1033					goto error;
1034				}
1035				sk->sk_sndmsg_page = page;
1036				sk->sk_sndmsg_off = 0;
1037
1038				skb_fill_page_desc(skb, i, page, 0, 0);
1039				frag = &skb_shinfo(skb)->frags[i];
1040			} else {
1041				err = -EMSGSIZE;
1042				goto error;
1043			}
1044			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1045				err = -EFAULT;
1046				goto error;
1047			}
1048			sk->sk_sndmsg_off += copy;
1049			frag->size += copy;
1050			skb->len += copy;
1051			skb->data_len += copy;
1052			skb->truesize += copy;
1053			atomic_add(copy, &sk->sk_wmem_alloc);
1054		}
1055		offset += copy;
1056		length -= copy;
1057	}
1058
1059	return 0;
1060
1061error:
1062	inet->cork.length -= length;
1063	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1064	return err;
1065}
1066
1067ssize_t	ip_append_page(struct sock *sk, struct page *page,
1068		       int offset, size_t size, int flags)
1069{
1070	struct inet_sock *inet = inet_sk(sk);
1071	struct sk_buff *skb;
1072	struct rtable *rt;
1073	struct ip_options *opt = NULL;
1074	int hh_len;
1075	int mtu;
1076	int len;
1077	int err;
1078	unsigned int maxfraglen, fragheaderlen, fraggap;
1079
1080	if (inet->hdrincl)
1081		return -EPERM;
1082
1083	if (flags&MSG_PROBE)
1084		return 0;
1085
1086	if (skb_queue_empty(&sk->sk_write_queue))
1087		return -EINVAL;
1088
1089	rt = (struct rtable *)inet->cork.dst;
1090	if (inet->cork.flags & IPCORK_OPT)
1091		opt = inet->cork.opt;
1092
1093	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1094		return -EOPNOTSUPP;
1095
1096	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1097	mtu = inet->cork.fragsize;
1098
1099	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1100	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1101
1102	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1103		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1104		return -EMSGSIZE;
1105	}
1106
1107	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1108		return -EINVAL;
1109
1110	inet->cork.length += size;
1111	if ((sk->sk_protocol == IPPROTO_UDP) &&
1112	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1113		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1114		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1115	}
1116
1117
1118	while (size > 0) {
1119		int i;
1120
1121		if (skb_is_gso(skb))
1122			len = size;
1123		else {
1124
1125			/* Check if the remaining data fits into current packet. */
1126			len = mtu - skb->len;
1127			if (len < size)
1128				len = maxfraglen - skb->len;
1129		}
1130		if (len <= 0) {
1131			struct sk_buff *skb_prev;
1132			int alloclen;
1133
1134			skb_prev = skb;
1135			fraggap = skb_prev->len - maxfraglen;
1136
1137			alloclen = fragheaderlen + hh_len + fraggap + 15;
1138			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1139			if (unlikely(!skb)) {
1140				err = -ENOBUFS;
1141				goto error;
1142			}
1143
1144			/*
1145			 *	Fill in the control structures
1146			 */
1147			skb->ip_summed = CHECKSUM_NONE;
1148			skb->csum = 0;
1149			skb_reserve(skb, hh_len);
1150
1151			/*
1152			 *	Find where to start putting bytes.
1153			 */
1154			skb_put(skb, fragheaderlen + fraggap);
1155			skb_reset_network_header(skb);
1156			skb->transport_header = (skb->network_header +
1157						 fragheaderlen);
1158			if (fraggap) {
1159				skb->csum = skb_copy_and_csum_bits(skb_prev,
1160								   maxfraglen,
1161						    skb_transport_header(skb),
1162								   fraggap, 0);
1163				skb_prev->csum = csum_sub(skb_prev->csum,
1164							  skb->csum);
1165				pskb_trim_unique(skb_prev, maxfraglen);
1166			}
1167
1168			/*
1169			 * Put the packet on the pending queue.
1170			 */
1171			__skb_queue_tail(&sk->sk_write_queue, skb);
1172			continue;
1173		}
1174
1175		i = skb_shinfo(skb)->nr_frags;
1176		if (len > size)
1177			len = size;
1178		if (skb_can_coalesce(skb, i, page, offset)) {
1179			skb_shinfo(skb)->frags[i-1].size += len;
1180		} else if (i < MAX_SKB_FRAGS) {
1181			get_page(page);
1182			skb_fill_page_desc(skb, i, page, offset, len);
1183		} else {
1184			err = -EMSGSIZE;
1185			goto error;
1186		}
1187
1188		if (skb->ip_summed == CHECKSUM_NONE) {
1189			__wsum csum;
1190			csum = csum_page(page, offset, len);
1191			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1192		}
1193
1194		skb->len += len;
1195		skb->data_len += len;
1196		skb->truesize += len;
1197		atomic_add(len, &sk->sk_wmem_alloc);
1198		offset += len;
1199		size -= len;
1200	}
1201	return 0;
1202
1203error:
1204	inet->cork.length -= size;
1205	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1206	return err;
1207}
1208
1209static void ip_cork_release(struct inet_sock *inet)
1210{
1211	inet->cork.flags &= ~IPCORK_OPT;
1212	kfree(inet->cork.opt);
1213	inet->cork.opt = NULL;
1214	dst_release(inet->cork.dst);
1215	inet->cork.dst = NULL;
1216}
1217
1218/*
1219 *	Combined all pending IP fragments on the socket as one IP datagram
1220 *	and push them out.
1221 */
1222int ip_push_pending_frames(struct sock *sk)
1223{
1224	struct sk_buff *skb, *tmp_skb;
1225	struct sk_buff **tail_skb;
1226	struct inet_sock *inet = inet_sk(sk);
1227	struct net *net = sock_net(sk);
1228	struct ip_options *opt = NULL;
1229	struct rtable *rt = (struct rtable *)inet->cork.dst;
1230	struct iphdr *iph;
1231	__be16 df = 0;
1232	__u8 ttl;
1233	int err = 0;
1234
1235	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1236		goto out;
1237	tail_skb = &(skb_shinfo(skb)->frag_list);
1238
1239	/* move skb->data to ip header from ext header */
1240	if (skb->data < skb_network_header(skb))
1241		__skb_pull(skb, skb_network_offset(skb));
1242	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1243		__skb_pull(tmp_skb, skb_network_header_len(skb));
1244		*tail_skb = tmp_skb;
1245		tail_skb = &(tmp_skb->next);
1246		skb->len += tmp_skb->len;
1247		skb->data_len += tmp_skb->len;
1248		skb->truesize += tmp_skb->truesize;
1249		tmp_skb->destructor = NULL;
1250		tmp_skb->sk = NULL;
1251	}
1252
1253	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1254	 * to fragment the frame generated here. No matter, what transforms
1255	 * how transforms change size of the packet, it will come out.
1256	 */
1257	if (inet->pmtudisc < IP_PMTUDISC_DO)
1258		skb->local_df = 1;
1259
1260	/* DF bit is set when we want to see DF on outgoing frames.
1261	 * If local_df is set too, we still allow to fragment this frame
1262	 * locally. */
1263	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1264	    (skb->len <= dst_mtu(&rt->u.dst) &&
1265	     ip_dont_fragment(sk, &rt->u.dst)))
1266		df = htons(IP_DF);
1267
1268	if (inet->cork.flags & IPCORK_OPT)
1269		opt = inet->cork.opt;
1270
1271	if (rt->rt_type == RTN_MULTICAST)
1272		ttl = inet->mc_ttl;
1273	else
1274		ttl = ip_select_ttl(inet, &rt->u.dst);
1275
1276	iph = (struct iphdr *)skb->data;
1277	iph->version = 4;
1278	iph->ihl = 5;
1279	if (opt) {
1280		iph->ihl += opt->optlen>>2;
1281		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1282	}
1283	iph->tos = inet->tos;
1284	iph->frag_off = df;
1285	ip_select_ident(iph, &rt->u.dst, sk);
1286	iph->ttl = ttl;
1287	iph->protocol = sk->sk_protocol;
1288	iph->saddr = rt->rt_src;
1289	iph->daddr = rt->rt_dst;
1290
1291	skb->priority = sk->sk_priority;
1292	skb->mark = sk->sk_mark;
1293	/*
1294	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1295	 * on dst refcount
1296	 */
1297	inet->cork.dst = NULL;
1298	skb_dst_set(skb, &rt->u.dst);
1299
1300	if (iph->protocol == IPPROTO_ICMP)
1301		icmp_out_count(net, ((struct icmphdr *)
1302			skb_transport_header(skb))->type);
1303
1304	/* Netfilter gets whole the not fragmented skb. */
1305	err = ip_local_out(skb);
1306	if (err) {
1307		if (err > 0)
1308			err = net_xmit_errno(err);
1309		if (err)
1310			goto error;
1311	}
1312
1313out:
1314	ip_cork_release(inet);
1315	return err;
1316
1317error:
1318	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1319	goto out;
1320}
1321
1322/*
1323 *	Throw away all pending data on the socket.
1324 */
1325void ip_flush_pending_frames(struct sock *sk)
1326{
1327	struct sk_buff *skb;
1328
1329	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1330		kfree_skb(skb);
1331
1332	ip_cork_release(inet_sk(sk));
1333}
1334
1335
1336/*
1337 *	Fetch data from kernel space and fill in checksum if needed.
1338 */
1339static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1340			      int len, int odd, struct sk_buff *skb)
1341{
1342	__wsum csum;
1343
1344	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1345	skb->csum = csum_block_add(skb->csum, csum, odd);
1346	return 0;
1347}
1348
1349/*
1350 *	Generic function to send a packet as reply to another packet.
1351 *	Used to send TCP resets so far. ICMP should use this function too.
1352 *
1353 *	Should run single threaded per socket because it uses the sock
1354 *     	structure to pass arguments.
1355 */
1356void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1357		   unsigned int len)
1358{
1359	struct inet_sock *inet = inet_sk(sk);
1360	struct {
1361		struct ip_options	opt;
1362		char			data[40];
1363	} replyopts;
1364	struct ipcm_cookie ipc;
1365	__be32 daddr;
1366	struct rtable *rt = skb_rtable(skb);
1367
1368	if (ip_options_echo(&replyopts.opt, skb))
1369		return;
1370
1371	daddr = ipc.addr = rt->rt_src;
1372	ipc.opt = NULL;
1373	ipc.shtx.flags = 0;
1374
1375	if (replyopts.opt.optlen) {
1376		ipc.opt = &replyopts.opt;
1377
1378		if (ipc.opt->srr)
1379			daddr = replyopts.opt.faddr;
1380	}
1381
1382	{
1383		struct flowi fl = { .oif = arg->bound_dev_if,
1384				    .nl_u = { .ip4_u =
1385					      { .daddr = daddr,
1386						.saddr = rt->rt_spec_dst,
1387						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1388				    /* Not quite clean, but right. */
1389				    .uli_u = { .ports =
1390					       { .sport = tcp_hdr(skb)->dest,
1391						 .dport = tcp_hdr(skb)->source } },
1392				    .proto = sk->sk_protocol,
1393				    .flags = ip_reply_arg_flowi_flags(arg) };
1394		security_skb_classify_flow(skb, &fl);
1395		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1396			return;
1397	}
1398
1399	/* And let IP do all the hard work.
1400
1401	   This chunk is not reenterable, hence spinlock.
1402	   Note that it uses the fact, that this function is called
1403	   with locally disabled BH and that sk cannot be already spinlocked.
1404	 */
1405	bh_lock_sock(sk);
1406	inet->tos = ip_hdr(skb)->tos;
1407	sk->sk_priority = skb->priority;
1408	sk->sk_protocol = ip_hdr(skb)->protocol;
1409	sk->sk_bound_dev_if = arg->bound_dev_if;
1410	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1411		       &ipc, &rt, MSG_DONTWAIT);
1412	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1413		if (arg->csumoffset >= 0)
1414			*((__sum16 *)skb_transport_header(skb) +
1415			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1416								arg->csum));
1417		skb->ip_summed = CHECKSUM_NONE;
1418		ip_push_pending_frames(sk);
1419	}
1420
1421	bh_unlock_sock(sk);
1422
1423	ip_rt_put(rt);
1424}
1425
1426void __init ip_init(void)
1427{
1428	ip_rt_init();
1429	inet_initpeers();
1430
1431#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1432	igmp_mc_proc_init();
1433#endif
1434}
1435
1436EXPORT_SYMBOL(ip_generic_getfrag);
1437EXPORT_SYMBOL(ip_queue_xmit);
1438EXPORT_SYMBOL(ip_send_check);
1439