ip_output.c revision 3b1e0a655f8eba44ab1ee2a1068d169ccfb853b9
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Donald Becker, <becker@super.org>
13 *		Alan Cox, <Alan.Cox@linux.org>
14 *		Richard Underwood
15 *		Stefan Becker, <stefanb@yello.ping.de>
16 *		Jorge Cwik, <jorge@laser.satlink.net>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 *	See ip_input.c for original log
21 *
22 *	Fixes:
23 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25 *		Bradford Johnson:	Fix faulty handling of some frames when
26 *					no route is found.
27 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28 *					(in case if packet not accepted by
29 *					output firewall rules)
30 *		Mike McLagan	:	Routing by source
31 *		Alexey Kuznetsov:	use new route cache
32 *		Andi Kleen:		Fix broken PMTU recovery and remove
33 *					some redundant tests.
34 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37 *					for decreased register pressure on x86
38 *					and more readibility.
39 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40 *					silently drop skb instead of failing with -EPERM.
41 *		Detlev Wengorz	:	Copy protocol for fragments.
42 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43 *					datagrams.
44 *		Hirokazu Takahashi:	sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/mm.h>
53#include <linux/string.h>
54#include <linux/errno.h>
55#include <linux/highmem.h>
56
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/in.h>
60#include <linux/inet.h>
61#include <linux/netdevice.h>
62#include <linux/etherdevice.h>
63#include <linux/proc_fs.h>
64#include <linux/stat.h>
65#include <linux/init.h>
66
67#include <net/snmp.h>
68#include <net/ip.h>
69#include <net/protocol.h>
70#include <net/route.h>
71#include <net/xfrm.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <net/arp.h>
75#include <net/icmp.h>
76#include <net/checksum.h>
77#include <net/inetpeer.h>
78#include <linux/igmp.h>
79#include <linux/netfilter_ipv4.h>
80#include <linux/netfilter_bridge.h>
81#include <linux/mroute.h>
82#include <linux/netlink.h>
83#include <linux/tcp.h>
84
85int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93
94int __ip_local_out(struct sk_buff *skb)
95{
96	struct iphdr *iph = ip_hdr(skb);
97
98	iph->tot_len = htons(skb->len);
99	ip_send_check(iph);
100	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
101		       dst_output);
102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106	int err;
107
108	err = __ip_local_out(skb);
109	if (likely(err == 1))
110		err = dst_output(skb);
111
112	return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119	skb_reset_mac_header(newskb);
120	__skb_pull(newskb, skb_network_offset(newskb));
121	newskb->pkt_type = PACKET_LOOPBACK;
122	newskb->ip_summed = CHECKSUM_UNNECESSARY;
123	BUG_TRAP(newskb->dst);
124	netif_rx(newskb);
125	return 0;
126}
127
128static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
129{
130	int ttl = inet->uc_ttl;
131
132	if (ttl < 0)
133		ttl = dst_metric(dst, RTAX_HOPLIMIT);
134	return ttl;
135}
136
137/*
138 *		Add an ip header to a skbuff and send it out.
139 *
140 */
141int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
142			  __be32 saddr, __be32 daddr, struct ip_options *opt)
143{
144	struct inet_sock *inet = inet_sk(sk);
145	struct rtable *rt = skb->rtable;
146	struct iphdr *iph;
147
148	/* Build the IP header. */
149	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
150	skb_reset_network_header(skb);
151	iph = ip_hdr(skb);
152	iph->version  = 4;
153	iph->ihl      = 5;
154	iph->tos      = inet->tos;
155	if (ip_dont_fragment(sk, &rt->u.dst))
156		iph->frag_off = htons(IP_DF);
157	else
158		iph->frag_off = 0;
159	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
160	iph->daddr    = rt->rt_dst;
161	iph->saddr    = rt->rt_src;
162	iph->protocol = sk->sk_protocol;
163	ip_select_ident(iph, &rt->u.dst, sk);
164
165	if (opt && opt->optlen) {
166		iph->ihl += opt->optlen>>2;
167		ip_options_build(skb, opt, daddr, rt, 0);
168	}
169
170	skb->priority = sk->sk_priority;
171	skb->mark = sk->sk_mark;
172
173	/* Send it out. */
174	return ip_local_out(skb);
175}
176
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb->dst;
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186	if (rt->rt_type == RTN_MULTICAST)
187		IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
188	else if (rt->rt_type == RTN_BROADCAST)
189		IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
190
191	/* Be paranoid, rather than too clever. */
192	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193		struct sk_buff *skb2;
194
195		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196		if (skb2 == NULL) {
197			kfree_skb(skb);
198			return -ENOMEM;
199		}
200		if (skb->sk)
201			skb_set_owner_w(skb2, skb->sk);
202		kfree_skb(skb);
203		skb = skb2;
204	}
205
206	if (dst->hh)
207		return neigh_hh_output(dst->hh, skb);
208	else if (dst->neighbour)
209		return dst->neighbour->output(skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb->dst->dev->mtu : dst_mtu(skb->dst);
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb->dst->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb->rtable;
244	struct net_device *dev = rt->u.dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if ((!sk || inet_sk(sk)->mc_loop)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
270#endif
271		) {
272			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273			if (newskb)
274				NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
275					NULL, newskb->dev,
276					ip_dev_loopback_xmit);
277		}
278
279		/* Multicasts with ttl 0 must not go beyond the host */
280
281		if (ip_hdr(skb)->ttl == 0) {
282			kfree_skb(skb);
283			return 0;
284		}
285	}
286
287	if (rt->rt_flags&RTCF_BROADCAST) {
288		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289		if (newskb)
290			NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
291				newskb->dev, ip_dev_loopback_xmit);
292	}
293
294	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
295			    ip_finish_output,
296			    !(IPCB(skb)->flags & IPSKB_REROUTED));
297}
298
299int ip_output(struct sk_buff *skb)
300{
301	struct net_device *dev = skb->dst->dev;
302
303	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
304
305	skb->dev = dev;
306	skb->protocol = htons(ETH_P_IP);
307
308	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
309			    ip_finish_output,
310			    !(IPCB(skb)->flags & IPSKB_REROUTED));
311}
312
313int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
314{
315	struct sock *sk = skb->sk;
316	struct inet_sock *inet = inet_sk(sk);
317	struct ip_options *opt = inet->opt;
318	struct rtable *rt;
319	struct iphdr *iph;
320
321	/* Skip all of this if the packet is already routed,
322	 * f.e. by something like SCTP.
323	 */
324	rt = skb->rtable;
325	if (rt != NULL)
326		goto packet_routed;
327
328	/* Make sure we can route this packet. */
329	rt = (struct rtable *)__sk_dst_check(sk, 0);
330	if (rt == NULL) {
331		__be32 daddr;
332
333		/* Use correct destination address if we have options. */
334		daddr = inet->daddr;
335		if(opt && opt->srr)
336			daddr = opt->faddr;
337
338		{
339			struct flowi fl = { .oif = sk->sk_bound_dev_if,
340					    .nl_u = { .ip4_u =
341						      { .daddr = daddr,
342							.saddr = inet->saddr,
343							.tos = RT_CONN_FLAGS(sk) } },
344					    .proto = sk->sk_protocol,
345					    .uli_u = { .ports =
346						       { .sport = inet->sport,
347							 .dport = inet->dport } } };
348
349			/* If this fails, retransmit mechanism of transport layer will
350			 * keep trying until route appears or the connection times
351			 * itself out.
352			 */
353			security_sk_classify_flow(sk, &fl);
354			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
355				goto no_route;
356		}
357		sk_setup_caps(sk, &rt->u.dst);
358	}
359	skb->dst = dst_clone(&rt->u.dst);
360
361packet_routed:
362	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
363		goto no_route;
364
365	/* OK, we know where to send it, allocate and build IP header. */
366	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
367	skb_reset_network_header(skb);
368	iph = ip_hdr(skb);
369	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
370	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
371		iph->frag_off = htons(IP_DF);
372	else
373		iph->frag_off = 0;
374	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
375	iph->protocol = sk->sk_protocol;
376	iph->saddr    = rt->rt_src;
377	iph->daddr    = rt->rt_dst;
378	/* Transport layer set skb->h.foo itself. */
379
380	if (opt && opt->optlen) {
381		iph->ihl += opt->optlen >> 2;
382		ip_options_build(skb, opt, inet->daddr, rt, 0);
383	}
384
385	ip_select_ident_more(iph, &rt->u.dst, sk,
386			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
387
388	skb->priority = sk->sk_priority;
389	skb->mark = sk->sk_mark;
390
391	return ip_local_out(skb);
392
393no_route:
394	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
395	kfree_skb(skb);
396	return -EHOSTUNREACH;
397}
398
399
400static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
401{
402	to->pkt_type = from->pkt_type;
403	to->priority = from->priority;
404	to->protocol = from->protocol;
405	dst_release(to->dst);
406	to->dst = dst_clone(from->dst);
407	to->dev = from->dev;
408	to->mark = from->mark;
409
410	/* Copy the flags to each fragment. */
411	IPCB(to)->flags = IPCB(from)->flags;
412
413#ifdef CONFIG_NET_SCHED
414	to->tc_index = from->tc_index;
415#endif
416	nf_copy(to, from);
417#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
418    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
419	to->nf_trace = from->nf_trace;
420#endif
421#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
422	to->ipvs_property = from->ipvs_property;
423#endif
424	skb_copy_secmark(to, from);
425}
426
427/*
428 *	This IP datagram is too large to be sent in one piece.  Break it up into
429 *	smaller pieces (each of size equal to IP header plus
430 *	a block of the data of the original IP data part) that will yet fit in a
431 *	single device frame, and queue such a frame for sending.
432 */
433
434int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
435{
436	struct iphdr *iph;
437	int raw = 0;
438	int ptr;
439	struct net_device *dev;
440	struct sk_buff *skb2;
441	unsigned int mtu, hlen, left, len, ll_rs, pad;
442	int offset;
443	__be16 not_last_frag;
444	struct rtable *rt = skb->rtable;
445	int err = 0;
446
447	dev = rt->u.dst.dev;
448
449	/*
450	 *	Point into the IP datagram header.
451	 */
452
453	iph = ip_hdr(skb);
454
455	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
456		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
457		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
458			  htonl(ip_skb_dst_mtu(skb)));
459		kfree_skb(skb);
460		return -EMSGSIZE;
461	}
462
463	/*
464	 *	Setup starting values.
465	 */
466
467	hlen = iph->ihl * 4;
468	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
469	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
470
471	/* When frag_list is given, use it. First, check its validity:
472	 * some transformers could create wrong frag_list or break existing
473	 * one, it is not prohibited. In this case fall back to copying.
474	 *
475	 * LATER: this step can be merged to real generation of fragments,
476	 * we can switch to copy when see the first bad fragment.
477	 */
478	if (skb_shinfo(skb)->frag_list) {
479		struct sk_buff *frag;
480		int first_len = skb_pagelen(skb);
481		int truesizes = 0;
482
483		if (first_len - hlen > mtu ||
484		    ((first_len - hlen) & 7) ||
485		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
486		    skb_cloned(skb))
487			goto slow_path;
488
489		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
490			/* Correct geometry. */
491			if (frag->len > mtu ||
492			    ((frag->len & 7) && frag->next) ||
493			    skb_headroom(frag) < hlen)
494			    goto slow_path;
495
496			/* Partially cloned skb? */
497			if (skb_shared(frag))
498				goto slow_path;
499
500			BUG_ON(frag->sk);
501			if (skb->sk) {
502				sock_hold(skb->sk);
503				frag->sk = skb->sk;
504				frag->destructor = sock_wfree;
505				truesizes += frag->truesize;
506			}
507		}
508
509		/* Everything is OK. Generate! */
510
511		err = 0;
512		offset = 0;
513		frag = skb_shinfo(skb)->frag_list;
514		skb_shinfo(skb)->frag_list = NULL;
515		skb->data_len = first_len - skb_headlen(skb);
516		skb->truesize -= truesizes;
517		skb->len = first_len;
518		iph->tot_len = htons(first_len);
519		iph->frag_off = htons(IP_MF);
520		ip_send_check(iph);
521
522		for (;;) {
523			/* Prepare header of the next frame,
524			 * before previous one went down. */
525			if (frag) {
526				frag->ip_summed = CHECKSUM_NONE;
527				skb_reset_transport_header(frag);
528				__skb_push(frag, hlen);
529				skb_reset_network_header(frag);
530				memcpy(skb_network_header(frag), iph, hlen);
531				iph = ip_hdr(frag);
532				iph->tot_len = htons(frag->len);
533				ip_copy_metadata(frag, skb);
534				if (offset == 0)
535					ip_options_fragment(frag);
536				offset += skb->len - hlen;
537				iph->frag_off = htons(offset>>3);
538				if (frag->next != NULL)
539					iph->frag_off |= htons(IP_MF);
540				/* Ready, complete checksum */
541				ip_send_check(iph);
542			}
543
544			err = output(skb);
545
546			if (!err)
547				IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
548			if (err || !frag)
549				break;
550
551			skb = frag;
552			frag = skb->next;
553			skb->next = NULL;
554		}
555
556		if (err == 0) {
557			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
558			return 0;
559		}
560
561		while (frag) {
562			skb = frag->next;
563			kfree_skb(frag);
564			frag = skb;
565		}
566		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
567		return err;
568	}
569
570slow_path:
571	left = skb->len - hlen;		/* Space per frame */
572	ptr = raw + hlen;		/* Where to start from */
573
574	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
575	 * we need to make room for the encapsulating header
576	 */
577	pad = nf_bridge_pad(skb);
578	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
579	mtu -= pad;
580
581	/*
582	 *	Fragment the datagram.
583	 */
584
585	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
586	not_last_frag = iph->frag_off & htons(IP_MF);
587
588	/*
589	 *	Keep copying data until we run out.
590	 */
591
592	while (left > 0) {
593		len = left;
594		/* IF: it doesn't fit, use 'mtu' - the data space left */
595		if (len > mtu)
596			len = mtu;
597		/* IF: we are not sending upto and including the packet end
598		   then align the next start on an eight byte boundary */
599		if (len < left)	{
600			len &= ~7;
601		}
602		/*
603		 *	Allocate buffer.
604		 */
605
606		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
607			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
608			err = -ENOMEM;
609			goto fail;
610		}
611
612		/*
613		 *	Set up data on packet
614		 */
615
616		ip_copy_metadata(skb2, skb);
617		skb_reserve(skb2, ll_rs);
618		skb_put(skb2, len + hlen);
619		skb_reset_network_header(skb2);
620		skb2->transport_header = skb2->network_header + hlen;
621
622		/*
623		 *	Charge the memory for the fragment to any owner
624		 *	it might possess
625		 */
626
627		if (skb->sk)
628			skb_set_owner_w(skb2, skb->sk);
629
630		/*
631		 *	Copy the packet header into the new buffer.
632		 */
633
634		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
635
636		/*
637		 *	Copy a block of the IP datagram.
638		 */
639		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
640			BUG();
641		left -= len;
642
643		/*
644		 *	Fill in the new header fields.
645		 */
646		iph = ip_hdr(skb2);
647		iph->frag_off = htons((offset >> 3));
648
649		/* ANK: dirty, but effective trick. Upgrade options only if
650		 * the segment to be fragmented was THE FIRST (otherwise,
651		 * options are already fixed) and make it ONCE
652		 * on the initial skb, so that all the following fragments
653		 * will inherit fixed options.
654		 */
655		if (offset == 0)
656			ip_options_fragment(skb);
657
658		/*
659		 *	Added AC : If we are fragmenting a fragment that's not the
660		 *		   last fragment then keep MF on each bit
661		 */
662		if (left > 0 || not_last_frag)
663			iph->frag_off |= htons(IP_MF);
664		ptr += len;
665		offset += len;
666
667		/*
668		 *	Put this fragment into the sending queue.
669		 */
670		iph->tot_len = htons(len + hlen);
671
672		ip_send_check(iph);
673
674		err = output(skb2);
675		if (err)
676			goto fail;
677
678		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
679	}
680	kfree_skb(skb);
681	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
682	return err;
683
684fail:
685	kfree_skb(skb);
686	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
687	return err;
688}
689
690EXPORT_SYMBOL(ip_fragment);
691
692int
693ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
694{
695	struct iovec *iov = from;
696
697	if (skb->ip_summed == CHECKSUM_PARTIAL) {
698		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
699			return -EFAULT;
700	} else {
701		__wsum csum = 0;
702		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
703			return -EFAULT;
704		skb->csum = csum_block_add(skb->csum, csum, odd);
705	}
706	return 0;
707}
708
709static inline __wsum
710csum_page(struct page *page, int offset, int copy)
711{
712	char *kaddr;
713	__wsum csum;
714	kaddr = kmap(page);
715	csum = csum_partial(kaddr + offset, copy, 0);
716	kunmap(page);
717	return csum;
718}
719
720static inline int ip_ufo_append_data(struct sock *sk,
721			int getfrag(void *from, char *to, int offset, int len,
722			       int odd, struct sk_buff *skb),
723			void *from, int length, int hh_len, int fragheaderlen,
724			int transhdrlen, int mtu,unsigned int flags)
725{
726	struct sk_buff *skb;
727	int err;
728
729	/* There is support for UDP fragmentation offload by network
730	 * device, so create one single skb packet containing complete
731	 * udp datagram
732	 */
733	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
734		skb = sock_alloc_send_skb(sk,
735			hh_len + fragheaderlen + transhdrlen + 20,
736			(flags & MSG_DONTWAIT), &err);
737
738		if (skb == NULL)
739			return err;
740
741		/* reserve space for Hardware header */
742		skb_reserve(skb, hh_len);
743
744		/* create space for UDP/IP header */
745		skb_put(skb,fragheaderlen + transhdrlen);
746
747		/* initialize network header pointer */
748		skb_reset_network_header(skb);
749
750		/* initialize protocol header pointer */
751		skb->transport_header = skb->network_header + fragheaderlen;
752
753		skb->ip_summed = CHECKSUM_PARTIAL;
754		skb->csum = 0;
755		sk->sk_sndmsg_off = 0;
756	}
757
758	err = skb_append_datato_frags(sk,skb, getfrag, from,
759			       (length - transhdrlen));
760	if (!err) {
761		/* specify the length of each IP datagram fragment*/
762		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
763		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
764		__skb_queue_tail(&sk->sk_write_queue, skb);
765
766		return 0;
767	}
768	/* There is not enough support do UFO ,
769	 * so follow normal path
770	 */
771	kfree_skb(skb);
772	return err;
773}
774
775/*
776 *	ip_append_data() and ip_append_page() can make one large IP datagram
777 *	from many pieces of data. Each pieces will be holded on the socket
778 *	until ip_push_pending_frames() is called. Each piece can be a page
779 *	or non-page data.
780 *
781 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
782 *	this interface potentially.
783 *
784 *	LATER: length must be adjusted by pad at tail, when it is required.
785 */
786int ip_append_data(struct sock *sk,
787		   int getfrag(void *from, char *to, int offset, int len,
788			       int odd, struct sk_buff *skb),
789		   void *from, int length, int transhdrlen,
790		   struct ipcm_cookie *ipc, struct rtable *rt,
791		   unsigned int flags)
792{
793	struct inet_sock *inet = inet_sk(sk);
794	struct sk_buff *skb;
795
796	struct ip_options *opt = NULL;
797	int hh_len;
798	int exthdrlen;
799	int mtu;
800	int copy;
801	int err;
802	int offset = 0;
803	unsigned int maxfraglen, fragheaderlen;
804	int csummode = CHECKSUM_NONE;
805
806	if (flags&MSG_PROBE)
807		return 0;
808
809	if (skb_queue_empty(&sk->sk_write_queue)) {
810		/*
811		 * setup for corking.
812		 */
813		opt = ipc->opt;
814		if (opt) {
815			if (inet->cork.opt == NULL) {
816				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
817				if (unlikely(inet->cork.opt == NULL))
818					return -ENOBUFS;
819			}
820			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
821			inet->cork.flags |= IPCORK_OPT;
822			inet->cork.addr = ipc->addr;
823		}
824		dst_hold(&rt->u.dst);
825		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
826					    rt->u.dst.dev->mtu :
827					    dst_mtu(rt->u.dst.path);
828		inet->cork.dst = &rt->u.dst;
829		inet->cork.length = 0;
830		sk->sk_sndmsg_page = NULL;
831		sk->sk_sndmsg_off = 0;
832		if ((exthdrlen = rt->u.dst.header_len) != 0) {
833			length += exthdrlen;
834			transhdrlen += exthdrlen;
835		}
836	} else {
837		rt = (struct rtable *)inet->cork.dst;
838		if (inet->cork.flags & IPCORK_OPT)
839			opt = inet->cork.opt;
840
841		transhdrlen = 0;
842		exthdrlen = 0;
843		mtu = inet->cork.fragsize;
844	}
845	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
846
847	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
849
850	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
851		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
852		return -EMSGSIZE;
853	}
854
855	/*
856	 * transhdrlen > 0 means that this is the first fragment and we wish
857	 * it won't be fragmented in the future.
858	 */
859	if (transhdrlen &&
860	    length + fragheaderlen <= mtu &&
861	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
862	    !exthdrlen)
863		csummode = CHECKSUM_PARTIAL;
864
865	inet->cork.length += length;
866	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
867			(rt->u.dst.dev->features & NETIF_F_UFO)) {
868
869		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
870					 fragheaderlen, transhdrlen, mtu,
871					 flags);
872		if (err)
873			goto error;
874		return 0;
875	}
876
877	/* So, what's going on in the loop below?
878	 *
879	 * We use calculated fragment length to generate chained skb,
880	 * each of segments is IP fragment ready for sending to network after
881	 * adding appropriate IP header.
882	 */
883
884	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
885		goto alloc_new_skb;
886
887	while (length > 0) {
888		/* Check if the remaining data fits into current packet. */
889		copy = mtu - skb->len;
890		if (copy < length)
891			copy = maxfraglen - skb->len;
892		if (copy <= 0) {
893			char *data;
894			unsigned int datalen;
895			unsigned int fraglen;
896			unsigned int fraggap;
897			unsigned int alloclen;
898			struct sk_buff *skb_prev;
899alloc_new_skb:
900			skb_prev = skb;
901			if (skb_prev)
902				fraggap = skb_prev->len - maxfraglen;
903			else
904				fraggap = 0;
905
906			/*
907			 * If remaining data exceeds the mtu,
908			 * we know we need more fragment(s).
909			 */
910			datalen = length + fraggap;
911			if (datalen > mtu - fragheaderlen)
912				datalen = maxfraglen - fragheaderlen;
913			fraglen = datalen + fragheaderlen;
914
915			if ((flags & MSG_MORE) &&
916			    !(rt->u.dst.dev->features&NETIF_F_SG))
917				alloclen = mtu;
918			else
919				alloclen = datalen + fragheaderlen;
920
921			/* The last fragment gets additional space at tail.
922			 * Note, with MSG_MORE we overallocate on fragments,
923			 * because we have no idea what fragment will be
924			 * the last.
925			 */
926			if (datalen == length + fraggap)
927				alloclen += rt->u.dst.trailer_len;
928
929			if (transhdrlen) {
930				skb = sock_alloc_send_skb(sk,
931						alloclen + hh_len + 15,
932						(flags & MSG_DONTWAIT), &err);
933			} else {
934				skb = NULL;
935				if (atomic_read(&sk->sk_wmem_alloc) <=
936				    2 * sk->sk_sndbuf)
937					skb = sock_wmalloc(sk,
938							   alloclen + hh_len + 15, 1,
939							   sk->sk_allocation);
940				if (unlikely(skb == NULL))
941					err = -ENOBUFS;
942			}
943			if (skb == NULL)
944				goto error;
945
946			/*
947			 *	Fill in the control structures
948			 */
949			skb->ip_summed = csummode;
950			skb->csum = 0;
951			skb_reserve(skb, hh_len);
952
953			/*
954			 *	Find where to start putting bytes.
955			 */
956			data = skb_put(skb, fraglen);
957			skb_set_network_header(skb, exthdrlen);
958			skb->transport_header = (skb->network_header +
959						 fragheaderlen);
960			data += fragheaderlen;
961
962			if (fraggap) {
963				skb->csum = skb_copy_and_csum_bits(
964					skb_prev, maxfraglen,
965					data + transhdrlen, fraggap, 0);
966				skb_prev->csum = csum_sub(skb_prev->csum,
967							  skb->csum);
968				data += fraggap;
969				pskb_trim_unique(skb_prev, maxfraglen);
970			}
971
972			copy = datalen - transhdrlen - fraggap;
973			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
974				err = -EFAULT;
975				kfree_skb(skb);
976				goto error;
977			}
978
979			offset += copy;
980			length -= datalen - fraggap;
981			transhdrlen = 0;
982			exthdrlen = 0;
983			csummode = CHECKSUM_NONE;
984
985			/*
986			 * Put the packet on the pending queue.
987			 */
988			__skb_queue_tail(&sk->sk_write_queue, skb);
989			continue;
990		}
991
992		if (copy > length)
993			copy = length;
994
995		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
996			unsigned int off;
997
998			off = skb->len;
999			if (getfrag(from, skb_put(skb, copy),
1000					offset, copy, off, skb) < 0) {
1001				__skb_trim(skb, off);
1002				err = -EFAULT;
1003				goto error;
1004			}
1005		} else {
1006			int i = skb_shinfo(skb)->nr_frags;
1007			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1008			struct page *page = sk->sk_sndmsg_page;
1009			int off = sk->sk_sndmsg_off;
1010			unsigned int left;
1011
1012			if (page && (left = PAGE_SIZE - off) > 0) {
1013				if (copy >= left)
1014					copy = left;
1015				if (page != frag->page) {
1016					if (i == MAX_SKB_FRAGS) {
1017						err = -EMSGSIZE;
1018						goto error;
1019					}
1020					get_page(page);
1021					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1022					frag = &skb_shinfo(skb)->frags[i];
1023				}
1024			} else if (i < MAX_SKB_FRAGS) {
1025				if (copy > PAGE_SIZE)
1026					copy = PAGE_SIZE;
1027				page = alloc_pages(sk->sk_allocation, 0);
1028				if (page == NULL)  {
1029					err = -ENOMEM;
1030					goto error;
1031				}
1032				sk->sk_sndmsg_page = page;
1033				sk->sk_sndmsg_off = 0;
1034
1035				skb_fill_page_desc(skb, i, page, 0, 0);
1036				frag = &skb_shinfo(skb)->frags[i];
1037			} else {
1038				err = -EMSGSIZE;
1039				goto error;
1040			}
1041			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1042				err = -EFAULT;
1043				goto error;
1044			}
1045			sk->sk_sndmsg_off += copy;
1046			frag->size += copy;
1047			skb->len += copy;
1048			skb->data_len += copy;
1049			skb->truesize += copy;
1050			atomic_add(copy, &sk->sk_wmem_alloc);
1051		}
1052		offset += copy;
1053		length -= copy;
1054	}
1055
1056	return 0;
1057
1058error:
1059	inet->cork.length -= length;
1060	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1061	return err;
1062}
1063
1064ssize_t	ip_append_page(struct sock *sk, struct page *page,
1065		       int offset, size_t size, int flags)
1066{
1067	struct inet_sock *inet = inet_sk(sk);
1068	struct sk_buff *skb;
1069	struct rtable *rt;
1070	struct ip_options *opt = NULL;
1071	int hh_len;
1072	int mtu;
1073	int len;
1074	int err;
1075	unsigned int maxfraglen, fragheaderlen, fraggap;
1076
1077	if (inet->hdrincl)
1078		return -EPERM;
1079
1080	if (flags&MSG_PROBE)
1081		return 0;
1082
1083	if (skb_queue_empty(&sk->sk_write_queue))
1084		return -EINVAL;
1085
1086	rt = (struct rtable *)inet->cork.dst;
1087	if (inet->cork.flags & IPCORK_OPT)
1088		opt = inet->cork.opt;
1089
1090	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1091		return -EOPNOTSUPP;
1092
1093	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1094	mtu = inet->cork.fragsize;
1095
1096	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1097	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1098
1099	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1100		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1101		return -EMSGSIZE;
1102	}
1103
1104	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1105		return -EINVAL;
1106
1107	inet->cork.length += size;
1108	if ((sk->sk_protocol == IPPROTO_UDP) &&
1109	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1110		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1111		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1112	}
1113
1114
1115	while (size > 0) {
1116		int i;
1117
1118		if (skb_is_gso(skb))
1119			len = size;
1120		else {
1121
1122			/* Check if the remaining data fits into current packet. */
1123			len = mtu - skb->len;
1124			if (len < size)
1125				len = maxfraglen - skb->len;
1126		}
1127		if (len <= 0) {
1128			struct sk_buff *skb_prev;
1129			int alloclen;
1130
1131			skb_prev = skb;
1132			fraggap = skb_prev->len - maxfraglen;
1133
1134			alloclen = fragheaderlen + hh_len + fraggap + 15;
1135			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1136			if (unlikely(!skb)) {
1137				err = -ENOBUFS;
1138				goto error;
1139			}
1140
1141			/*
1142			 *	Fill in the control structures
1143			 */
1144			skb->ip_summed = CHECKSUM_NONE;
1145			skb->csum = 0;
1146			skb_reserve(skb, hh_len);
1147
1148			/*
1149			 *	Find where to start putting bytes.
1150			 */
1151			skb_put(skb, fragheaderlen + fraggap);
1152			skb_reset_network_header(skb);
1153			skb->transport_header = (skb->network_header +
1154						 fragheaderlen);
1155			if (fraggap) {
1156				skb->csum = skb_copy_and_csum_bits(skb_prev,
1157								   maxfraglen,
1158						    skb_transport_header(skb),
1159								   fraggap, 0);
1160				skb_prev->csum = csum_sub(skb_prev->csum,
1161							  skb->csum);
1162				pskb_trim_unique(skb_prev, maxfraglen);
1163			}
1164
1165			/*
1166			 * Put the packet on the pending queue.
1167			 */
1168			__skb_queue_tail(&sk->sk_write_queue, skb);
1169			continue;
1170		}
1171
1172		i = skb_shinfo(skb)->nr_frags;
1173		if (len > size)
1174			len = size;
1175		if (skb_can_coalesce(skb, i, page, offset)) {
1176			skb_shinfo(skb)->frags[i-1].size += len;
1177		} else if (i < MAX_SKB_FRAGS) {
1178			get_page(page);
1179			skb_fill_page_desc(skb, i, page, offset, len);
1180		} else {
1181			err = -EMSGSIZE;
1182			goto error;
1183		}
1184
1185		if (skb->ip_summed == CHECKSUM_NONE) {
1186			__wsum csum;
1187			csum = csum_page(page, offset, len);
1188			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1189		}
1190
1191		skb->len += len;
1192		skb->data_len += len;
1193		skb->truesize += len;
1194		atomic_add(len, &sk->sk_wmem_alloc);
1195		offset += len;
1196		size -= len;
1197	}
1198	return 0;
1199
1200error:
1201	inet->cork.length -= size;
1202	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1203	return err;
1204}
1205
1206static void ip_cork_release(struct inet_sock *inet)
1207{
1208	inet->cork.flags &= ~IPCORK_OPT;
1209	kfree(inet->cork.opt);
1210	inet->cork.opt = NULL;
1211	dst_release(inet->cork.dst);
1212	inet->cork.dst = NULL;
1213}
1214
1215/*
1216 *	Combined all pending IP fragments on the socket as one IP datagram
1217 *	and push them out.
1218 */
1219int ip_push_pending_frames(struct sock *sk)
1220{
1221	struct sk_buff *skb, *tmp_skb;
1222	struct sk_buff **tail_skb;
1223	struct inet_sock *inet = inet_sk(sk);
1224	struct ip_options *opt = NULL;
1225	struct rtable *rt = (struct rtable *)inet->cork.dst;
1226	struct iphdr *iph;
1227	__be16 df = 0;
1228	__u8 ttl;
1229	int err = 0;
1230
1231	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1232		goto out;
1233	tail_skb = &(skb_shinfo(skb)->frag_list);
1234
1235	/* move skb->data to ip header from ext header */
1236	if (skb->data < skb_network_header(skb))
1237		__skb_pull(skb, skb_network_offset(skb));
1238	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1239		__skb_pull(tmp_skb, skb_network_header_len(skb));
1240		*tail_skb = tmp_skb;
1241		tail_skb = &(tmp_skb->next);
1242		skb->len += tmp_skb->len;
1243		skb->data_len += tmp_skb->len;
1244		skb->truesize += tmp_skb->truesize;
1245		__sock_put(tmp_skb->sk);
1246		tmp_skb->destructor = NULL;
1247		tmp_skb->sk = NULL;
1248	}
1249
1250	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1251	 * to fragment the frame generated here. No matter, what transforms
1252	 * how transforms change size of the packet, it will come out.
1253	 */
1254	if (inet->pmtudisc < IP_PMTUDISC_DO)
1255		skb->local_df = 1;
1256
1257	/* DF bit is set when we want to see DF on outgoing frames.
1258	 * If local_df is set too, we still allow to fragment this frame
1259	 * locally. */
1260	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1261	    (skb->len <= dst_mtu(&rt->u.dst) &&
1262	     ip_dont_fragment(sk, &rt->u.dst)))
1263		df = htons(IP_DF);
1264
1265	if (inet->cork.flags & IPCORK_OPT)
1266		opt = inet->cork.opt;
1267
1268	if (rt->rt_type == RTN_MULTICAST)
1269		ttl = inet->mc_ttl;
1270	else
1271		ttl = ip_select_ttl(inet, &rt->u.dst);
1272
1273	iph = (struct iphdr *)skb->data;
1274	iph->version = 4;
1275	iph->ihl = 5;
1276	if (opt) {
1277		iph->ihl += opt->optlen>>2;
1278		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1279	}
1280	iph->tos = inet->tos;
1281	iph->frag_off = df;
1282	ip_select_ident(iph, &rt->u.dst, sk);
1283	iph->ttl = ttl;
1284	iph->protocol = sk->sk_protocol;
1285	iph->saddr = rt->rt_src;
1286	iph->daddr = rt->rt_dst;
1287
1288	skb->priority = sk->sk_priority;
1289	skb->mark = sk->sk_mark;
1290	skb->dst = dst_clone(&rt->u.dst);
1291
1292	if (iph->protocol == IPPROTO_ICMP)
1293		icmp_out_count(((struct icmphdr *)
1294			skb_transport_header(skb))->type);
1295
1296	/* Netfilter gets whole the not fragmented skb. */
1297	err = ip_local_out(skb);
1298	if (err) {
1299		if (err > 0)
1300			err = inet->recverr ? net_xmit_errno(err) : 0;
1301		if (err)
1302			goto error;
1303	}
1304
1305out:
1306	ip_cork_release(inet);
1307	return err;
1308
1309error:
1310	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1311	goto out;
1312}
1313
1314/*
1315 *	Throw away all pending data on the socket.
1316 */
1317void ip_flush_pending_frames(struct sock *sk)
1318{
1319	struct sk_buff *skb;
1320
1321	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1322		kfree_skb(skb);
1323
1324	ip_cork_release(inet_sk(sk));
1325}
1326
1327
1328/*
1329 *	Fetch data from kernel space and fill in checksum if needed.
1330 */
1331static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1332			      int len, int odd, struct sk_buff *skb)
1333{
1334	__wsum csum;
1335
1336	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1337	skb->csum = csum_block_add(skb->csum, csum, odd);
1338	return 0;
1339}
1340
1341/*
1342 *	Generic function to send a packet as reply to another packet.
1343 *	Used to send TCP resets so far. ICMP should use this function too.
1344 *
1345 *	Should run single threaded per socket because it uses the sock
1346 *     	structure to pass arguments.
1347 */
1348void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1349		   unsigned int len)
1350{
1351	struct inet_sock *inet = inet_sk(sk);
1352	struct {
1353		struct ip_options	opt;
1354		char			data[40];
1355	} replyopts;
1356	struct ipcm_cookie ipc;
1357	__be32 daddr;
1358	struct rtable *rt = skb->rtable;
1359
1360	if (ip_options_echo(&replyopts.opt, skb))
1361		return;
1362
1363	daddr = ipc.addr = rt->rt_src;
1364	ipc.opt = NULL;
1365
1366	if (replyopts.opt.optlen) {
1367		ipc.opt = &replyopts.opt;
1368
1369		if (ipc.opt->srr)
1370			daddr = replyopts.opt.faddr;
1371	}
1372
1373	{
1374		struct flowi fl = { .oif = arg->bound_dev_if,
1375				    .nl_u = { .ip4_u =
1376					      { .daddr = daddr,
1377						.saddr = rt->rt_spec_dst,
1378						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1379				    /* Not quite clean, but right. */
1380				    .uli_u = { .ports =
1381					       { .sport = tcp_hdr(skb)->dest,
1382						 .dport = tcp_hdr(skb)->source } },
1383				    .proto = sk->sk_protocol };
1384		security_skb_classify_flow(skb, &fl);
1385		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1386			return;
1387	}
1388
1389	/* And let IP do all the hard work.
1390
1391	   This chunk is not reenterable, hence spinlock.
1392	   Note that it uses the fact, that this function is called
1393	   with locally disabled BH and that sk cannot be already spinlocked.
1394	 */
1395	bh_lock_sock(sk);
1396	inet->tos = ip_hdr(skb)->tos;
1397	sk->sk_priority = skb->priority;
1398	sk->sk_protocol = ip_hdr(skb)->protocol;
1399	sk->sk_bound_dev_if = arg->bound_dev_if;
1400	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1401		       &ipc, rt, MSG_DONTWAIT);
1402	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1403		if (arg->csumoffset >= 0)
1404			*((__sum16 *)skb_transport_header(skb) +
1405			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1406								arg->csum));
1407		skb->ip_summed = CHECKSUM_NONE;
1408		ip_push_pending_frames(sk);
1409	}
1410
1411	bh_unlock_sock(sk);
1412
1413	ip_rt_put(rt);
1414}
1415
1416void __init ip_init(void)
1417{
1418	ip_rt_init();
1419	inet_initpeers();
1420
1421#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1422	igmp_mc_proc_init();
1423#endif
1424}
1425
1426EXPORT_SYMBOL(ip_generic_getfrag);
1427EXPORT_SYMBOL(ip_queue_xmit);
1428EXPORT_SYMBOL(ip_send_check);
1429