ip_output.c revision 69cce1d1404968f78b177a0314f5822d5afdbbfb
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162	iph->saddr    = saddr;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->opt.optlen) {
167		iph->ihl += opt->opt.optlen>>2;
168		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185	struct neighbour *neigh;
186
187	if (rt->rt_type == RTN_MULTICAST) {
188		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189	} else if (rt->rt_type == RTN_BROADCAST)
190		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191
192	/* Be paranoid, rather than too clever. */
193	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194		struct sk_buff *skb2;
195
196		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197		if (skb2 == NULL) {
198			kfree_skb(skb);
199			return -ENOMEM;
200		}
201		if (skb->sk)
202			skb_set_owner_w(skb2, skb->sk);
203		kfree_skb(skb);
204		skb = skb2;
205	}
206
207	neigh = dst_get_neighbour(dst);
208	if (neigh)
209		return neigh_output(neigh, skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb_dst(skb)->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb_rtable(skb);
244	struct net_device *dev = rt->dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if (sk_mc_loop(sk)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    &&
270		    ((rt->rt_flags & RTCF_LOCAL) ||
271		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272#endif
273		   ) {
274			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275			if (newskb)
276				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277					newskb, NULL, newskb->dev,
278					ip_dev_loopback_xmit);
279		}
280
281		/* Multicasts with ttl 0 must not go beyond the host */
282
283		if (ip_hdr(skb)->ttl == 0) {
284			kfree_skb(skb);
285			return 0;
286		}
287	}
288
289	if (rt->rt_flags&RTCF_BROADCAST) {
290		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291		if (newskb)
292			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293				NULL, newskb->dev, ip_dev_loopback_xmit);
294	}
295
296	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297			    skb->dev, ip_finish_output,
298			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299}
300
301int ip_output(struct sk_buff *skb)
302{
303	struct net_device *dev = skb_dst(skb)->dev;
304
305	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307	skb->dev = dev;
308	skb->protocol = htons(ETH_P_IP);
309
310	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311			    ip_finish_output,
312			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313}
314
315int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
316{
317	struct sock *sk = skb->sk;
318	struct inet_sock *inet = inet_sk(sk);
319	struct ip_options_rcu *inet_opt;
320	struct flowi4 *fl4;
321	struct rtable *rt;
322	struct iphdr *iph;
323	int res;
324
325	/* Skip all of this if the packet is already routed,
326	 * f.e. by something like SCTP.
327	 */
328	rcu_read_lock();
329	inet_opt = rcu_dereference(inet->inet_opt);
330	fl4 = &fl->u.ip4;
331	rt = skb_rtable(skb);
332	if (rt != NULL)
333		goto packet_routed;
334
335	/* Make sure we can route this packet. */
336	rt = (struct rtable *)__sk_dst_check(sk, 0);
337	if (rt == NULL) {
338		__be32 daddr;
339
340		/* Use correct destination address if we have options. */
341		daddr = inet->inet_daddr;
342		if (inet_opt && inet_opt->opt.srr)
343			daddr = inet_opt->opt.faddr;
344
345		/* If this fails, retransmit mechanism of transport layer will
346		 * keep trying until route appears or the connection times
347		 * itself out.
348		 */
349		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350					   daddr, inet->inet_saddr,
351					   inet->inet_dport,
352					   inet->inet_sport,
353					   sk->sk_protocol,
354					   RT_CONN_FLAGS(sk),
355					   sk->sk_bound_dev_if);
356		if (IS_ERR(rt))
357			goto no_route;
358		sk_setup_caps(sk, &rt->dst);
359	}
360	skb_dst_set_noref(skb, &rt->dst);
361
362packet_routed:
363	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
364		goto no_route;
365
366	/* OK, we know where to send it, allocate and build IP header. */
367	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
368	skb_reset_network_header(skb);
369	iph = ip_hdr(skb);
370	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
371	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
372		iph->frag_off = htons(IP_DF);
373	else
374		iph->frag_off = 0;
375	iph->ttl      = ip_select_ttl(inet, &rt->dst);
376	iph->protocol = sk->sk_protocol;
377	iph->saddr    = fl4->saddr;
378	iph->daddr    = fl4->daddr;
379	/* Transport layer set skb->h.foo itself. */
380
381	if (inet_opt && inet_opt->opt.optlen) {
382		iph->ihl += inet_opt->opt.optlen >> 2;
383		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
384	}
385
386	ip_select_ident_more(iph, &rt->dst, sk,
387			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
388
389	skb->priority = sk->sk_priority;
390	skb->mark = sk->sk_mark;
391
392	res = ip_local_out(skb);
393	rcu_read_unlock();
394	return res;
395
396no_route:
397	rcu_read_unlock();
398	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
399	kfree_skb(skb);
400	return -EHOSTUNREACH;
401}
402EXPORT_SYMBOL(ip_queue_xmit);
403
404
405static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406{
407	to->pkt_type = from->pkt_type;
408	to->priority = from->priority;
409	to->protocol = from->protocol;
410	skb_dst_drop(to);
411	skb_dst_copy(to, from);
412	to->dev = from->dev;
413	to->mark = from->mark;
414
415	/* Copy the flags to each fragment. */
416	IPCB(to)->flags = IPCB(from)->flags;
417
418#ifdef CONFIG_NET_SCHED
419	to->tc_index = from->tc_index;
420#endif
421	nf_copy(to, from);
422#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424	to->nf_trace = from->nf_trace;
425#endif
426#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427	to->ipvs_property = from->ipvs_property;
428#endif
429	skb_copy_secmark(to, from);
430}
431
432/*
433 *	This IP datagram is too large to be sent in one piece.  Break it up into
434 *	smaller pieces (each of size equal to IP header plus
435 *	a block of the data of the original IP data part) that will yet fit in a
436 *	single device frame, and queue such a frame for sending.
437 */
438
439int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
440{
441	struct iphdr *iph;
442	int ptr;
443	struct net_device *dev;
444	struct sk_buff *skb2;
445	unsigned int mtu, hlen, left, len, ll_rs;
446	int offset;
447	__be16 not_last_frag;
448	struct rtable *rt = skb_rtable(skb);
449	int err = 0;
450
451	dev = rt->dst.dev;
452
453	/*
454	 *	Point into the IP datagram header.
455	 */
456
457	iph = ip_hdr(skb);
458
459	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
460		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
461		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
462			  htonl(ip_skb_dst_mtu(skb)));
463		kfree_skb(skb);
464		return -EMSGSIZE;
465	}
466
467	/*
468	 *	Setup starting values.
469	 */
470
471	hlen = iph->ihl * 4;
472	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
473#ifdef CONFIG_BRIDGE_NETFILTER
474	if (skb->nf_bridge)
475		mtu -= nf_bridge_mtu_reduction(skb);
476#endif
477	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
478
479	/* When frag_list is given, use it. First, check its validity:
480	 * some transformers could create wrong frag_list or break existing
481	 * one, it is not prohibited. In this case fall back to copying.
482	 *
483	 * LATER: this step can be merged to real generation of fragments,
484	 * we can switch to copy when see the first bad fragment.
485	 */
486	if (skb_has_frag_list(skb)) {
487		struct sk_buff *frag, *frag2;
488		int first_len = skb_pagelen(skb);
489
490		if (first_len - hlen > mtu ||
491		    ((first_len - hlen) & 7) ||
492		    ip_is_fragment(iph) ||
493		    skb_cloned(skb))
494			goto slow_path;
495
496		skb_walk_frags(skb, frag) {
497			/* Correct geometry. */
498			if (frag->len > mtu ||
499			    ((frag->len & 7) && frag->next) ||
500			    skb_headroom(frag) < hlen)
501				goto slow_path_clean;
502
503			/* Partially cloned skb? */
504			if (skb_shared(frag))
505				goto slow_path_clean;
506
507			BUG_ON(frag->sk);
508			if (skb->sk) {
509				frag->sk = skb->sk;
510				frag->destructor = sock_wfree;
511			}
512			skb->truesize -= frag->truesize;
513		}
514
515		/* Everything is OK. Generate! */
516
517		err = 0;
518		offset = 0;
519		frag = skb_shinfo(skb)->frag_list;
520		skb_frag_list_init(skb);
521		skb->data_len = first_len - skb_headlen(skb);
522		skb->len = first_len;
523		iph->tot_len = htons(first_len);
524		iph->frag_off = htons(IP_MF);
525		ip_send_check(iph);
526
527		for (;;) {
528			/* Prepare header of the next frame,
529			 * before previous one went down. */
530			if (frag) {
531				frag->ip_summed = CHECKSUM_NONE;
532				skb_reset_transport_header(frag);
533				__skb_push(frag, hlen);
534				skb_reset_network_header(frag);
535				memcpy(skb_network_header(frag), iph, hlen);
536				iph = ip_hdr(frag);
537				iph->tot_len = htons(frag->len);
538				ip_copy_metadata(frag, skb);
539				if (offset == 0)
540					ip_options_fragment(frag);
541				offset += skb->len - hlen;
542				iph->frag_off = htons(offset>>3);
543				if (frag->next != NULL)
544					iph->frag_off |= htons(IP_MF);
545				/* Ready, complete checksum */
546				ip_send_check(iph);
547			}
548
549			err = output(skb);
550
551			if (!err)
552				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
553			if (err || !frag)
554				break;
555
556			skb = frag;
557			frag = skb->next;
558			skb->next = NULL;
559		}
560
561		if (err == 0) {
562			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
563			return 0;
564		}
565
566		while (frag) {
567			skb = frag->next;
568			kfree_skb(frag);
569			frag = skb;
570		}
571		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
572		return err;
573
574slow_path_clean:
575		skb_walk_frags(skb, frag2) {
576			if (frag2 == frag)
577				break;
578			frag2->sk = NULL;
579			frag2->destructor = NULL;
580			skb->truesize += frag2->truesize;
581		}
582	}
583
584slow_path:
585	left = skb->len - hlen;		/* Space per frame */
586	ptr = hlen;		/* Where to start from */
587
588	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
589	 * we need to make room for the encapsulating header
590	 */
591	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
592
593	/*
594	 *	Fragment the datagram.
595	 */
596
597	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598	not_last_frag = iph->frag_off & htons(IP_MF);
599
600	/*
601	 *	Keep copying data until we run out.
602	 */
603
604	while (left > 0) {
605		len = left;
606		/* IF: it doesn't fit, use 'mtu' - the data space left */
607		if (len > mtu)
608			len = mtu;
609		/* IF: we are not sending up to and including the packet end
610		   then align the next start on an eight byte boundary */
611		if (len < left)	{
612			len &= ~7;
613		}
614		/*
615		 *	Allocate buffer.
616		 */
617
618		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
619			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
620			err = -ENOMEM;
621			goto fail;
622		}
623
624		/*
625		 *	Set up data on packet
626		 */
627
628		ip_copy_metadata(skb2, skb);
629		skb_reserve(skb2, ll_rs);
630		skb_put(skb2, len + hlen);
631		skb_reset_network_header(skb2);
632		skb2->transport_header = skb2->network_header + hlen;
633
634		/*
635		 *	Charge the memory for the fragment to any owner
636		 *	it might possess
637		 */
638
639		if (skb->sk)
640			skb_set_owner_w(skb2, skb->sk);
641
642		/*
643		 *	Copy the packet header into the new buffer.
644		 */
645
646		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
647
648		/*
649		 *	Copy a block of the IP datagram.
650		 */
651		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
652			BUG();
653		left -= len;
654
655		/*
656		 *	Fill in the new header fields.
657		 */
658		iph = ip_hdr(skb2);
659		iph->frag_off = htons((offset >> 3));
660
661		/* ANK: dirty, but effective trick. Upgrade options only if
662		 * the segment to be fragmented was THE FIRST (otherwise,
663		 * options are already fixed) and make it ONCE
664		 * on the initial skb, so that all the following fragments
665		 * will inherit fixed options.
666		 */
667		if (offset == 0)
668			ip_options_fragment(skb);
669
670		/*
671		 *	Added AC : If we are fragmenting a fragment that's not the
672		 *		   last fragment then keep MF on each bit
673		 */
674		if (left > 0 || not_last_frag)
675			iph->frag_off |= htons(IP_MF);
676		ptr += len;
677		offset += len;
678
679		/*
680		 *	Put this fragment into the sending queue.
681		 */
682		iph->tot_len = htons(len + hlen);
683
684		ip_send_check(iph);
685
686		err = output(skb2);
687		if (err)
688			goto fail;
689
690		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
691	}
692	kfree_skb(skb);
693	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
694	return err;
695
696fail:
697	kfree_skb(skb);
698	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
699	return err;
700}
701EXPORT_SYMBOL(ip_fragment);
702
703int
704ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705{
706	struct iovec *iov = from;
707
708	if (skb->ip_summed == CHECKSUM_PARTIAL) {
709		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710			return -EFAULT;
711	} else {
712		__wsum csum = 0;
713		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714			return -EFAULT;
715		skb->csum = csum_block_add(skb->csum, csum, odd);
716	}
717	return 0;
718}
719EXPORT_SYMBOL(ip_generic_getfrag);
720
721static inline __wsum
722csum_page(struct page *page, int offset, int copy)
723{
724	char *kaddr;
725	__wsum csum;
726	kaddr = kmap(page);
727	csum = csum_partial(kaddr + offset, copy, 0);
728	kunmap(page);
729	return csum;
730}
731
732static inline int ip_ufo_append_data(struct sock *sk,
733			struct sk_buff_head *queue,
734			int getfrag(void *from, char *to, int offset, int len,
735			       int odd, struct sk_buff *skb),
736			void *from, int length, int hh_len, int fragheaderlen,
737			int transhdrlen, int mtu, unsigned int flags)
738{
739	struct sk_buff *skb;
740	int err;
741
742	/* There is support for UDP fragmentation offload by network
743	 * device, so create one single skb packet containing complete
744	 * udp datagram
745	 */
746	if ((skb = skb_peek_tail(queue)) == NULL) {
747		skb = sock_alloc_send_skb(sk,
748			hh_len + fragheaderlen + transhdrlen + 20,
749			(flags & MSG_DONTWAIT), &err);
750
751		if (skb == NULL)
752			return err;
753
754		/* reserve space for Hardware header */
755		skb_reserve(skb, hh_len);
756
757		/* create space for UDP/IP header */
758		skb_put(skb, fragheaderlen + transhdrlen);
759
760		/* initialize network header pointer */
761		skb_reset_network_header(skb);
762
763		/* initialize protocol header pointer */
764		skb->transport_header = skb->network_header + fragheaderlen;
765
766		skb->ip_summed = CHECKSUM_PARTIAL;
767		skb->csum = 0;
768
769		/* specify the length of each IP datagram fragment */
770		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
771		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
772		__skb_queue_tail(queue, skb);
773	}
774
775	return skb_append_datato_frags(sk, skb, getfrag, from,
776				       (length - transhdrlen));
777}
778
779static int __ip_append_data(struct sock *sk,
780			    struct flowi4 *fl4,
781			    struct sk_buff_head *queue,
782			    struct inet_cork *cork,
783			    int getfrag(void *from, char *to, int offset,
784					int len, int odd, struct sk_buff *skb),
785			    void *from, int length, int transhdrlen,
786			    unsigned int flags)
787{
788	struct inet_sock *inet = inet_sk(sk);
789	struct sk_buff *skb;
790
791	struct ip_options *opt = cork->opt;
792	int hh_len;
793	int exthdrlen;
794	int mtu;
795	int copy;
796	int err;
797	int offset = 0;
798	unsigned int maxfraglen, fragheaderlen;
799	int csummode = CHECKSUM_NONE;
800	struct rtable *rt = (struct rtable *)cork->dst;
801
802	skb = skb_peek_tail(queue);
803
804	exthdrlen = !skb ? rt->dst.header_len : 0;
805	mtu = cork->fragsize;
806
807	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
808
809	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
810	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
811
812	if (cork->length + length > 0xFFFF - fragheaderlen) {
813		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
814			       mtu-exthdrlen);
815		return -EMSGSIZE;
816	}
817
818	/*
819	 * transhdrlen > 0 means that this is the first fragment and we wish
820	 * it won't be fragmented in the future.
821	 */
822	if (transhdrlen &&
823	    length + fragheaderlen <= mtu &&
824	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
825	    !exthdrlen)
826		csummode = CHECKSUM_PARTIAL;
827
828	cork->length += length;
829	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
830	    (sk->sk_protocol == IPPROTO_UDP) &&
831	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
832		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833					 hh_len, fragheaderlen, transhdrlen,
834					 mtu, flags);
835		if (err)
836			goto error;
837		return 0;
838	}
839
840	/* So, what's going on in the loop below?
841	 *
842	 * We use calculated fragment length to generate chained skb,
843	 * each of segments is IP fragment ready for sending to network after
844	 * adding appropriate IP header.
845	 */
846
847	if (!skb)
848		goto alloc_new_skb;
849
850	while (length > 0) {
851		/* Check if the remaining data fits into current packet. */
852		copy = mtu - skb->len;
853		if (copy < length)
854			copy = maxfraglen - skb->len;
855		if (copy <= 0) {
856			char *data;
857			unsigned int datalen;
858			unsigned int fraglen;
859			unsigned int fraggap;
860			unsigned int alloclen;
861			struct sk_buff *skb_prev;
862alloc_new_skb:
863			skb_prev = skb;
864			if (skb_prev)
865				fraggap = skb_prev->len - maxfraglen;
866			else
867				fraggap = 0;
868
869			/*
870			 * If remaining data exceeds the mtu,
871			 * we know we need more fragment(s).
872			 */
873			datalen = length + fraggap;
874			if (datalen > mtu - fragheaderlen)
875				datalen = maxfraglen - fragheaderlen;
876			fraglen = datalen + fragheaderlen;
877
878			if ((flags & MSG_MORE) &&
879			    !(rt->dst.dev->features&NETIF_F_SG))
880				alloclen = mtu;
881			else
882				alloclen = fraglen;
883
884			alloclen += exthdrlen;
885
886			/* The last fragment gets additional space at tail.
887			 * Note, with MSG_MORE we overallocate on fragments,
888			 * because we have no idea what fragment will be
889			 * the last.
890			 */
891			if (datalen == length + fraggap)
892				alloclen += rt->dst.trailer_len;
893
894			if (transhdrlen) {
895				skb = sock_alloc_send_skb(sk,
896						alloclen + hh_len + 15,
897						(flags & MSG_DONTWAIT), &err);
898			} else {
899				skb = NULL;
900				if (atomic_read(&sk->sk_wmem_alloc) <=
901				    2 * sk->sk_sndbuf)
902					skb = sock_wmalloc(sk,
903							   alloclen + hh_len + 15, 1,
904							   sk->sk_allocation);
905				if (unlikely(skb == NULL))
906					err = -ENOBUFS;
907				else
908					/* only the initial fragment is
909					   time stamped */
910					cork->tx_flags = 0;
911			}
912			if (skb == NULL)
913				goto error;
914
915			/*
916			 *	Fill in the control structures
917			 */
918			skb->ip_summed = csummode;
919			skb->csum = 0;
920			skb_reserve(skb, hh_len);
921			skb_shinfo(skb)->tx_flags = cork->tx_flags;
922
923			/*
924			 *	Find where to start putting bytes.
925			 */
926			data = skb_put(skb, fraglen + exthdrlen);
927			skb_set_network_header(skb, exthdrlen);
928			skb->transport_header = (skb->network_header +
929						 fragheaderlen);
930			data += fragheaderlen + exthdrlen;
931
932			if (fraggap) {
933				skb->csum = skb_copy_and_csum_bits(
934					skb_prev, maxfraglen,
935					data + transhdrlen, fraggap, 0);
936				skb_prev->csum = csum_sub(skb_prev->csum,
937							  skb->csum);
938				data += fraggap;
939				pskb_trim_unique(skb_prev, maxfraglen);
940			}
941
942			copy = datalen - transhdrlen - fraggap;
943			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
944				err = -EFAULT;
945				kfree_skb(skb);
946				goto error;
947			}
948
949			offset += copy;
950			length -= datalen - fraggap;
951			transhdrlen = 0;
952			exthdrlen = 0;
953			csummode = CHECKSUM_NONE;
954
955			/*
956			 * Put the packet on the pending queue.
957			 */
958			__skb_queue_tail(queue, skb);
959			continue;
960		}
961
962		if (copy > length)
963			copy = length;
964
965		if (!(rt->dst.dev->features&NETIF_F_SG)) {
966			unsigned int off;
967
968			off = skb->len;
969			if (getfrag(from, skb_put(skb, copy),
970					offset, copy, off, skb) < 0) {
971				__skb_trim(skb, off);
972				err = -EFAULT;
973				goto error;
974			}
975		} else {
976			int i = skb_shinfo(skb)->nr_frags;
977			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
978			struct page *page = cork->page;
979			int off = cork->off;
980			unsigned int left;
981
982			if (page && (left = PAGE_SIZE - off) > 0) {
983				if (copy >= left)
984					copy = left;
985				if (page != frag->page) {
986					if (i == MAX_SKB_FRAGS) {
987						err = -EMSGSIZE;
988						goto error;
989					}
990					get_page(page);
991					skb_fill_page_desc(skb, i, page, off, 0);
992					frag = &skb_shinfo(skb)->frags[i];
993				}
994			} else if (i < MAX_SKB_FRAGS) {
995				if (copy > PAGE_SIZE)
996					copy = PAGE_SIZE;
997				page = alloc_pages(sk->sk_allocation, 0);
998				if (page == NULL)  {
999					err = -ENOMEM;
1000					goto error;
1001				}
1002				cork->page = page;
1003				cork->off = 0;
1004
1005				skb_fill_page_desc(skb, i, page, 0, 0);
1006				frag = &skb_shinfo(skb)->frags[i];
1007			} else {
1008				err = -EMSGSIZE;
1009				goto error;
1010			}
1011			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012				err = -EFAULT;
1013				goto error;
1014			}
1015			cork->off += copy;
1016			frag->size += copy;
1017			skb->len += copy;
1018			skb->data_len += copy;
1019			skb->truesize += copy;
1020			atomic_add(copy, &sk->sk_wmem_alloc);
1021		}
1022		offset += copy;
1023		length -= copy;
1024	}
1025
1026	return 0;
1027
1028error:
1029	cork->length -= length;
1030	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1031	return err;
1032}
1033
1034static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035			 struct ipcm_cookie *ipc, struct rtable **rtp)
1036{
1037	struct inet_sock *inet = inet_sk(sk);
1038	struct ip_options_rcu *opt;
1039	struct rtable *rt;
1040
1041	/*
1042	 * setup for corking.
1043	 */
1044	opt = ipc->opt;
1045	if (opt) {
1046		if (cork->opt == NULL) {
1047			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048					    sk->sk_allocation);
1049			if (unlikely(cork->opt == NULL))
1050				return -ENOBUFS;
1051		}
1052		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053		cork->flags |= IPCORK_OPT;
1054		cork->addr = ipc->addr;
1055	}
1056	rt = *rtp;
1057	if (unlikely(!rt))
1058		return -EFAULT;
1059	/*
1060	 * We steal reference to this route, caller should not release it
1061	 */
1062	*rtp = NULL;
1063	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065	cork->dst = &rt->dst;
1066	cork->length = 0;
1067	cork->tx_flags = ipc->tx_flags;
1068	cork->page = NULL;
1069	cork->off = 0;
1070
1071	return 0;
1072}
1073
1074/*
1075 *	ip_append_data() and ip_append_page() can make one large IP datagram
1076 *	from many pieces of data. Each pieces will be holded on the socket
1077 *	until ip_push_pending_frames() is called. Each piece can be a page
1078 *	or non-page data.
1079 *
1080 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1081 *	this interface potentially.
1082 *
1083 *	LATER: length must be adjusted by pad at tail, when it is required.
1084 */
1085int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086		   int getfrag(void *from, char *to, int offset, int len,
1087			       int odd, struct sk_buff *skb),
1088		   void *from, int length, int transhdrlen,
1089		   struct ipcm_cookie *ipc, struct rtable **rtp,
1090		   unsigned int flags)
1091{
1092	struct inet_sock *inet = inet_sk(sk);
1093	int err;
1094
1095	if (flags&MSG_PROBE)
1096		return 0;
1097
1098	if (skb_queue_empty(&sk->sk_write_queue)) {
1099		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100		if (err)
1101			return err;
1102	} else {
1103		transhdrlen = 0;
1104	}
1105
1106	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107				from, length, transhdrlen, flags);
1108}
1109
1110ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1111		       int offset, size_t size, int flags)
1112{
1113	struct inet_sock *inet = inet_sk(sk);
1114	struct sk_buff *skb;
1115	struct rtable *rt;
1116	struct ip_options *opt = NULL;
1117	struct inet_cork *cork;
1118	int hh_len;
1119	int mtu;
1120	int len;
1121	int err;
1122	unsigned int maxfraglen, fragheaderlen, fraggap;
1123
1124	if (inet->hdrincl)
1125		return -EPERM;
1126
1127	if (flags&MSG_PROBE)
1128		return 0;
1129
1130	if (skb_queue_empty(&sk->sk_write_queue))
1131		return -EINVAL;
1132
1133	cork = &inet->cork.base;
1134	rt = (struct rtable *)cork->dst;
1135	if (cork->flags & IPCORK_OPT)
1136		opt = cork->opt;
1137
1138	if (!(rt->dst.dev->features&NETIF_F_SG))
1139		return -EOPNOTSUPP;
1140
1141	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1142	mtu = cork->fragsize;
1143
1144	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1145	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1146
1147	if (cork->length + size > 0xFFFF - fragheaderlen) {
1148		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1149		return -EMSGSIZE;
1150	}
1151
1152	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1153		return -EINVAL;
1154
1155	cork->length += size;
1156	if ((size + skb->len > mtu) &&
1157	    (sk->sk_protocol == IPPROTO_UDP) &&
1158	    (rt->dst.dev->features & NETIF_F_UFO)) {
1159		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1160		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161	}
1162
1163
1164	while (size > 0) {
1165		int i;
1166
1167		if (skb_is_gso(skb))
1168			len = size;
1169		else {
1170
1171			/* Check if the remaining data fits into current packet. */
1172			len = mtu - skb->len;
1173			if (len < size)
1174				len = maxfraglen - skb->len;
1175		}
1176		if (len <= 0) {
1177			struct sk_buff *skb_prev;
1178			int alloclen;
1179
1180			skb_prev = skb;
1181			fraggap = skb_prev->len - maxfraglen;
1182
1183			alloclen = fragheaderlen + hh_len + fraggap + 15;
1184			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1185			if (unlikely(!skb)) {
1186				err = -ENOBUFS;
1187				goto error;
1188			}
1189
1190			/*
1191			 *	Fill in the control structures
1192			 */
1193			skb->ip_summed = CHECKSUM_NONE;
1194			skb->csum = 0;
1195			skb_reserve(skb, hh_len);
1196
1197			/*
1198			 *	Find where to start putting bytes.
1199			 */
1200			skb_put(skb, fragheaderlen + fraggap);
1201			skb_reset_network_header(skb);
1202			skb->transport_header = (skb->network_header +
1203						 fragheaderlen);
1204			if (fraggap) {
1205				skb->csum = skb_copy_and_csum_bits(skb_prev,
1206								   maxfraglen,
1207						    skb_transport_header(skb),
1208								   fraggap, 0);
1209				skb_prev->csum = csum_sub(skb_prev->csum,
1210							  skb->csum);
1211				pskb_trim_unique(skb_prev, maxfraglen);
1212			}
1213
1214			/*
1215			 * Put the packet on the pending queue.
1216			 */
1217			__skb_queue_tail(&sk->sk_write_queue, skb);
1218			continue;
1219		}
1220
1221		i = skb_shinfo(skb)->nr_frags;
1222		if (len > size)
1223			len = size;
1224		if (skb_can_coalesce(skb, i, page, offset)) {
1225			skb_shinfo(skb)->frags[i-1].size += len;
1226		} else if (i < MAX_SKB_FRAGS) {
1227			get_page(page);
1228			skb_fill_page_desc(skb, i, page, offset, len);
1229		} else {
1230			err = -EMSGSIZE;
1231			goto error;
1232		}
1233
1234		if (skb->ip_summed == CHECKSUM_NONE) {
1235			__wsum csum;
1236			csum = csum_page(page, offset, len);
1237			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1238		}
1239
1240		skb->len += len;
1241		skb->data_len += len;
1242		skb->truesize += len;
1243		atomic_add(len, &sk->sk_wmem_alloc);
1244		offset += len;
1245		size -= len;
1246	}
1247	return 0;
1248
1249error:
1250	cork->length -= size;
1251	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1252	return err;
1253}
1254
1255static void ip_cork_release(struct inet_cork *cork)
1256{
1257	cork->flags &= ~IPCORK_OPT;
1258	kfree(cork->opt);
1259	cork->opt = NULL;
1260	dst_release(cork->dst);
1261	cork->dst = NULL;
1262}
1263
1264/*
1265 *	Combined all pending IP fragments on the socket as one IP datagram
1266 *	and push them out.
1267 */
1268struct sk_buff *__ip_make_skb(struct sock *sk,
1269			      struct flowi4 *fl4,
1270			      struct sk_buff_head *queue,
1271			      struct inet_cork *cork)
1272{
1273	struct sk_buff *skb, *tmp_skb;
1274	struct sk_buff **tail_skb;
1275	struct inet_sock *inet = inet_sk(sk);
1276	struct net *net = sock_net(sk);
1277	struct ip_options *opt = NULL;
1278	struct rtable *rt = (struct rtable *)cork->dst;
1279	struct iphdr *iph;
1280	__be16 df = 0;
1281	__u8 ttl;
1282
1283	if ((skb = __skb_dequeue(queue)) == NULL)
1284		goto out;
1285	tail_skb = &(skb_shinfo(skb)->frag_list);
1286
1287	/* move skb->data to ip header from ext header */
1288	if (skb->data < skb_network_header(skb))
1289		__skb_pull(skb, skb_network_offset(skb));
1290	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1291		__skb_pull(tmp_skb, skb_network_header_len(skb));
1292		*tail_skb = tmp_skb;
1293		tail_skb = &(tmp_skb->next);
1294		skb->len += tmp_skb->len;
1295		skb->data_len += tmp_skb->len;
1296		skb->truesize += tmp_skb->truesize;
1297		tmp_skb->destructor = NULL;
1298		tmp_skb->sk = NULL;
1299	}
1300
1301	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302	 * to fragment the frame generated here. No matter, what transforms
1303	 * how transforms change size of the packet, it will come out.
1304	 */
1305	if (inet->pmtudisc < IP_PMTUDISC_DO)
1306		skb->local_df = 1;
1307
1308	/* DF bit is set when we want to see DF on outgoing frames.
1309	 * If local_df is set too, we still allow to fragment this frame
1310	 * locally. */
1311	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1312	    (skb->len <= dst_mtu(&rt->dst) &&
1313	     ip_dont_fragment(sk, &rt->dst)))
1314		df = htons(IP_DF);
1315
1316	if (cork->flags & IPCORK_OPT)
1317		opt = cork->opt;
1318
1319	if (rt->rt_type == RTN_MULTICAST)
1320		ttl = inet->mc_ttl;
1321	else
1322		ttl = ip_select_ttl(inet, &rt->dst);
1323
1324	iph = (struct iphdr *)skb->data;
1325	iph->version = 4;
1326	iph->ihl = 5;
1327	iph->tos = inet->tos;
1328	iph->frag_off = df;
1329	ip_select_ident(iph, &rt->dst, sk);
1330	iph->ttl = ttl;
1331	iph->protocol = sk->sk_protocol;
1332	iph->saddr = fl4->saddr;
1333	iph->daddr = fl4->daddr;
1334
1335	if (opt) {
1336		iph->ihl += opt->optlen>>2;
1337		ip_options_build(skb, opt, cork->addr, rt, 0);
1338	}
1339
1340	skb->priority = sk->sk_priority;
1341	skb->mark = sk->sk_mark;
1342	/*
1343	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1344	 * on dst refcount
1345	 */
1346	cork->dst = NULL;
1347	skb_dst_set(skb, &rt->dst);
1348
1349	if (iph->protocol == IPPROTO_ICMP)
1350		icmp_out_count(net, ((struct icmphdr *)
1351			skb_transport_header(skb))->type);
1352
1353	ip_cork_release(cork);
1354out:
1355	return skb;
1356}
1357
1358int ip_send_skb(struct sk_buff *skb)
1359{
1360	struct net *net = sock_net(skb->sk);
1361	int err;
1362
1363	err = ip_local_out(skb);
1364	if (err) {
1365		if (err > 0)
1366			err = net_xmit_errno(err);
1367		if (err)
1368			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1369	}
1370
1371	return err;
1372}
1373
1374int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1375{
1376	struct sk_buff *skb;
1377
1378	skb = ip_finish_skb(sk, fl4);
1379	if (!skb)
1380		return 0;
1381
1382	/* Netfilter gets whole the not fragmented skb. */
1383	return ip_send_skb(skb);
1384}
1385
1386/*
1387 *	Throw away all pending data on the socket.
1388 */
1389static void __ip_flush_pending_frames(struct sock *sk,
1390				      struct sk_buff_head *queue,
1391				      struct inet_cork *cork)
1392{
1393	struct sk_buff *skb;
1394
1395	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1396		kfree_skb(skb);
1397
1398	ip_cork_release(cork);
1399}
1400
1401void ip_flush_pending_frames(struct sock *sk)
1402{
1403	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404}
1405
1406struct sk_buff *ip_make_skb(struct sock *sk,
1407			    struct flowi4 *fl4,
1408			    int getfrag(void *from, char *to, int offset,
1409					int len, int odd, struct sk_buff *skb),
1410			    void *from, int length, int transhdrlen,
1411			    struct ipcm_cookie *ipc, struct rtable **rtp,
1412			    unsigned int flags)
1413{
1414	struct inet_cork cork;
1415	struct sk_buff_head queue;
1416	int err;
1417
1418	if (flags & MSG_PROBE)
1419		return NULL;
1420
1421	__skb_queue_head_init(&queue);
1422
1423	cork.flags = 0;
1424	cork.addr = 0;
1425	cork.opt = NULL;
1426	err = ip_setup_cork(sk, &cork, ipc, rtp);
1427	if (err)
1428		return ERR_PTR(err);
1429
1430	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431			       from, length, transhdrlen, flags);
1432	if (err) {
1433		__ip_flush_pending_frames(sk, &queue, &cork);
1434		return ERR_PTR(err);
1435	}
1436
1437	return __ip_make_skb(sk, fl4, &queue, &cork);
1438}
1439
1440/*
1441 *	Fetch data from kernel space and fill in checksum if needed.
1442 */
1443static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1444			      int len, int odd, struct sk_buff *skb)
1445{
1446	__wsum csum;
1447
1448	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1449	skb->csum = csum_block_add(skb->csum, csum, odd);
1450	return 0;
1451}
1452
1453/*
1454 *	Generic function to send a packet as reply to another packet.
1455 *	Used to send TCP resets so far. ICMP should use this function too.
1456 *
1457 *	Should run single threaded per socket because it uses the sock
1458 *     	structure to pass arguments.
1459 */
1460void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1461		   struct ip_reply_arg *arg, unsigned int len)
1462{
1463	struct inet_sock *inet = inet_sk(sk);
1464	struct ip_options_data replyopts;
1465	struct ipcm_cookie ipc;
1466	struct flowi4 fl4;
1467	struct rtable *rt = skb_rtable(skb);
1468
1469	if (ip_options_echo(&replyopts.opt.opt, skb))
1470		return;
1471
1472	ipc.addr = daddr;
1473	ipc.opt = NULL;
1474	ipc.tx_flags = 0;
1475
1476	if (replyopts.opt.opt.optlen) {
1477		ipc.opt = &replyopts.opt;
1478
1479		if (replyopts.opt.opt.srr)
1480			daddr = replyopts.opt.opt.faddr;
1481	}
1482
1483	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1484			   RT_TOS(ip_hdr(skb)->tos),
1485			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1486			   ip_reply_arg_flowi_flags(arg),
1487			   daddr, rt->rt_spec_dst,
1488			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1489	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1490	rt = ip_route_output_key(sock_net(sk), &fl4);
1491	if (IS_ERR(rt))
1492		return;
1493
1494	/* And let IP do all the hard work.
1495
1496	   This chunk is not reenterable, hence spinlock.
1497	   Note that it uses the fact, that this function is called
1498	   with locally disabled BH and that sk cannot be already spinlocked.
1499	 */
1500	bh_lock_sock(sk);
1501	inet->tos = ip_hdr(skb)->tos;
1502	sk->sk_priority = skb->priority;
1503	sk->sk_protocol = ip_hdr(skb)->protocol;
1504	sk->sk_bound_dev_if = arg->bound_dev_if;
1505	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1506		       &ipc, &rt, MSG_DONTWAIT);
1507	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1508		if (arg->csumoffset >= 0)
1509			*((__sum16 *)skb_transport_header(skb) +
1510			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1511								arg->csum));
1512		skb->ip_summed = CHECKSUM_NONE;
1513		ip_push_pending_frames(sk, &fl4);
1514	}
1515
1516	bh_unlock_sock(sk);
1517
1518	ip_rt_put(rt);
1519}
1520
1521void __init ip_init(void)
1522{
1523	ip_rt_init();
1524	inet_initpeers();
1525
1526#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1527	igmp_mc_proc_init();
1528#endif
1529}
1530