ip_output.c revision ea4fc0d6193ff56fcef39b0d2210d402a7acb5f0
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162	iph->saddr    = saddr;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->opt.optlen) {
167		iph->ihl += opt->opt.optlen>>2;
168		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186	if (rt->rt_type == RTN_MULTICAST) {
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188	} else if (rt->rt_type == RTN_BROADCAST)
189		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191	/* Be paranoid, rather than too clever. */
192	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193		struct sk_buff *skb2;
194
195		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196		if (skb2 == NULL) {
197			kfree_skb(skb);
198			return -ENOMEM;
199		}
200		if (skb->sk)
201			skb_set_owner_w(skb2, skb->sk);
202		kfree_skb(skb);
203		skb = skb2;
204	}
205
206	if (dst->hh)
207		return neigh_hh_output(dst->hh, skb);
208	else if (dst->neighbour)
209		return dst->neighbour->output(skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb_dst(skb)->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb_rtable(skb);
244	struct net_device *dev = rt->dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if (sk_mc_loop(sk)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    &&
270		    ((rt->rt_flags & RTCF_LOCAL) ||
271		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272#endif
273		   ) {
274			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275			if (newskb)
276				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277					newskb, NULL, newskb->dev,
278					ip_dev_loopback_xmit);
279		}
280
281		/* Multicasts with ttl 0 must not go beyond the host */
282
283		if (ip_hdr(skb)->ttl == 0) {
284			kfree_skb(skb);
285			return 0;
286		}
287	}
288
289	if (rt->rt_flags&RTCF_BROADCAST) {
290		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291		if (newskb)
292			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293				NULL, newskb->dev, ip_dev_loopback_xmit);
294	}
295
296	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297			    skb->dev, ip_finish_output,
298			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299}
300
301int ip_output(struct sk_buff *skb)
302{
303	struct net_device *dev = skb_dst(skb)->dev;
304
305	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307	skb->dev = dev;
308	skb->protocol = htons(ETH_P_IP);
309
310	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311			    ip_finish_output,
312			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313}
314
315int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
316{
317	struct sock *sk = skb->sk;
318	struct inet_sock *inet = inet_sk(sk);
319	struct ip_options_rcu *inet_opt;
320	struct flowi4 *fl4;
321	struct rtable *rt;
322	struct iphdr *iph;
323	int res;
324
325	/* Skip all of this if the packet is already routed,
326	 * f.e. by something like SCTP.
327	 */
328	rcu_read_lock();
329	inet_opt = rcu_dereference(inet->inet_opt);
330	fl4 = &fl->u.ip4;
331	rt = skb_rtable(skb);
332	if (rt != NULL)
333		goto packet_routed;
334
335	/* Make sure we can route this packet. */
336	rt = (struct rtable *)__sk_dst_check(sk, 0);
337	if (rt == NULL) {
338		__be32 daddr;
339
340		/* Use correct destination address if we have options. */
341		daddr = inet->inet_daddr;
342		if (inet_opt && inet_opt->opt.srr)
343			daddr = inet_opt->opt.faddr;
344
345		/* If this fails, retransmit mechanism of transport layer will
346		 * keep trying until route appears or the connection times
347		 * itself out.
348		 */
349		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350					   daddr, inet->inet_saddr,
351					   inet->inet_dport,
352					   inet->inet_sport,
353					   sk->sk_protocol,
354					   RT_CONN_FLAGS(sk),
355					   sk->sk_bound_dev_if);
356		if (IS_ERR(rt))
357			goto no_route;
358		sk_setup_caps(sk, &rt->dst);
359	}
360	skb_dst_set_noref(skb, &rt->dst);
361
362packet_routed:
363	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
364		goto no_route;
365
366	/* OK, we know where to send it, allocate and build IP header. */
367	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
368	skb_reset_network_header(skb);
369	iph = ip_hdr(skb);
370	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
371	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
372		iph->frag_off = htons(IP_DF);
373	else
374		iph->frag_off = 0;
375	iph->ttl      = ip_select_ttl(inet, &rt->dst);
376	iph->protocol = sk->sk_protocol;
377	iph->saddr    = fl4->saddr;
378	iph->daddr    = fl4->daddr;
379	/* Transport layer set skb->h.foo itself. */
380
381	if (inet_opt && inet_opt->opt.optlen) {
382		iph->ihl += inet_opt->opt.optlen >> 2;
383		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
384	}
385
386	ip_select_ident_more(iph, &rt->dst, sk,
387			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
388
389	skb->priority = sk->sk_priority;
390	skb->mark = sk->sk_mark;
391
392	res = ip_local_out(skb);
393	rcu_read_unlock();
394	return res;
395
396no_route:
397	rcu_read_unlock();
398	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
399	kfree_skb(skb);
400	return -EHOSTUNREACH;
401}
402EXPORT_SYMBOL(ip_queue_xmit);
403
404
405static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406{
407	to->pkt_type = from->pkt_type;
408	to->priority = from->priority;
409	to->protocol = from->protocol;
410	skb_dst_drop(to);
411	skb_dst_copy(to, from);
412	to->dev = from->dev;
413	to->mark = from->mark;
414
415	/* Copy the flags to each fragment. */
416	IPCB(to)->flags = IPCB(from)->flags;
417
418#ifdef CONFIG_NET_SCHED
419	to->tc_index = from->tc_index;
420#endif
421	nf_copy(to, from);
422#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424	to->nf_trace = from->nf_trace;
425#endif
426#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427	to->ipvs_property = from->ipvs_property;
428#endif
429	skb_copy_secmark(to, from);
430}
431
432/*
433 *	This IP datagram is too large to be sent in one piece.  Break it up into
434 *	smaller pieces (each of size equal to IP header plus
435 *	a block of the data of the original IP data part) that will yet fit in a
436 *	single device frame, and queue such a frame for sending.
437 */
438
439int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
440{
441	struct iphdr *iph;
442	int ptr;
443	struct net_device *dev;
444	struct sk_buff *skb2;
445	unsigned int mtu, hlen, left, len, ll_rs;
446	int offset;
447	__be16 not_last_frag;
448	struct rtable *rt = skb_rtable(skb);
449	int err = 0;
450
451	dev = rt->dst.dev;
452
453	/*
454	 *	Point into the IP datagram header.
455	 */
456
457	iph = ip_hdr(skb);
458
459	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
460		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
461		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
462			  htonl(ip_skb_dst_mtu(skb)));
463		kfree_skb(skb);
464		return -EMSGSIZE;
465	}
466
467	/*
468	 *	Setup starting values.
469	 */
470
471	hlen = iph->ihl * 4;
472	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
473#ifdef CONFIG_BRIDGE_NETFILTER
474	if (skb->nf_bridge)
475		mtu -= nf_bridge_mtu_reduction(skb);
476#endif
477	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
478
479	/* When frag_list is given, use it. First, check its validity:
480	 * some transformers could create wrong frag_list or break existing
481	 * one, it is not prohibited. In this case fall back to copying.
482	 *
483	 * LATER: this step can be merged to real generation of fragments,
484	 * we can switch to copy when see the first bad fragment.
485	 */
486	if (skb_has_frag_list(skb)) {
487		struct sk_buff *frag, *frag2;
488		int first_len = skb_pagelen(skb);
489
490		if (first_len - hlen > mtu ||
491		    ((first_len - hlen) & 7) ||
492		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
493		    skb_cloned(skb))
494			goto slow_path;
495
496		skb_walk_frags(skb, frag) {
497			/* Correct geometry. */
498			if (frag->len > mtu ||
499			    ((frag->len & 7) && frag->next) ||
500			    skb_headroom(frag) < hlen)
501				goto slow_path_clean;
502
503			/* Partially cloned skb? */
504			if (skb_shared(frag))
505				goto slow_path_clean;
506
507			BUG_ON(frag->sk);
508			if (skb->sk) {
509				frag->sk = skb->sk;
510				frag->destructor = sock_wfree;
511			}
512			skb->truesize -= frag->truesize;
513		}
514
515		/* Everything is OK. Generate! */
516
517		err = 0;
518		offset = 0;
519		frag = skb_shinfo(skb)->frag_list;
520		skb_frag_list_init(skb);
521		skb->data_len = first_len - skb_headlen(skb);
522		skb->len = first_len;
523		iph->tot_len = htons(first_len);
524		iph->frag_off = htons(IP_MF);
525		ip_send_check(iph);
526
527		for (;;) {
528			/* Prepare header of the next frame,
529			 * before previous one went down. */
530			if (frag) {
531				frag->ip_summed = CHECKSUM_NONE;
532				skb_reset_transport_header(frag);
533				__skb_push(frag, hlen);
534				skb_reset_network_header(frag);
535				memcpy(skb_network_header(frag), iph, hlen);
536				iph = ip_hdr(frag);
537				iph->tot_len = htons(frag->len);
538				ip_copy_metadata(frag, skb);
539				if (offset == 0)
540					ip_options_fragment(frag);
541				offset += skb->len - hlen;
542				iph->frag_off = htons(offset>>3);
543				if (frag->next != NULL)
544					iph->frag_off |= htons(IP_MF);
545				/* Ready, complete checksum */
546				ip_send_check(iph);
547			}
548
549			err = output(skb);
550
551			if (!err)
552				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
553			if (err || !frag)
554				break;
555
556			skb = frag;
557			frag = skb->next;
558			skb->next = NULL;
559		}
560
561		if (err == 0) {
562			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
563			return 0;
564		}
565
566		while (frag) {
567			skb = frag->next;
568			kfree_skb(frag);
569			frag = skb;
570		}
571		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
572		return err;
573
574slow_path_clean:
575		skb_walk_frags(skb, frag2) {
576			if (frag2 == frag)
577				break;
578			frag2->sk = NULL;
579			frag2->destructor = NULL;
580			skb->truesize += frag2->truesize;
581		}
582	}
583
584slow_path:
585	left = skb->len - hlen;		/* Space per frame */
586	ptr = hlen;		/* Where to start from */
587
588	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
589	 * we need to make room for the encapsulating header
590	 */
591	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
592
593	/*
594	 *	Fragment the datagram.
595	 */
596
597	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598	not_last_frag = iph->frag_off & htons(IP_MF);
599
600	/*
601	 *	Keep copying data until we run out.
602	 */
603
604	while (left > 0) {
605		len = left;
606		/* IF: it doesn't fit, use 'mtu' - the data space left */
607		if (len > mtu)
608			len = mtu;
609		/* IF: we are not sending up to and including the packet end
610		   then align the next start on an eight byte boundary */
611		if (len < left)	{
612			len &= ~7;
613		}
614		/*
615		 *	Allocate buffer.
616		 */
617
618		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
619			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
620			err = -ENOMEM;
621			goto fail;
622		}
623
624		/*
625		 *	Set up data on packet
626		 */
627
628		ip_copy_metadata(skb2, skb);
629		skb_reserve(skb2, ll_rs);
630		skb_put(skb2, len + hlen);
631		skb_reset_network_header(skb2);
632		skb2->transport_header = skb2->network_header + hlen;
633
634		/*
635		 *	Charge the memory for the fragment to any owner
636		 *	it might possess
637		 */
638
639		if (skb->sk)
640			skb_set_owner_w(skb2, skb->sk);
641
642		/*
643		 *	Copy the packet header into the new buffer.
644		 */
645
646		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
647
648		/*
649		 *	Copy a block of the IP datagram.
650		 */
651		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
652			BUG();
653		left -= len;
654
655		/*
656		 *	Fill in the new header fields.
657		 */
658		iph = ip_hdr(skb2);
659		iph->frag_off = htons((offset >> 3));
660
661		/* ANK: dirty, but effective trick. Upgrade options only if
662		 * the segment to be fragmented was THE FIRST (otherwise,
663		 * options are already fixed) and make it ONCE
664		 * on the initial skb, so that all the following fragments
665		 * will inherit fixed options.
666		 */
667		if (offset == 0)
668			ip_options_fragment(skb);
669
670		/*
671		 *	Added AC : If we are fragmenting a fragment that's not the
672		 *		   last fragment then keep MF on each bit
673		 */
674		if (left > 0 || not_last_frag)
675			iph->frag_off |= htons(IP_MF);
676		ptr += len;
677		offset += len;
678
679		/*
680		 *	Put this fragment into the sending queue.
681		 */
682		iph->tot_len = htons(len + hlen);
683
684		ip_send_check(iph);
685
686		err = output(skb2);
687		if (err)
688			goto fail;
689
690		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
691	}
692	kfree_skb(skb);
693	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
694	return err;
695
696fail:
697	kfree_skb(skb);
698	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
699	return err;
700}
701EXPORT_SYMBOL(ip_fragment);
702
703int
704ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705{
706	struct iovec *iov = from;
707
708	if (skb->ip_summed == CHECKSUM_PARTIAL) {
709		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710			return -EFAULT;
711	} else {
712		__wsum csum = 0;
713		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714			return -EFAULT;
715		skb->csum = csum_block_add(skb->csum, csum, odd);
716	}
717	return 0;
718}
719EXPORT_SYMBOL(ip_generic_getfrag);
720
721static inline __wsum
722csum_page(struct page *page, int offset, int copy)
723{
724	char *kaddr;
725	__wsum csum;
726	kaddr = kmap(page);
727	csum = csum_partial(kaddr + offset, copy, 0);
728	kunmap(page);
729	return csum;
730}
731
732static inline int ip_ufo_append_data(struct sock *sk,
733			struct sk_buff_head *queue,
734			int getfrag(void *from, char *to, int offset, int len,
735			       int odd, struct sk_buff *skb),
736			void *from, int length, int hh_len, int fragheaderlen,
737			int transhdrlen, int mtu, unsigned int flags)
738{
739	struct sk_buff *skb;
740	int err;
741
742	/* There is support for UDP fragmentation offload by network
743	 * device, so create one single skb packet containing complete
744	 * udp datagram
745	 */
746	if ((skb = skb_peek_tail(queue)) == NULL) {
747		skb = sock_alloc_send_skb(sk,
748			hh_len + fragheaderlen + transhdrlen + 20,
749			(flags & MSG_DONTWAIT), &err);
750
751		if (skb == NULL)
752			return err;
753
754		/* reserve space for Hardware header */
755		skb_reserve(skb, hh_len);
756
757		/* create space for UDP/IP header */
758		skb_put(skb, fragheaderlen + transhdrlen);
759
760		/* initialize network header pointer */
761		skb_reset_network_header(skb);
762
763		/* initialize protocol header pointer */
764		skb->transport_header = skb->network_header + fragheaderlen;
765
766		skb->ip_summed = CHECKSUM_PARTIAL;
767		skb->csum = 0;
768
769		/* specify the length of each IP datagram fragment */
770		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
771		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
772		__skb_queue_tail(queue, skb);
773	}
774
775	return skb_append_datato_frags(sk, skb, getfrag, from,
776				       (length - transhdrlen));
777}
778
779static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
780			    struct inet_cork *cork,
781			    int getfrag(void *from, char *to, int offset,
782					int len, int odd, struct sk_buff *skb),
783			    void *from, int length, int transhdrlen,
784			    unsigned int flags)
785{
786	struct inet_sock *inet = inet_sk(sk);
787	struct sk_buff *skb;
788
789	struct ip_options *opt = cork->opt;
790	int hh_len;
791	int exthdrlen;
792	int mtu;
793	int copy;
794	int err;
795	int offset = 0;
796	unsigned int maxfraglen, fragheaderlen;
797	int csummode = CHECKSUM_NONE;
798	struct rtable *rt = (struct rtable *)cork->dst;
799
800	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
801	length += exthdrlen;
802	transhdrlen += exthdrlen;
803	mtu = cork->fragsize;
804
805	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
806
807	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
808	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
809
810	if (cork->length + length > 0xFFFF - fragheaderlen) {
811		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
812			       mtu-exthdrlen);
813		return -EMSGSIZE;
814	}
815
816	/*
817	 * transhdrlen > 0 means that this is the first fragment and we wish
818	 * it won't be fragmented in the future.
819	 */
820	if (transhdrlen &&
821	    length + fragheaderlen <= mtu &&
822	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
823	    !exthdrlen)
824		csummode = CHECKSUM_PARTIAL;
825
826	skb = skb_peek_tail(queue);
827
828	cork->length += length;
829	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
830	    (sk->sk_protocol == IPPROTO_UDP) &&
831	    (rt->dst.dev->features & NETIF_F_UFO)) {
832		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833					 hh_len, fragheaderlen, transhdrlen,
834					 mtu, flags);
835		if (err)
836			goto error;
837		return 0;
838	}
839
840	/* So, what's going on in the loop below?
841	 *
842	 * We use calculated fragment length to generate chained skb,
843	 * each of segments is IP fragment ready for sending to network after
844	 * adding appropriate IP header.
845	 */
846
847	if (!skb)
848		goto alloc_new_skb;
849
850	while (length > 0) {
851		/* Check if the remaining data fits into current packet. */
852		copy = mtu - skb->len;
853		if (copy < length)
854			copy = maxfraglen - skb->len;
855		if (copy <= 0) {
856			char *data;
857			unsigned int datalen;
858			unsigned int fraglen;
859			unsigned int fraggap;
860			unsigned int alloclen;
861			struct sk_buff *skb_prev;
862alloc_new_skb:
863			skb_prev = skb;
864			if (skb_prev)
865				fraggap = skb_prev->len - maxfraglen;
866			else
867				fraggap = 0;
868
869			/*
870			 * If remaining data exceeds the mtu,
871			 * we know we need more fragment(s).
872			 */
873			datalen = length + fraggap;
874			if (datalen > mtu - fragheaderlen)
875				datalen = maxfraglen - fragheaderlen;
876			fraglen = datalen + fragheaderlen;
877
878			if ((flags & MSG_MORE) &&
879			    !(rt->dst.dev->features&NETIF_F_SG))
880				alloclen = mtu;
881			else
882				alloclen = fraglen;
883
884			/* The last fragment gets additional space at tail.
885			 * Note, with MSG_MORE we overallocate on fragments,
886			 * because we have no idea what fragment will be
887			 * the last.
888			 */
889			if (datalen == length + fraggap) {
890				alloclen += rt->dst.trailer_len;
891				/* make sure mtu is not reached */
892				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
893					datalen -= ALIGN(rt->dst.trailer_len, 8);
894			}
895			if (transhdrlen) {
896				skb = sock_alloc_send_skb(sk,
897						alloclen + hh_len + 15,
898						(flags & MSG_DONTWAIT), &err);
899			} else {
900				skb = NULL;
901				if (atomic_read(&sk->sk_wmem_alloc) <=
902				    2 * sk->sk_sndbuf)
903					skb = sock_wmalloc(sk,
904							   alloclen + hh_len + 15, 1,
905							   sk->sk_allocation);
906				if (unlikely(skb == NULL))
907					err = -ENOBUFS;
908				else
909					/* only the initial fragment is
910					   time stamped */
911					cork->tx_flags = 0;
912			}
913			if (skb == NULL)
914				goto error;
915
916			/*
917			 *	Fill in the control structures
918			 */
919			skb->ip_summed = csummode;
920			skb->csum = 0;
921			skb_reserve(skb, hh_len);
922			skb_shinfo(skb)->tx_flags = cork->tx_flags;
923
924			/*
925			 *	Find where to start putting bytes.
926			 */
927			data = skb_put(skb, fraglen);
928			skb_set_network_header(skb, exthdrlen);
929			skb->transport_header = (skb->network_header +
930						 fragheaderlen);
931			data += fragheaderlen;
932
933			if (fraggap) {
934				skb->csum = skb_copy_and_csum_bits(
935					skb_prev, maxfraglen,
936					data + transhdrlen, fraggap, 0);
937				skb_prev->csum = csum_sub(skb_prev->csum,
938							  skb->csum);
939				data += fraggap;
940				pskb_trim_unique(skb_prev, maxfraglen);
941			}
942
943			copy = datalen - transhdrlen - fraggap;
944			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
945				err = -EFAULT;
946				kfree_skb(skb);
947				goto error;
948			}
949
950			offset += copy;
951			length -= datalen - fraggap;
952			transhdrlen = 0;
953			exthdrlen = 0;
954			csummode = CHECKSUM_NONE;
955
956			/*
957			 * Put the packet on the pending queue.
958			 */
959			__skb_queue_tail(queue, skb);
960			continue;
961		}
962
963		if (copy > length)
964			copy = length;
965
966		if (!(rt->dst.dev->features&NETIF_F_SG)) {
967			unsigned int off;
968
969			off = skb->len;
970			if (getfrag(from, skb_put(skb, copy),
971					offset, copy, off, skb) < 0) {
972				__skb_trim(skb, off);
973				err = -EFAULT;
974				goto error;
975			}
976		} else {
977			int i = skb_shinfo(skb)->nr_frags;
978			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
979			struct page *page = cork->page;
980			int off = cork->off;
981			unsigned int left;
982
983			if (page && (left = PAGE_SIZE - off) > 0) {
984				if (copy >= left)
985					copy = left;
986				if (page != frag->page) {
987					if (i == MAX_SKB_FRAGS) {
988						err = -EMSGSIZE;
989						goto error;
990					}
991					get_page(page);
992					skb_fill_page_desc(skb, i, page, off, 0);
993					frag = &skb_shinfo(skb)->frags[i];
994				}
995			} else if (i < MAX_SKB_FRAGS) {
996				if (copy > PAGE_SIZE)
997					copy = PAGE_SIZE;
998				page = alloc_pages(sk->sk_allocation, 0);
999				if (page == NULL)  {
1000					err = -ENOMEM;
1001					goto error;
1002				}
1003				cork->page = page;
1004				cork->off = 0;
1005
1006				skb_fill_page_desc(skb, i, page, 0, 0);
1007				frag = &skb_shinfo(skb)->frags[i];
1008			} else {
1009				err = -EMSGSIZE;
1010				goto error;
1011			}
1012			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1013				err = -EFAULT;
1014				goto error;
1015			}
1016			cork->off += copy;
1017			frag->size += copy;
1018			skb->len += copy;
1019			skb->data_len += copy;
1020			skb->truesize += copy;
1021			atomic_add(copy, &sk->sk_wmem_alloc);
1022		}
1023		offset += copy;
1024		length -= copy;
1025	}
1026
1027	return 0;
1028
1029error:
1030	cork->length -= length;
1031	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1032	return err;
1033}
1034
1035static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1036			 struct ipcm_cookie *ipc, struct rtable **rtp)
1037{
1038	struct inet_sock *inet = inet_sk(sk);
1039	struct ip_options_rcu *opt;
1040	struct rtable *rt;
1041
1042	/*
1043	 * setup for corking.
1044	 */
1045	opt = ipc->opt;
1046	if (opt) {
1047		if (cork->opt == NULL) {
1048			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1049					    sk->sk_allocation);
1050			if (unlikely(cork->opt == NULL))
1051				return -ENOBUFS;
1052		}
1053		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1054		cork->flags |= IPCORK_OPT;
1055		cork->addr = ipc->addr;
1056	}
1057	rt = *rtp;
1058	if (unlikely(!rt))
1059		return -EFAULT;
1060	/*
1061	 * We steal reference to this route, caller should not release it
1062	 */
1063	*rtp = NULL;
1064	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1065			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1066	cork->dst = &rt->dst;
1067	cork->length = 0;
1068	cork->tx_flags = ipc->tx_flags;
1069	cork->page = NULL;
1070	cork->off = 0;
1071
1072	return 0;
1073}
1074
1075/*
1076 *	ip_append_data() and ip_append_page() can make one large IP datagram
1077 *	from many pieces of data. Each pieces will be holded on the socket
1078 *	until ip_push_pending_frames() is called. Each piece can be a page
1079 *	or non-page data.
1080 *
1081 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1082 *	this interface potentially.
1083 *
1084 *	LATER: length must be adjusted by pad at tail, when it is required.
1085 */
1086int ip_append_data(struct sock *sk,
1087		   int getfrag(void *from, char *to, int offset, int len,
1088			       int odd, struct sk_buff *skb),
1089		   void *from, int length, int transhdrlen,
1090		   struct ipcm_cookie *ipc, struct rtable **rtp,
1091		   unsigned int flags)
1092{
1093	struct inet_sock *inet = inet_sk(sk);
1094	int err;
1095
1096	if (flags&MSG_PROBE)
1097		return 0;
1098
1099	if (skb_queue_empty(&sk->sk_write_queue)) {
1100		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1101		if (err)
1102			return err;
1103	} else {
1104		transhdrlen = 0;
1105	}
1106
1107	return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork.base, getfrag,
1108				from, length, transhdrlen, flags);
1109}
1110
1111ssize_t	ip_append_page(struct sock *sk, struct page *page,
1112		       int offset, size_t size, int flags)
1113{
1114	struct inet_sock *inet = inet_sk(sk);
1115	struct sk_buff *skb;
1116	struct rtable *rt;
1117	struct ip_options *opt = NULL;
1118	struct inet_cork *cork;
1119	int hh_len;
1120	int mtu;
1121	int len;
1122	int err;
1123	unsigned int maxfraglen, fragheaderlen, fraggap;
1124
1125	if (inet->hdrincl)
1126		return -EPERM;
1127
1128	if (flags&MSG_PROBE)
1129		return 0;
1130
1131	if (skb_queue_empty(&sk->sk_write_queue))
1132		return -EINVAL;
1133
1134	cork = &inet->cork.base;
1135	rt = (struct rtable *)cork->dst;
1136	if (cork->flags & IPCORK_OPT)
1137		opt = cork->opt;
1138
1139	if (!(rt->dst.dev->features&NETIF_F_SG))
1140		return -EOPNOTSUPP;
1141
1142	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1143	mtu = cork->fragsize;
1144
1145	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1146	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1147
1148	if (cork->length + size > 0xFFFF - fragheaderlen) {
1149		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1150		return -EMSGSIZE;
1151	}
1152
1153	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1154		return -EINVAL;
1155
1156	cork->length += size;
1157	if ((size + skb->len > mtu) &&
1158	    (sk->sk_protocol == IPPROTO_UDP) &&
1159	    (rt->dst.dev->features & NETIF_F_UFO)) {
1160		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1161		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1162	}
1163
1164
1165	while (size > 0) {
1166		int i;
1167
1168		if (skb_is_gso(skb))
1169			len = size;
1170		else {
1171
1172			/* Check if the remaining data fits into current packet. */
1173			len = mtu - skb->len;
1174			if (len < size)
1175				len = maxfraglen - skb->len;
1176		}
1177		if (len <= 0) {
1178			struct sk_buff *skb_prev;
1179			int alloclen;
1180
1181			skb_prev = skb;
1182			fraggap = skb_prev->len - maxfraglen;
1183
1184			alloclen = fragheaderlen + hh_len + fraggap + 15;
1185			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1186			if (unlikely(!skb)) {
1187				err = -ENOBUFS;
1188				goto error;
1189			}
1190
1191			/*
1192			 *	Fill in the control structures
1193			 */
1194			skb->ip_summed = CHECKSUM_NONE;
1195			skb->csum = 0;
1196			skb_reserve(skb, hh_len);
1197
1198			/*
1199			 *	Find where to start putting bytes.
1200			 */
1201			skb_put(skb, fragheaderlen + fraggap);
1202			skb_reset_network_header(skb);
1203			skb->transport_header = (skb->network_header +
1204						 fragheaderlen);
1205			if (fraggap) {
1206				skb->csum = skb_copy_and_csum_bits(skb_prev,
1207								   maxfraglen,
1208						    skb_transport_header(skb),
1209								   fraggap, 0);
1210				skb_prev->csum = csum_sub(skb_prev->csum,
1211							  skb->csum);
1212				pskb_trim_unique(skb_prev, maxfraglen);
1213			}
1214
1215			/*
1216			 * Put the packet on the pending queue.
1217			 */
1218			__skb_queue_tail(&sk->sk_write_queue, skb);
1219			continue;
1220		}
1221
1222		i = skb_shinfo(skb)->nr_frags;
1223		if (len > size)
1224			len = size;
1225		if (skb_can_coalesce(skb, i, page, offset)) {
1226			skb_shinfo(skb)->frags[i-1].size += len;
1227		} else if (i < MAX_SKB_FRAGS) {
1228			get_page(page);
1229			skb_fill_page_desc(skb, i, page, offset, len);
1230		} else {
1231			err = -EMSGSIZE;
1232			goto error;
1233		}
1234
1235		if (skb->ip_summed == CHECKSUM_NONE) {
1236			__wsum csum;
1237			csum = csum_page(page, offset, len);
1238			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1239		}
1240
1241		skb->len += len;
1242		skb->data_len += len;
1243		skb->truesize += len;
1244		atomic_add(len, &sk->sk_wmem_alloc);
1245		offset += len;
1246		size -= len;
1247	}
1248	return 0;
1249
1250error:
1251	cork->length -= size;
1252	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1253	return err;
1254}
1255
1256static void ip_cork_release(struct inet_cork *cork)
1257{
1258	cork->flags &= ~IPCORK_OPT;
1259	kfree(cork->opt);
1260	cork->opt = NULL;
1261	dst_release(cork->dst);
1262	cork->dst = NULL;
1263}
1264
1265/*
1266 *	Combined all pending IP fragments on the socket as one IP datagram
1267 *	and push them out.
1268 */
1269struct sk_buff *__ip_make_skb(struct sock *sk,
1270			      struct sk_buff_head *queue,
1271			      struct inet_cork *cork)
1272{
1273	struct sk_buff *skb, *tmp_skb;
1274	struct sk_buff **tail_skb;
1275	struct inet_sock *inet = inet_sk(sk);
1276	struct net *net = sock_net(sk);
1277	struct ip_options *opt = NULL;
1278	struct rtable *rt = (struct rtable *)cork->dst;
1279	struct iphdr *iph;
1280	__be16 df = 0;
1281	__u8 ttl;
1282
1283	if ((skb = __skb_dequeue(queue)) == NULL)
1284		goto out;
1285	tail_skb = &(skb_shinfo(skb)->frag_list);
1286
1287	/* move skb->data to ip header from ext header */
1288	if (skb->data < skb_network_header(skb))
1289		__skb_pull(skb, skb_network_offset(skb));
1290	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1291		__skb_pull(tmp_skb, skb_network_header_len(skb));
1292		*tail_skb = tmp_skb;
1293		tail_skb = &(tmp_skb->next);
1294		skb->len += tmp_skb->len;
1295		skb->data_len += tmp_skb->len;
1296		skb->truesize += tmp_skb->truesize;
1297		tmp_skb->destructor = NULL;
1298		tmp_skb->sk = NULL;
1299	}
1300
1301	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302	 * to fragment the frame generated here. No matter, what transforms
1303	 * how transforms change size of the packet, it will come out.
1304	 */
1305	if (inet->pmtudisc < IP_PMTUDISC_DO)
1306		skb->local_df = 1;
1307
1308	/* DF bit is set when we want to see DF on outgoing frames.
1309	 * If local_df is set too, we still allow to fragment this frame
1310	 * locally. */
1311	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1312	    (skb->len <= dst_mtu(&rt->dst) &&
1313	     ip_dont_fragment(sk, &rt->dst)))
1314		df = htons(IP_DF);
1315
1316	if (cork->flags & IPCORK_OPT)
1317		opt = cork->opt;
1318
1319	if (rt->rt_type == RTN_MULTICAST)
1320		ttl = inet->mc_ttl;
1321	else
1322		ttl = ip_select_ttl(inet, &rt->dst);
1323
1324	iph = (struct iphdr *)skb->data;
1325	iph->version = 4;
1326	iph->ihl = 5;
1327	if (opt) {
1328		iph->ihl += opt->optlen>>2;
1329		ip_options_build(skb, opt, cork->addr, rt, 0);
1330	}
1331	iph->tos = inet->tos;
1332	iph->frag_off = df;
1333	ip_select_ident(iph, &rt->dst, sk);
1334	iph->ttl = ttl;
1335	iph->protocol = sk->sk_protocol;
1336	iph->saddr = rt->rt_src;
1337	iph->daddr = rt->rt_dst;
1338
1339	skb->priority = sk->sk_priority;
1340	skb->mark = sk->sk_mark;
1341	/*
1342	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1343	 * on dst refcount
1344	 */
1345	cork->dst = NULL;
1346	skb_dst_set(skb, &rt->dst);
1347
1348	if (iph->protocol == IPPROTO_ICMP)
1349		icmp_out_count(net, ((struct icmphdr *)
1350			skb_transport_header(skb))->type);
1351
1352	ip_cork_release(cork);
1353out:
1354	return skb;
1355}
1356
1357int ip_send_skb(struct sk_buff *skb)
1358{
1359	struct net *net = sock_net(skb->sk);
1360	int err;
1361
1362	err = ip_local_out(skb);
1363	if (err) {
1364		if (err > 0)
1365			err = net_xmit_errno(err);
1366		if (err)
1367			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1368	}
1369
1370	return err;
1371}
1372
1373int ip_push_pending_frames(struct sock *sk)
1374{
1375	struct sk_buff *skb;
1376
1377	skb = ip_finish_skb(sk);
1378	if (!skb)
1379		return 0;
1380
1381	/* Netfilter gets whole the not fragmented skb. */
1382	return ip_send_skb(skb);
1383}
1384
1385/*
1386 *	Throw away all pending data on the socket.
1387 */
1388static void __ip_flush_pending_frames(struct sock *sk,
1389				      struct sk_buff_head *queue,
1390				      struct inet_cork *cork)
1391{
1392	struct sk_buff *skb;
1393
1394	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1395		kfree_skb(skb);
1396
1397	ip_cork_release(cork);
1398}
1399
1400void ip_flush_pending_frames(struct sock *sk)
1401{
1402	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1403}
1404
1405struct sk_buff *ip_make_skb(struct sock *sk,
1406			    int getfrag(void *from, char *to, int offset,
1407					int len, int odd, struct sk_buff *skb),
1408			    void *from, int length, int transhdrlen,
1409			    struct ipcm_cookie *ipc, struct rtable **rtp,
1410			    unsigned int flags)
1411{
1412	struct inet_cork cork;
1413	struct sk_buff_head queue;
1414	int err;
1415
1416	if (flags & MSG_PROBE)
1417		return NULL;
1418
1419	__skb_queue_head_init(&queue);
1420
1421	cork.flags = 0;
1422	cork.addr = 0;
1423	cork.opt = NULL;
1424	err = ip_setup_cork(sk, &cork, ipc, rtp);
1425	if (err)
1426		return ERR_PTR(err);
1427
1428	err = __ip_append_data(sk, &queue, &cork, getfrag,
1429			       from, length, transhdrlen, flags);
1430	if (err) {
1431		__ip_flush_pending_frames(sk, &queue, &cork);
1432		return ERR_PTR(err);
1433	}
1434
1435	return __ip_make_skb(sk, &queue, &cork);
1436}
1437
1438/*
1439 *	Fetch data from kernel space and fill in checksum if needed.
1440 */
1441static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1442			      int len, int odd, struct sk_buff *skb)
1443{
1444	__wsum csum;
1445
1446	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1447	skb->csum = csum_block_add(skb->csum, csum, odd);
1448	return 0;
1449}
1450
1451/*
1452 *	Generic function to send a packet as reply to another packet.
1453 *	Used to send TCP resets so far. ICMP should use this function too.
1454 *
1455 *	Should run single threaded per socket because it uses the sock
1456 *     	structure to pass arguments.
1457 */
1458void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1459		   unsigned int len)
1460{
1461	struct inet_sock *inet = inet_sk(sk);
1462	struct ip_options_data replyopts;
1463	struct ipcm_cookie ipc;
1464	__be32 daddr;
1465	struct rtable *rt = skb_rtable(skb);
1466
1467	if (ip_options_echo(&replyopts.opt.opt, skb))
1468		return;
1469
1470	daddr = ipc.addr = rt->rt_src;
1471	ipc.opt = NULL;
1472	ipc.tx_flags = 0;
1473
1474	if (replyopts.opt.opt.optlen) {
1475		ipc.opt = &replyopts.opt;
1476
1477		if (replyopts.opt.opt.srr)
1478			daddr = replyopts.opt.opt.faddr;
1479	}
1480
1481	{
1482		struct flowi4 fl4;
1483
1484		flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1485				   RT_TOS(ip_hdr(skb)->tos),
1486				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1487				   ip_reply_arg_flowi_flags(arg),
1488				   daddr, rt->rt_spec_dst,
1489				   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1490		security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1491		rt = ip_route_output_key(sock_net(sk), &fl4);
1492		if (IS_ERR(rt))
1493			return;
1494	}
1495
1496	/* And let IP do all the hard work.
1497
1498	   This chunk is not reenterable, hence spinlock.
1499	   Note that it uses the fact, that this function is called
1500	   with locally disabled BH and that sk cannot be already spinlocked.
1501	 */
1502	bh_lock_sock(sk);
1503	inet->tos = ip_hdr(skb)->tos;
1504	sk->sk_priority = skb->priority;
1505	sk->sk_protocol = ip_hdr(skb)->protocol;
1506	sk->sk_bound_dev_if = arg->bound_dev_if;
1507	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1508		       &ipc, &rt, MSG_DONTWAIT);
1509	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1510		if (arg->csumoffset >= 0)
1511			*((__sum16 *)skb_transport_header(skb) +
1512			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1513								arg->csum));
1514		skb->ip_summed = CHECKSUM_NONE;
1515		ip_push_pending_frames(sk);
1516	}
1517
1518	bh_unlock_sock(sk);
1519
1520	ip_rt_put(rt);
1521}
1522
1523void __init ip_init(void)
1524{
1525	ip_rt_init();
1526	inet_initpeers();
1527
1528#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1529	igmp_mc_proc_init();
1530#endif
1531}
1532