ip_output.c revision 1c32c5ad6fac8cee1a77449f5abf211e911ff830
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = rt->rt_dst;
162	iph->saddr    = rt->rt_src;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->optlen) {
167		iph->ihl += opt->optlen>>2;
168		ip_options_build(skb, opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186	if (rt->rt_type == RTN_MULTICAST) {
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188	} else if (rt->rt_type == RTN_BROADCAST)
189		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191	/* Be paranoid, rather than too clever. */
192	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193		struct sk_buff *skb2;
194
195		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196		if (skb2 == NULL) {
197			kfree_skb(skb);
198			return -ENOMEM;
199		}
200		if (skb->sk)
201			skb_set_owner_w(skb2, skb->sk);
202		kfree_skb(skb);
203		skb = skb2;
204	}
205
206	if (dst->hh)
207		return neigh_hh_output(dst->hh, skb);
208	else if (dst->neighbour)
209		return dst->neighbour->output(skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb_dst(skb)->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb_rtable(skb);
244	struct net_device *dev = rt->dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if (sk_mc_loop(sk)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    &&
270		    ((rt->rt_flags & RTCF_LOCAL) ||
271		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272#endif
273		   ) {
274			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275			if (newskb)
276				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277					newskb, NULL, newskb->dev,
278					ip_dev_loopback_xmit);
279		}
280
281		/* Multicasts with ttl 0 must not go beyond the host */
282
283		if (ip_hdr(skb)->ttl == 0) {
284			kfree_skb(skb);
285			return 0;
286		}
287	}
288
289	if (rt->rt_flags&RTCF_BROADCAST) {
290		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291		if (newskb)
292			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293				NULL, newskb->dev, ip_dev_loopback_xmit);
294	}
295
296	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297			    skb->dev, ip_finish_output,
298			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299}
300
301int ip_output(struct sk_buff *skb)
302{
303	struct net_device *dev = skb_dst(skb)->dev;
304
305	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307	skb->dev = dev;
308	skb->protocol = htons(ETH_P_IP);
309
310	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311			    ip_finish_output,
312			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313}
314
315int ip_queue_xmit(struct sk_buff *skb)
316{
317	struct sock *sk = skb->sk;
318	struct inet_sock *inet = inet_sk(sk);
319	struct ip_options *opt = inet->opt;
320	struct rtable *rt;
321	struct iphdr *iph;
322	int res;
323
324	/* Skip all of this if the packet is already routed,
325	 * f.e. by something like SCTP.
326	 */
327	rcu_read_lock();
328	rt = skb_rtable(skb);
329	if (rt != NULL)
330		goto packet_routed;
331
332	/* Make sure we can route this packet. */
333	rt = (struct rtable *)__sk_dst_check(sk, 0);
334	if (rt == NULL) {
335		__be32 daddr;
336
337		/* Use correct destination address if we have options. */
338		daddr = inet->inet_daddr;
339		if(opt && opt->srr)
340			daddr = opt->faddr;
341
342		{
343			struct flowi fl = { .oif = sk->sk_bound_dev_if,
344					    .mark = sk->sk_mark,
345					    .fl4_dst = daddr,
346					    .fl4_src = inet->inet_saddr,
347					    .fl4_tos = RT_CONN_FLAGS(sk),
348					    .proto = sk->sk_protocol,
349					    .flags = inet_sk_flowi_flags(sk),
350					    .fl_ip_sport = inet->inet_sport,
351					    .fl_ip_dport = inet->inet_dport };
352
353			/* If this fails, retransmit mechanism of transport layer will
354			 * keep trying until route appears or the connection times
355			 * itself out.
356			 */
357			security_sk_classify_flow(sk, &fl);
358			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
359				goto no_route;
360		}
361		sk_setup_caps(sk, &rt->dst);
362	}
363	skb_dst_set_noref(skb, &rt->dst);
364
365packet_routed:
366	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
367		goto no_route;
368
369	/* OK, we know where to send it, allocate and build IP header. */
370	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
371	skb_reset_network_header(skb);
372	iph = ip_hdr(skb);
373	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
374	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
375		iph->frag_off = htons(IP_DF);
376	else
377		iph->frag_off = 0;
378	iph->ttl      = ip_select_ttl(inet, &rt->dst);
379	iph->protocol = sk->sk_protocol;
380	iph->saddr    = rt->rt_src;
381	iph->daddr    = rt->rt_dst;
382	/* Transport layer set skb->h.foo itself. */
383
384	if (opt && opt->optlen) {
385		iph->ihl += opt->optlen >> 2;
386		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
387	}
388
389	ip_select_ident_more(iph, &rt->dst, sk,
390			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
391
392	skb->priority = sk->sk_priority;
393	skb->mark = sk->sk_mark;
394
395	res = ip_local_out(skb);
396	rcu_read_unlock();
397	return res;
398
399no_route:
400	rcu_read_unlock();
401	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
402	kfree_skb(skb);
403	return -EHOSTUNREACH;
404}
405EXPORT_SYMBOL(ip_queue_xmit);
406
407
408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
409{
410	to->pkt_type = from->pkt_type;
411	to->priority = from->priority;
412	to->protocol = from->protocol;
413	skb_dst_drop(to);
414	skb_dst_copy(to, from);
415	to->dev = from->dev;
416	to->mark = from->mark;
417
418	/* Copy the flags to each fragment. */
419	IPCB(to)->flags = IPCB(from)->flags;
420
421#ifdef CONFIG_NET_SCHED
422	to->tc_index = from->tc_index;
423#endif
424	nf_copy(to, from);
425#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
426    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
427	to->nf_trace = from->nf_trace;
428#endif
429#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
430	to->ipvs_property = from->ipvs_property;
431#endif
432	skb_copy_secmark(to, from);
433}
434
435/*
436 *	This IP datagram is too large to be sent in one piece.  Break it up into
437 *	smaller pieces (each of size equal to IP header plus
438 *	a block of the data of the original IP data part) that will yet fit in a
439 *	single device frame, and queue such a frame for sending.
440 */
441
442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
443{
444	struct iphdr *iph;
445	int ptr;
446	struct net_device *dev;
447	struct sk_buff *skb2;
448	unsigned int mtu, hlen, left, len, ll_rs;
449	int offset;
450	__be16 not_last_frag;
451	struct rtable *rt = skb_rtable(skb);
452	int err = 0;
453
454	dev = rt->dst.dev;
455
456	/*
457	 *	Point into the IP datagram header.
458	 */
459
460	iph = ip_hdr(skb);
461
462	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
463		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
464		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
465			  htonl(ip_skb_dst_mtu(skb)));
466		kfree_skb(skb);
467		return -EMSGSIZE;
468	}
469
470	/*
471	 *	Setup starting values.
472	 */
473
474	hlen = iph->ihl * 4;
475	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
476#ifdef CONFIG_BRIDGE_NETFILTER
477	if (skb->nf_bridge)
478		mtu -= nf_bridge_mtu_reduction(skb);
479#endif
480	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
481
482	/* When frag_list is given, use it. First, check its validity:
483	 * some transformers could create wrong frag_list or break existing
484	 * one, it is not prohibited. In this case fall back to copying.
485	 *
486	 * LATER: this step can be merged to real generation of fragments,
487	 * we can switch to copy when see the first bad fragment.
488	 */
489	if (skb_has_frag_list(skb)) {
490		struct sk_buff *frag, *frag2;
491		int first_len = skb_pagelen(skb);
492
493		if (first_len - hlen > mtu ||
494		    ((first_len - hlen) & 7) ||
495		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
496		    skb_cloned(skb))
497			goto slow_path;
498
499		skb_walk_frags(skb, frag) {
500			/* Correct geometry. */
501			if (frag->len > mtu ||
502			    ((frag->len & 7) && frag->next) ||
503			    skb_headroom(frag) < hlen)
504				goto slow_path_clean;
505
506			/* Partially cloned skb? */
507			if (skb_shared(frag))
508				goto slow_path_clean;
509
510			BUG_ON(frag->sk);
511			if (skb->sk) {
512				frag->sk = skb->sk;
513				frag->destructor = sock_wfree;
514			}
515			skb->truesize -= frag->truesize;
516		}
517
518		/* Everything is OK. Generate! */
519
520		err = 0;
521		offset = 0;
522		frag = skb_shinfo(skb)->frag_list;
523		skb_frag_list_init(skb);
524		skb->data_len = first_len - skb_headlen(skb);
525		skb->len = first_len;
526		iph->tot_len = htons(first_len);
527		iph->frag_off = htons(IP_MF);
528		ip_send_check(iph);
529
530		for (;;) {
531			/* Prepare header of the next frame,
532			 * before previous one went down. */
533			if (frag) {
534				frag->ip_summed = CHECKSUM_NONE;
535				skb_reset_transport_header(frag);
536				__skb_push(frag, hlen);
537				skb_reset_network_header(frag);
538				memcpy(skb_network_header(frag), iph, hlen);
539				iph = ip_hdr(frag);
540				iph->tot_len = htons(frag->len);
541				ip_copy_metadata(frag, skb);
542				if (offset == 0)
543					ip_options_fragment(frag);
544				offset += skb->len - hlen;
545				iph->frag_off = htons(offset>>3);
546				if (frag->next != NULL)
547					iph->frag_off |= htons(IP_MF);
548				/* Ready, complete checksum */
549				ip_send_check(iph);
550			}
551
552			err = output(skb);
553
554			if (!err)
555				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
556			if (err || !frag)
557				break;
558
559			skb = frag;
560			frag = skb->next;
561			skb->next = NULL;
562		}
563
564		if (err == 0) {
565			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
566			return 0;
567		}
568
569		while (frag) {
570			skb = frag->next;
571			kfree_skb(frag);
572			frag = skb;
573		}
574		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
575		return err;
576
577slow_path_clean:
578		skb_walk_frags(skb, frag2) {
579			if (frag2 == frag)
580				break;
581			frag2->sk = NULL;
582			frag2->destructor = NULL;
583			skb->truesize += frag2->truesize;
584		}
585	}
586
587slow_path:
588	left = skb->len - hlen;		/* Space per frame */
589	ptr = hlen;		/* Where to start from */
590
591	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
592	 * we need to make room for the encapsulating header
593	 */
594	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
595
596	/*
597	 *	Fragment the datagram.
598	 */
599
600	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
601	not_last_frag = iph->frag_off & htons(IP_MF);
602
603	/*
604	 *	Keep copying data until we run out.
605	 */
606
607	while (left > 0) {
608		len = left;
609		/* IF: it doesn't fit, use 'mtu' - the data space left */
610		if (len > mtu)
611			len = mtu;
612		/* IF: we are not sending upto and including the packet end
613		   then align the next start on an eight byte boundary */
614		if (len < left)	{
615			len &= ~7;
616		}
617		/*
618		 *	Allocate buffer.
619		 */
620
621		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
622			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
623			err = -ENOMEM;
624			goto fail;
625		}
626
627		/*
628		 *	Set up data on packet
629		 */
630
631		ip_copy_metadata(skb2, skb);
632		skb_reserve(skb2, ll_rs);
633		skb_put(skb2, len + hlen);
634		skb_reset_network_header(skb2);
635		skb2->transport_header = skb2->network_header + hlen;
636
637		/*
638		 *	Charge the memory for the fragment to any owner
639		 *	it might possess
640		 */
641
642		if (skb->sk)
643			skb_set_owner_w(skb2, skb->sk);
644
645		/*
646		 *	Copy the packet header into the new buffer.
647		 */
648
649		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
650
651		/*
652		 *	Copy a block of the IP datagram.
653		 */
654		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
655			BUG();
656		left -= len;
657
658		/*
659		 *	Fill in the new header fields.
660		 */
661		iph = ip_hdr(skb2);
662		iph->frag_off = htons((offset >> 3));
663
664		/* ANK: dirty, but effective trick. Upgrade options only if
665		 * the segment to be fragmented was THE FIRST (otherwise,
666		 * options are already fixed) and make it ONCE
667		 * on the initial skb, so that all the following fragments
668		 * will inherit fixed options.
669		 */
670		if (offset == 0)
671			ip_options_fragment(skb);
672
673		/*
674		 *	Added AC : If we are fragmenting a fragment that's not the
675		 *		   last fragment then keep MF on each bit
676		 */
677		if (left > 0 || not_last_frag)
678			iph->frag_off |= htons(IP_MF);
679		ptr += len;
680		offset += len;
681
682		/*
683		 *	Put this fragment into the sending queue.
684		 */
685		iph->tot_len = htons(len + hlen);
686
687		ip_send_check(iph);
688
689		err = output(skb2);
690		if (err)
691			goto fail;
692
693		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
694	}
695	kfree_skb(skb);
696	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
697	return err;
698
699fail:
700	kfree_skb(skb);
701	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
702	return err;
703}
704EXPORT_SYMBOL(ip_fragment);
705
706int
707ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
708{
709	struct iovec *iov = from;
710
711	if (skb->ip_summed == CHECKSUM_PARTIAL) {
712		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
713			return -EFAULT;
714	} else {
715		__wsum csum = 0;
716		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
717			return -EFAULT;
718		skb->csum = csum_block_add(skb->csum, csum, odd);
719	}
720	return 0;
721}
722EXPORT_SYMBOL(ip_generic_getfrag);
723
724static inline __wsum
725csum_page(struct page *page, int offset, int copy)
726{
727	char *kaddr;
728	__wsum csum;
729	kaddr = kmap(page);
730	csum = csum_partial(kaddr + offset, copy, 0);
731	kunmap(page);
732	return csum;
733}
734
735static inline int ip_ufo_append_data(struct sock *sk,
736			struct sk_buff_head *queue,
737			int getfrag(void *from, char *to, int offset, int len,
738			       int odd, struct sk_buff *skb),
739			void *from, int length, int hh_len, int fragheaderlen,
740			int transhdrlen, int mtu, unsigned int flags)
741{
742	struct sk_buff *skb;
743	int err;
744
745	/* There is support for UDP fragmentation offload by network
746	 * device, so create one single skb packet containing complete
747	 * udp datagram
748	 */
749	if ((skb = skb_peek_tail(queue)) == NULL) {
750		skb = sock_alloc_send_skb(sk,
751			hh_len + fragheaderlen + transhdrlen + 20,
752			(flags & MSG_DONTWAIT), &err);
753
754		if (skb == NULL)
755			return err;
756
757		/* reserve space for Hardware header */
758		skb_reserve(skb, hh_len);
759
760		/* create space for UDP/IP header */
761		skb_put(skb, fragheaderlen + transhdrlen);
762
763		/* initialize network header pointer */
764		skb_reset_network_header(skb);
765
766		/* initialize protocol header pointer */
767		skb->transport_header = skb->network_header + fragheaderlen;
768
769		skb->ip_summed = CHECKSUM_PARTIAL;
770		skb->csum = 0;
771
772		/* specify the length of each IP datagram fragment */
773		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775		__skb_queue_tail(queue, skb);
776	}
777
778	return skb_append_datato_frags(sk, skb, getfrag, from,
779				       (length - transhdrlen));
780}
781
782static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
783			    struct inet_cork *cork,
784			    int getfrag(void *from, char *to, int offset,
785					int len, int odd, struct sk_buff *skb),
786			    void *from, int length, int transhdrlen,
787			    unsigned int flags)
788{
789	struct inet_sock *inet = inet_sk(sk);
790	struct sk_buff *skb;
791
792	struct ip_options *opt = inet->cork.opt;
793	int hh_len;
794	int exthdrlen;
795	int mtu;
796	int copy;
797	int err;
798	int offset = 0;
799	unsigned int maxfraglen, fragheaderlen;
800	int csummode = CHECKSUM_NONE;
801	struct rtable *rt = (struct rtable *)cork->dst;
802
803	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
804	length += exthdrlen;
805	transhdrlen += exthdrlen;
806	mtu = inet->cork.fragsize;
807
808	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
809
810	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
811	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
812
813	if (cork->length + length > 0xFFFF - fragheaderlen) {
814		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
815			       mtu-exthdrlen);
816		return -EMSGSIZE;
817	}
818
819	/*
820	 * transhdrlen > 0 means that this is the first fragment and we wish
821	 * it won't be fragmented in the future.
822	 */
823	if (transhdrlen &&
824	    length + fragheaderlen <= mtu &&
825	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
826	    !exthdrlen)
827		csummode = CHECKSUM_PARTIAL;
828
829	skb = skb_peek_tail(queue);
830
831	cork->length += length;
832	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
833	    (sk->sk_protocol == IPPROTO_UDP) &&
834	    (rt->dst.dev->features & NETIF_F_UFO)) {
835		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
836					 hh_len, fragheaderlen, transhdrlen,
837					 mtu, flags);
838		if (err)
839			goto error;
840		return 0;
841	}
842
843	/* So, what's going on in the loop below?
844	 *
845	 * We use calculated fragment length to generate chained skb,
846	 * each of segments is IP fragment ready for sending to network after
847	 * adding appropriate IP header.
848	 */
849
850	if (!skb)
851		goto alloc_new_skb;
852
853	while (length > 0) {
854		/* Check if the remaining data fits into current packet. */
855		copy = mtu - skb->len;
856		if (copy < length)
857			copy = maxfraglen - skb->len;
858		if (copy <= 0) {
859			char *data;
860			unsigned int datalen;
861			unsigned int fraglen;
862			unsigned int fraggap;
863			unsigned int alloclen;
864			struct sk_buff *skb_prev;
865alloc_new_skb:
866			skb_prev = skb;
867			if (skb_prev)
868				fraggap = skb_prev->len - maxfraglen;
869			else
870				fraggap = 0;
871
872			/*
873			 * If remaining data exceeds the mtu,
874			 * we know we need more fragment(s).
875			 */
876			datalen = length + fraggap;
877			if (datalen > mtu - fragheaderlen)
878				datalen = maxfraglen - fragheaderlen;
879			fraglen = datalen + fragheaderlen;
880
881			if ((flags & MSG_MORE) &&
882			    !(rt->dst.dev->features&NETIF_F_SG))
883				alloclen = mtu;
884			else
885				alloclen = fraglen;
886
887			/* The last fragment gets additional space at tail.
888			 * Note, with MSG_MORE we overallocate on fragments,
889			 * because we have no idea what fragment will be
890			 * the last.
891			 */
892			if (datalen == length + fraggap) {
893				alloclen += rt->dst.trailer_len;
894				/* make sure mtu is not reached */
895				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
896					datalen -= ALIGN(rt->dst.trailer_len, 8);
897			}
898			if (transhdrlen) {
899				skb = sock_alloc_send_skb(sk,
900						alloclen + hh_len + 15,
901						(flags & MSG_DONTWAIT), &err);
902			} else {
903				skb = NULL;
904				if (atomic_read(&sk->sk_wmem_alloc) <=
905				    2 * sk->sk_sndbuf)
906					skb = sock_wmalloc(sk,
907							   alloclen + hh_len + 15, 1,
908							   sk->sk_allocation);
909				if (unlikely(skb == NULL))
910					err = -ENOBUFS;
911				else
912					/* only the initial fragment is
913					   time stamped */
914					cork->tx_flags = 0;
915			}
916			if (skb == NULL)
917				goto error;
918
919			/*
920			 *	Fill in the control structures
921			 */
922			skb->ip_summed = csummode;
923			skb->csum = 0;
924			skb_reserve(skb, hh_len);
925			skb_shinfo(skb)->tx_flags = cork->tx_flags;
926
927			/*
928			 *	Find where to start putting bytes.
929			 */
930			data = skb_put(skb, fraglen);
931			skb_set_network_header(skb, exthdrlen);
932			skb->transport_header = (skb->network_header +
933						 fragheaderlen);
934			data += fragheaderlen;
935
936			if (fraggap) {
937				skb->csum = skb_copy_and_csum_bits(
938					skb_prev, maxfraglen,
939					data + transhdrlen, fraggap, 0);
940				skb_prev->csum = csum_sub(skb_prev->csum,
941							  skb->csum);
942				data += fraggap;
943				pskb_trim_unique(skb_prev, maxfraglen);
944			}
945
946			copy = datalen - transhdrlen - fraggap;
947			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
948				err = -EFAULT;
949				kfree_skb(skb);
950				goto error;
951			}
952
953			offset += copy;
954			length -= datalen - fraggap;
955			transhdrlen = 0;
956			exthdrlen = 0;
957			csummode = CHECKSUM_NONE;
958
959			/*
960			 * Put the packet on the pending queue.
961			 */
962			__skb_queue_tail(queue, skb);
963			continue;
964		}
965
966		if (copy > length)
967			copy = length;
968
969		if (!(rt->dst.dev->features&NETIF_F_SG)) {
970			unsigned int off;
971
972			off = skb->len;
973			if (getfrag(from, skb_put(skb, copy),
974					offset, copy, off, skb) < 0) {
975				__skb_trim(skb, off);
976				err = -EFAULT;
977				goto error;
978			}
979		} else {
980			int i = skb_shinfo(skb)->nr_frags;
981			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
982			struct page *page = cork->page;
983			int off = cork->off;
984			unsigned int left;
985
986			if (page && (left = PAGE_SIZE - off) > 0) {
987				if (copy >= left)
988					copy = left;
989				if (page != frag->page) {
990					if (i == MAX_SKB_FRAGS) {
991						err = -EMSGSIZE;
992						goto error;
993					}
994					get_page(page);
995					skb_fill_page_desc(skb, i, page, off, 0);
996					frag = &skb_shinfo(skb)->frags[i];
997				}
998			} else if (i < MAX_SKB_FRAGS) {
999				if (copy > PAGE_SIZE)
1000					copy = PAGE_SIZE;
1001				page = alloc_pages(sk->sk_allocation, 0);
1002				if (page == NULL)  {
1003					err = -ENOMEM;
1004					goto error;
1005				}
1006				cork->page = page;
1007				cork->off = 0;
1008
1009				skb_fill_page_desc(skb, i, page, 0, 0);
1010				frag = &skb_shinfo(skb)->frags[i];
1011			} else {
1012				err = -EMSGSIZE;
1013				goto error;
1014			}
1015			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1016				err = -EFAULT;
1017				goto error;
1018			}
1019			cork->off += copy;
1020			frag->size += copy;
1021			skb->len += copy;
1022			skb->data_len += copy;
1023			skb->truesize += copy;
1024			atomic_add(copy, &sk->sk_wmem_alloc);
1025		}
1026		offset += copy;
1027		length -= copy;
1028	}
1029
1030	return 0;
1031
1032error:
1033	cork->length -= length;
1034	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1035	return err;
1036}
1037
1038static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1039			 struct ipcm_cookie *ipc, struct rtable **rtp)
1040{
1041	struct inet_sock *inet = inet_sk(sk);
1042	struct ip_options *opt;
1043	struct rtable *rt;
1044
1045	/*
1046	 * setup for corking.
1047	 */
1048	opt = ipc->opt;
1049	if (opt) {
1050		if (cork->opt == NULL) {
1051			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1052					    sk->sk_allocation);
1053			if (unlikely(cork->opt == NULL))
1054				return -ENOBUFS;
1055		}
1056		memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1057		cork->flags |= IPCORK_OPT;
1058		cork->addr = ipc->addr;
1059	}
1060	rt = *rtp;
1061	if (unlikely(!rt))
1062		return -EFAULT;
1063	/*
1064	 * We steal reference to this route, caller should not release it
1065	 */
1066	*rtp = NULL;
1067	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1068			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1069	cork->dst = &rt->dst;
1070	cork->length = 0;
1071	cork->tx_flags = ipc->tx_flags;
1072	cork->page = NULL;
1073	cork->off = 0;
1074
1075	return 0;
1076}
1077
1078/*
1079 *	ip_append_data() and ip_append_page() can make one large IP datagram
1080 *	from many pieces of data. Each pieces will be holded on the socket
1081 *	until ip_push_pending_frames() is called. Each piece can be a page
1082 *	or non-page data.
1083 *
1084 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1085 *	this interface potentially.
1086 *
1087 *	LATER: length must be adjusted by pad at tail, when it is required.
1088 */
1089int ip_append_data(struct sock *sk,
1090		   int getfrag(void *from, char *to, int offset, int len,
1091			       int odd, struct sk_buff *skb),
1092		   void *from, int length, int transhdrlen,
1093		   struct ipcm_cookie *ipc, struct rtable **rtp,
1094		   unsigned int flags)
1095{
1096	struct inet_sock *inet = inet_sk(sk);
1097	int err;
1098
1099	if (flags&MSG_PROBE)
1100		return 0;
1101
1102	if (skb_queue_empty(&sk->sk_write_queue)) {
1103		err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1104		if (err)
1105			return err;
1106	} else {
1107		transhdrlen = 0;
1108	}
1109
1110	return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1111				from, length, transhdrlen, flags);
1112}
1113
1114ssize_t	ip_append_page(struct sock *sk, struct page *page,
1115		       int offset, size_t size, int flags)
1116{
1117	struct inet_sock *inet = inet_sk(sk);
1118	struct sk_buff *skb;
1119	struct rtable *rt;
1120	struct ip_options *opt = NULL;
1121	int hh_len;
1122	int mtu;
1123	int len;
1124	int err;
1125	unsigned int maxfraglen, fragheaderlen, fraggap;
1126
1127	if (inet->hdrincl)
1128		return -EPERM;
1129
1130	if (flags&MSG_PROBE)
1131		return 0;
1132
1133	if (skb_queue_empty(&sk->sk_write_queue))
1134		return -EINVAL;
1135
1136	rt = (struct rtable *)inet->cork.dst;
1137	if (inet->cork.flags & IPCORK_OPT)
1138		opt = inet->cork.opt;
1139
1140	if (!(rt->dst.dev->features&NETIF_F_SG))
1141		return -EOPNOTSUPP;
1142
1143	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1144	mtu = inet->cork.fragsize;
1145
1146	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1147	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1148
1149	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1150		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1151		return -EMSGSIZE;
1152	}
1153
1154	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1155		return -EINVAL;
1156
1157	inet->cork.length += size;
1158	if ((size + skb->len > mtu) &&
1159	    (sk->sk_protocol == IPPROTO_UDP) &&
1160	    (rt->dst.dev->features & NETIF_F_UFO)) {
1161		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1162		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1163	}
1164
1165
1166	while (size > 0) {
1167		int i;
1168
1169		if (skb_is_gso(skb))
1170			len = size;
1171		else {
1172
1173			/* Check if the remaining data fits into current packet. */
1174			len = mtu - skb->len;
1175			if (len < size)
1176				len = maxfraglen - skb->len;
1177		}
1178		if (len <= 0) {
1179			struct sk_buff *skb_prev;
1180			int alloclen;
1181
1182			skb_prev = skb;
1183			fraggap = skb_prev->len - maxfraglen;
1184
1185			alloclen = fragheaderlen + hh_len + fraggap + 15;
1186			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1187			if (unlikely(!skb)) {
1188				err = -ENOBUFS;
1189				goto error;
1190			}
1191
1192			/*
1193			 *	Fill in the control structures
1194			 */
1195			skb->ip_summed = CHECKSUM_NONE;
1196			skb->csum = 0;
1197			skb_reserve(skb, hh_len);
1198
1199			/*
1200			 *	Find where to start putting bytes.
1201			 */
1202			skb_put(skb, fragheaderlen + fraggap);
1203			skb_reset_network_header(skb);
1204			skb->transport_header = (skb->network_header +
1205						 fragheaderlen);
1206			if (fraggap) {
1207				skb->csum = skb_copy_and_csum_bits(skb_prev,
1208								   maxfraglen,
1209						    skb_transport_header(skb),
1210								   fraggap, 0);
1211				skb_prev->csum = csum_sub(skb_prev->csum,
1212							  skb->csum);
1213				pskb_trim_unique(skb_prev, maxfraglen);
1214			}
1215
1216			/*
1217			 * Put the packet on the pending queue.
1218			 */
1219			__skb_queue_tail(&sk->sk_write_queue, skb);
1220			continue;
1221		}
1222
1223		i = skb_shinfo(skb)->nr_frags;
1224		if (len > size)
1225			len = size;
1226		if (skb_can_coalesce(skb, i, page, offset)) {
1227			skb_shinfo(skb)->frags[i-1].size += len;
1228		} else if (i < MAX_SKB_FRAGS) {
1229			get_page(page);
1230			skb_fill_page_desc(skb, i, page, offset, len);
1231		} else {
1232			err = -EMSGSIZE;
1233			goto error;
1234		}
1235
1236		if (skb->ip_summed == CHECKSUM_NONE) {
1237			__wsum csum;
1238			csum = csum_page(page, offset, len);
1239			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1240		}
1241
1242		skb->len += len;
1243		skb->data_len += len;
1244		skb->truesize += len;
1245		atomic_add(len, &sk->sk_wmem_alloc);
1246		offset += len;
1247		size -= len;
1248	}
1249	return 0;
1250
1251error:
1252	inet->cork.length -= size;
1253	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1254	return err;
1255}
1256
1257static void ip_cork_release(struct inet_cork *cork)
1258{
1259	cork->flags &= ~IPCORK_OPT;
1260	kfree(cork->opt);
1261	cork->opt = NULL;
1262	dst_release(cork->dst);
1263	cork->dst = NULL;
1264}
1265
1266/*
1267 *	Combined all pending IP fragments on the socket as one IP datagram
1268 *	and push them out.
1269 */
1270struct sk_buff *__ip_make_skb(struct sock *sk,
1271			      struct sk_buff_head *queue,
1272			      struct inet_cork *cork)
1273{
1274	struct sk_buff *skb, *tmp_skb;
1275	struct sk_buff **tail_skb;
1276	struct inet_sock *inet = inet_sk(sk);
1277	struct net *net = sock_net(sk);
1278	struct ip_options *opt = NULL;
1279	struct rtable *rt = (struct rtable *)cork->dst;
1280	struct iphdr *iph;
1281	__be16 df = 0;
1282	__u8 ttl;
1283
1284	if ((skb = __skb_dequeue(queue)) == NULL)
1285		goto out;
1286	tail_skb = &(skb_shinfo(skb)->frag_list);
1287
1288	/* move skb->data to ip header from ext header */
1289	if (skb->data < skb_network_header(skb))
1290		__skb_pull(skb, skb_network_offset(skb));
1291	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1292		__skb_pull(tmp_skb, skb_network_header_len(skb));
1293		*tail_skb = tmp_skb;
1294		tail_skb = &(tmp_skb->next);
1295		skb->len += tmp_skb->len;
1296		skb->data_len += tmp_skb->len;
1297		skb->truesize += tmp_skb->truesize;
1298		tmp_skb->destructor = NULL;
1299		tmp_skb->sk = NULL;
1300	}
1301
1302	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1303	 * to fragment the frame generated here. No matter, what transforms
1304	 * how transforms change size of the packet, it will come out.
1305	 */
1306	if (inet->pmtudisc < IP_PMTUDISC_DO)
1307		skb->local_df = 1;
1308
1309	/* DF bit is set when we want to see DF on outgoing frames.
1310	 * If local_df is set too, we still allow to fragment this frame
1311	 * locally. */
1312	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1313	    (skb->len <= dst_mtu(&rt->dst) &&
1314	     ip_dont_fragment(sk, &rt->dst)))
1315		df = htons(IP_DF);
1316
1317	if (cork->flags & IPCORK_OPT)
1318		opt = cork->opt;
1319
1320	if (rt->rt_type == RTN_MULTICAST)
1321		ttl = inet->mc_ttl;
1322	else
1323		ttl = ip_select_ttl(inet, &rt->dst);
1324
1325	iph = (struct iphdr *)skb->data;
1326	iph->version = 4;
1327	iph->ihl = 5;
1328	if (opt) {
1329		iph->ihl += opt->optlen>>2;
1330		ip_options_build(skb, opt, cork->addr, rt, 0);
1331	}
1332	iph->tos = inet->tos;
1333	iph->frag_off = df;
1334	ip_select_ident(iph, &rt->dst, sk);
1335	iph->ttl = ttl;
1336	iph->protocol = sk->sk_protocol;
1337	iph->saddr = rt->rt_src;
1338	iph->daddr = rt->rt_dst;
1339
1340	skb->priority = sk->sk_priority;
1341	skb->mark = sk->sk_mark;
1342	/*
1343	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1344	 * on dst refcount
1345	 */
1346	cork->dst = NULL;
1347	skb_dst_set(skb, &rt->dst);
1348
1349	if (iph->protocol == IPPROTO_ICMP)
1350		icmp_out_count(net, ((struct icmphdr *)
1351			skb_transport_header(skb))->type);
1352
1353	ip_cork_release(cork);
1354out:
1355	return skb;
1356}
1357
1358int ip_send_skb(struct sk_buff *skb)
1359{
1360	struct net *net = sock_net(skb->sk);
1361	int err;
1362
1363	err = ip_local_out(skb);
1364	if (err) {
1365		if (err > 0)
1366			err = net_xmit_errno(err);
1367		if (err)
1368			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1369	}
1370
1371	return err;
1372}
1373
1374int ip_push_pending_frames(struct sock *sk)
1375{
1376	struct sk_buff *skb;
1377
1378	skb = ip_finish_skb(sk);
1379	if (!skb)
1380		return 0;
1381
1382	/* Netfilter gets whole the not fragmented skb. */
1383	return ip_send_skb(skb);
1384}
1385
1386/*
1387 *	Throw away all pending data on the socket.
1388 */
1389static void __ip_flush_pending_frames(struct sock *sk,
1390				      struct sk_buff_head *queue,
1391				      struct inet_cork *cork)
1392{
1393	struct sk_buff *skb;
1394
1395	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1396		kfree_skb(skb);
1397
1398	ip_cork_release(cork);
1399}
1400
1401void ip_flush_pending_frames(struct sock *sk)
1402{
1403	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1404}
1405
1406struct sk_buff *ip_make_skb(struct sock *sk,
1407			    int getfrag(void *from, char *to, int offset,
1408					int len, int odd, struct sk_buff *skb),
1409			    void *from, int length, int transhdrlen,
1410			    struct ipcm_cookie *ipc, struct rtable **rtp,
1411			    unsigned int flags)
1412{
1413	struct inet_cork cork = {};
1414	struct sk_buff_head queue;
1415	int err;
1416
1417	if (flags & MSG_PROBE)
1418		return NULL;
1419
1420	__skb_queue_head_init(&queue);
1421
1422	err = ip_setup_cork(sk, &cork, ipc, rtp);
1423	if (err)
1424		return ERR_PTR(err);
1425
1426	err = __ip_append_data(sk, &queue, &cork, getfrag,
1427			       from, length, transhdrlen, flags);
1428	if (err) {
1429		__ip_flush_pending_frames(sk, &queue, &cork);
1430		return ERR_PTR(err);
1431	}
1432
1433	return __ip_make_skb(sk, &queue, &cork);
1434}
1435
1436/*
1437 *	Fetch data from kernel space and fill in checksum if needed.
1438 */
1439static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1440			      int len, int odd, struct sk_buff *skb)
1441{
1442	__wsum csum;
1443
1444	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1445	skb->csum = csum_block_add(skb->csum, csum, odd);
1446	return 0;
1447}
1448
1449/*
1450 *	Generic function to send a packet as reply to another packet.
1451 *	Used to send TCP resets so far. ICMP should use this function too.
1452 *
1453 *	Should run single threaded per socket because it uses the sock
1454 *     	structure to pass arguments.
1455 */
1456void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1457		   unsigned int len)
1458{
1459	struct inet_sock *inet = inet_sk(sk);
1460	struct {
1461		struct ip_options	opt;
1462		char			data[40];
1463	} replyopts;
1464	struct ipcm_cookie ipc;
1465	__be32 daddr;
1466	struct rtable *rt = skb_rtable(skb);
1467
1468	if (ip_options_echo(&replyopts.opt, skb))
1469		return;
1470
1471	daddr = ipc.addr = rt->rt_src;
1472	ipc.opt = NULL;
1473	ipc.tx_flags = 0;
1474
1475	if (replyopts.opt.optlen) {
1476		ipc.opt = &replyopts.opt;
1477
1478		if (ipc.opt->srr)
1479			daddr = replyopts.opt.faddr;
1480	}
1481
1482	{
1483		struct flowi fl = { .oif = arg->bound_dev_if,
1484				    .fl4_dst = daddr,
1485				    .fl4_src = rt->rt_spec_dst,
1486				    .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1487				    .fl_ip_sport = tcp_hdr(skb)->dest,
1488				    .fl_ip_dport = tcp_hdr(skb)->source,
1489				    .proto = sk->sk_protocol,
1490				    .flags = ip_reply_arg_flowi_flags(arg) };
1491		security_skb_classify_flow(skb, &fl);
1492		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1493			return;
1494	}
1495
1496	/* And let IP do all the hard work.
1497
1498	   This chunk is not reenterable, hence spinlock.
1499	   Note that it uses the fact, that this function is called
1500	   with locally disabled BH and that sk cannot be already spinlocked.
1501	 */
1502	bh_lock_sock(sk);
1503	inet->tos = ip_hdr(skb)->tos;
1504	sk->sk_priority = skb->priority;
1505	sk->sk_protocol = ip_hdr(skb)->protocol;
1506	sk->sk_bound_dev_if = arg->bound_dev_if;
1507	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1508		       &ipc, &rt, MSG_DONTWAIT);
1509	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1510		if (arg->csumoffset >= 0)
1511			*((__sum16 *)skb_transport_header(skb) +
1512			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1513								arg->csum));
1514		skb->ip_summed = CHECKSUM_NONE;
1515		ip_push_pending_frames(sk);
1516	}
1517
1518	bh_unlock_sock(sk);
1519
1520	ip_rt_put(rt);
1521}
1522
1523void __init ip_init(void)
1524{
1525	ip_rt_init();
1526	inet_initpeers();
1527
1528#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1529	igmp_mc_proc_init();
1530#endif
1531}
1532