ip_output.c revision f6d8bd051c391c1c0458a30b2a7abcd939329259
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = rt->rt_dst;
162	iph->saddr    = rt->rt_src;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->opt.optlen) {
167		iph->ihl += opt->opt.optlen>>2;
168		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186	if (rt->rt_type == RTN_MULTICAST) {
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188	} else if (rt->rt_type == RTN_BROADCAST)
189		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191	/* Be paranoid, rather than too clever. */
192	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193		struct sk_buff *skb2;
194
195		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196		if (skb2 == NULL) {
197			kfree_skb(skb);
198			return -ENOMEM;
199		}
200		if (skb->sk)
201			skb_set_owner_w(skb2, skb->sk);
202		kfree_skb(skb);
203		skb = skb2;
204	}
205
206	if (dst->hh)
207		return neigh_hh_output(dst->hh, skb);
208	else if (dst->neighbour)
209		return dst->neighbour->output(skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb_dst(skb)->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb_rtable(skb);
244	struct net_device *dev = rt->dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if (sk_mc_loop(sk)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    &&
270		    ((rt->rt_flags & RTCF_LOCAL) ||
271		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272#endif
273		   ) {
274			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275			if (newskb)
276				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277					newskb, NULL, newskb->dev,
278					ip_dev_loopback_xmit);
279		}
280
281		/* Multicasts with ttl 0 must not go beyond the host */
282
283		if (ip_hdr(skb)->ttl == 0) {
284			kfree_skb(skb);
285			return 0;
286		}
287	}
288
289	if (rt->rt_flags&RTCF_BROADCAST) {
290		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291		if (newskb)
292			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293				NULL, newskb->dev, ip_dev_loopback_xmit);
294	}
295
296	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297			    skb->dev, ip_finish_output,
298			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299}
300
301int ip_output(struct sk_buff *skb)
302{
303	struct net_device *dev = skb_dst(skb)->dev;
304
305	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307	skb->dev = dev;
308	skb->protocol = htons(ETH_P_IP);
309
310	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311			    ip_finish_output,
312			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313}
314
315int ip_queue_xmit(struct sk_buff *skb)
316{
317	struct sock *sk = skb->sk;
318	struct inet_sock *inet = inet_sk(sk);
319	struct ip_options_rcu *inet_opt;
320	struct rtable *rt;
321	struct iphdr *iph;
322	int res;
323
324	/* Skip all of this if the packet is already routed,
325	 * f.e. by something like SCTP.
326	 */
327	rcu_read_lock();
328	inet_opt = rcu_dereference(inet->inet_opt);
329	rt = skb_rtable(skb);
330	if (rt != NULL)
331		goto packet_routed;
332
333	/* Make sure we can route this packet. */
334	rt = (struct rtable *)__sk_dst_check(sk, 0);
335	if (rt == NULL) {
336		__be32 daddr;
337
338		/* Use correct destination address if we have options. */
339		daddr = inet->inet_daddr;
340		if (inet_opt && inet_opt->opt.srr)
341			daddr = inet_opt->opt.faddr;
342
343		/* If this fails, retransmit mechanism of transport layer will
344		 * keep trying until route appears or the connection times
345		 * itself out.
346		 */
347		rt = ip_route_output_ports(sock_net(sk), sk,
348					   daddr, inet->inet_saddr,
349					   inet->inet_dport,
350					   inet->inet_sport,
351					   sk->sk_protocol,
352					   RT_CONN_FLAGS(sk),
353					   sk->sk_bound_dev_if);
354		if (IS_ERR(rt))
355			goto no_route;
356		sk_setup_caps(sk, &rt->dst);
357	}
358	skb_dst_set_noref(skb, &rt->dst);
359
360packet_routed:
361	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
362		goto no_route;
363
364	/* OK, we know where to send it, allocate and build IP header. */
365	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
366	skb_reset_network_header(skb);
367	iph = ip_hdr(skb);
368	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
369	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
370		iph->frag_off = htons(IP_DF);
371	else
372		iph->frag_off = 0;
373	iph->ttl      = ip_select_ttl(inet, &rt->dst);
374	iph->protocol = sk->sk_protocol;
375	iph->saddr    = rt->rt_src;
376	iph->daddr    = rt->rt_dst;
377	/* Transport layer set skb->h.foo itself. */
378
379	if (inet_opt && inet_opt->opt.optlen) {
380		iph->ihl += inet_opt->opt.optlen >> 2;
381		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
382	}
383
384	ip_select_ident_more(iph, &rt->dst, sk,
385			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
386
387	skb->priority = sk->sk_priority;
388	skb->mark = sk->sk_mark;
389
390	res = ip_local_out(skb);
391	rcu_read_unlock();
392	return res;
393
394no_route:
395	rcu_read_unlock();
396	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
397	kfree_skb(skb);
398	return -EHOSTUNREACH;
399}
400EXPORT_SYMBOL(ip_queue_xmit);
401
402
403static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
404{
405	to->pkt_type = from->pkt_type;
406	to->priority = from->priority;
407	to->protocol = from->protocol;
408	skb_dst_drop(to);
409	skb_dst_copy(to, from);
410	to->dev = from->dev;
411	to->mark = from->mark;
412
413	/* Copy the flags to each fragment. */
414	IPCB(to)->flags = IPCB(from)->flags;
415
416#ifdef CONFIG_NET_SCHED
417	to->tc_index = from->tc_index;
418#endif
419	nf_copy(to, from);
420#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
421    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
422	to->nf_trace = from->nf_trace;
423#endif
424#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
425	to->ipvs_property = from->ipvs_property;
426#endif
427	skb_copy_secmark(to, from);
428}
429
430/*
431 *	This IP datagram is too large to be sent in one piece.  Break it up into
432 *	smaller pieces (each of size equal to IP header plus
433 *	a block of the data of the original IP data part) that will yet fit in a
434 *	single device frame, and queue such a frame for sending.
435 */
436
437int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
438{
439	struct iphdr *iph;
440	int ptr;
441	struct net_device *dev;
442	struct sk_buff *skb2;
443	unsigned int mtu, hlen, left, len, ll_rs;
444	int offset;
445	__be16 not_last_frag;
446	struct rtable *rt = skb_rtable(skb);
447	int err = 0;
448
449	dev = rt->dst.dev;
450
451	/*
452	 *	Point into the IP datagram header.
453	 */
454
455	iph = ip_hdr(skb);
456
457	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
458		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
459		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
460			  htonl(ip_skb_dst_mtu(skb)));
461		kfree_skb(skb);
462		return -EMSGSIZE;
463	}
464
465	/*
466	 *	Setup starting values.
467	 */
468
469	hlen = iph->ihl * 4;
470	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
471#ifdef CONFIG_BRIDGE_NETFILTER
472	if (skb->nf_bridge)
473		mtu -= nf_bridge_mtu_reduction(skb);
474#endif
475	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
476
477	/* When frag_list is given, use it. First, check its validity:
478	 * some transformers could create wrong frag_list or break existing
479	 * one, it is not prohibited. In this case fall back to copying.
480	 *
481	 * LATER: this step can be merged to real generation of fragments,
482	 * we can switch to copy when see the first bad fragment.
483	 */
484	if (skb_has_frag_list(skb)) {
485		struct sk_buff *frag, *frag2;
486		int first_len = skb_pagelen(skb);
487
488		if (first_len - hlen > mtu ||
489		    ((first_len - hlen) & 7) ||
490		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
491		    skb_cloned(skb))
492			goto slow_path;
493
494		skb_walk_frags(skb, frag) {
495			/* Correct geometry. */
496			if (frag->len > mtu ||
497			    ((frag->len & 7) && frag->next) ||
498			    skb_headroom(frag) < hlen)
499				goto slow_path_clean;
500
501			/* Partially cloned skb? */
502			if (skb_shared(frag))
503				goto slow_path_clean;
504
505			BUG_ON(frag->sk);
506			if (skb->sk) {
507				frag->sk = skb->sk;
508				frag->destructor = sock_wfree;
509			}
510			skb->truesize -= frag->truesize;
511		}
512
513		/* Everything is OK. Generate! */
514
515		err = 0;
516		offset = 0;
517		frag = skb_shinfo(skb)->frag_list;
518		skb_frag_list_init(skb);
519		skb->data_len = first_len - skb_headlen(skb);
520		skb->len = first_len;
521		iph->tot_len = htons(first_len);
522		iph->frag_off = htons(IP_MF);
523		ip_send_check(iph);
524
525		for (;;) {
526			/* Prepare header of the next frame,
527			 * before previous one went down. */
528			if (frag) {
529				frag->ip_summed = CHECKSUM_NONE;
530				skb_reset_transport_header(frag);
531				__skb_push(frag, hlen);
532				skb_reset_network_header(frag);
533				memcpy(skb_network_header(frag), iph, hlen);
534				iph = ip_hdr(frag);
535				iph->tot_len = htons(frag->len);
536				ip_copy_metadata(frag, skb);
537				if (offset == 0)
538					ip_options_fragment(frag);
539				offset += skb->len - hlen;
540				iph->frag_off = htons(offset>>3);
541				if (frag->next != NULL)
542					iph->frag_off |= htons(IP_MF);
543				/* Ready, complete checksum */
544				ip_send_check(iph);
545			}
546
547			err = output(skb);
548
549			if (!err)
550				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
551			if (err || !frag)
552				break;
553
554			skb = frag;
555			frag = skb->next;
556			skb->next = NULL;
557		}
558
559		if (err == 0) {
560			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
561			return 0;
562		}
563
564		while (frag) {
565			skb = frag->next;
566			kfree_skb(frag);
567			frag = skb;
568		}
569		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
570		return err;
571
572slow_path_clean:
573		skb_walk_frags(skb, frag2) {
574			if (frag2 == frag)
575				break;
576			frag2->sk = NULL;
577			frag2->destructor = NULL;
578			skb->truesize += frag2->truesize;
579		}
580	}
581
582slow_path:
583	left = skb->len - hlen;		/* Space per frame */
584	ptr = hlen;		/* Where to start from */
585
586	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
587	 * we need to make room for the encapsulating header
588	 */
589	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
590
591	/*
592	 *	Fragment the datagram.
593	 */
594
595	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
596	not_last_frag = iph->frag_off & htons(IP_MF);
597
598	/*
599	 *	Keep copying data until we run out.
600	 */
601
602	while (left > 0) {
603		len = left;
604		/* IF: it doesn't fit, use 'mtu' - the data space left */
605		if (len > mtu)
606			len = mtu;
607		/* IF: we are not sending up to and including the packet end
608		   then align the next start on an eight byte boundary */
609		if (len < left)	{
610			len &= ~7;
611		}
612		/*
613		 *	Allocate buffer.
614		 */
615
616		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
617			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
618			err = -ENOMEM;
619			goto fail;
620		}
621
622		/*
623		 *	Set up data on packet
624		 */
625
626		ip_copy_metadata(skb2, skb);
627		skb_reserve(skb2, ll_rs);
628		skb_put(skb2, len + hlen);
629		skb_reset_network_header(skb2);
630		skb2->transport_header = skb2->network_header + hlen;
631
632		/*
633		 *	Charge the memory for the fragment to any owner
634		 *	it might possess
635		 */
636
637		if (skb->sk)
638			skb_set_owner_w(skb2, skb->sk);
639
640		/*
641		 *	Copy the packet header into the new buffer.
642		 */
643
644		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
645
646		/*
647		 *	Copy a block of the IP datagram.
648		 */
649		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
650			BUG();
651		left -= len;
652
653		/*
654		 *	Fill in the new header fields.
655		 */
656		iph = ip_hdr(skb2);
657		iph->frag_off = htons((offset >> 3));
658
659		/* ANK: dirty, but effective trick. Upgrade options only if
660		 * the segment to be fragmented was THE FIRST (otherwise,
661		 * options are already fixed) and make it ONCE
662		 * on the initial skb, so that all the following fragments
663		 * will inherit fixed options.
664		 */
665		if (offset == 0)
666			ip_options_fragment(skb);
667
668		/*
669		 *	Added AC : If we are fragmenting a fragment that's not the
670		 *		   last fragment then keep MF on each bit
671		 */
672		if (left > 0 || not_last_frag)
673			iph->frag_off |= htons(IP_MF);
674		ptr += len;
675		offset += len;
676
677		/*
678		 *	Put this fragment into the sending queue.
679		 */
680		iph->tot_len = htons(len + hlen);
681
682		ip_send_check(iph);
683
684		err = output(skb2);
685		if (err)
686			goto fail;
687
688		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
689	}
690	kfree_skb(skb);
691	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
692	return err;
693
694fail:
695	kfree_skb(skb);
696	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
697	return err;
698}
699EXPORT_SYMBOL(ip_fragment);
700
701int
702ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
703{
704	struct iovec *iov = from;
705
706	if (skb->ip_summed == CHECKSUM_PARTIAL) {
707		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
708			return -EFAULT;
709	} else {
710		__wsum csum = 0;
711		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
712			return -EFAULT;
713		skb->csum = csum_block_add(skb->csum, csum, odd);
714	}
715	return 0;
716}
717EXPORT_SYMBOL(ip_generic_getfrag);
718
719static inline __wsum
720csum_page(struct page *page, int offset, int copy)
721{
722	char *kaddr;
723	__wsum csum;
724	kaddr = kmap(page);
725	csum = csum_partial(kaddr + offset, copy, 0);
726	kunmap(page);
727	return csum;
728}
729
730static inline int ip_ufo_append_data(struct sock *sk,
731			struct sk_buff_head *queue,
732			int getfrag(void *from, char *to, int offset, int len,
733			       int odd, struct sk_buff *skb),
734			void *from, int length, int hh_len, int fragheaderlen,
735			int transhdrlen, int mtu, unsigned int flags)
736{
737	struct sk_buff *skb;
738	int err;
739
740	/* There is support for UDP fragmentation offload by network
741	 * device, so create one single skb packet containing complete
742	 * udp datagram
743	 */
744	if ((skb = skb_peek_tail(queue)) == NULL) {
745		skb = sock_alloc_send_skb(sk,
746			hh_len + fragheaderlen + transhdrlen + 20,
747			(flags & MSG_DONTWAIT), &err);
748
749		if (skb == NULL)
750			return err;
751
752		/* reserve space for Hardware header */
753		skb_reserve(skb, hh_len);
754
755		/* create space for UDP/IP header */
756		skb_put(skb, fragheaderlen + transhdrlen);
757
758		/* initialize network header pointer */
759		skb_reset_network_header(skb);
760
761		/* initialize protocol header pointer */
762		skb->transport_header = skb->network_header + fragheaderlen;
763
764		skb->ip_summed = CHECKSUM_PARTIAL;
765		skb->csum = 0;
766
767		/* specify the length of each IP datagram fragment */
768		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
769		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
770		__skb_queue_tail(queue, skb);
771	}
772
773	return skb_append_datato_frags(sk, skb, getfrag, from,
774				       (length - transhdrlen));
775}
776
777static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
778			    struct inet_cork *cork,
779			    int getfrag(void *from, char *to, int offset,
780					int len, int odd, struct sk_buff *skb),
781			    void *from, int length, int transhdrlen,
782			    unsigned int flags)
783{
784	struct inet_sock *inet = inet_sk(sk);
785	struct sk_buff *skb;
786
787	struct ip_options *opt = cork->opt;
788	int hh_len;
789	int exthdrlen;
790	int mtu;
791	int copy;
792	int err;
793	int offset = 0;
794	unsigned int maxfraglen, fragheaderlen;
795	int csummode = CHECKSUM_NONE;
796	struct rtable *rt = (struct rtable *)cork->dst;
797
798	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
799	length += exthdrlen;
800	transhdrlen += exthdrlen;
801	mtu = cork->fragsize;
802
803	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
804
805	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
806	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
807
808	if (cork->length + length > 0xFFFF - fragheaderlen) {
809		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
810			       mtu-exthdrlen);
811		return -EMSGSIZE;
812	}
813
814	/*
815	 * transhdrlen > 0 means that this is the first fragment and we wish
816	 * it won't be fragmented in the future.
817	 */
818	if (transhdrlen &&
819	    length + fragheaderlen <= mtu &&
820	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
821	    !exthdrlen)
822		csummode = CHECKSUM_PARTIAL;
823
824	skb = skb_peek_tail(queue);
825
826	cork->length += length;
827	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
828	    (sk->sk_protocol == IPPROTO_UDP) &&
829	    (rt->dst.dev->features & NETIF_F_UFO)) {
830		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
831					 hh_len, fragheaderlen, transhdrlen,
832					 mtu, flags);
833		if (err)
834			goto error;
835		return 0;
836	}
837
838	/* So, what's going on in the loop below?
839	 *
840	 * We use calculated fragment length to generate chained skb,
841	 * each of segments is IP fragment ready for sending to network after
842	 * adding appropriate IP header.
843	 */
844
845	if (!skb)
846		goto alloc_new_skb;
847
848	while (length > 0) {
849		/* Check if the remaining data fits into current packet. */
850		copy = mtu - skb->len;
851		if (copy < length)
852			copy = maxfraglen - skb->len;
853		if (copy <= 0) {
854			char *data;
855			unsigned int datalen;
856			unsigned int fraglen;
857			unsigned int fraggap;
858			unsigned int alloclen;
859			struct sk_buff *skb_prev;
860alloc_new_skb:
861			skb_prev = skb;
862			if (skb_prev)
863				fraggap = skb_prev->len - maxfraglen;
864			else
865				fraggap = 0;
866
867			/*
868			 * If remaining data exceeds the mtu,
869			 * we know we need more fragment(s).
870			 */
871			datalen = length + fraggap;
872			if (datalen > mtu - fragheaderlen)
873				datalen = maxfraglen - fragheaderlen;
874			fraglen = datalen + fragheaderlen;
875
876			if ((flags & MSG_MORE) &&
877			    !(rt->dst.dev->features&NETIF_F_SG))
878				alloclen = mtu;
879			else
880				alloclen = fraglen;
881
882			/* The last fragment gets additional space at tail.
883			 * Note, with MSG_MORE we overallocate on fragments,
884			 * because we have no idea what fragment will be
885			 * the last.
886			 */
887			if (datalen == length + fraggap) {
888				alloclen += rt->dst.trailer_len;
889				/* make sure mtu is not reached */
890				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
891					datalen -= ALIGN(rt->dst.trailer_len, 8);
892			}
893			if (transhdrlen) {
894				skb = sock_alloc_send_skb(sk,
895						alloclen + hh_len + 15,
896						(flags & MSG_DONTWAIT), &err);
897			} else {
898				skb = NULL;
899				if (atomic_read(&sk->sk_wmem_alloc) <=
900				    2 * sk->sk_sndbuf)
901					skb = sock_wmalloc(sk,
902							   alloclen + hh_len + 15, 1,
903							   sk->sk_allocation);
904				if (unlikely(skb == NULL))
905					err = -ENOBUFS;
906				else
907					/* only the initial fragment is
908					   time stamped */
909					cork->tx_flags = 0;
910			}
911			if (skb == NULL)
912				goto error;
913
914			/*
915			 *	Fill in the control structures
916			 */
917			skb->ip_summed = csummode;
918			skb->csum = 0;
919			skb_reserve(skb, hh_len);
920			skb_shinfo(skb)->tx_flags = cork->tx_flags;
921
922			/*
923			 *	Find where to start putting bytes.
924			 */
925			data = skb_put(skb, fraglen);
926			skb_set_network_header(skb, exthdrlen);
927			skb->transport_header = (skb->network_header +
928						 fragheaderlen);
929			data += fragheaderlen;
930
931			if (fraggap) {
932				skb->csum = skb_copy_and_csum_bits(
933					skb_prev, maxfraglen,
934					data + transhdrlen, fraggap, 0);
935				skb_prev->csum = csum_sub(skb_prev->csum,
936							  skb->csum);
937				data += fraggap;
938				pskb_trim_unique(skb_prev, maxfraglen);
939			}
940
941			copy = datalen - transhdrlen - fraggap;
942			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
943				err = -EFAULT;
944				kfree_skb(skb);
945				goto error;
946			}
947
948			offset += copy;
949			length -= datalen - fraggap;
950			transhdrlen = 0;
951			exthdrlen = 0;
952			csummode = CHECKSUM_NONE;
953
954			/*
955			 * Put the packet on the pending queue.
956			 */
957			__skb_queue_tail(queue, skb);
958			continue;
959		}
960
961		if (copy > length)
962			copy = length;
963
964		if (!(rt->dst.dev->features&NETIF_F_SG)) {
965			unsigned int off;
966
967			off = skb->len;
968			if (getfrag(from, skb_put(skb, copy),
969					offset, copy, off, skb) < 0) {
970				__skb_trim(skb, off);
971				err = -EFAULT;
972				goto error;
973			}
974		} else {
975			int i = skb_shinfo(skb)->nr_frags;
976			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
977			struct page *page = cork->page;
978			int off = cork->off;
979			unsigned int left;
980
981			if (page && (left = PAGE_SIZE - off) > 0) {
982				if (copy >= left)
983					copy = left;
984				if (page != frag->page) {
985					if (i == MAX_SKB_FRAGS) {
986						err = -EMSGSIZE;
987						goto error;
988					}
989					get_page(page);
990					skb_fill_page_desc(skb, i, page, off, 0);
991					frag = &skb_shinfo(skb)->frags[i];
992				}
993			} else if (i < MAX_SKB_FRAGS) {
994				if (copy > PAGE_SIZE)
995					copy = PAGE_SIZE;
996				page = alloc_pages(sk->sk_allocation, 0);
997				if (page == NULL)  {
998					err = -ENOMEM;
999					goto error;
1000				}
1001				cork->page = page;
1002				cork->off = 0;
1003
1004				skb_fill_page_desc(skb, i, page, 0, 0);
1005				frag = &skb_shinfo(skb)->frags[i];
1006			} else {
1007				err = -EMSGSIZE;
1008				goto error;
1009			}
1010			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1011				err = -EFAULT;
1012				goto error;
1013			}
1014			cork->off += copy;
1015			frag->size += copy;
1016			skb->len += copy;
1017			skb->data_len += copy;
1018			skb->truesize += copy;
1019			atomic_add(copy, &sk->sk_wmem_alloc);
1020		}
1021		offset += copy;
1022		length -= copy;
1023	}
1024
1025	return 0;
1026
1027error:
1028	cork->length -= length;
1029	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1030	return err;
1031}
1032
1033static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1034			 struct ipcm_cookie *ipc, struct rtable **rtp)
1035{
1036	struct inet_sock *inet = inet_sk(sk);
1037	struct ip_options_rcu *opt;
1038	struct rtable *rt;
1039
1040	/*
1041	 * setup for corking.
1042	 */
1043	opt = ipc->opt;
1044	if (opt) {
1045		if (cork->opt == NULL) {
1046			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1047					    sk->sk_allocation);
1048			if (unlikely(cork->opt == NULL))
1049				return -ENOBUFS;
1050		}
1051		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1052		cork->flags |= IPCORK_OPT;
1053		cork->addr = ipc->addr;
1054	}
1055	rt = *rtp;
1056	if (unlikely(!rt))
1057		return -EFAULT;
1058	/*
1059	 * We steal reference to this route, caller should not release it
1060	 */
1061	*rtp = NULL;
1062	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1063			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1064	cork->dst = &rt->dst;
1065	cork->length = 0;
1066	cork->tx_flags = ipc->tx_flags;
1067	cork->page = NULL;
1068	cork->off = 0;
1069
1070	return 0;
1071}
1072
1073/*
1074 *	ip_append_data() and ip_append_page() can make one large IP datagram
1075 *	from many pieces of data. Each pieces will be holded on the socket
1076 *	until ip_push_pending_frames() is called. Each piece can be a page
1077 *	or non-page data.
1078 *
1079 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1080 *	this interface potentially.
1081 *
1082 *	LATER: length must be adjusted by pad at tail, when it is required.
1083 */
1084int ip_append_data(struct sock *sk,
1085		   int getfrag(void *from, char *to, int offset, int len,
1086			       int odd, struct sk_buff *skb),
1087		   void *from, int length, int transhdrlen,
1088		   struct ipcm_cookie *ipc, struct rtable **rtp,
1089		   unsigned int flags)
1090{
1091	struct inet_sock *inet = inet_sk(sk);
1092	int err;
1093
1094	if (flags&MSG_PROBE)
1095		return 0;
1096
1097	if (skb_queue_empty(&sk->sk_write_queue)) {
1098		err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1099		if (err)
1100			return err;
1101	} else {
1102		transhdrlen = 0;
1103	}
1104
1105	return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1106				from, length, transhdrlen, flags);
1107}
1108
1109ssize_t	ip_append_page(struct sock *sk, struct page *page,
1110		       int offset, size_t size, int flags)
1111{
1112	struct inet_sock *inet = inet_sk(sk);
1113	struct sk_buff *skb;
1114	struct rtable *rt;
1115	struct ip_options *opt = NULL;
1116	int hh_len;
1117	int mtu;
1118	int len;
1119	int err;
1120	unsigned int maxfraglen, fragheaderlen, fraggap;
1121
1122	if (inet->hdrincl)
1123		return -EPERM;
1124
1125	if (flags&MSG_PROBE)
1126		return 0;
1127
1128	if (skb_queue_empty(&sk->sk_write_queue))
1129		return -EINVAL;
1130
1131	rt = (struct rtable *)inet->cork.dst;
1132	if (inet->cork.flags & IPCORK_OPT)
1133		opt = inet->cork.opt;
1134
1135	if (!(rt->dst.dev->features&NETIF_F_SG))
1136		return -EOPNOTSUPP;
1137
1138	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1139	mtu = inet->cork.fragsize;
1140
1141	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1142	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1143
1144	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1145		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1146		return -EMSGSIZE;
1147	}
1148
1149	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1150		return -EINVAL;
1151
1152	inet->cork.length += size;
1153	if ((size + skb->len > mtu) &&
1154	    (sk->sk_protocol == IPPROTO_UDP) &&
1155	    (rt->dst.dev->features & NETIF_F_UFO)) {
1156		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1157		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158	}
1159
1160
1161	while (size > 0) {
1162		int i;
1163
1164		if (skb_is_gso(skb))
1165			len = size;
1166		else {
1167
1168			/* Check if the remaining data fits into current packet. */
1169			len = mtu - skb->len;
1170			if (len < size)
1171				len = maxfraglen - skb->len;
1172		}
1173		if (len <= 0) {
1174			struct sk_buff *skb_prev;
1175			int alloclen;
1176
1177			skb_prev = skb;
1178			fraggap = skb_prev->len - maxfraglen;
1179
1180			alloclen = fragheaderlen + hh_len + fraggap + 15;
1181			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1182			if (unlikely(!skb)) {
1183				err = -ENOBUFS;
1184				goto error;
1185			}
1186
1187			/*
1188			 *	Fill in the control structures
1189			 */
1190			skb->ip_summed = CHECKSUM_NONE;
1191			skb->csum = 0;
1192			skb_reserve(skb, hh_len);
1193
1194			/*
1195			 *	Find where to start putting bytes.
1196			 */
1197			skb_put(skb, fragheaderlen + fraggap);
1198			skb_reset_network_header(skb);
1199			skb->transport_header = (skb->network_header +
1200						 fragheaderlen);
1201			if (fraggap) {
1202				skb->csum = skb_copy_and_csum_bits(skb_prev,
1203								   maxfraglen,
1204						    skb_transport_header(skb),
1205								   fraggap, 0);
1206				skb_prev->csum = csum_sub(skb_prev->csum,
1207							  skb->csum);
1208				pskb_trim_unique(skb_prev, maxfraglen);
1209			}
1210
1211			/*
1212			 * Put the packet on the pending queue.
1213			 */
1214			__skb_queue_tail(&sk->sk_write_queue, skb);
1215			continue;
1216		}
1217
1218		i = skb_shinfo(skb)->nr_frags;
1219		if (len > size)
1220			len = size;
1221		if (skb_can_coalesce(skb, i, page, offset)) {
1222			skb_shinfo(skb)->frags[i-1].size += len;
1223		} else if (i < MAX_SKB_FRAGS) {
1224			get_page(page);
1225			skb_fill_page_desc(skb, i, page, offset, len);
1226		} else {
1227			err = -EMSGSIZE;
1228			goto error;
1229		}
1230
1231		if (skb->ip_summed == CHECKSUM_NONE) {
1232			__wsum csum;
1233			csum = csum_page(page, offset, len);
1234			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1235		}
1236
1237		skb->len += len;
1238		skb->data_len += len;
1239		skb->truesize += len;
1240		atomic_add(len, &sk->sk_wmem_alloc);
1241		offset += len;
1242		size -= len;
1243	}
1244	return 0;
1245
1246error:
1247	inet->cork.length -= size;
1248	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1249	return err;
1250}
1251
1252static void ip_cork_release(struct inet_cork *cork)
1253{
1254	cork->flags &= ~IPCORK_OPT;
1255	kfree(cork->opt);
1256	cork->opt = NULL;
1257	dst_release(cork->dst);
1258	cork->dst = NULL;
1259}
1260
1261/*
1262 *	Combined all pending IP fragments on the socket as one IP datagram
1263 *	and push them out.
1264 */
1265struct sk_buff *__ip_make_skb(struct sock *sk,
1266			      struct sk_buff_head *queue,
1267			      struct inet_cork *cork)
1268{
1269	struct sk_buff *skb, *tmp_skb;
1270	struct sk_buff **tail_skb;
1271	struct inet_sock *inet = inet_sk(sk);
1272	struct net *net = sock_net(sk);
1273	struct ip_options *opt = NULL;
1274	struct rtable *rt = (struct rtable *)cork->dst;
1275	struct iphdr *iph;
1276	__be16 df = 0;
1277	__u8 ttl;
1278
1279	if ((skb = __skb_dequeue(queue)) == NULL)
1280		goto out;
1281	tail_skb = &(skb_shinfo(skb)->frag_list);
1282
1283	/* move skb->data to ip header from ext header */
1284	if (skb->data < skb_network_header(skb))
1285		__skb_pull(skb, skb_network_offset(skb));
1286	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1287		__skb_pull(tmp_skb, skb_network_header_len(skb));
1288		*tail_skb = tmp_skb;
1289		tail_skb = &(tmp_skb->next);
1290		skb->len += tmp_skb->len;
1291		skb->data_len += tmp_skb->len;
1292		skb->truesize += tmp_skb->truesize;
1293		tmp_skb->destructor = NULL;
1294		tmp_skb->sk = NULL;
1295	}
1296
1297	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1298	 * to fragment the frame generated here. No matter, what transforms
1299	 * how transforms change size of the packet, it will come out.
1300	 */
1301	if (inet->pmtudisc < IP_PMTUDISC_DO)
1302		skb->local_df = 1;
1303
1304	/* DF bit is set when we want to see DF on outgoing frames.
1305	 * If local_df is set too, we still allow to fragment this frame
1306	 * locally. */
1307	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1308	    (skb->len <= dst_mtu(&rt->dst) &&
1309	     ip_dont_fragment(sk, &rt->dst)))
1310		df = htons(IP_DF);
1311
1312	if (cork->flags & IPCORK_OPT)
1313		opt = cork->opt;
1314
1315	if (rt->rt_type == RTN_MULTICAST)
1316		ttl = inet->mc_ttl;
1317	else
1318		ttl = ip_select_ttl(inet, &rt->dst);
1319
1320	iph = (struct iphdr *)skb->data;
1321	iph->version = 4;
1322	iph->ihl = 5;
1323	if (opt) {
1324		iph->ihl += opt->optlen>>2;
1325		ip_options_build(skb, opt, cork->addr, rt, 0);
1326	}
1327	iph->tos = inet->tos;
1328	iph->frag_off = df;
1329	ip_select_ident(iph, &rt->dst, sk);
1330	iph->ttl = ttl;
1331	iph->protocol = sk->sk_protocol;
1332	iph->saddr = rt->rt_src;
1333	iph->daddr = rt->rt_dst;
1334
1335	skb->priority = sk->sk_priority;
1336	skb->mark = sk->sk_mark;
1337	/*
1338	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1339	 * on dst refcount
1340	 */
1341	cork->dst = NULL;
1342	skb_dst_set(skb, &rt->dst);
1343
1344	if (iph->protocol == IPPROTO_ICMP)
1345		icmp_out_count(net, ((struct icmphdr *)
1346			skb_transport_header(skb))->type);
1347
1348	ip_cork_release(cork);
1349out:
1350	return skb;
1351}
1352
1353int ip_send_skb(struct sk_buff *skb)
1354{
1355	struct net *net = sock_net(skb->sk);
1356	int err;
1357
1358	err = ip_local_out(skb);
1359	if (err) {
1360		if (err > 0)
1361			err = net_xmit_errno(err);
1362		if (err)
1363			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1364	}
1365
1366	return err;
1367}
1368
1369int ip_push_pending_frames(struct sock *sk)
1370{
1371	struct sk_buff *skb;
1372
1373	skb = ip_finish_skb(sk);
1374	if (!skb)
1375		return 0;
1376
1377	/* Netfilter gets whole the not fragmented skb. */
1378	return ip_send_skb(skb);
1379}
1380
1381/*
1382 *	Throw away all pending data on the socket.
1383 */
1384static void __ip_flush_pending_frames(struct sock *sk,
1385				      struct sk_buff_head *queue,
1386				      struct inet_cork *cork)
1387{
1388	struct sk_buff *skb;
1389
1390	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1391		kfree_skb(skb);
1392
1393	ip_cork_release(cork);
1394}
1395
1396void ip_flush_pending_frames(struct sock *sk)
1397{
1398	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1399}
1400
1401struct sk_buff *ip_make_skb(struct sock *sk,
1402			    int getfrag(void *from, char *to, int offset,
1403					int len, int odd, struct sk_buff *skb),
1404			    void *from, int length, int transhdrlen,
1405			    struct ipcm_cookie *ipc, struct rtable **rtp,
1406			    unsigned int flags)
1407{
1408	struct inet_cork cork = {};
1409	struct sk_buff_head queue;
1410	int err;
1411
1412	if (flags & MSG_PROBE)
1413		return NULL;
1414
1415	__skb_queue_head_init(&queue);
1416
1417	err = ip_setup_cork(sk, &cork, ipc, rtp);
1418	if (err)
1419		return ERR_PTR(err);
1420
1421	err = __ip_append_data(sk, &queue, &cork, getfrag,
1422			       from, length, transhdrlen, flags);
1423	if (err) {
1424		__ip_flush_pending_frames(sk, &queue, &cork);
1425		return ERR_PTR(err);
1426	}
1427
1428	return __ip_make_skb(sk, &queue, &cork);
1429}
1430
1431/*
1432 *	Fetch data from kernel space and fill in checksum if needed.
1433 */
1434static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1435			      int len, int odd, struct sk_buff *skb)
1436{
1437	__wsum csum;
1438
1439	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1440	skb->csum = csum_block_add(skb->csum, csum, odd);
1441	return 0;
1442}
1443
1444/*
1445 *	Generic function to send a packet as reply to another packet.
1446 *	Used to send TCP resets so far. ICMP should use this function too.
1447 *
1448 *	Should run single threaded per socket because it uses the sock
1449 *     	structure to pass arguments.
1450 */
1451void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1452		   unsigned int len)
1453{
1454	struct inet_sock *inet = inet_sk(sk);
1455	struct ip_options_data replyopts;
1456	struct ipcm_cookie ipc;
1457	__be32 daddr;
1458	struct rtable *rt = skb_rtable(skb);
1459
1460	if (ip_options_echo(&replyopts.opt.opt, skb))
1461		return;
1462
1463	daddr = ipc.addr = rt->rt_src;
1464	ipc.opt = NULL;
1465	ipc.tx_flags = 0;
1466
1467	if (replyopts.opt.opt.optlen) {
1468		ipc.opt = &replyopts.opt;
1469
1470		if (replyopts.opt.opt.srr)
1471			daddr = replyopts.opt.opt.faddr;
1472	}
1473
1474	{
1475		struct flowi4 fl4;
1476
1477		flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1478				   RT_TOS(ip_hdr(skb)->tos),
1479				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1480				   ip_reply_arg_flowi_flags(arg),
1481				   daddr, rt->rt_spec_dst,
1482				   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1483		security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1484		rt = ip_route_output_key(sock_net(sk), &fl4);
1485		if (IS_ERR(rt))
1486			return;
1487	}
1488
1489	/* And let IP do all the hard work.
1490
1491	   This chunk is not reenterable, hence spinlock.
1492	   Note that it uses the fact, that this function is called
1493	   with locally disabled BH and that sk cannot be already spinlocked.
1494	 */
1495	bh_lock_sock(sk);
1496	inet->tos = ip_hdr(skb)->tos;
1497	sk->sk_priority = skb->priority;
1498	sk->sk_protocol = ip_hdr(skb)->protocol;
1499	sk->sk_bound_dev_if = arg->bound_dev_if;
1500	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1501		       &ipc, &rt, MSG_DONTWAIT);
1502	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1503		if (arg->csumoffset >= 0)
1504			*((__sum16 *)skb_transport_header(skb) +
1505			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1506								arg->csum));
1507		skb->ip_summed = CHECKSUM_NONE;
1508		ip_push_pending_frames(sk);
1509	}
1510
1511	bh_unlock_sock(sk);
1512
1513	ip_rt_put(rt);
1514}
1515
1516void __init ip_init(void)
1517{
1518	ip_rt_init();
1519	inet_initpeers();
1520
1521#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1522	igmp_mc_proc_init();
1523#endif
1524}
1525