ip_output.c revision 2721745501a26d0dc3b88c0d2f3aa11471891388
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	skb_dst_force(newskb);
126	netif_rx_ni(newskb);
127	return 0;
128}
129
130static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131{
132	int ttl = inet->uc_ttl;
133
134	if (ttl < 0)
135		ttl = ip4_dst_hoplimit(dst);
136	return ttl;
137}
138
139/*
140 *		Add an ip header to a skbuff and send it out.
141 *
142 */
143int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
144			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145{
146	struct inet_sock *inet = inet_sk(sk);
147	struct rtable *rt = skb_rtable(skb);
148	struct iphdr *iph;
149
150	/* Build the IP header. */
151	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
152	skb_reset_network_header(skb);
153	iph = ip_hdr(skb);
154	iph->version  = 4;
155	iph->ihl      = 5;
156	iph->tos      = inet->tos;
157	if (ip_dont_fragment(sk, &rt->dst))
158		iph->frag_off = htons(IP_DF);
159	else
160		iph->frag_off = 0;
161	iph->ttl      = ip_select_ttl(inet, &rt->dst);
162	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163	iph->saddr    = saddr;
164	iph->protocol = sk->sk_protocol;
165	ip_select_ident(iph, &rt->dst, sk);
166
167	if (opt && opt->opt.optlen) {
168		iph->ihl += opt->opt.optlen>>2;
169		ip_options_build(skb, &opt->opt, daddr, rt, 0);
170	}
171
172	skb->priority = sk->sk_priority;
173	skb->mark = sk->sk_mark;
174
175	/* Send it out. */
176	return ip_local_out(skb);
177}
178EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179
180static inline int ip_finish_output2(struct sk_buff *skb)
181{
182	struct dst_entry *dst = skb_dst(skb);
183	struct rtable *rt = (struct rtable *)dst;
184	struct net_device *dev = dst->dev;
185	unsigned int hh_len = LL_RESERVED_SPACE(dev);
186	struct neighbour *neigh;
187
188	if (rt->rt_type == RTN_MULTICAST) {
189		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190	} else if (rt->rt_type == RTN_BROADCAST)
191		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
192
193	/* Be paranoid, rather than too clever. */
194	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
195		struct sk_buff *skb2;
196
197		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198		if (skb2 == NULL) {
199			kfree_skb(skb);
200			return -ENOMEM;
201		}
202		if (skb->sk)
203			skb_set_owner_w(skb2, skb->sk);
204		kfree_skb(skb);
205		skb = skb2;
206	}
207
208	rcu_read_lock();
209	neigh = dst_get_neighbour_noref(dst);
210	if (neigh) {
211		int res = neigh_output(neigh, skb);
212
213		rcu_read_unlock();
214		return res;
215	}
216	rcu_read_unlock();
217
218	if (net_ratelimit())
219		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220	kfree_skb(skb);
221	return -EINVAL;
222}
223
224static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225{
226	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227
228	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
229	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
230}
231
232static int ip_finish_output(struct sk_buff *skb)
233{
234#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235	/* Policy lookup after SNAT yielded a new policy */
236	if (skb_dst(skb)->xfrm != NULL) {
237		IPCB(skb)->flags |= IPSKB_REROUTED;
238		return dst_output(skb);
239	}
240#endif
241	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
242		return ip_fragment(skb, ip_finish_output2);
243	else
244		return ip_finish_output2(skb);
245}
246
247int ip_mc_output(struct sk_buff *skb)
248{
249	struct sock *sk = skb->sk;
250	struct rtable *rt = skb_rtable(skb);
251	struct net_device *dev = rt->dst.dev;
252
253	/*
254	 *	If the indicated interface is up and running, send the packet.
255	 */
256	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257
258	skb->dev = dev;
259	skb->protocol = htons(ETH_P_IP);
260
261	/*
262	 *	Multicasts are looped back for other local users
263	 */
264
265	if (rt->rt_flags&RTCF_MULTICAST) {
266		if (sk_mc_loop(sk)
267#ifdef CONFIG_IP_MROUTE
268		/* Small optimization: do not loopback not local frames,
269		   which returned after forwarding; they will be  dropped
270		   by ip_mr_input in any case.
271		   Note, that local frames are looped back to be delivered
272		   to local recipients.
273
274		   This check is duplicated in ip_mr_input at the moment.
275		 */
276		    &&
277		    ((rt->rt_flags & RTCF_LOCAL) ||
278		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
279#endif
280		   ) {
281			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282			if (newskb)
283				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284					newskb, NULL, newskb->dev,
285					ip_dev_loopback_xmit);
286		}
287
288		/* Multicasts with ttl 0 must not go beyond the host */
289
290		if (ip_hdr(skb)->ttl == 0) {
291			kfree_skb(skb);
292			return 0;
293		}
294	}
295
296	if (rt->rt_flags&RTCF_BROADCAST) {
297		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298		if (newskb)
299			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300				NULL, newskb->dev, ip_dev_loopback_xmit);
301	}
302
303	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304			    skb->dev, ip_finish_output,
305			    !(IPCB(skb)->flags & IPSKB_REROUTED));
306}
307
308int ip_output(struct sk_buff *skb)
309{
310	struct net_device *dev = skb_dst(skb)->dev;
311
312	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313
314	skb->dev = dev;
315	skb->protocol = htons(ETH_P_IP);
316
317	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
318			    ip_finish_output,
319			    !(IPCB(skb)->flags & IPSKB_REROUTED));
320}
321
322/*
323 * copy saddr and daddr, possibly using 64bit load/stores
324 * Equivalent to :
325 *   iph->saddr = fl4->saddr;
326 *   iph->daddr = fl4->daddr;
327 */
328static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
329{
330	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
331		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
332	memcpy(&iph->saddr, &fl4->saddr,
333	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
334}
335
336int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
337{
338	struct sock *sk = skb->sk;
339	struct inet_sock *inet = inet_sk(sk);
340	struct ip_options_rcu *inet_opt;
341	struct flowi4 *fl4;
342	struct rtable *rt;
343	struct iphdr *iph;
344	int res;
345
346	/* Skip all of this if the packet is already routed,
347	 * f.e. by something like SCTP.
348	 */
349	rcu_read_lock();
350	inet_opt = rcu_dereference(inet->inet_opt);
351	fl4 = &fl->u.ip4;
352	rt = skb_rtable(skb);
353	if (rt != NULL)
354		goto packet_routed;
355
356	/* Make sure we can route this packet. */
357	rt = (struct rtable *)__sk_dst_check(sk, 0);
358	if (rt == NULL) {
359		__be32 daddr;
360
361		/* Use correct destination address if we have options. */
362		daddr = inet->inet_daddr;
363		if (inet_opt && inet_opt->opt.srr)
364			daddr = inet_opt->opt.faddr;
365
366		/* If this fails, retransmit mechanism of transport layer will
367		 * keep trying until route appears or the connection times
368		 * itself out.
369		 */
370		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
371					   daddr, inet->inet_saddr,
372					   inet->inet_dport,
373					   inet->inet_sport,
374					   sk->sk_protocol,
375					   RT_CONN_FLAGS(sk),
376					   sk->sk_bound_dev_if);
377		if (IS_ERR(rt))
378			goto no_route;
379		sk_setup_caps(sk, &rt->dst);
380	}
381	skb_dst_set_noref(skb, &rt->dst);
382
383packet_routed:
384	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
385		goto no_route;
386
387	/* OK, we know where to send it, allocate and build IP header. */
388	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
389	skb_reset_network_header(skb);
390	iph = ip_hdr(skb);
391	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
392	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
393		iph->frag_off = htons(IP_DF);
394	else
395		iph->frag_off = 0;
396	iph->ttl      = ip_select_ttl(inet, &rt->dst);
397	iph->protocol = sk->sk_protocol;
398	ip_copy_addrs(iph, fl4);
399
400	/* Transport layer set skb->h.foo itself. */
401
402	if (inet_opt && inet_opt->opt.optlen) {
403		iph->ihl += inet_opt->opt.optlen >> 2;
404		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
405	}
406
407	ip_select_ident_more(iph, &rt->dst, sk,
408			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
409
410	skb->priority = sk->sk_priority;
411	skb->mark = sk->sk_mark;
412
413	res = ip_local_out(skb);
414	rcu_read_unlock();
415	return res;
416
417no_route:
418	rcu_read_unlock();
419	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
420	kfree_skb(skb);
421	return -EHOSTUNREACH;
422}
423EXPORT_SYMBOL(ip_queue_xmit);
424
425
426static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
427{
428	to->pkt_type = from->pkt_type;
429	to->priority = from->priority;
430	to->protocol = from->protocol;
431	skb_dst_drop(to);
432	skb_dst_copy(to, from);
433	to->dev = from->dev;
434	to->mark = from->mark;
435
436	/* Copy the flags to each fragment. */
437	IPCB(to)->flags = IPCB(from)->flags;
438
439#ifdef CONFIG_NET_SCHED
440	to->tc_index = from->tc_index;
441#endif
442	nf_copy(to, from);
443#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
444    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
445	to->nf_trace = from->nf_trace;
446#endif
447#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
448	to->ipvs_property = from->ipvs_property;
449#endif
450	skb_copy_secmark(to, from);
451}
452
453/*
454 *	This IP datagram is too large to be sent in one piece.  Break it up into
455 *	smaller pieces (each of size equal to IP header plus
456 *	a block of the data of the original IP data part) that will yet fit in a
457 *	single device frame, and queue such a frame for sending.
458 */
459
460int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
461{
462	struct iphdr *iph;
463	int ptr;
464	struct net_device *dev;
465	struct sk_buff *skb2;
466	unsigned int mtu, hlen, left, len, ll_rs;
467	int offset;
468	__be16 not_last_frag;
469	struct rtable *rt = skb_rtable(skb);
470	int err = 0;
471
472	dev = rt->dst.dev;
473
474	/*
475	 *	Point into the IP datagram header.
476	 */
477
478	iph = ip_hdr(skb);
479
480	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
481		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
482		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
483			  htonl(ip_skb_dst_mtu(skb)));
484		kfree_skb(skb);
485		return -EMSGSIZE;
486	}
487
488	/*
489	 *	Setup starting values.
490	 */
491
492	hlen = iph->ihl * 4;
493	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
494#ifdef CONFIG_BRIDGE_NETFILTER
495	if (skb->nf_bridge)
496		mtu -= nf_bridge_mtu_reduction(skb);
497#endif
498	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
499
500	/* When frag_list is given, use it. First, check its validity:
501	 * some transformers could create wrong frag_list or break existing
502	 * one, it is not prohibited. In this case fall back to copying.
503	 *
504	 * LATER: this step can be merged to real generation of fragments,
505	 * we can switch to copy when see the first bad fragment.
506	 */
507	if (skb_has_frag_list(skb)) {
508		struct sk_buff *frag, *frag2;
509		int first_len = skb_pagelen(skb);
510
511		if (first_len - hlen > mtu ||
512		    ((first_len - hlen) & 7) ||
513		    ip_is_fragment(iph) ||
514		    skb_cloned(skb))
515			goto slow_path;
516
517		skb_walk_frags(skb, frag) {
518			/* Correct geometry. */
519			if (frag->len > mtu ||
520			    ((frag->len & 7) && frag->next) ||
521			    skb_headroom(frag) < hlen)
522				goto slow_path_clean;
523
524			/* Partially cloned skb? */
525			if (skb_shared(frag))
526				goto slow_path_clean;
527
528			BUG_ON(frag->sk);
529			if (skb->sk) {
530				frag->sk = skb->sk;
531				frag->destructor = sock_wfree;
532			}
533			skb->truesize -= frag->truesize;
534		}
535
536		/* Everything is OK. Generate! */
537
538		err = 0;
539		offset = 0;
540		frag = skb_shinfo(skb)->frag_list;
541		skb_frag_list_init(skb);
542		skb->data_len = first_len - skb_headlen(skb);
543		skb->len = first_len;
544		iph->tot_len = htons(first_len);
545		iph->frag_off = htons(IP_MF);
546		ip_send_check(iph);
547
548		for (;;) {
549			/* Prepare header of the next frame,
550			 * before previous one went down. */
551			if (frag) {
552				frag->ip_summed = CHECKSUM_NONE;
553				skb_reset_transport_header(frag);
554				__skb_push(frag, hlen);
555				skb_reset_network_header(frag);
556				memcpy(skb_network_header(frag), iph, hlen);
557				iph = ip_hdr(frag);
558				iph->tot_len = htons(frag->len);
559				ip_copy_metadata(frag, skb);
560				if (offset == 0)
561					ip_options_fragment(frag);
562				offset += skb->len - hlen;
563				iph->frag_off = htons(offset>>3);
564				if (frag->next != NULL)
565					iph->frag_off |= htons(IP_MF);
566				/* Ready, complete checksum */
567				ip_send_check(iph);
568			}
569
570			err = output(skb);
571
572			if (!err)
573				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
574			if (err || !frag)
575				break;
576
577			skb = frag;
578			frag = skb->next;
579			skb->next = NULL;
580		}
581
582		if (err == 0) {
583			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
584			return 0;
585		}
586
587		while (frag) {
588			skb = frag->next;
589			kfree_skb(frag);
590			frag = skb;
591		}
592		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
593		return err;
594
595slow_path_clean:
596		skb_walk_frags(skb, frag2) {
597			if (frag2 == frag)
598				break;
599			frag2->sk = NULL;
600			frag2->destructor = NULL;
601			skb->truesize += frag2->truesize;
602		}
603	}
604
605slow_path:
606	left = skb->len - hlen;		/* Space per frame */
607	ptr = hlen;		/* Where to start from */
608
609	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
610	 * we need to make room for the encapsulating header
611	 */
612	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
613
614	/*
615	 *	Fragment the datagram.
616	 */
617
618	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
619	not_last_frag = iph->frag_off & htons(IP_MF);
620
621	/*
622	 *	Keep copying data until we run out.
623	 */
624
625	while (left > 0) {
626		len = left;
627		/* IF: it doesn't fit, use 'mtu' - the data space left */
628		if (len > mtu)
629			len = mtu;
630		/* IF: we are not sending up to and including the packet end
631		   then align the next start on an eight byte boundary */
632		if (len < left)	{
633			len &= ~7;
634		}
635		/*
636		 *	Allocate buffer.
637		 */
638
639		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
640			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
641			err = -ENOMEM;
642			goto fail;
643		}
644
645		/*
646		 *	Set up data on packet
647		 */
648
649		ip_copy_metadata(skb2, skb);
650		skb_reserve(skb2, ll_rs);
651		skb_put(skb2, len + hlen);
652		skb_reset_network_header(skb2);
653		skb2->transport_header = skb2->network_header + hlen;
654
655		/*
656		 *	Charge the memory for the fragment to any owner
657		 *	it might possess
658		 */
659
660		if (skb->sk)
661			skb_set_owner_w(skb2, skb->sk);
662
663		/*
664		 *	Copy the packet header into the new buffer.
665		 */
666
667		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
668
669		/*
670		 *	Copy a block of the IP datagram.
671		 */
672		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
673			BUG();
674		left -= len;
675
676		/*
677		 *	Fill in the new header fields.
678		 */
679		iph = ip_hdr(skb2);
680		iph->frag_off = htons((offset >> 3));
681
682		/* ANK: dirty, but effective trick. Upgrade options only if
683		 * the segment to be fragmented was THE FIRST (otherwise,
684		 * options are already fixed) and make it ONCE
685		 * on the initial skb, so that all the following fragments
686		 * will inherit fixed options.
687		 */
688		if (offset == 0)
689			ip_options_fragment(skb);
690
691		/*
692		 *	Added AC : If we are fragmenting a fragment that's not the
693		 *		   last fragment then keep MF on each bit
694		 */
695		if (left > 0 || not_last_frag)
696			iph->frag_off |= htons(IP_MF);
697		ptr += len;
698		offset += len;
699
700		/*
701		 *	Put this fragment into the sending queue.
702		 */
703		iph->tot_len = htons(len + hlen);
704
705		ip_send_check(iph);
706
707		err = output(skb2);
708		if (err)
709			goto fail;
710
711		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
712	}
713	kfree_skb(skb);
714	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
715	return err;
716
717fail:
718	kfree_skb(skb);
719	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
720	return err;
721}
722EXPORT_SYMBOL(ip_fragment);
723
724int
725ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
726{
727	struct iovec *iov = from;
728
729	if (skb->ip_summed == CHECKSUM_PARTIAL) {
730		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
731			return -EFAULT;
732	} else {
733		__wsum csum = 0;
734		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
735			return -EFAULT;
736		skb->csum = csum_block_add(skb->csum, csum, odd);
737	}
738	return 0;
739}
740EXPORT_SYMBOL(ip_generic_getfrag);
741
742static inline __wsum
743csum_page(struct page *page, int offset, int copy)
744{
745	char *kaddr;
746	__wsum csum;
747	kaddr = kmap(page);
748	csum = csum_partial(kaddr + offset, copy, 0);
749	kunmap(page);
750	return csum;
751}
752
753static inline int ip_ufo_append_data(struct sock *sk,
754			struct sk_buff_head *queue,
755			int getfrag(void *from, char *to, int offset, int len,
756			       int odd, struct sk_buff *skb),
757			void *from, int length, int hh_len, int fragheaderlen,
758			int transhdrlen, int maxfraglen, unsigned int flags)
759{
760	struct sk_buff *skb;
761	int err;
762
763	/* There is support for UDP fragmentation offload by network
764	 * device, so create one single skb packet containing complete
765	 * udp datagram
766	 */
767	if ((skb = skb_peek_tail(queue)) == NULL) {
768		skb = sock_alloc_send_skb(sk,
769			hh_len + fragheaderlen + transhdrlen + 20,
770			(flags & MSG_DONTWAIT), &err);
771
772		if (skb == NULL)
773			return err;
774
775		/* reserve space for Hardware header */
776		skb_reserve(skb, hh_len);
777
778		/* create space for UDP/IP header */
779		skb_put(skb, fragheaderlen + transhdrlen);
780
781		/* initialize network header pointer */
782		skb_reset_network_header(skb);
783
784		/* initialize protocol header pointer */
785		skb->transport_header = skb->network_header + fragheaderlen;
786
787		skb->ip_summed = CHECKSUM_PARTIAL;
788		skb->csum = 0;
789
790		/* specify the length of each IP datagram fragment */
791		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
792		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
793		__skb_queue_tail(queue, skb);
794	}
795
796	return skb_append_datato_frags(sk, skb, getfrag, from,
797				       (length - transhdrlen));
798}
799
800static int __ip_append_data(struct sock *sk,
801			    struct flowi4 *fl4,
802			    struct sk_buff_head *queue,
803			    struct inet_cork *cork,
804			    int getfrag(void *from, char *to, int offset,
805					int len, int odd, struct sk_buff *skb),
806			    void *from, int length, int transhdrlen,
807			    unsigned int flags)
808{
809	struct inet_sock *inet = inet_sk(sk);
810	struct sk_buff *skb;
811
812	struct ip_options *opt = cork->opt;
813	int hh_len;
814	int exthdrlen;
815	int mtu;
816	int copy;
817	int err;
818	int offset = 0;
819	unsigned int maxfraglen, fragheaderlen;
820	int csummode = CHECKSUM_NONE;
821	struct rtable *rt = (struct rtable *)cork->dst;
822
823	skb = skb_peek_tail(queue);
824
825	exthdrlen = !skb ? rt->dst.header_len : 0;
826	mtu = cork->fragsize;
827
828	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
829
830	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
831	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
832
833	if (cork->length + length > 0xFFFF - fragheaderlen) {
834		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
835			       mtu-exthdrlen);
836		return -EMSGSIZE;
837	}
838
839	/*
840	 * transhdrlen > 0 means that this is the first fragment and we wish
841	 * it won't be fragmented in the future.
842	 */
843	if (transhdrlen &&
844	    length + fragheaderlen <= mtu &&
845	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
846	    !exthdrlen)
847		csummode = CHECKSUM_PARTIAL;
848
849	cork->length += length;
850	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
851	    (sk->sk_protocol == IPPROTO_UDP) &&
852	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
853		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
854					 hh_len, fragheaderlen, transhdrlen,
855					 maxfraglen, flags);
856		if (err)
857			goto error;
858		return 0;
859	}
860
861	/* So, what's going on in the loop below?
862	 *
863	 * We use calculated fragment length to generate chained skb,
864	 * each of segments is IP fragment ready for sending to network after
865	 * adding appropriate IP header.
866	 */
867
868	if (!skb)
869		goto alloc_new_skb;
870
871	while (length > 0) {
872		/* Check if the remaining data fits into current packet. */
873		copy = mtu - skb->len;
874		if (copy < length)
875			copy = maxfraglen - skb->len;
876		if (copy <= 0) {
877			char *data;
878			unsigned int datalen;
879			unsigned int fraglen;
880			unsigned int fraggap;
881			unsigned int alloclen;
882			struct sk_buff *skb_prev;
883alloc_new_skb:
884			skb_prev = skb;
885			if (skb_prev)
886				fraggap = skb_prev->len - maxfraglen;
887			else
888				fraggap = 0;
889
890			/*
891			 * If remaining data exceeds the mtu,
892			 * we know we need more fragment(s).
893			 */
894			datalen = length + fraggap;
895			if (datalen > mtu - fragheaderlen)
896				datalen = maxfraglen - fragheaderlen;
897			fraglen = datalen + fragheaderlen;
898
899			if ((flags & MSG_MORE) &&
900			    !(rt->dst.dev->features&NETIF_F_SG))
901				alloclen = mtu;
902			else
903				alloclen = fraglen;
904
905			alloclen += exthdrlen;
906
907			/* The last fragment gets additional space at tail.
908			 * Note, with MSG_MORE we overallocate on fragments,
909			 * because we have no idea what fragment will be
910			 * the last.
911			 */
912			if (datalen == length + fraggap)
913				alloclen += rt->dst.trailer_len;
914
915			if (transhdrlen) {
916				skb = sock_alloc_send_skb(sk,
917						alloclen + hh_len + 15,
918						(flags & MSG_DONTWAIT), &err);
919			} else {
920				skb = NULL;
921				if (atomic_read(&sk->sk_wmem_alloc) <=
922				    2 * sk->sk_sndbuf)
923					skb = sock_wmalloc(sk,
924							   alloclen + hh_len + 15, 1,
925							   sk->sk_allocation);
926				if (unlikely(skb == NULL))
927					err = -ENOBUFS;
928				else
929					/* only the initial fragment is
930					   time stamped */
931					cork->tx_flags = 0;
932			}
933			if (skb == NULL)
934				goto error;
935
936			/*
937			 *	Fill in the control structures
938			 */
939			skb->ip_summed = csummode;
940			skb->csum = 0;
941			skb_reserve(skb, hh_len);
942			skb_shinfo(skb)->tx_flags = cork->tx_flags;
943
944			/*
945			 *	Find where to start putting bytes.
946			 */
947			data = skb_put(skb, fraglen + exthdrlen);
948			skb_set_network_header(skb, exthdrlen);
949			skb->transport_header = (skb->network_header +
950						 fragheaderlen);
951			data += fragheaderlen + exthdrlen;
952
953			if (fraggap) {
954				skb->csum = skb_copy_and_csum_bits(
955					skb_prev, maxfraglen,
956					data + transhdrlen, fraggap, 0);
957				skb_prev->csum = csum_sub(skb_prev->csum,
958							  skb->csum);
959				data += fraggap;
960				pskb_trim_unique(skb_prev, maxfraglen);
961			}
962
963			copy = datalen - transhdrlen - fraggap;
964			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
965				err = -EFAULT;
966				kfree_skb(skb);
967				goto error;
968			}
969
970			offset += copy;
971			length -= datalen - fraggap;
972			transhdrlen = 0;
973			exthdrlen = 0;
974			csummode = CHECKSUM_NONE;
975
976			/*
977			 * Put the packet on the pending queue.
978			 */
979			__skb_queue_tail(queue, skb);
980			continue;
981		}
982
983		if (copy > length)
984			copy = length;
985
986		if (!(rt->dst.dev->features&NETIF_F_SG)) {
987			unsigned int off;
988
989			off = skb->len;
990			if (getfrag(from, skb_put(skb, copy),
991					offset, copy, off, skb) < 0) {
992				__skb_trim(skb, off);
993				err = -EFAULT;
994				goto error;
995			}
996		} else {
997			int i = skb_shinfo(skb)->nr_frags;
998			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
999			struct page *page = cork->page;
1000			int off = cork->off;
1001			unsigned int left;
1002
1003			if (page && (left = PAGE_SIZE - off) > 0) {
1004				if (copy >= left)
1005					copy = left;
1006				if (page != skb_frag_page(frag)) {
1007					if (i == MAX_SKB_FRAGS) {
1008						err = -EMSGSIZE;
1009						goto error;
1010					}
1011					skb_fill_page_desc(skb, i, page, off, 0);
1012					skb_frag_ref(skb, i);
1013					frag = &skb_shinfo(skb)->frags[i];
1014				}
1015			} else if (i < MAX_SKB_FRAGS) {
1016				if (copy > PAGE_SIZE)
1017					copy = PAGE_SIZE;
1018				page = alloc_pages(sk->sk_allocation, 0);
1019				if (page == NULL)  {
1020					err = -ENOMEM;
1021					goto error;
1022				}
1023				cork->page = page;
1024				cork->off = 0;
1025
1026				skb_fill_page_desc(skb, i, page, 0, 0);
1027				frag = &skb_shinfo(skb)->frags[i];
1028			} else {
1029				err = -EMSGSIZE;
1030				goto error;
1031			}
1032			if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1033				    offset, copy, skb->len, skb) < 0) {
1034				err = -EFAULT;
1035				goto error;
1036			}
1037			cork->off += copy;
1038			skb_frag_size_add(frag, copy);
1039			skb->len += copy;
1040			skb->data_len += copy;
1041			skb->truesize += copy;
1042			atomic_add(copy, &sk->sk_wmem_alloc);
1043		}
1044		offset += copy;
1045		length -= copy;
1046	}
1047
1048	return 0;
1049
1050error:
1051	cork->length -= length;
1052	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1053	return err;
1054}
1055
1056static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1057			 struct ipcm_cookie *ipc, struct rtable **rtp)
1058{
1059	struct inet_sock *inet = inet_sk(sk);
1060	struct ip_options_rcu *opt;
1061	struct rtable *rt;
1062
1063	/*
1064	 * setup for corking.
1065	 */
1066	opt = ipc->opt;
1067	if (opt) {
1068		if (cork->opt == NULL) {
1069			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1070					    sk->sk_allocation);
1071			if (unlikely(cork->opt == NULL))
1072				return -ENOBUFS;
1073		}
1074		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1075		cork->flags |= IPCORK_OPT;
1076		cork->addr = ipc->addr;
1077	}
1078	rt = *rtp;
1079	if (unlikely(!rt))
1080		return -EFAULT;
1081	/*
1082	 * We steal reference to this route, caller should not release it
1083	 */
1084	*rtp = NULL;
1085	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1086			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1087	cork->dst = &rt->dst;
1088	cork->length = 0;
1089	cork->tx_flags = ipc->tx_flags;
1090	cork->page = NULL;
1091	cork->off = 0;
1092
1093	return 0;
1094}
1095
1096/*
1097 *	ip_append_data() and ip_append_page() can make one large IP datagram
1098 *	from many pieces of data. Each pieces will be holded on the socket
1099 *	until ip_push_pending_frames() is called. Each piece can be a page
1100 *	or non-page data.
1101 *
1102 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1103 *	this interface potentially.
1104 *
1105 *	LATER: length must be adjusted by pad at tail, when it is required.
1106 */
1107int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1108		   int getfrag(void *from, char *to, int offset, int len,
1109			       int odd, struct sk_buff *skb),
1110		   void *from, int length, int transhdrlen,
1111		   struct ipcm_cookie *ipc, struct rtable **rtp,
1112		   unsigned int flags)
1113{
1114	struct inet_sock *inet = inet_sk(sk);
1115	int err;
1116
1117	if (flags&MSG_PROBE)
1118		return 0;
1119
1120	if (skb_queue_empty(&sk->sk_write_queue)) {
1121		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1122		if (err)
1123			return err;
1124	} else {
1125		transhdrlen = 0;
1126	}
1127
1128	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1129				from, length, transhdrlen, flags);
1130}
1131
1132ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1133		       int offset, size_t size, int flags)
1134{
1135	struct inet_sock *inet = inet_sk(sk);
1136	struct sk_buff *skb;
1137	struct rtable *rt;
1138	struct ip_options *opt = NULL;
1139	struct inet_cork *cork;
1140	int hh_len;
1141	int mtu;
1142	int len;
1143	int err;
1144	unsigned int maxfraglen, fragheaderlen, fraggap;
1145
1146	if (inet->hdrincl)
1147		return -EPERM;
1148
1149	if (flags&MSG_PROBE)
1150		return 0;
1151
1152	if (skb_queue_empty(&sk->sk_write_queue))
1153		return -EINVAL;
1154
1155	cork = &inet->cork.base;
1156	rt = (struct rtable *)cork->dst;
1157	if (cork->flags & IPCORK_OPT)
1158		opt = cork->opt;
1159
1160	if (!(rt->dst.dev->features&NETIF_F_SG))
1161		return -EOPNOTSUPP;
1162
1163	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1164	mtu = cork->fragsize;
1165
1166	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1167	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1168
1169	if (cork->length + size > 0xFFFF - fragheaderlen) {
1170		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1171		return -EMSGSIZE;
1172	}
1173
1174	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1175		return -EINVAL;
1176
1177	cork->length += size;
1178	if ((size + skb->len > mtu) &&
1179	    (sk->sk_protocol == IPPROTO_UDP) &&
1180	    (rt->dst.dev->features & NETIF_F_UFO)) {
1181		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1182		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1183	}
1184
1185
1186	while (size > 0) {
1187		int i;
1188
1189		if (skb_is_gso(skb))
1190			len = size;
1191		else {
1192
1193			/* Check if the remaining data fits into current packet. */
1194			len = mtu - skb->len;
1195			if (len < size)
1196				len = maxfraglen - skb->len;
1197		}
1198		if (len <= 0) {
1199			struct sk_buff *skb_prev;
1200			int alloclen;
1201
1202			skb_prev = skb;
1203			fraggap = skb_prev->len - maxfraglen;
1204
1205			alloclen = fragheaderlen + hh_len + fraggap + 15;
1206			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1207			if (unlikely(!skb)) {
1208				err = -ENOBUFS;
1209				goto error;
1210			}
1211
1212			/*
1213			 *	Fill in the control structures
1214			 */
1215			skb->ip_summed = CHECKSUM_NONE;
1216			skb->csum = 0;
1217			skb_reserve(skb, hh_len);
1218
1219			/*
1220			 *	Find where to start putting bytes.
1221			 */
1222			skb_put(skb, fragheaderlen + fraggap);
1223			skb_reset_network_header(skb);
1224			skb->transport_header = (skb->network_header +
1225						 fragheaderlen);
1226			if (fraggap) {
1227				skb->csum = skb_copy_and_csum_bits(skb_prev,
1228								   maxfraglen,
1229						    skb_transport_header(skb),
1230								   fraggap, 0);
1231				skb_prev->csum = csum_sub(skb_prev->csum,
1232							  skb->csum);
1233				pskb_trim_unique(skb_prev, maxfraglen);
1234			}
1235
1236			/*
1237			 * Put the packet on the pending queue.
1238			 */
1239			__skb_queue_tail(&sk->sk_write_queue, skb);
1240			continue;
1241		}
1242
1243		i = skb_shinfo(skb)->nr_frags;
1244		if (len > size)
1245			len = size;
1246		if (skb_can_coalesce(skb, i, page, offset)) {
1247			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1248		} else if (i < MAX_SKB_FRAGS) {
1249			get_page(page);
1250			skb_fill_page_desc(skb, i, page, offset, len);
1251		} else {
1252			err = -EMSGSIZE;
1253			goto error;
1254		}
1255
1256		if (skb->ip_summed == CHECKSUM_NONE) {
1257			__wsum csum;
1258			csum = csum_page(page, offset, len);
1259			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1260		}
1261
1262		skb->len += len;
1263		skb->data_len += len;
1264		skb->truesize += len;
1265		atomic_add(len, &sk->sk_wmem_alloc);
1266		offset += len;
1267		size -= len;
1268	}
1269	return 0;
1270
1271error:
1272	cork->length -= size;
1273	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1274	return err;
1275}
1276
1277static void ip_cork_release(struct inet_cork *cork)
1278{
1279	cork->flags &= ~IPCORK_OPT;
1280	kfree(cork->opt);
1281	cork->opt = NULL;
1282	dst_release(cork->dst);
1283	cork->dst = NULL;
1284}
1285
1286/*
1287 *	Combined all pending IP fragments on the socket as one IP datagram
1288 *	and push them out.
1289 */
1290struct sk_buff *__ip_make_skb(struct sock *sk,
1291			      struct flowi4 *fl4,
1292			      struct sk_buff_head *queue,
1293			      struct inet_cork *cork)
1294{
1295	struct sk_buff *skb, *tmp_skb;
1296	struct sk_buff **tail_skb;
1297	struct inet_sock *inet = inet_sk(sk);
1298	struct net *net = sock_net(sk);
1299	struct ip_options *opt = NULL;
1300	struct rtable *rt = (struct rtable *)cork->dst;
1301	struct iphdr *iph;
1302	__be16 df = 0;
1303	__u8 ttl;
1304
1305	if ((skb = __skb_dequeue(queue)) == NULL)
1306		goto out;
1307	tail_skb = &(skb_shinfo(skb)->frag_list);
1308
1309	/* move skb->data to ip header from ext header */
1310	if (skb->data < skb_network_header(skb))
1311		__skb_pull(skb, skb_network_offset(skb));
1312	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1313		__skb_pull(tmp_skb, skb_network_header_len(skb));
1314		*tail_skb = tmp_skb;
1315		tail_skb = &(tmp_skb->next);
1316		skb->len += tmp_skb->len;
1317		skb->data_len += tmp_skb->len;
1318		skb->truesize += tmp_skb->truesize;
1319		tmp_skb->destructor = NULL;
1320		tmp_skb->sk = NULL;
1321	}
1322
1323	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1324	 * to fragment the frame generated here. No matter, what transforms
1325	 * how transforms change size of the packet, it will come out.
1326	 */
1327	if (inet->pmtudisc < IP_PMTUDISC_DO)
1328		skb->local_df = 1;
1329
1330	/* DF bit is set when we want to see DF on outgoing frames.
1331	 * If local_df is set too, we still allow to fragment this frame
1332	 * locally. */
1333	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1334	    (skb->len <= dst_mtu(&rt->dst) &&
1335	     ip_dont_fragment(sk, &rt->dst)))
1336		df = htons(IP_DF);
1337
1338	if (cork->flags & IPCORK_OPT)
1339		opt = cork->opt;
1340
1341	if (rt->rt_type == RTN_MULTICAST)
1342		ttl = inet->mc_ttl;
1343	else
1344		ttl = ip_select_ttl(inet, &rt->dst);
1345
1346	iph = (struct iphdr *)skb->data;
1347	iph->version = 4;
1348	iph->ihl = 5;
1349	iph->tos = inet->tos;
1350	iph->frag_off = df;
1351	ip_select_ident(iph, &rt->dst, sk);
1352	iph->ttl = ttl;
1353	iph->protocol = sk->sk_protocol;
1354	ip_copy_addrs(iph, fl4);
1355
1356	if (opt) {
1357		iph->ihl += opt->optlen>>2;
1358		ip_options_build(skb, opt, cork->addr, rt, 0);
1359	}
1360
1361	skb->priority = sk->sk_priority;
1362	skb->mark = sk->sk_mark;
1363	/*
1364	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1365	 * on dst refcount
1366	 */
1367	cork->dst = NULL;
1368	skb_dst_set(skb, &rt->dst);
1369
1370	if (iph->protocol == IPPROTO_ICMP)
1371		icmp_out_count(net, ((struct icmphdr *)
1372			skb_transport_header(skb))->type);
1373
1374	ip_cork_release(cork);
1375out:
1376	return skb;
1377}
1378
1379int ip_send_skb(struct sk_buff *skb)
1380{
1381	struct net *net = sock_net(skb->sk);
1382	int err;
1383
1384	err = ip_local_out(skb);
1385	if (err) {
1386		if (err > 0)
1387			err = net_xmit_errno(err);
1388		if (err)
1389			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1390	}
1391
1392	return err;
1393}
1394
1395int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1396{
1397	struct sk_buff *skb;
1398
1399	skb = ip_finish_skb(sk, fl4);
1400	if (!skb)
1401		return 0;
1402
1403	/* Netfilter gets whole the not fragmented skb. */
1404	return ip_send_skb(skb);
1405}
1406
1407/*
1408 *	Throw away all pending data on the socket.
1409 */
1410static void __ip_flush_pending_frames(struct sock *sk,
1411				      struct sk_buff_head *queue,
1412				      struct inet_cork *cork)
1413{
1414	struct sk_buff *skb;
1415
1416	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1417		kfree_skb(skb);
1418
1419	ip_cork_release(cork);
1420}
1421
1422void ip_flush_pending_frames(struct sock *sk)
1423{
1424	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1425}
1426
1427struct sk_buff *ip_make_skb(struct sock *sk,
1428			    struct flowi4 *fl4,
1429			    int getfrag(void *from, char *to, int offset,
1430					int len, int odd, struct sk_buff *skb),
1431			    void *from, int length, int transhdrlen,
1432			    struct ipcm_cookie *ipc, struct rtable **rtp,
1433			    unsigned int flags)
1434{
1435	struct inet_cork cork;
1436	struct sk_buff_head queue;
1437	int err;
1438
1439	if (flags & MSG_PROBE)
1440		return NULL;
1441
1442	__skb_queue_head_init(&queue);
1443
1444	cork.flags = 0;
1445	cork.addr = 0;
1446	cork.opt = NULL;
1447	err = ip_setup_cork(sk, &cork, ipc, rtp);
1448	if (err)
1449		return ERR_PTR(err);
1450
1451	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1452			       from, length, transhdrlen, flags);
1453	if (err) {
1454		__ip_flush_pending_frames(sk, &queue, &cork);
1455		return ERR_PTR(err);
1456	}
1457
1458	return __ip_make_skb(sk, fl4, &queue, &cork);
1459}
1460
1461/*
1462 *	Fetch data from kernel space and fill in checksum if needed.
1463 */
1464static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1465			      int len, int odd, struct sk_buff *skb)
1466{
1467	__wsum csum;
1468
1469	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1470	skb->csum = csum_block_add(skb->csum, csum, odd);
1471	return 0;
1472}
1473
1474/*
1475 *	Generic function to send a packet as reply to another packet.
1476 *	Used to send TCP resets so far. ICMP should use this function too.
1477 *
1478 *	Should run single threaded per socket because it uses the sock
1479 *     	structure to pass arguments.
1480 */
1481void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1482		   const struct ip_reply_arg *arg, unsigned int len)
1483{
1484	struct inet_sock *inet = inet_sk(sk);
1485	struct ip_options_data replyopts;
1486	struct ipcm_cookie ipc;
1487	struct flowi4 fl4;
1488	struct rtable *rt = skb_rtable(skb);
1489
1490	if (ip_options_echo(&replyopts.opt.opt, skb))
1491		return;
1492
1493	ipc.addr = daddr;
1494	ipc.opt = NULL;
1495	ipc.tx_flags = 0;
1496
1497	if (replyopts.opt.opt.optlen) {
1498		ipc.opt = &replyopts.opt;
1499
1500		if (replyopts.opt.opt.srr)
1501			daddr = replyopts.opt.opt.faddr;
1502	}
1503
1504	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1505			   RT_TOS(arg->tos),
1506			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1507			   ip_reply_arg_flowi_flags(arg),
1508			   daddr, rt->rt_spec_dst,
1509			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1510	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1511	rt = ip_route_output_key(sock_net(sk), &fl4);
1512	if (IS_ERR(rt))
1513		return;
1514
1515	/* And let IP do all the hard work.
1516
1517	   This chunk is not reenterable, hence spinlock.
1518	   Note that it uses the fact, that this function is called
1519	   with locally disabled BH and that sk cannot be already spinlocked.
1520	 */
1521	bh_lock_sock(sk);
1522	inet->tos = arg->tos;
1523	sk->sk_priority = skb->priority;
1524	sk->sk_protocol = ip_hdr(skb)->protocol;
1525	sk->sk_bound_dev_if = arg->bound_dev_if;
1526	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1527		       &ipc, &rt, MSG_DONTWAIT);
1528	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1529		if (arg->csumoffset >= 0)
1530			*((__sum16 *)skb_transport_header(skb) +
1531			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1532								arg->csum));
1533		skb->ip_summed = CHECKSUM_NONE;
1534		ip_push_pending_frames(sk, &fl4);
1535	}
1536
1537	bh_unlock_sock(sk);
1538
1539	ip_rt_put(rt);
1540}
1541
1542void __init ip_init(void)
1543{
1544	ip_rt_init();
1545	inet_initpeers();
1546
1547#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1548	igmp_mc_proc_init();
1549#endif
1550}
1551