ip_output.c revision f2c31e32b378a6653f8de606149d963baf11d7d3
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162	iph->saddr    = saddr;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->opt.optlen) {
167		iph->ihl += opt->opt.optlen>>2;
168		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185	struct neighbour *neigh;
186
187	if (rt->rt_type == RTN_MULTICAST) {
188		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189	} else if (rt->rt_type == RTN_BROADCAST)
190		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191
192	/* Be paranoid, rather than too clever. */
193	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194		struct sk_buff *skb2;
195
196		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197		if (skb2 == NULL) {
198			kfree_skb(skb);
199			return -ENOMEM;
200		}
201		if (skb->sk)
202			skb_set_owner_w(skb2, skb->sk);
203		kfree_skb(skb);
204		skb = skb2;
205	}
206
207	rcu_read_lock();
208	neigh = dst_get_neighbour(dst);
209	if (neigh) {
210		int res = neigh_output(neigh, skb);
211
212		rcu_read_unlock();
213		return res;
214	}
215	rcu_read_unlock();
216
217	if (net_ratelimit())
218		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
219	kfree_skb(skb);
220	return -EINVAL;
221}
222
223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229}
230
231static int ip_finish_output(struct sk_buff *skb)
232{
233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234	/* Policy lookup after SNAT yielded a new policy */
235	if (skb_dst(skb)->xfrm != NULL) {
236		IPCB(skb)->flags |= IPSKB_REROUTED;
237		return dst_output(skb);
238	}
239#endif
240	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241		return ip_fragment(skb, ip_finish_output2);
242	else
243		return ip_finish_output2(skb);
244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248	struct sock *sk = skb->sk;
249	struct rtable *rt = skb_rtable(skb);
250	struct net_device *dev = rt->dst.dev;
251
252	/*
253	 *	If the indicated interface is up and running, send the packet.
254	 */
255	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256
257	skb->dev = dev;
258	skb->protocol = htons(ETH_P_IP);
259
260	/*
261	 *	Multicasts are looped back for other local users
262	 */
263
264	if (rt->rt_flags&RTCF_MULTICAST) {
265		if (sk_mc_loop(sk)
266#ifdef CONFIG_IP_MROUTE
267		/* Small optimization: do not loopback not local frames,
268		   which returned after forwarding; they will be  dropped
269		   by ip_mr_input in any case.
270		   Note, that local frames are looped back to be delivered
271		   to local recipients.
272
273		   This check is duplicated in ip_mr_input at the moment.
274		 */
275		    &&
276		    ((rt->rt_flags & RTCF_LOCAL) ||
277		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
278#endif
279		   ) {
280			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281			if (newskb)
282				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283					newskb, NULL, newskb->dev,
284					ip_dev_loopback_xmit);
285		}
286
287		/* Multicasts with ttl 0 must not go beyond the host */
288
289		if (ip_hdr(skb)->ttl == 0) {
290			kfree_skb(skb);
291			return 0;
292		}
293	}
294
295	if (rt->rt_flags&RTCF_BROADCAST) {
296		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297		if (newskb)
298			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299				NULL, newskb->dev, ip_dev_loopback_xmit);
300	}
301
302	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303			    skb->dev, ip_finish_output,
304			    !(IPCB(skb)->flags & IPSKB_REROUTED));
305}
306
307int ip_output(struct sk_buff *skb)
308{
309	struct net_device *dev = skb_dst(skb)->dev;
310
311	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312
313	skb->dev = dev;
314	skb->protocol = htons(ETH_P_IP);
315
316	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317			    ip_finish_output,
318			    !(IPCB(skb)->flags & IPSKB_REROUTED));
319}
320
321int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
322{
323	struct sock *sk = skb->sk;
324	struct inet_sock *inet = inet_sk(sk);
325	struct ip_options_rcu *inet_opt;
326	struct flowi4 *fl4;
327	struct rtable *rt;
328	struct iphdr *iph;
329	int res;
330
331	/* Skip all of this if the packet is already routed,
332	 * f.e. by something like SCTP.
333	 */
334	rcu_read_lock();
335	inet_opt = rcu_dereference(inet->inet_opt);
336	fl4 = &fl->u.ip4;
337	rt = skb_rtable(skb);
338	if (rt != NULL)
339		goto packet_routed;
340
341	/* Make sure we can route this packet. */
342	rt = (struct rtable *)__sk_dst_check(sk, 0);
343	if (rt == NULL) {
344		__be32 daddr;
345
346		/* Use correct destination address if we have options. */
347		daddr = inet->inet_daddr;
348		if (inet_opt && inet_opt->opt.srr)
349			daddr = inet_opt->opt.faddr;
350
351		/* If this fails, retransmit mechanism of transport layer will
352		 * keep trying until route appears or the connection times
353		 * itself out.
354		 */
355		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
356					   daddr, inet->inet_saddr,
357					   inet->inet_dport,
358					   inet->inet_sport,
359					   sk->sk_protocol,
360					   RT_CONN_FLAGS(sk),
361					   sk->sk_bound_dev_if);
362		if (IS_ERR(rt))
363			goto no_route;
364		sk_setup_caps(sk, &rt->dst);
365	}
366	skb_dst_set_noref(skb, &rt->dst);
367
368packet_routed:
369	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
370		goto no_route;
371
372	/* OK, we know where to send it, allocate and build IP header. */
373	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
374	skb_reset_network_header(skb);
375	iph = ip_hdr(skb);
376	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
377	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
378		iph->frag_off = htons(IP_DF);
379	else
380		iph->frag_off = 0;
381	iph->ttl      = ip_select_ttl(inet, &rt->dst);
382	iph->protocol = sk->sk_protocol;
383	iph->saddr    = fl4->saddr;
384	iph->daddr    = fl4->daddr;
385	/* Transport layer set skb->h.foo itself. */
386
387	if (inet_opt && inet_opt->opt.optlen) {
388		iph->ihl += inet_opt->opt.optlen >> 2;
389		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
390	}
391
392	ip_select_ident_more(iph, &rt->dst, sk,
393			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
394
395	skb->priority = sk->sk_priority;
396	skb->mark = sk->sk_mark;
397
398	res = ip_local_out(skb);
399	rcu_read_unlock();
400	return res;
401
402no_route:
403	rcu_read_unlock();
404	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
405	kfree_skb(skb);
406	return -EHOSTUNREACH;
407}
408EXPORT_SYMBOL(ip_queue_xmit);
409
410
411static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
412{
413	to->pkt_type = from->pkt_type;
414	to->priority = from->priority;
415	to->protocol = from->protocol;
416	skb_dst_drop(to);
417	skb_dst_copy(to, from);
418	to->dev = from->dev;
419	to->mark = from->mark;
420
421	/* Copy the flags to each fragment. */
422	IPCB(to)->flags = IPCB(from)->flags;
423
424#ifdef CONFIG_NET_SCHED
425	to->tc_index = from->tc_index;
426#endif
427	nf_copy(to, from);
428#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
429    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
430	to->nf_trace = from->nf_trace;
431#endif
432#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
433	to->ipvs_property = from->ipvs_property;
434#endif
435	skb_copy_secmark(to, from);
436}
437
438/*
439 *	This IP datagram is too large to be sent in one piece.  Break it up into
440 *	smaller pieces (each of size equal to IP header plus
441 *	a block of the data of the original IP data part) that will yet fit in a
442 *	single device frame, and queue such a frame for sending.
443 */
444
445int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
446{
447	struct iphdr *iph;
448	int ptr;
449	struct net_device *dev;
450	struct sk_buff *skb2;
451	unsigned int mtu, hlen, left, len, ll_rs;
452	int offset;
453	__be16 not_last_frag;
454	struct rtable *rt = skb_rtable(skb);
455	int err = 0;
456
457	dev = rt->dst.dev;
458
459	/*
460	 *	Point into the IP datagram header.
461	 */
462
463	iph = ip_hdr(skb);
464
465	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
466		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
467		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
468			  htonl(ip_skb_dst_mtu(skb)));
469		kfree_skb(skb);
470		return -EMSGSIZE;
471	}
472
473	/*
474	 *	Setup starting values.
475	 */
476
477	hlen = iph->ihl * 4;
478	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
479#ifdef CONFIG_BRIDGE_NETFILTER
480	if (skb->nf_bridge)
481		mtu -= nf_bridge_mtu_reduction(skb);
482#endif
483	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
484
485	/* When frag_list is given, use it. First, check its validity:
486	 * some transformers could create wrong frag_list or break existing
487	 * one, it is not prohibited. In this case fall back to copying.
488	 *
489	 * LATER: this step can be merged to real generation of fragments,
490	 * we can switch to copy when see the first bad fragment.
491	 */
492	if (skb_has_frag_list(skb)) {
493		struct sk_buff *frag, *frag2;
494		int first_len = skb_pagelen(skb);
495
496		if (first_len - hlen > mtu ||
497		    ((first_len - hlen) & 7) ||
498		    ip_is_fragment(iph) ||
499		    skb_cloned(skb))
500			goto slow_path;
501
502		skb_walk_frags(skb, frag) {
503			/* Correct geometry. */
504			if (frag->len > mtu ||
505			    ((frag->len & 7) && frag->next) ||
506			    skb_headroom(frag) < hlen)
507				goto slow_path_clean;
508
509			/* Partially cloned skb? */
510			if (skb_shared(frag))
511				goto slow_path_clean;
512
513			BUG_ON(frag->sk);
514			if (skb->sk) {
515				frag->sk = skb->sk;
516				frag->destructor = sock_wfree;
517			}
518			skb->truesize -= frag->truesize;
519		}
520
521		/* Everything is OK. Generate! */
522
523		err = 0;
524		offset = 0;
525		frag = skb_shinfo(skb)->frag_list;
526		skb_frag_list_init(skb);
527		skb->data_len = first_len - skb_headlen(skb);
528		skb->len = first_len;
529		iph->tot_len = htons(first_len);
530		iph->frag_off = htons(IP_MF);
531		ip_send_check(iph);
532
533		for (;;) {
534			/* Prepare header of the next frame,
535			 * before previous one went down. */
536			if (frag) {
537				frag->ip_summed = CHECKSUM_NONE;
538				skb_reset_transport_header(frag);
539				__skb_push(frag, hlen);
540				skb_reset_network_header(frag);
541				memcpy(skb_network_header(frag), iph, hlen);
542				iph = ip_hdr(frag);
543				iph->tot_len = htons(frag->len);
544				ip_copy_metadata(frag, skb);
545				if (offset == 0)
546					ip_options_fragment(frag);
547				offset += skb->len - hlen;
548				iph->frag_off = htons(offset>>3);
549				if (frag->next != NULL)
550					iph->frag_off |= htons(IP_MF);
551				/* Ready, complete checksum */
552				ip_send_check(iph);
553			}
554
555			err = output(skb);
556
557			if (!err)
558				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
559			if (err || !frag)
560				break;
561
562			skb = frag;
563			frag = skb->next;
564			skb->next = NULL;
565		}
566
567		if (err == 0) {
568			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
569			return 0;
570		}
571
572		while (frag) {
573			skb = frag->next;
574			kfree_skb(frag);
575			frag = skb;
576		}
577		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578		return err;
579
580slow_path_clean:
581		skb_walk_frags(skb, frag2) {
582			if (frag2 == frag)
583				break;
584			frag2->sk = NULL;
585			frag2->destructor = NULL;
586			skb->truesize += frag2->truesize;
587		}
588	}
589
590slow_path:
591	left = skb->len - hlen;		/* Space per frame */
592	ptr = hlen;		/* Where to start from */
593
594	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
595	 * we need to make room for the encapsulating header
596	 */
597	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
598
599	/*
600	 *	Fragment the datagram.
601	 */
602
603	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
604	not_last_frag = iph->frag_off & htons(IP_MF);
605
606	/*
607	 *	Keep copying data until we run out.
608	 */
609
610	while (left > 0) {
611		len = left;
612		/* IF: it doesn't fit, use 'mtu' - the data space left */
613		if (len > mtu)
614			len = mtu;
615		/* IF: we are not sending up to and including the packet end
616		   then align the next start on an eight byte boundary */
617		if (len < left)	{
618			len &= ~7;
619		}
620		/*
621		 *	Allocate buffer.
622		 */
623
624		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
625			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
626			err = -ENOMEM;
627			goto fail;
628		}
629
630		/*
631		 *	Set up data on packet
632		 */
633
634		ip_copy_metadata(skb2, skb);
635		skb_reserve(skb2, ll_rs);
636		skb_put(skb2, len + hlen);
637		skb_reset_network_header(skb2);
638		skb2->transport_header = skb2->network_header + hlen;
639
640		/*
641		 *	Charge the memory for the fragment to any owner
642		 *	it might possess
643		 */
644
645		if (skb->sk)
646			skb_set_owner_w(skb2, skb->sk);
647
648		/*
649		 *	Copy the packet header into the new buffer.
650		 */
651
652		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
653
654		/*
655		 *	Copy a block of the IP datagram.
656		 */
657		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
658			BUG();
659		left -= len;
660
661		/*
662		 *	Fill in the new header fields.
663		 */
664		iph = ip_hdr(skb2);
665		iph->frag_off = htons((offset >> 3));
666
667		/* ANK: dirty, but effective trick. Upgrade options only if
668		 * the segment to be fragmented was THE FIRST (otherwise,
669		 * options are already fixed) and make it ONCE
670		 * on the initial skb, so that all the following fragments
671		 * will inherit fixed options.
672		 */
673		if (offset == 0)
674			ip_options_fragment(skb);
675
676		/*
677		 *	Added AC : If we are fragmenting a fragment that's not the
678		 *		   last fragment then keep MF on each bit
679		 */
680		if (left > 0 || not_last_frag)
681			iph->frag_off |= htons(IP_MF);
682		ptr += len;
683		offset += len;
684
685		/*
686		 *	Put this fragment into the sending queue.
687		 */
688		iph->tot_len = htons(len + hlen);
689
690		ip_send_check(iph);
691
692		err = output(skb2);
693		if (err)
694			goto fail;
695
696		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
697	}
698	kfree_skb(skb);
699	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
700	return err;
701
702fail:
703	kfree_skb(skb);
704	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
705	return err;
706}
707EXPORT_SYMBOL(ip_fragment);
708
709int
710ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
711{
712	struct iovec *iov = from;
713
714	if (skb->ip_summed == CHECKSUM_PARTIAL) {
715		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
716			return -EFAULT;
717	} else {
718		__wsum csum = 0;
719		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
720			return -EFAULT;
721		skb->csum = csum_block_add(skb->csum, csum, odd);
722	}
723	return 0;
724}
725EXPORT_SYMBOL(ip_generic_getfrag);
726
727static inline __wsum
728csum_page(struct page *page, int offset, int copy)
729{
730	char *kaddr;
731	__wsum csum;
732	kaddr = kmap(page);
733	csum = csum_partial(kaddr + offset, copy, 0);
734	kunmap(page);
735	return csum;
736}
737
738static inline int ip_ufo_append_data(struct sock *sk,
739			struct sk_buff_head *queue,
740			int getfrag(void *from, char *to, int offset, int len,
741			       int odd, struct sk_buff *skb),
742			void *from, int length, int hh_len, int fragheaderlen,
743			int transhdrlen, int maxfraglen, unsigned int flags)
744{
745	struct sk_buff *skb;
746	int err;
747
748	/* There is support for UDP fragmentation offload by network
749	 * device, so create one single skb packet containing complete
750	 * udp datagram
751	 */
752	if ((skb = skb_peek_tail(queue)) == NULL) {
753		skb = sock_alloc_send_skb(sk,
754			hh_len + fragheaderlen + transhdrlen + 20,
755			(flags & MSG_DONTWAIT), &err);
756
757		if (skb == NULL)
758			return err;
759
760		/* reserve space for Hardware header */
761		skb_reserve(skb, hh_len);
762
763		/* create space for UDP/IP header */
764		skb_put(skb, fragheaderlen + transhdrlen);
765
766		/* initialize network header pointer */
767		skb_reset_network_header(skb);
768
769		/* initialize protocol header pointer */
770		skb->transport_header = skb->network_header + fragheaderlen;
771
772		skb->ip_summed = CHECKSUM_PARTIAL;
773		skb->csum = 0;
774
775		/* specify the length of each IP datagram fragment */
776		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
777		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
778		__skb_queue_tail(queue, skb);
779	}
780
781	return skb_append_datato_frags(sk, skb, getfrag, from,
782				       (length - transhdrlen));
783}
784
785static int __ip_append_data(struct sock *sk,
786			    struct flowi4 *fl4,
787			    struct sk_buff_head *queue,
788			    struct inet_cork *cork,
789			    int getfrag(void *from, char *to, int offset,
790					int len, int odd, struct sk_buff *skb),
791			    void *from, int length, int transhdrlen,
792			    unsigned int flags)
793{
794	struct inet_sock *inet = inet_sk(sk);
795	struct sk_buff *skb;
796
797	struct ip_options *opt = cork->opt;
798	int hh_len;
799	int exthdrlen;
800	int mtu;
801	int copy;
802	int err;
803	int offset = 0;
804	unsigned int maxfraglen, fragheaderlen;
805	int csummode = CHECKSUM_NONE;
806	struct rtable *rt = (struct rtable *)cork->dst;
807
808	skb = skb_peek_tail(queue);
809
810	exthdrlen = !skb ? rt->dst.header_len : 0;
811	mtu = cork->fragsize;
812
813	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
814
815	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
816	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
817
818	if (cork->length + length > 0xFFFF - fragheaderlen) {
819		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
820			       mtu-exthdrlen);
821		return -EMSGSIZE;
822	}
823
824	/*
825	 * transhdrlen > 0 means that this is the first fragment and we wish
826	 * it won't be fragmented in the future.
827	 */
828	if (transhdrlen &&
829	    length + fragheaderlen <= mtu &&
830	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
831	    !exthdrlen)
832		csummode = CHECKSUM_PARTIAL;
833
834	cork->length += length;
835	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
836	    (sk->sk_protocol == IPPROTO_UDP) &&
837	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
838		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
839					 hh_len, fragheaderlen, transhdrlen,
840					 maxfraglen, flags);
841		if (err)
842			goto error;
843		return 0;
844	}
845
846	/* So, what's going on in the loop below?
847	 *
848	 * We use calculated fragment length to generate chained skb,
849	 * each of segments is IP fragment ready for sending to network after
850	 * adding appropriate IP header.
851	 */
852
853	if (!skb)
854		goto alloc_new_skb;
855
856	while (length > 0) {
857		/* Check if the remaining data fits into current packet. */
858		copy = mtu - skb->len;
859		if (copy < length)
860			copy = maxfraglen - skb->len;
861		if (copy <= 0) {
862			char *data;
863			unsigned int datalen;
864			unsigned int fraglen;
865			unsigned int fraggap;
866			unsigned int alloclen;
867			struct sk_buff *skb_prev;
868alloc_new_skb:
869			skb_prev = skb;
870			if (skb_prev)
871				fraggap = skb_prev->len - maxfraglen;
872			else
873				fraggap = 0;
874
875			/*
876			 * If remaining data exceeds the mtu,
877			 * we know we need more fragment(s).
878			 */
879			datalen = length + fraggap;
880			if (datalen > mtu - fragheaderlen)
881				datalen = maxfraglen - fragheaderlen;
882			fraglen = datalen + fragheaderlen;
883
884			if ((flags & MSG_MORE) &&
885			    !(rt->dst.dev->features&NETIF_F_SG))
886				alloclen = mtu;
887			else
888				alloclen = fraglen;
889
890			alloclen += exthdrlen;
891
892			/* The last fragment gets additional space at tail.
893			 * Note, with MSG_MORE we overallocate on fragments,
894			 * because we have no idea what fragment will be
895			 * the last.
896			 */
897			if (datalen == length + fraggap)
898				alloclen += rt->dst.trailer_len;
899
900			if (transhdrlen) {
901				skb = sock_alloc_send_skb(sk,
902						alloclen + hh_len + 15,
903						(flags & MSG_DONTWAIT), &err);
904			} else {
905				skb = NULL;
906				if (atomic_read(&sk->sk_wmem_alloc) <=
907				    2 * sk->sk_sndbuf)
908					skb = sock_wmalloc(sk,
909							   alloclen + hh_len + 15, 1,
910							   sk->sk_allocation);
911				if (unlikely(skb == NULL))
912					err = -ENOBUFS;
913				else
914					/* only the initial fragment is
915					   time stamped */
916					cork->tx_flags = 0;
917			}
918			if (skb == NULL)
919				goto error;
920
921			/*
922			 *	Fill in the control structures
923			 */
924			skb->ip_summed = csummode;
925			skb->csum = 0;
926			skb_reserve(skb, hh_len);
927			skb_shinfo(skb)->tx_flags = cork->tx_flags;
928
929			/*
930			 *	Find where to start putting bytes.
931			 */
932			data = skb_put(skb, fraglen + exthdrlen);
933			skb_set_network_header(skb, exthdrlen);
934			skb->transport_header = (skb->network_header +
935						 fragheaderlen);
936			data += fragheaderlen + exthdrlen;
937
938			if (fraggap) {
939				skb->csum = skb_copy_and_csum_bits(
940					skb_prev, maxfraglen,
941					data + transhdrlen, fraggap, 0);
942				skb_prev->csum = csum_sub(skb_prev->csum,
943							  skb->csum);
944				data += fraggap;
945				pskb_trim_unique(skb_prev, maxfraglen);
946			}
947
948			copy = datalen - transhdrlen - fraggap;
949			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
950				err = -EFAULT;
951				kfree_skb(skb);
952				goto error;
953			}
954
955			offset += copy;
956			length -= datalen - fraggap;
957			transhdrlen = 0;
958			exthdrlen = 0;
959			csummode = CHECKSUM_NONE;
960
961			/*
962			 * Put the packet on the pending queue.
963			 */
964			__skb_queue_tail(queue, skb);
965			continue;
966		}
967
968		if (copy > length)
969			copy = length;
970
971		if (!(rt->dst.dev->features&NETIF_F_SG)) {
972			unsigned int off;
973
974			off = skb->len;
975			if (getfrag(from, skb_put(skb, copy),
976					offset, copy, off, skb) < 0) {
977				__skb_trim(skb, off);
978				err = -EFAULT;
979				goto error;
980			}
981		} else {
982			int i = skb_shinfo(skb)->nr_frags;
983			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
984			struct page *page = cork->page;
985			int off = cork->off;
986			unsigned int left;
987
988			if (page && (left = PAGE_SIZE - off) > 0) {
989				if (copy >= left)
990					copy = left;
991				if (page != frag->page) {
992					if (i == MAX_SKB_FRAGS) {
993						err = -EMSGSIZE;
994						goto error;
995					}
996					get_page(page);
997					skb_fill_page_desc(skb, i, page, off, 0);
998					frag = &skb_shinfo(skb)->frags[i];
999				}
1000			} else if (i < MAX_SKB_FRAGS) {
1001				if (copy > PAGE_SIZE)
1002					copy = PAGE_SIZE;
1003				page = alloc_pages(sk->sk_allocation, 0);
1004				if (page == NULL)  {
1005					err = -ENOMEM;
1006					goto error;
1007				}
1008				cork->page = page;
1009				cork->off = 0;
1010
1011				skb_fill_page_desc(skb, i, page, 0, 0);
1012				frag = &skb_shinfo(skb)->frags[i];
1013			} else {
1014				err = -EMSGSIZE;
1015				goto error;
1016			}
1017			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018				err = -EFAULT;
1019				goto error;
1020			}
1021			cork->off += copy;
1022			frag->size += copy;
1023			skb->len += copy;
1024			skb->data_len += copy;
1025			skb->truesize += copy;
1026			atomic_add(copy, &sk->sk_wmem_alloc);
1027		}
1028		offset += copy;
1029		length -= copy;
1030	}
1031
1032	return 0;
1033
1034error:
1035	cork->length -= length;
1036	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1037	return err;
1038}
1039
1040static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1041			 struct ipcm_cookie *ipc, struct rtable **rtp)
1042{
1043	struct inet_sock *inet = inet_sk(sk);
1044	struct ip_options_rcu *opt;
1045	struct rtable *rt;
1046
1047	/*
1048	 * setup for corking.
1049	 */
1050	opt = ipc->opt;
1051	if (opt) {
1052		if (cork->opt == NULL) {
1053			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1054					    sk->sk_allocation);
1055			if (unlikely(cork->opt == NULL))
1056				return -ENOBUFS;
1057		}
1058		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1059		cork->flags |= IPCORK_OPT;
1060		cork->addr = ipc->addr;
1061	}
1062	rt = *rtp;
1063	if (unlikely(!rt))
1064		return -EFAULT;
1065	/*
1066	 * We steal reference to this route, caller should not release it
1067	 */
1068	*rtp = NULL;
1069	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1070			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1071	cork->dst = &rt->dst;
1072	cork->length = 0;
1073	cork->tx_flags = ipc->tx_flags;
1074	cork->page = NULL;
1075	cork->off = 0;
1076
1077	return 0;
1078}
1079
1080/*
1081 *	ip_append_data() and ip_append_page() can make one large IP datagram
1082 *	from many pieces of data. Each pieces will be holded on the socket
1083 *	until ip_push_pending_frames() is called. Each piece can be a page
1084 *	or non-page data.
1085 *
1086 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1087 *	this interface potentially.
1088 *
1089 *	LATER: length must be adjusted by pad at tail, when it is required.
1090 */
1091int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1092		   int getfrag(void *from, char *to, int offset, int len,
1093			       int odd, struct sk_buff *skb),
1094		   void *from, int length, int transhdrlen,
1095		   struct ipcm_cookie *ipc, struct rtable **rtp,
1096		   unsigned int flags)
1097{
1098	struct inet_sock *inet = inet_sk(sk);
1099	int err;
1100
1101	if (flags&MSG_PROBE)
1102		return 0;
1103
1104	if (skb_queue_empty(&sk->sk_write_queue)) {
1105		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1106		if (err)
1107			return err;
1108	} else {
1109		transhdrlen = 0;
1110	}
1111
1112	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1113				from, length, transhdrlen, flags);
1114}
1115
1116ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1117		       int offset, size_t size, int flags)
1118{
1119	struct inet_sock *inet = inet_sk(sk);
1120	struct sk_buff *skb;
1121	struct rtable *rt;
1122	struct ip_options *opt = NULL;
1123	struct inet_cork *cork;
1124	int hh_len;
1125	int mtu;
1126	int len;
1127	int err;
1128	unsigned int maxfraglen, fragheaderlen, fraggap;
1129
1130	if (inet->hdrincl)
1131		return -EPERM;
1132
1133	if (flags&MSG_PROBE)
1134		return 0;
1135
1136	if (skb_queue_empty(&sk->sk_write_queue))
1137		return -EINVAL;
1138
1139	cork = &inet->cork.base;
1140	rt = (struct rtable *)cork->dst;
1141	if (cork->flags & IPCORK_OPT)
1142		opt = cork->opt;
1143
1144	if (!(rt->dst.dev->features&NETIF_F_SG))
1145		return -EOPNOTSUPP;
1146
1147	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1148	mtu = cork->fragsize;
1149
1150	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1151	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1152
1153	if (cork->length + size > 0xFFFF - fragheaderlen) {
1154		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1155		return -EMSGSIZE;
1156	}
1157
1158	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1159		return -EINVAL;
1160
1161	cork->length += size;
1162	if ((size + skb->len > mtu) &&
1163	    (sk->sk_protocol == IPPROTO_UDP) &&
1164	    (rt->dst.dev->features & NETIF_F_UFO)) {
1165		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1166		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1167	}
1168
1169
1170	while (size > 0) {
1171		int i;
1172
1173		if (skb_is_gso(skb))
1174			len = size;
1175		else {
1176
1177			/* Check if the remaining data fits into current packet. */
1178			len = mtu - skb->len;
1179			if (len < size)
1180				len = maxfraglen - skb->len;
1181		}
1182		if (len <= 0) {
1183			struct sk_buff *skb_prev;
1184			int alloclen;
1185
1186			skb_prev = skb;
1187			fraggap = skb_prev->len - maxfraglen;
1188
1189			alloclen = fragheaderlen + hh_len + fraggap + 15;
1190			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1191			if (unlikely(!skb)) {
1192				err = -ENOBUFS;
1193				goto error;
1194			}
1195
1196			/*
1197			 *	Fill in the control structures
1198			 */
1199			skb->ip_summed = CHECKSUM_NONE;
1200			skb->csum = 0;
1201			skb_reserve(skb, hh_len);
1202
1203			/*
1204			 *	Find where to start putting bytes.
1205			 */
1206			skb_put(skb, fragheaderlen + fraggap);
1207			skb_reset_network_header(skb);
1208			skb->transport_header = (skb->network_header +
1209						 fragheaderlen);
1210			if (fraggap) {
1211				skb->csum = skb_copy_and_csum_bits(skb_prev,
1212								   maxfraglen,
1213						    skb_transport_header(skb),
1214								   fraggap, 0);
1215				skb_prev->csum = csum_sub(skb_prev->csum,
1216							  skb->csum);
1217				pskb_trim_unique(skb_prev, maxfraglen);
1218			}
1219
1220			/*
1221			 * Put the packet on the pending queue.
1222			 */
1223			__skb_queue_tail(&sk->sk_write_queue, skb);
1224			continue;
1225		}
1226
1227		i = skb_shinfo(skb)->nr_frags;
1228		if (len > size)
1229			len = size;
1230		if (skb_can_coalesce(skb, i, page, offset)) {
1231			skb_shinfo(skb)->frags[i-1].size += len;
1232		} else if (i < MAX_SKB_FRAGS) {
1233			get_page(page);
1234			skb_fill_page_desc(skb, i, page, offset, len);
1235		} else {
1236			err = -EMSGSIZE;
1237			goto error;
1238		}
1239
1240		if (skb->ip_summed == CHECKSUM_NONE) {
1241			__wsum csum;
1242			csum = csum_page(page, offset, len);
1243			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1244		}
1245
1246		skb->len += len;
1247		skb->data_len += len;
1248		skb->truesize += len;
1249		atomic_add(len, &sk->sk_wmem_alloc);
1250		offset += len;
1251		size -= len;
1252	}
1253	return 0;
1254
1255error:
1256	cork->length -= size;
1257	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1258	return err;
1259}
1260
1261static void ip_cork_release(struct inet_cork *cork)
1262{
1263	cork->flags &= ~IPCORK_OPT;
1264	kfree(cork->opt);
1265	cork->opt = NULL;
1266	dst_release(cork->dst);
1267	cork->dst = NULL;
1268}
1269
1270/*
1271 *	Combined all pending IP fragments on the socket as one IP datagram
1272 *	and push them out.
1273 */
1274struct sk_buff *__ip_make_skb(struct sock *sk,
1275			      struct flowi4 *fl4,
1276			      struct sk_buff_head *queue,
1277			      struct inet_cork *cork)
1278{
1279	struct sk_buff *skb, *tmp_skb;
1280	struct sk_buff **tail_skb;
1281	struct inet_sock *inet = inet_sk(sk);
1282	struct net *net = sock_net(sk);
1283	struct ip_options *opt = NULL;
1284	struct rtable *rt = (struct rtable *)cork->dst;
1285	struct iphdr *iph;
1286	__be16 df = 0;
1287	__u8 ttl;
1288
1289	if ((skb = __skb_dequeue(queue)) == NULL)
1290		goto out;
1291	tail_skb = &(skb_shinfo(skb)->frag_list);
1292
1293	/* move skb->data to ip header from ext header */
1294	if (skb->data < skb_network_header(skb))
1295		__skb_pull(skb, skb_network_offset(skb));
1296	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1297		__skb_pull(tmp_skb, skb_network_header_len(skb));
1298		*tail_skb = tmp_skb;
1299		tail_skb = &(tmp_skb->next);
1300		skb->len += tmp_skb->len;
1301		skb->data_len += tmp_skb->len;
1302		skb->truesize += tmp_skb->truesize;
1303		tmp_skb->destructor = NULL;
1304		tmp_skb->sk = NULL;
1305	}
1306
1307	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1308	 * to fragment the frame generated here. No matter, what transforms
1309	 * how transforms change size of the packet, it will come out.
1310	 */
1311	if (inet->pmtudisc < IP_PMTUDISC_DO)
1312		skb->local_df = 1;
1313
1314	/* DF bit is set when we want to see DF on outgoing frames.
1315	 * If local_df is set too, we still allow to fragment this frame
1316	 * locally. */
1317	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1318	    (skb->len <= dst_mtu(&rt->dst) &&
1319	     ip_dont_fragment(sk, &rt->dst)))
1320		df = htons(IP_DF);
1321
1322	if (cork->flags & IPCORK_OPT)
1323		opt = cork->opt;
1324
1325	if (rt->rt_type == RTN_MULTICAST)
1326		ttl = inet->mc_ttl;
1327	else
1328		ttl = ip_select_ttl(inet, &rt->dst);
1329
1330	iph = (struct iphdr *)skb->data;
1331	iph->version = 4;
1332	iph->ihl = 5;
1333	iph->tos = inet->tos;
1334	iph->frag_off = df;
1335	ip_select_ident(iph, &rt->dst, sk);
1336	iph->ttl = ttl;
1337	iph->protocol = sk->sk_protocol;
1338	iph->saddr = fl4->saddr;
1339	iph->daddr = fl4->daddr;
1340
1341	if (opt) {
1342		iph->ihl += opt->optlen>>2;
1343		ip_options_build(skb, opt, cork->addr, rt, 0);
1344	}
1345
1346	skb->priority = sk->sk_priority;
1347	skb->mark = sk->sk_mark;
1348	/*
1349	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1350	 * on dst refcount
1351	 */
1352	cork->dst = NULL;
1353	skb_dst_set(skb, &rt->dst);
1354
1355	if (iph->protocol == IPPROTO_ICMP)
1356		icmp_out_count(net, ((struct icmphdr *)
1357			skb_transport_header(skb))->type);
1358
1359	ip_cork_release(cork);
1360out:
1361	return skb;
1362}
1363
1364int ip_send_skb(struct sk_buff *skb)
1365{
1366	struct net *net = sock_net(skb->sk);
1367	int err;
1368
1369	err = ip_local_out(skb);
1370	if (err) {
1371		if (err > 0)
1372			err = net_xmit_errno(err);
1373		if (err)
1374			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1375	}
1376
1377	return err;
1378}
1379
1380int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1381{
1382	struct sk_buff *skb;
1383
1384	skb = ip_finish_skb(sk, fl4);
1385	if (!skb)
1386		return 0;
1387
1388	/* Netfilter gets whole the not fragmented skb. */
1389	return ip_send_skb(skb);
1390}
1391
1392/*
1393 *	Throw away all pending data on the socket.
1394 */
1395static void __ip_flush_pending_frames(struct sock *sk,
1396				      struct sk_buff_head *queue,
1397				      struct inet_cork *cork)
1398{
1399	struct sk_buff *skb;
1400
1401	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1402		kfree_skb(skb);
1403
1404	ip_cork_release(cork);
1405}
1406
1407void ip_flush_pending_frames(struct sock *sk)
1408{
1409	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1410}
1411
1412struct sk_buff *ip_make_skb(struct sock *sk,
1413			    struct flowi4 *fl4,
1414			    int getfrag(void *from, char *to, int offset,
1415					int len, int odd, struct sk_buff *skb),
1416			    void *from, int length, int transhdrlen,
1417			    struct ipcm_cookie *ipc, struct rtable **rtp,
1418			    unsigned int flags)
1419{
1420	struct inet_cork cork;
1421	struct sk_buff_head queue;
1422	int err;
1423
1424	if (flags & MSG_PROBE)
1425		return NULL;
1426
1427	__skb_queue_head_init(&queue);
1428
1429	cork.flags = 0;
1430	cork.addr = 0;
1431	cork.opt = NULL;
1432	err = ip_setup_cork(sk, &cork, ipc, rtp);
1433	if (err)
1434		return ERR_PTR(err);
1435
1436	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1437			       from, length, transhdrlen, flags);
1438	if (err) {
1439		__ip_flush_pending_frames(sk, &queue, &cork);
1440		return ERR_PTR(err);
1441	}
1442
1443	return __ip_make_skb(sk, fl4, &queue, &cork);
1444}
1445
1446/*
1447 *	Fetch data from kernel space and fill in checksum if needed.
1448 */
1449static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1450			      int len, int odd, struct sk_buff *skb)
1451{
1452	__wsum csum;
1453
1454	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1455	skb->csum = csum_block_add(skb->csum, csum, odd);
1456	return 0;
1457}
1458
1459/*
1460 *	Generic function to send a packet as reply to another packet.
1461 *	Used to send TCP resets so far. ICMP should use this function too.
1462 *
1463 *	Should run single threaded per socket because it uses the sock
1464 *     	structure to pass arguments.
1465 */
1466void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1467		   struct ip_reply_arg *arg, unsigned int len)
1468{
1469	struct inet_sock *inet = inet_sk(sk);
1470	struct ip_options_data replyopts;
1471	struct ipcm_cookie ipc;
1472	struct flowi4 fl4;
1473	struct rtable *rt = skb_rtable(skb);
1474
1475	if (ip_options_echo(&replyopts.opt.opt, skb))
1476		return;
1477
1478	ipc.addr = daddr;
1479	ipc.opt = NULL;
1480	ipc.tx_flags = 0;
1481
1482	if (replyopts.opt.opt.optlen) {
1483		ipc.opt = &replyopts.opt;
1484
1485		if (replyopts.opt.opt.srr)
1486			daddr = replyopts.opt.opt.faddr;
1487	}
1488
1489	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1490			   RT_TOS(ip_hdr(skb)->tos),
1491			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1492			   ip_reply_arg_flowi_flags(arg),
1493			   daddr, rt->rt_spec_dst,
1494			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1495	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1496	rt = ip_route_output_key(sock_net(sk), &fl4);
1497	if (IS_ERR(rt))
1498		return;
1499
1500	/* And let IP do all the hard work.
1501
1502	   This chunk is not reenterable, hence spinlock.
1503	   Note that it uses the fact, that this function is called
1504	   with locally disabled BH and that sk cannot be already spinlocked.
1505	 */
1506	bh_lock_sock(sk);
1507	inet->tos = ip_hdr(skb)->tos;
1508	sk->sk_priority = skb->priority;
1509	sk->sk_protocol = ip_hdr(skb)->protocol;
1510	sk->sk_bound_dev_if = arg->bound_dev_if;
1511	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1512		       &ipc, &rt, MSG_DONTWAIT);
1513	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1514		if (arg->csumoffset >= 0)
1515			*((__sum16 *)skb_transport_header(skb) +
1516			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1517								arg->csum));
1518		skb->ip_summed = CHECKSUM_NONE;
1519		ip_push_pending_frames(sk, &fl4);
1520	}
1521
1522	bh_unlock_sock(sk);
1523
1524	ip_rt_put(rt);
1525}
1526
1527void __init ip_init(void)
1528{
1529	ip_rt_init();
1530	inet_initpeers();
1531
1532#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1533	igmp_mc_proc_init();
1534#endif
1535}
1536