ip_output.c revision 538de0e01f1ca3568ad03877ff297c646dd8ad23
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Donald Becker, <becker@super.org>
11 *		Alan Cox, <Alan.Cox@linux.org>
12 *		Richard Underwood
13 *		Stefan Becker, <stefanb@yello.ping.de>
14 *		Jorge Cwik, <jorge@laser.satlink.net>
15 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 *		Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 *	See ip_input.c for original log
19 *
20 *	Fixes:
21 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23 *		Bradford Johnson:	Fix faulty handling of some frames when
24 *					no route is found.
25 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26 *					(in case if packet not accepted by
27 *					output firewall rules)
28 *		Mike McLagan	:	Routing by source
29 *		Alexey Kuznetsov:	use new route cache
30 *		Andi Kleen:		Fix broken PMTU recovery and remove
31 *					some redundant tests.
32 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35 *					for decreased register pressure on x86
36 *					and more readibility.
37 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38 *					silently drop skb instead of failing with -EPERM.
39 *		Detlev Wengorz	:	Copy protocol for fragments.
40 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41 *					datagrams.
42 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90	iph->check = 0;
91	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97	struct iphdr *iph = ip_hdr(skb);
98
99	iph->tot_len = htons(skb->len);
100	ip_send_check(iph);
101	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102		       skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107	int err;
108
109	err = __ip_local_out(skb);
110	if (likely(err == 1))
111		err = dst_output(skb);
112
113	return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120	skb_reset_mac_header(newskb);
121	__skb_pull(newskb, skb_network_offset(newskb));
122	newskb->pkt_type = PACKET_LOOPBACK;
123	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124	WARN_ON(!skb_dst(newskb));
125	netif_rx_ni(newskb);
126	return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131	int ttl = inet->uc_ttl;
132
133	if (ttl < 0)
134		ttl = ip4_dst_hoplimit(dst);
135	return ttl;
136}
137
138/*
139 *		Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143			  __be32 saddr, __be32 daddr, struct ip_options *opt)
144{
145	struct inet_sock *inet = inet_sk(sk);
146	struct rtable *rt = skb_rtable(skb);
147	struct iphdr *iph;
148
149	/* Build the IP header. */
150	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151	skb_reset_network_header(skb);
152	iph = ip_hdr(skb);
153	iph->version  = 4;
154	iph->ihl      = 5;
155	iph->tos      = inet->tos;
156	if (ip_dont_fragment(sk, &rt->dst))
157		iph->frag_off = htons(IP_DF);
158	else
159		iph->frag_off = 0;
160	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161	iph->daddr    = rt->rt_dst;
162	iph->saddr    = rt->rt_src;
163	iph->protocol = sk->sk_protocol;
164	ip_select_ident(iph, &rt->dst, sk);
165
166	if (opt && opt->optlen) {
167		iph->ihl += opt->optlen>>2;
168		ip_options_build(skb, opt, daddr, rt, 0);
169	}
170
171	skb->priority = sk->sk_priority;
172	skb->mark = sk->sk_mark;
173
174	/* Send it out. */
175	return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181	struct dst_entry *dst = skb_dst(skb);
182	struct rtable *rt = (struct rtable *)dst;
183	struct net_device *dev = dst->dev;
184	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186	if (rt->rt_type == RTN_MULTICAST) {
187		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188	} else if (rt->rt_type == RTN_BROADCAST)
189		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191	/* Be paranoid, rather than too clever. */
192	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193		struct sk_buff *skb2;
194
195		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196		if (skb2 == NULL) {
197			kfree_skb(skb);
198			return -ENOMEM;
199		}
200		if (skb->sk)
201			skb_set_owner_w(skb2, skb->sk);
202		kfree_skb(skb);
203		skb = skb2;
204	}
205
206	if (dst->hh)
207		return neigh_hh_output(dst->hh, skb);
208	else if (dst->neighbour)
209		return dst->neighbour->output(skb);
210
211	if (net_ratelimit())
212		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213	kfree_skb(skb);
214	return -EINVAL;
215}
216
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223}
224
225static int ip_finish_output(struct sk_buff *skb)
226{
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228	/* Policy lookup after SNAT yielded a new policy */
229	if (skb_dst(skb)->xfrm != NULL) {
230		IPCB(skb)->flags |= IPSKB_REROUTED;
231		return dst_output(skb);
232	}
233#endif
234	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235		return ip_fragment(skb, ip_finish_output2);
236	else
237		return ip_finish_output2(skb);
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242	struct sock *sk = skb->sk;
243	struct rtable *rt = skb_rtable(skb);
244	struct net_device *dev = rt->dst.dev;
245
246	/*
247	 *	If the indicated interface is up and running, send the packet.
248	 */
249	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251	skb->dev = dev;
252	skb->protocol = htons(ETH_P_IP);
253
254	/*
255	 *	Multicasts are looped back for other local users
256	 */
257
258	if (rt->rt_flags&RTCF_MULTICAST) {
259		if (sk_mc_loop(sk)
260#ifdef CONFIG_IP_MROUTE
261		/* Small optimization: do not loopback not local frames,
262		   which returned after forwarding; they will be  dropped
263		   by ip_mr_input in any case.
264		   Note, that local frames are looped back to be delivered
265		   to local recipients.
266
267		   This check is duplicated in ip_mr_input at the moment.
268		 */
269		    &&
270		    ((rt->rt_flags & RTCF_LOCAL) ||
271		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272#endif
273		   ) {
274			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275			if (newskb)
276				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277					newskb, NULL, newskb->dev,
278					ip_dev_loopback_xmit);
279		}
280
281		/* Multicasts with ttl 0 must not go beyond the host */
282
283		if (ip_hdr(skb)->ttl == 0) {
284			kfree_skb(skb);
285			return 0;
286		}
287	}
288
289	if (rt->rt_flags&RTCF_BROADCAST) {
290		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291		if (newskb)
292			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293				NULL, newskb->dev, ip_dev_loopback_xmit);
294	}
295
296	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297			    skb->dev, ip_finish_output,
298			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299}
300
301int ip_output(struct sk_buff *skb)
302{
303	struct net_device *dev = skb_dst(skb)->dev;
304
305	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307	skb->dev = dev;
308	skb->protocol = htons(ETH_P_IP);
309
310	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311			    ip_finish_output,
312			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313}
314
315int ip_queue_xmit(struct sk_buff *skb)
316{
317	struct sock *sk = skb->sk;
318	struct inet_sock *inet = inet_sk(sk);
319	struct ip_options *opt = inet->opt;
320	struct rtable *rt;
321	struct iphdr *iph;
322	int res;
323
324	/* Skip all of this if the packet is already routed,
325	 * f.e. by something like SCTP.
326	 */
327	rcu_read_lock();
328	rt = skb_rtable(skb);
329	if (rt != NULL)
330		goto packet_routed;
331
332	/* Make sure we can route this packet. */
333	rt = (struct rtable *)__sk_dst_check(sk, 0);
334	if (rt == NULL) {
335		__be32 daddr;
336
337		/* Use correct destination address if we have options. */
338		daddr = inet->inet_daddr;
339		if(opt && opt->srr)
340			daddr = opt->faddr;
341
342		/* If this fails, retransmit mechanism of transport layer will
343		 * keep trying until route appears or the connection times
344		 * itself out.
345		 */
346		rt = ip_route_output_ports(sock_net(sk), sk,
347					   daddr, inet->inet_saddr,
348					   inet->inet_dport,
349					   inet->inet_sport,
350					   sk->sk_protocol,
351					   RT_CONN_FLAGS(sk),
352					   sk->sk_bound_dev_if);
353		if (IS_ERR(rt))
354			goto no_route;
355		sk_setup_caps(sk, &rt->dst);
356	}
357	skb_dst_set_noref(skb, &rt->dst);
358
359packet_routed:
360	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
361		goto no_route;
362
363	/* OK, we know where to send it, allocate and build IP header. */
364	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
365	skb_reset_network_header(skb);
366	iph = ip_hdr(skb);
367	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
368	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
369		iph->frag_off = htons(IP_DF);
370	else
371		iph->frag_off = 0;
372	iph->ttl      = ip_select_ttl(inet, &rt->dst);
373	iph->protocol = sk->sk_protocol;
374	iph->saddr    = rt->rt_src;
375	iph->daddr    = rt->rt_dst;
376	/* Transport layer set skb->h.foo itself. */
377
378	if (opt && opt->optlen) {
379		iph->ihl += opt->optlen >> 2;
380		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
381	}
382
383	ip_select_ident_more(iph, &rt->dst, sk,
384			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
385
386	skb->priority = sk->sk_priority;
387	skb->mark = sk->sk_mark;
388
389	res = ip_local_out(skb);
390	rcu_read_unlock();
391	return res;
392
393no_route:
394	rcu_read_unlock();
395	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
396	kfree_skb(skb);
397	return -EHOSTUNREACH;
398}
399EXPORT_SYMBOL(ip_queue_xmit);
400
401
402static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403{
404	to->pkt_type = from->pkt_type;
405	to->priority = from->priority;
406	to->protocol = from->protocol;
407	skb_dst_drop(to);
408	skb_dst_copy(to, from);
409	to->dev = from->dev;
410	to->mark = from->mark;
411
412	/* Copy the flags to each fragment. */
413	IPCB(to)->flags = IPCB(from)->flags;
414
415#ifdef CONFIG_NET_SCHED
416	to->tc_index = from->tc_index;
417#endif
418	nf_copy(to, from);
419#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421	to->nf_trace = from->nf_trace;
422#endif
423#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424	to->ipvs_property = from->ipvs_property;
425#endif
426	skb_copy_secmark(to, from);
427}
428
429/*
430 *	This IP datagram is too large to be sent in one piece.  Break it up into
431 *	smaller pieces (each of size equal to IP header plus
432 *	a block of the data of the original IP data part) that will yet fit in a
433 *	single device frame, and queue such a frame for sending.
434 */
435
436int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437{
438	struct iphdr *iph;
439	int ptr;
440	struct net_device *dev;
441	struct sk_buff *skb2;
442	unsigned int mtu, hlen, left, len, ll_rs;
443	int offset;
444	__be16 not_last_frag;
445	struct rtable *rt = skb_rtable(skb);
446	int err = 0;
447
448	dev = rt->dst.dev;
449
450	/*
451	 *	Point into the IP datagram header.
452	 */
453
454	iph = ip_hdr(skb);
455
456	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
457		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
458		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
459			  htonl(ip_skb_dst_mtu(skb)));
460		kfree_skb(skb);
461		return -EMSGSIZE;
462	}
463
464	/*
465	 *	Setup starting values.
466	 */
467
468	hlen = iph->ihl * 4;
469	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
470#ifdef CONFIG_BRIDGE_NETFILTER
471	if (skb->nf_bridge)
472		mtu -= nf_bridge_mtu_reduction(skb);
473#endif
474	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
475
476	/* When frag_list is given, use it. First, check its validity:
477	 * some transformers could create wrong frag_list or break existing
478	 * one, it is not prohibited. In this case fall back to copying.
479	 *
480	 * LATER: this step can be merged to real generation of fragments,
481	 * we can switch to copy when see the first bad fragment.
482	 */
483	if (skb_has_frag_list(skb)) {
484		struct sk_buff *frag, *frag2;
485		int first_len = skb_pagelen(skb);
486
487		if (first_len - hlen > mtu ||
488		    ((first_len - hlen) & 7) ||
489		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
490		    skb_cloned(skb))
491			goto slow_path;
492
493		skb_walk_frags(skb, frag) {
494			/* Correct geometry. */
495			if (frag->len > mtu ||
496			    ((frag->len & 7) && frag->next) ||
497			    skb_headroom(frag) < hlen)
498				goto slow_path_clean;
499
500			/* Partially cloned skb? */
501			if (skb_shared(frag))
502				goto slow_path_clean;
503
504			BUG_ON(frag->sk);
505			if (skb->sk) {
506				frag->sk = skb->sk;
507				frag->destructor = sock_wfree;
508			}
509			skb->truesize -= frag->truesize;
510		}
511
512		/* Everything is OK. Generate! */
513
514		err = 0;
515		offset = 0;
516		frag = skb_shinfo(skb)->frag_list;
517		skb_frag_list_init(skb);
518		skb->data_len = first_len - skb_headlen(skb);
519		skb->len = first_len;
520		iph->tot_len = htons(first_len);
521		iph->frag_off = htons(IP_MF);
522		ip_send_check(iph);
523
524		for (;;) {
525			/* Prepare header of the next frame,
526			 * before previous one went down. */
527			if (frag) {
528				frag->ip_summed = CHECKSUM_NONE;
529				skb_reset_transport_header(frag);
530				__skb_push(frag, hlen);
531				skb_reset_network_header(frag);
532				memcpy(skb_network_header(frag), iph, hlen);
533				iph = ip_hdr(frag);
534				iph->tot_len = htons(frag->len);
535				ip_copy_metadata(frag, skb);
536				if (offset == 0)
537					ip_options_fragment(frag);
538				offset += skb->len - hlen;
539				iph->frag_off = htons(offset>>3);
540				if (frag->next != NULL)
541					iph->frag_off |= htons(IP_MF);
542				/* Ready, complete checksum */
543				ip_send_check(iph);
544			}
545
546			err = output(skb);
547
548			if (!err)
549				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
550			if (err || !frag)
551				break;
552
553			skb = frag;
554			frag = skb->next;
555			skb->next = NULL;
556		}
557
558		if (err == 0) {
559			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
560			return 0;
561		}
562
563		while (frag) {
564			skb = frag->next;
565			kfree_skb(frag);
566			frag = skb;
567		}
568		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
569		return err;
570
571slow_path_clean:
572		skb_walk_frags(skb, frag2) {
573			if (frag2 == frag)
574				break;
575			frag2->sk = NULL;
576			frag2->destructor = NULL;
577			skb->truesize += frag2->truesize;
578		}
579	}
580
581slow_path:
582	left = skb->len - hlen;		/* Space per frame */
583	ptr = hlen;		/* Where to start from */
584
585	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
586	 * we need to make room for the encapsulating header
587	 */
588	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
589
590	/*
591	 *	Fragment the datagram.
592	 */
593
594	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
595	not_last_frag = iph->frag_off & htons(IP_MF);
596
597	/*
598	 *	Keep copying data until we run out.
599	 */
600
601	while (left > 0) {
602		len = left;
603		/* IF: it doesn't fit, use 'mtu' - the data space left */
604		if (len > mtu)
605			len = mtu;
606		/* IF: we are not sending upto and including the packet end
607		   then align the next start on an eight byte boundary */
608		if (len < left)	{
609			len &= ~7;
610		}
611		/*
612		 *	Allocate buffer.
613		 */
614
615		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
616			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
617			err = -ENOMEM;
618			goto fail;
619		}
620
621		/*
622		 *	Set up data on packet
623		 */
624
625		ip_copy_metadata(skb2, skb);
626		skb_reserve(skb2, ll_rs);
627		skb_put(skb2, len + hlen);
628		skb_reset_network_header(skb2);
629		skb2->transport_header = skb2->network_header + hlen;
630
631		/*
632		 *	Charge the memory for the fragment to any owner
633		 *	it might possess
634		 */
635
636		if (skb->sk)
637			skb_set_owner_w(skb2, skb->sk);
638
639		/*
640		 *	Copy the packet header into the new buffer.
641		 */
642
643		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
644
645		/*
646		 *	Copy a block of the IP datagram.
647		 */
648		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
649			BUG();
650		left -= len;
651
652		/*
653		 *	Fill in the new header fields.
654		 */
655		iph = ip_hdr(skb2);
656		iph->frag_off = htons((offset >> 3));
657
658		/* ANK: dirty, but effective trick. Upgrade options only if
659		 * the segment to be fragmented was THE FIRST (otherwise,
660		 * options are already fixed) and make it ONCE
661		 * on the initial skb, so that all the following fragments
662		 * will inherit fixed options.
663		 */
664		if (offset == 0)
665			ip_options_fragment(skb);
666
667		/*
668		 *	Added AC : If we are fragmenting a fragment that's not the
669		 *		   last fragment then keep MF on each bit
670		 */
671		if (left > 0 || not_last_frag)
672			iph->frag_off |= htons(IP_MF);
673		ptr += len;
674		offset += len;
675
676		/*
677		 *	Put this fragment into the sending queue.
678		 */
679		iph->tot_len = htons(len + hlen);
680
681		ip_send_check(iph);
682
683		err = output(skb2);
684		if (err)
685			goto fail;
686
687		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
688	}
689	kfree_skb(skb);
690	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
691	return err;
692
693fail:
694	kfree_skb(skb);
695	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
696	return err;
697}
698EXPORT_SYMBOL(ip_fragment);
699
700int
701ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
702{
703	struct iovec *iov = from;
704
705	if (skb->ip_summed == CHECKSUM_PARTIAL) {
706		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
707			return -EFAULT;
708	} else {
709		__wsum csum = 0;
710		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
711			return -EFAULT;
712		skb->csum = csum_block_add(skb->csum, csum, odd);
713	}
714	return 0;
715}
716EXPORT_SYMBOL(ip_generic_getfrag);
717
718static inline __wsum
719csum_page(struct page *page, int offset, int copy)
720{
721	char *kaddr;
722	__wsum csum;
723	kaddr = kmap(page);
724	csum = csum_partial(kaddr + offset, copy, 0);
725	kunmap(page);
726	return csum;
727}
728
729static inline int ip_ufo_append_data(struct sock *sk,
730			struct sk_buff_head *queue,
731			int getfrag(void *from, char *to, int offset, int len,
732			       int odd, struct sk_buff *skb),
733			void *from, int length, int hh_len, int fragheaderlen,
734			int transhdrlen, int mtu, unsigned int flags)
735{
736	struct sk_buff *skb;
737	int err;
738
739	/* There is support for UDP fragmentation offload by network
740	 * device, so create one single skb packet containing complete
741	 * udp datagram
742	 */
743	if ((skb = skb_peek_tail(queue)) == NULL) {
744		skb = sock_alloc_send_skb(sk,
745			hh_len + fragheaderlen + transhdrlen + 20,
746			(flags & MSG_DONTWAIT), &err);
747
748		if (skb == NULL)
749			return err;
750
751		/* reserve space for Hardware header */
752		skb_reserve(skb, hh_len);
753
754		/* create space for UDP/IP header */
755		skb_put(skb, fragheaderlen + transhdrlen);
756
757		/* initialize network header pointer */
758		skb_reset_network_header(skb);
759
760		/* initialize protocol header pointer */
761		skb->transport_header = skb->network_header + fragheaderlen;
762
763		skb->ip_summed = CHECKSUM_PARTIAL;
764		skb->csum = 0;
765
766		/* specify the length of each IP datagram fragment */
767		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
768		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
769		__skb_queue_tail(queue, skb);
770	}
771
772	return skb_append_datato_frags(sk, skb, getfrag, from,
773				       (length - transhdrlen));
774}
775
776static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
777			    struct inet_cork *cork,
778			    int getfrag(void *from, char *to, int offset,
779					int len, int odd, struct sk_buff *skb),
780			    void *from, int length, int transhdrlen,
781			    unsigned int flags)
782{
783	struct inet_sock *inet = inet_sk(sk);
784	struct sk_buff *skb;
785
786	struct ip_options *opt = cork->opt;
787	int hh_len;
788	int exthdrlen;
789	int mtu;
790	int copy;
791	int err;
792	int offset = 0;
793	unsigned int maxfraglen, fragheaderlen;
794	int csummode = CHECKSUM_NONE;
795	struct rtable *rt = (struct rtable *)cork->dst;
796
797	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
798	length += exthdrlen;
799	transhdrlen += exthdrlen;
800	mtu = cork->fragsize;
801
802	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
803
804	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
805	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
806
807	if (cork->length + length > 0xFFFF - fragheaderlen) {
808		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
809			       mtu-exthdrlen);
810		return -EMSGSIZE;
811	}
812
813	/*
814	 * transhdrlen > 0 means that this is the first fragment and we wish
815	 * it won't be fragmented in the future.
816	 */
817	if (transhdrlen &&
818	    length + fragheaderlen <= mtu &&
819	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
820	    !exthdrlen)
821		csummode = CHECKSUM_PARTIAL;
822
823	skb = skb_peek_tail(queue);
824
825	cork->length += length;
826	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
827	    (sk->sk_protocol == IPPROTO_UDP) &&
828	    (rt->dst.dev->features & NETIF_F_UFO)) {
829		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
830					 hh_len, fragheaderlen, transhdrlen,
831					 mtu, flags);
832		if (err)
833			goto error;
834		return 0;
835	}
836
837	/* So, what's going on in the loop below?
838	 *
839	 * We use calculated fragment length to generate chained skb,
840	 * each of segments is IP fragment ready for sending to network after
841	 * adding appropriate IP header.
842	 */
843
844	if (!skb)
845		goto alloc_new_skb;
846
847	while (length > 0) {
848		/* Check if the remaining data fits into current packet. */
849		copy = mtu - skb->len;
850		if (copy < length)
851			copy = maxfraglen - skb->len;
852		if (copy <= 0) {
853			char *data;
854			unsigned int datalen;
855			unsigned int fraglen;
856			unsigned int fraggap;
857			unsigned int alloclen;
858			struct sk_buff *skb_prev;
859alloc_new_skb:
860			skb_prev = skb;
861			if (skb_prev)
862				fraggap = skb_prev->len - maxfraglen;
863			else
864				fraggap = 0;
865
866			/*
867			 * If remaining data exceeds the mtu,
868			 * we know we need more fragment(s).
869			 */
870			datalen = length + fraggap;
871			if (datalen > mtu - fragheaderlen)
872				datalen = maxfraglen - fragheaderlen;
873			fraglen = datalen + fragheaderlen;
874
875			if ((flags & MSG_MORE) &&
876			    !(rt->dst.dev->features&NETIF_F_SG))
877				alloclen = mtu;
878			else
879				alloclen = fraglen;
880
881			/* The last fragment gets additional space at tail.
882			 * Note, with MSG_MORE we overallocate on fragments,
883			 * because we have no idea what fragment will be
884			 * the last.
885			 */
886			if (datalen == length + fraggap) {
887				alloclen += rt->dst.trailer_len;
888				/* make sure mtu is not reached */
889				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
890					datalen -= ALIGN(rt->dst.trailer_len, 8);
891			}
892			if (transhdrlen) {
893				skb = sock_alloc_send_skb(sk,
894						alloclen + hh_len + 15,
895						(flags & MSG_DONTWAIT), &err);
896			} else {
897				skb = NULL;
898				if (atomic_read(&sk->sk_wmem_alloc) <=
899				    2 * sk->sk_sndbuf)
900					skb = sock_wmalloc(sk,
901							   alloclen + hh_len + 15, 1,
902							   sk->sk_allocation);
903				if (unlikely(skb == NULL))
904					err = -ENOBUFS;
905				else
906					/* only the initial fragment is
907					   time stamped */
908					cork->tx_flags = 0;
909			}
910			if (skb == NULL)
911				goto error;
912
913			/*
914			 *	Fill in the control structures
915			 */
916			skb->ip_summed = csummode;
917			skb->csum = 0;
918			skb_reserve(skb, hh_len);
919			skb_shinfo(skb)->tx_flags = cork->tx_flags;
920
921			/*
922			 *	Find where to start putting bytes.
923			 */
924			data = skb_put(skb, fraglen);
925			skb_set_network_header(skb, exthdrlen);
926			skb->transport_header = (skb->network_header +
927						 fragheaderlen);
928			data += fragheaderlen;
929
930			if (fraggap) {
931				skb->csum = skb_copy_and_csum_bits(
932					skb_prev, maxfraglen,
933					data + transhdrlen, fraggap, 0);
934				skb_prev->csum = csum_sub(skb_prev->csum,
935							  skb->csum);
936				data += fraggap;
937				pskb_trim_unique(skb_prev, maxfraglen);
938			}
939
940			copy = datalen - transhdrlen - fraggap;
941			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942				err = -EFAULT;
943				kfree_skb(skb);
944				goto error;
945			}
946
947			offset += copy;
948			length -= datalen - fraggap;
949			transhdrlen = 0;
950			exthdrlen = 0;
951			csummode = CHECKSUM_NONE;
952
953			/*
954			 * Put the packet on the pending queue.
955			 */
956			__skb_queue_tail(queue, skb);
957			continue;
958		}
959
960		if (copy > length)
961			copy = length;
962
963		if (!(rt->dst.dev->features&NETIF_F_SG)) {
964			unsigned int off;
965
966			off = skb->len;
967			if (getfrag(from, skb_put(skb, copy),
968					offset, copy, off, skb) < 0) {
969				__skb_trim(skb, off);
970				err = -EFAULT;
971				goto error;
972			}
973		} else {
974			int i = skb_shinfo(skb)->nr_frags;
975			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976			struct page *page = cork->page;
977			int off = cork->off;
978			unsigned int left;
979
980			if (page && (left = PAGE_SIZE - off) > 0) {
981				if (copy >= left)
982					copy = left;
983				if (page != frag->page) {
984					if (i == MAX_SKB_FRAGS) {
985						err = -EMSGSIZE;
986						goto error;
987					}
988					get_page(page);
989					skb_fill_page_desc(skb, i, page, off, 0);
990					frag = &skb_shinfo(skb)->frags[i];
991				}
992			} else if (i < MAX_SKB_FRAGS) {
993				if (copy > PAGE_SIZE)
994					copy = PAGE_SIZE;
995				page = alloc_pages(sk->sk_allocation, 0);
996				if (page == NULL)  {
997					err = -ENOMEM;
998					goto error;
999				}
1000				cork->page = page;
1001				cork->off = 0;
1002
1003				skb_fill_page_desc(skb, i, page, 0, 0);
1004				frag = &skb_shinfo(skb)->frags[i];
1005			} else {
1006				err = -EMSGSIZE;
1007				goto error;
1008			}
1009			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1010				err = -EFAULT;
1011				goto error;
1012			}
1013			cork->off += copy;
1014			frag->size += copy;
1015			skb->len += copy;
1016			skb->data_len += copy;
1017			skb->truesize += copy;
1018			atomic_add(copy, &sk->sk_wmem_alloc);
1019		}
1020		offset += copy;
1021		length -= copy;
1022	}
1023
1024	return 0;
1025
1026error:
1027	cork->length -= length;
1028	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1029	return err;
1030}
1031
1032static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033			 struct ipcm_cookie *ipc, struct rtable **rtp)
1034{
1035	struct inet_sock *inet = inet_sk(sk);
1036	struct ip_options *opt;
1037	struct rtable *rt;
1038
1039	/*
1040	 * setup for corking.
1041	 */
1042	opt = ipc->opt;
1043	if (opt) {
1044		if (cork->opt == NULL) {
1045			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046					    sk->sk_allocation);
1047			if (unlikely(cork->opt == NULL))
1048				return -ENOBUFS;
1049		}
1050		memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051		cork->flags |= IPCORK_OPT;
1052		cork->addr = ipc->addr;
1053	}
1054	rt = *rtp;
1055	if (unlikely(!rt))
1056		return -EFAULT;
1057	/*
1058	 * We steal reference to this route, caller should not release it
1059	 */
1060	*rtp = NULL;
1061	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063	cork->dst = &rt->dst;
1064	cork->length = 0;
1065	cork->tx_flags = ipc->tx_flags;
1066	cork->page = NULL;
1067	cork->off = 0;
1068
1069	return 0;
1070}
1071
1072/*
1073 *	ip_append_data() and ip_append_page() can make one large IP datagram
1074 *	from many pieces of data. Each pieces will be holded on the socket
1075 *	until ip_push_pending_frames() is called. Each piece can be a page
1076 *	or non-page data.
1077 *
1078 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1079 *	this interface potentially.
1080 *
1081 *	LATER: length must be adjusted by pad at tail, when it is required.
1082 */
1083int ip_append_data(struct sock *sk,
1084		   int getfrag(void *from, char *to, int offset, int len,
1085			       int odd, struct sk_buff *skb),
1086		   void *from, int length, int transhdrlen,
1087		   struct ipcm_cookie *ipc, struct rtable **rtp,
1088		   unsigned int flags)
1089{
1090	struct inet_sock *inet = inet_sk(sk);
1091	int err;
1092
1093	if (flags&MSG_PROBE)
1094		return 0;
1095
1096	if (skb_queue_empty(&sk->sk_write_queue)) {
1097		err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098		if (err)
1099			return err;
1100	} else {
1101		transhdrlen = 0;
1102	}
1103
1104	return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105				from, length, transhdrlen, flags);
1106}
1107
1108ssize_t	ip_append_page(struct sock *sk, struct page *page,
1109		       int offset, size_t size, int flags)
1110{
1111	struct inet_sock *inet = inet_sk(sk);
1112	struct sk_buff *skb;
1113	struct rtable *rt;
1114	struct ip_options *opt = NULL;
1115	int hh_len;
1116	int mtu;
1117	int len;
1118	int err;
1119	unsigned int maxfraglen, fragheaderlen, fraggap;
1120
1121	if (inet->hdrincl)
1122		return -EPERM;
1123
1124	if (flags&MSG_PROBE)
1125		return 0;
1126
1127	if (skb_queue_empty(&sk->sk_write_queue))
1128		return -EINVAL;
1129
1130	rt = (struct rtable *)inet->cork.dst;
1131	if (inet->cork.flags & IPCORK_OPT)
1132		opt = inet->cork.opt;
1133
1134	if (!(rt->dst.dev->features&NETIF_F_SG))
1135		return -EOPNOTSUPP;
1136
1137	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1138	mtu = inet->cork.fragsize;
1139
1140	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1141	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1142
1143	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1144		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1145		return -EMSGSIZE;
1146	}
1147
1148	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1149		return -EINVAL;
1150
1151	inet->cork.length += size;
1152	if ((size + skb->len > mtu) &&
1153	    (sk->sk_protocol == IPPROTO_UDP) &&
1154	    (rt->dst.dev->features & NETIF_F_UFO)) {
1155		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1156		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157	}
1158
1159
1160	while (size > 0) {
1161		int i;
1162
1163		if (skb_is_gso(skb))
1164			len = size;
1165		else {
1166
1167			/* Check if the remaining data fits into current packet. */
1168			len = mtu - skb->len;
1169			if (len < size)
1170				len = maxfraglen - skb->len;
1171		}
1172		if (len <= 0) {
1173			struct sk_buff *skb_prev;
1174			int alloclen;
1175
1176			skb_prev = skb;
1177			fraggap = skb_prev->len - maxfraglen;
1178
1179			alloclen = fragheaderlen + hh_len + fraggap + 15;
1180			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1181			if (unlikely(!skb)) {
1182				err = -ENOBUFS;
1183				goto error;
1184			}
1185
1186			/*
1187			 *	Fill in the control structures
1188			 */
1189			skb->ip_summed = CHECKSUM_NONE;
1190			skb->csum = 0;
1191			skb_reserve(skb, hh_len);
1192
1193			/*
1194			 *	Find where to start putting bytes.
1195			 */
1196			skb_put(skb, fragheaderlen + fraggap);
1197			skb_reset_network_header(skb);
1198			skb->transport_header = (skb->network_header +
1199						 fragheaderlen);
1200			if (fraggap) {
1201				skb->csum = skb_copy_and_csum_bits(skb_prev,
1202								   maxfraglen,
1203						    skb_transport_header(skb),
1204								   fraggap, 0);
1205				skb_prev->csum = csum_sub(skb_prev->csum,
1206							  skb->csum);
1207				pskb_trim_unique(skb_prev, maxfraglen);
1208			}
1209
1210			/*
1211			 * Put the packet on the pending queue.
1212			 */
1213			__skb_queue_tail(&sk->sk_write_queue, skb);
1214			continue;
1215		}
1216
1217		i = skb_shinfo(skb)->nr_frags;
1218		if (len > size)
1219			len = size;
1220		if (skb_can_coalesce(skb, i, page, offset)) {
1221			skb_shinfo(skb)->frags[i-1].size += len;
1222		} else if (i < MAX_SKB_FRAGS) {
1223			get_page(page);
1224			skb_fill_page_desc(skb, i, page, offset, len);
1225		} else {
1226			err = -EMSGSIZE;
1227			goto error;
1228		}
1229
1230		if (skb->ip_summed == CHECKSUM_NONE) {
1231			__wsum csum;
1232			csum = csum_page(page, offset, len);
1233			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1234		}
1235
1236		skb->len += len;
1237		skb->data_len += len;
1238		skb->truesize += len;
1239		atomic_add(len, &sk->sk_wmem_alloc);
1240		offset += len;
1241		size -= len;
1242	}
1243	return 0;
1244
1245error:
1246	inet->cork.length -= size;
1247	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1248	return err;
1249}
1250
1251static void ip_cork_release(struct inet_cork *cork)
1252{
1253	cork->flags &= ~IPCORK_OPT;
1254	kfree(cork->opt);
1255	cork->opt = NULL;
1256	dst_release(cork->dst);
1257	cork->dst = NULL;
1258}
1259
1260/*
1261 *	Combined all pending IP fragments on the socket as one IP datagram
1262 *	and push them out.
1263 */
1264struct sk_buff *__ip_make_skb(struct sock *sk,
1265			      struct sk_buff_head *queue,
1266			      struct inet_cork *cork)
1267{
1268	struct sk_buff *skb, *tmp_skb;
1269	struct sk_buff **tail_skb;
1270	struct inet_sock *inet = inet_sk(sk);
1271	struct net *net = sock_net(sk);
1272	struct ip_options *opt = NULL;
1273	struct rtable *rt = (struct rtable *)cork->dst;
1274	struct iphdr *iph;
1275	__be16 df = 0;
1276	__u8 ttl;
1277
1278	if ((skb = __skb_dequeue(queue)) == NULL)
1279		goto out;
1280	tail_skb = &(skb_shinfo(skb)->frag_list);
1281
1282	/* move skb->data to ip header from ext header */
1283	if (skb->data < skb_network_header(skb))
1284		__skb_pull(skb, skb_network_offset(skb));
1285	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1286		__skb_pull(tmp_skb, skb_network_header_len(skb));
1287		*tail_skb = tmp_skb;
1288		tail_skb = &(tmp_skb->next);
1289		skb->len += tmp_skb->len;
1290		skb->data_len += tmp_skb->len;
1291		skb->truesize += tmp_skb->truesize;
1292		tmp_skb->destructor = NULL;
1293		tmp_skb->sk = NULL;
1294	}
1295
1296	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1297	 * to fragment the frame generated here. No matter, what transforms
1298	 * how transforms change size of the packet, it will come out.
1299	 */
1300	if (inet->pmtudisc < IP_PMTUDISC_DO)
1301		skb->local_df = 1;
1302
1303	/* DF bit is set when we want to see DF on outgoing frames.
1304	 * If local_df is set too, we still allow to fragment this frame
1305	 * locally. */
1306	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1307	    (skb->len <= dst_mtu(&rt->dst) &&
1308	     ip_dont_fragment(sk, &rt->dst)))
1309		df = htons(IP_DF);
1310
1311	if (cork->flags & IPCORK_OPT)
1312		opt = cork->opt;
1313
1314	if (rt->rt_type == RTN_MULTICAST)
1315		ttl = inet->mc_ttl;
1316	else
1317		ttl = ip_select_ttl(inet, &rt->dst);
1318
1319	iph = (struct iphdr *)skb->data;
1320	iph->version = 4;
1321	iph->ihl = 5;
1322	if (opt) {
1323		iph->ihl += opt->optlen>>2;
1324		ip_options_build(skb, opt, cork->addr, rt, 0);
1325	}
1326	iph->tos = inet->tos;
1327	iph->frag_off = df;
1328	ip_select_ident(iph, &rt->dst, sk);
1329	iph->ttl = ttl;
1330	iph->protocol = sk->sk_protocol;
1331	iph->saddr = rt->rt_src;
1332	iph->daddr = rt->rt_dst;
1333
1334	skb->priority = sk->sk_priority;
1335	skb->mark = sk->sk_mark;
1336	/*
1337	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1338	 * on dst refcount
1339	 */
1340	cork->dst = NULL;
1341	skb_dst_set(skb, &rt->dst);
1342
1343	if (iph->protocol == IPPROTO_ICMP)
1344		icmp_out_count(net, ((struct icmphdr *)
1345			skb_transport_header(skb))->type);
1346
1347	ip_cork_release(cork);
1348out:
1349	return skb;
1350}
1351
1352int ip_send_skb(struct sk_buff *skb)
1353{
1354	struct net *net = sock_net(skb->sk);
1355	int err;
1356
1357	err = ip_local_out(skb);
1358	if (err) {
1359		if (err > 0)
1360			err = net_xmit_errno(err);
1361		if (err)
1362			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1363	}
1364
1365	return err;
1366}
1367
1368int ip_push_pending_frames(struct sock *sk)
1369{
1370	struct sk_buff *skb;
1371
1372	skb = ip_finish_skb(sk);
1373	if (!skb)
1374		return 0;
1375
1376	/* Netfilter gets whole the not fragmented skb. */
1377	return ip_send_skb(skb);
1378}
1379
1380/*
1381 *	Throw away all pending data on the socket.
1382 */
1383static void __ip_flush_pending_frames(struct sock *sk,
1384				      struct sk_buff_head *queue,
1385				      struct inet_cork *cork)
1386{
1387	struct sk_buff *skb;
1388
1389	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1390		kfree_skb(skb);
1391
1392	ip_cork_release(cork);
1393}
1394
1395void ip_flush_pending_frames(struct sock *sk)
1396{
1397	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1398}
1399
1400struct sk_buff *ip_make_skb(struct sock *sk,
1401			    int getfrag(void *from, char *to, int offset,
1402					int len, int odd, struct sk_buff *skb),
1403			    void *from, int length, int transhdrlen,
1404			    struct ipcm_cookie *ipc, struct rtable **rtp,
1405			    unsigned int flags)
1406{
1407	struct inet_cork cork = {};
1408	struct sk_buff_head queue;
1409	int err;
1410
1411	if (flags & MSG_PROBE)
1412		return NULL;
1413
1414	__skb_queue_head_init(&queue);
1415
1416	err = ip_setup_cork(sk, &cork, ipc, rtp);
1417	if (err)
1418		return ERR_PTR(err);
1419
1420	err = __ip_append_data(sk, &queue, &cork, getfrag,
1421			       from, length, transhdrlen, flags);
1422	if (err) {
1423		__ip_flush_pending_frames(sk, &queue, &cork);
1424		return ERR_PTR(err);
1425	}
1426
1427	return __ip_make_skb(sk, &queue, &cork);
1428}
1429
1430/*
1431 *	Fetch data from kernel space and fill in checksum if needed.
1432 */
1433static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1434			      int len, int odd, struct sk_buff *skb)
1435{
1436	__wsum csum;
1437
1438	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1439	skb->csum = csum_block_add(skb->csum, csum, odd);
1440	return 0;
1441}
1442
1443/*
1444 *	Generic function to send a packet as reply to another packet.
1445 *	Used to send TCP resets so far. ICMP should use this function too.
1446 *
1447 *	Should run single threaded per socket because it uses the sock
1448 *     	structure to pass arguments.
1449 */
1450void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1451		   unsigned int len)
1452{
1453	struct inet_sock *inet = inet_sk(sk);
1454	struct {
1455		struct ip_options	opt;
1456		char			data[40];
1457	} replyopts;
1458	struct ipcm_cookie ipc;
1459	__be32 daddr;
1460	struct rtable *rt = skb_rtable(skb);
1461
1462	if (ip_options_echo(&replyopts.opt, skb))
1463		return;
1464
1465	daddr = ipc.addr = rt->rt_src;
1466	ipc.opt = NULL;
1467	ipc.tx_flags = 0;
1468
1469	if (replyopts.opt.optlen) {
1470		ipc.opt = &replyopts.opt;
1471
1472		if (ipc.opt->srr)
1473			daddr = replyopts.opt.faddr;
1474	}
1475
1476	{
1477		struct flowi4 fl4;
1478
1479		flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1480				   RT_TOS(ip_hdr(skb)->tos),
1481				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1482				   ip_reply_arg_flowi_flags(arg),
1483				   daddr, rt->rt_spec_dst,
1484				   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1485		security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1486		rt = ip_route_output_key(sock_net(sk), &fl4);
1487		if (IS_ERR(rt))
1488			return;
1489	}
1490
1491	/* And let IP do all the hard work.
1492
1493	   This chunk is not reenterable, hence spinlock.
1494	   Note that it uses the fact, that this function is called
1495	   with locally disabled BH and that sk cannot be already spinlocked.
1496	 */
1497	bh_lock_sock(sk);
1498	inet->tos = ip_hdr(skb)->tos;
1499	sk->sk_priority = skb->priority;
1500	sk->sk_protocol = ip_hdr(skb)->protocol;
1501	sk->sk_bound_dev_if = arg->bound_dev_if;
1502	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1503		       &ipc, &rt, MSG_DONTWAIT);
1504	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1505		if (arg->csumoffset >= 0)
1506			*((__sum16 *)skb_transport_header(skb) +
1507			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1508								arg->csum));
1509		skb->ip_summed = CHECKSUM_NONE;
1510		ip_push_pending_frames(sk);
1511	}
1512
1513	bh_unlock_sock(sk);
1514
1515	ip_rt_put(rt);
1516}
1517
1518void __init ip_init(void)
1519{
1520	ip_rt_init();
1521	inet_initpeers();
1522
1523#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1524	igmp_mc_proc_init();
1525#endif
1526}
1527