ip_output.c revision 64ce207306debd7157f47282be94770407bec01c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Donald Becker, <becker@super.org>
13 *		Alan Cox, <Alan.Cox@linux.org>
14 *		Richard Underwood
15 *		Stefan Becker, <stefanb@yello.ping.de>
16 *		Jorge Cwik, <jorge@laser.satlink.net>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 *	See ip_input.c for original log
21 *
22 *	Fixes:
23 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25 *		Bradford Johnson:	Fix faulty handling of some frames when
26 *					no route is found.
27 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28 *					(in case if packet not accepted by
29 *					output firewall rules)
30 *		Mike McLagan	:	Routing by source
31 *		Alexey Kuznetsov:	use new route cache
32 *		Andi Kleen:		Fix broken PMTU recovery and remove
33 *					some redundant tests.
34 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37 *					for decreased register pressure on x86
38 *					and more readibility.
39 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40 *					silently drop skb instead of failing with -EPERM.
41 *		Detlev Wengorz	:	Copy protocol for fragments.
42 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43 *					datagrams.
44 *		Hirokazu Takahashi:	sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/sched.h>
53#include <linux/mm.h>
54#include <linux/string.h>
55#include <linux/errno.h>
56#include <linux/config.h>
57
58#include <linux/socket.h>
59#include <linux/sockios.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/etherdevice.h>
64#include <linux/proc_fs.h>
65#include <linux/stat.h>
66#include <linux/init.h>
67
68#include <net/snmp.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <net/arp.h>
75#include <net/icmp.h>
76#include <net/checksum.h>
77#include <net/inetpeer.h>
78#include <net/checksum.h>
79#include <linux/igmp.h>
80#include <linux/netfilter_ipv4.h>
81#include <linux/netfilter_bridge.h>
82#include <linux/mroute.h>
83#include <linux/netlink.h>
84#include <linux/tcp.h>
85
86int sysctl_ip_default_ttl = IPDEFTTL;
87
88/* Generate a checksum for an outgoing IP datagram. */
89__inline__ void ip_send_check(struct iphdr *iph)
90{
91	iph->check = 0;
92	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93}
94
95/* dev_loopback_xmit for use with netfilter. */
96static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97{
98	newskb->mac.raw = newskb->data;
99	__skb_pull(newskb, newskb->nh.raw - newskb->data);
100	newskb->pkt_type = PACKET_LOOPBACK;
101	newskb->ip_summed = CHECKSUM_UNNECESSARY;
102	BUG_TRAP(newskb->dst);
103	netif_rx(newskb);
104	return 0;
105}
106
107static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108{
109	int ttl = inet->uc_ttl;
110
111	if (ttl < 0)
112		ttl = dst_metric(dst, RTAX_HOPLIMIT);
113	return ttl;
114}
115
116/*
117 *		Add an ip header to a skbuff and send it out.
118 *
119 */
120int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121			  u32 saddr, u32 daddr, struct ip_options *opt)
122{
123	struct inet_sock *inet = inet_sk(sk);
124	struct rtable *rt = (struct rtable *)skb->dst;
125	struct iphdr *iph;
126
127	/* Build the IP header. */
128	if (opt)
129		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130	else
131		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133	iph->version  = 4;
134	iph->ihl      = 5;
135	iph->tos      = inet->tos;
136	if (ip_dont_fragment(sk, &rt->u.dst))
137		iph->frag_off = htons(IP_DF);
138	else
139		iph->frag_off = 0;
140	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141	iph->daddr    = rt->rt_dst;
142	iph->saddr    = rt->rt_src;
143	iph->protocol = sk->sk_protocol;
144	iph->tot_len  = htons(skb->len);
145	ip_select_ident(iph, &rt->u.dst, sk);
146	skb->nh.iph   = iph;
147
148	if (opt && opt->optlen) {
149		iph->ihl += opt->optlen>>2;
150		ip_options_build(skb, opt, daddr, rt, 0);
151	}
152	ip_send_check(iph);
153
154	skb->priority = sk->sk_priority;
155
156	/* Send it out. */
157	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158		       dst_output);
159}
160
161EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
163static inline int ip_finish_output2(struct sk_buff *skb)
164{
165	struct dst_entry *dst = skb->dst;
166	struct hh_cache *hh = dst->hh;
167	struct net_device *dev = dst->dev;
168	int hh_len = LL_RESERVED_SPACE(dev);
169
170	/* Be paranoid, rather than too clever. */
171	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172		struct sk_buff *skb2;
173
174		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175		if (skb2 == NULL) {
176			kfree_skb(skb);
177			return -ENOMEM;
178		}
179		if (skb->sk)
180			skb_set_owner_w(skb2, skb->sk);
181		kfree_skb(skb);
182		skb = skb2;
183	}
184
185	if (hh) {
186		int hh_alen;
187
188		read_lock_bh(&hh->hh_lock);
189		hh_alen = HH_DATA_ALIGN(hh->hh_len);
190  		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191		read_unlock_bh(&hh->hh_lock);
192	        skb_push(skb, hh->hh_len);
193		return hh->hh_output(skb);
194	} else if (dst->neighbour)
195		return dst->neighbour->output(skb);
196
197	if (net_ratelimit())
198		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199	kfree_skb(skb);
200	return -EINVAL;
201}
202
203static int ip_finish_output(struct sk_buff *skb)
204{
205	struct net_device *dev = skb->dst->dev;
206
207	skb->dev = dev;
208	skb->protocol = htons(ETH_P_IP);
209
210	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211		       ip_finish_output2);
212}
213
214int ip_mc_output(struct sk_buff *skb)
215{
216	struct sock *sk = skb->sk;
217	struct rtable *rt = (struct rtable*)skb->dst;
218	struct net_device *dev = rt->u.dst.dev;
219
220	/*
221	 *	If the indicated interface is up and running, send the packet.
222	 */
223	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224
225	skb->dev = dev;
226	skb->protocol = htons(ETH_P_IP);
227
228	/*
229	 *	Multicasts are looped back for other local users
230	 */
231
232	if (rt->rt_flags&RTCF_MULTICAST) {
233		if ((!sk || inet_sk(sk)->mc_loop)
234#ifdef CONFIG_IP_MROUTE
235		/* Small optimization: do not loopback not local frames,
236		   which returned after forwarding; they will be  dropped
237		   by ip_mr_input in any case.
238		   Note, that local frames are looped back to be delivered
239		   to local recipients.
240
241		   This check is duplicated in ip_mr_input at the moment.
242		 */
243		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244#endif
245		) {
246			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247			if (newskb)
248				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249					newskb->dev,
250					ip_dev_loopback_xmit);
251		}
252
253		/* Multicasts with ttl 0 must not go beyond the host */
254
255		if (skb->nh.iph->ttl == 0) {
256			kfree_skb(skb);
257			return 0;
258		}
259	}
260
261	if (rt->rt_flags&RTCF_BROADCAST) {
262		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263		if (newskb)
264			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265				newskb->dev, ip_dev_loopback_xmit);
266	}
267
268	if (skb->len > dst_mtu(&rt->u.dst))
269		return ip_fragment(skb, ip_finish_output);
270	else
271		return ip_finish_output(skb);
272}
273
274int ip_output(struct sk_buff *skb)
275{
276	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277
278	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
279		return ip_fragment(skb, ip_finish_output);
280	else
281		return ip_finish_output(skb);
282}
283
284int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
285{
286	struct sock *sk = skb->sk;
287	struct inet_sock *inet = inet_sk(sk);
288	struct ip_options *opt = inet->opt;
289	struct rtable *rt;
290	struct iphdr *iph;
291
292	/* Skip all of this if the packet is already routed,
293	 * f.e. by something like SCTP.
294	 */
295	rt = (struct rtable *) skb->dst;
296	if (rt != NULL)
297		goto packet_routed;
298
299	/* Make sure we can route this packet. */
300	rt = (struct rtable *)__sk_dst_check(sk, 0);
301	if (rt == NULL) {
302		u32 daddr;
303
304		/* Use correct destination address if we have options. */
305		daddr = inet->daddr;
306		if(opt && opt->srr)
307			daddr = opt->faddr;
308
309		{
310			struct flowi fl = { .oif = sk->sk_bound_dev_if,
311					    .nl_u = { .ip4_u =
312						      { .daddr = daddr,
313							.saddr = inet->saddr,
314							.tos = RT_CONN_FLAGS(sk) } },
315					    .proto = sk->sk_protocol,
316					    .uli_u = { .ports =
317						       { .sport = inet->sport,
318							 .dport = inet->dport } } };
319
320			/* If this fails, retransmit mechanism of transport layer will
321			 * keep trying until route appears or the connection times
322			 * itself out.
323			 */
324			if (ip_route_output_flow(&rt, &fl, sk, 0))
325				goto no_route;
326		}
327		sk_setup_caps(sk, &rt->u.dst);
328	}
329	skb->dst = dst_clone(&rt->u.dst);
330
331packet_routed:
332	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
333		goto no_route;
334
335	/* OK, we know where to send it, allocate and build IP header. */
336	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
337	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338	iph->tot_len = htons(skb->len);
339	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
340		iph->frag_off = htons(IP_DF);
341	else
342		iph->frag_off = 0;
343	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
344	iph->protocol = sk->sk_protocol;
345	iph->saddr    = rt->rt_src;
346	iph->daddr    = rt->rt_dst;
347	skb->nh.iph   = iph;
348	/* Transport layer set skb->h.foo itself. */
349
350	if (opt && opt->optlen) {
351		iph->ihl += opt->optlen >> 2;
352		ip_options_build(skb, opt, inet->daddr, rt, 0);
353	}
354
355	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
356
357	/* Add an IP checksum. */
358	ip_send_check(iph);
359
360	skb->priority = sk->sk_priority;
361
362	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
363		       dst_output);
364
365no_route:
366	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
367	kfree_skb(skb);
368	return -EHOSTUNREACH;
369}
370
371
372static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
373{
374	to->pkt_type = from->pkt_type;
375	to->priority = from->priority;
376	to->protocol = from->protocol;
377	dst_release(to->dst);
378	to->dst = dst_clone(from->dst);
379	to->dev = from->dev;
380
381	/* Copy the flags to each fragment. */
382	IPCB(to)->flags = IPCB(from)->flags;
383
384#ifdef CONFIG_NET_SCHED
385	to->tc_index = from->tc_index;
386#endif
387#ifdef CONFIG_NETFILTER
388	to->nfmark = from->nfmark;
389	/* Connection association is same as pre-frag packet */
390	nf_conntrack_put(to->nfct);
391	to->nfct = from->nfct;
392	nf_conntrack_get(to->nfct);
393	to->nfctinfo = from->nfctinfo;
394#ifdef CONFIG_BRIDGE_NETFILTER
395	nf_bridge_put(to->nf_bridge);
396	to->nf_bridge = from->nf_bridge;
397	nf_bridge_get(to->nf_bridge);
398#endif
399#endif
400}
401
402/*
403 *	This IP datagram is too large to be sent in one piece.  Break it up into
404 *	smaller pieces (each of size equal to IP header plus
405 *	a block of the data of the original IP data part) that will yet fit in a
406 *	single device frame, and queue such a frame for sending.
407 */
408
409int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
410{
411	struct iphdr *iph;
412	int raw = 0;
413	int ptr;
414	struct net_device *dev;
415	struct sk_buff *skb2;
416	unsigned int mtu, hlen, left, len, ll_rs;
417	int offset;
418	int not_last_frag;
419	struct rtable *rt = (struct rtable*)skb->dst;
420	int err = 0;
421
422	dev = rt->u.dst.dev;
423
424	/*
425	 *	Point into the IP datagram header.
426	 */
427
428	iph = skb->nh.iph;
429
430	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
431		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
432			  htonl(dst_mtu(&rt->u.dst)));
433		kfree_skb(skb);
434		return -EMSGSIZE;
435	}
436
437	/*
438	 *	Setup starting values.
439	 */
440
441	hlen = iph->ihl * 4;
442	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
443
444	/* When frag_list is given, use it. First, check its validity:
445	 * some transformers could create wrong frag_list or break existing
446	 * one, it is not prohibited. In this case fall back to copying.
447	 *
448	 * LATER: this step can be merged to real generation of fragments,
449	 * we can switch to copy when see the first bad fragment.
450	 */
451	if (skb_shinfo(skb)->frag_list) {
452		struct sk_buff *frag;
453		int first_len = skb_pagelen(skb);
454
455		if (first_len - hlen > mtu ||
456		    ((first_len - hlen) & 7) ||
457		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
458		    skb_cloned(skb))
459			goto slow_path;
460
461		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
462			/* Correct geometry. */
463			if (frag->len > mtu ||
464			    ((frag->len & 7) && frag->next) ||
465			    skb_headroom(frag) < hlen)
466			    goto slow_path;
467
468			/* Partially cloned skb? */
469			if (skb_shared(frag))
470				goto slow_path;
471
472			BUG_ON(frag->sk);
473			if (skb->sk) {
474				sock_hold(skb->sk);
475				frag->sk = skb->sk;
476				frag->destructor = sock_wfree;
477				skb->truesize -= frag->truesize;
478			}
479		}
480
481		/* Everything is OK. Generate! */
482
483		err = 0;
484		offset = 0;
485		frag = skb_shinfo(skb)->frag_list;
486		skb_shinfo(skb)->frag_list = NULL;
487		skb->data_len = first_len - skb_headlen(skb);
488		skb->len = first_len;
489		iph->tot_len = htons(first_len);
490		iph->frag_off = htons(IP_MF);
491		ip_send_check(iph);
492
493		for (;;) {
494			/* Prepare header of the next frame,
495			 * before previous one went down. */
496			if (frag) {
497				frag->ip_summed = CHECKSUM_NONE;
498				frag->h.raw = frag->data;
499				frag->nh.raw = __skb_push(frag, hlen);
500				memcpy(frag->nh.raw, iph, hlen);
501				iph = frag->nh.iph;
502				iph->tot_len = htons(frag->len);
503				ip_copy_metadata(frag, skb);
504				if (offset == 0)
505					ip_options_fragment(frag);
506				offset += skb->len - hlen;
507				iph->frag_off = htons(offset>>3);
508				if (frag->next != NULL)
509					iph->frag_off |= htons(IP_MF);
510				/* Ready, complete checksum */
511				ip_send_check(iph);
512			}
513
514			err = output(skb);
515
516			if (err || !frag)
517				break;
518
519			skb = frag;
520			frag = skb->next;
521			skb->next = NULL;
522		}
523
524		if (err == 0) {
525			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
526			return 0;
527		}
528
529		while (frag) {
530			skb = frag->next;
531			kfree_skb(frag);
532			frag = skb;
533		}
534		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
535		return err;
536	}
537
538slow_path:
539	left = skb->len - hlen;		/* Space per frame */
540	ptr = raw + hlen;		/* Where to start from */
541
542#ifdef CONFIG_BRIDGE_NETFILTER
543	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
544	 * we need to make room for the encapsulating header */
545	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
546	mtu -= nf_bridge_pad(skb);
547#else
548	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
549#endif
550	/*
551	 *	Fragment the datagram.
552	 */
553
554	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
555	not_last_frag = iph->frag_off & htons(IP_MF);
556
557	/*
558	 *	Keep copying data until we run out.
559	 */
560
561	while(left > 0)	{
562		len = left;
563		/* IF: it doesn't fit, use 'mtu' - the data space left */
564		if (len > mtu)
565			len = mtu;
566		/* IF: we are not sending upto and including the packet end
567		   then align the next start on an eight byte boundary */
568		if (len < left)	{
569			len &= ~7;
570		}
571		/*
572		 *	Allocate buffer.
573		 */
574
575		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
576			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
577			err = -ENOMEM;
578			goto fail;
579		}
580
581		/*
582		 *	Set up data on packet
583		 */
584
585		ip_copy_metadata(skb2, skb);
586		skb_reserve(skb2, ll_rs);
587		skb_put(skb2, len + hlen);
588		skb2->nh.raw = skb2->data;
589		skb2->h.raw = skb2->data + hlen;
590
591		/*
592		 *	Charge the memory for the fragment to any owner
593		 *	it might possess
594		 */
595
596		if (skb->sk)
597			skb_set_owner_w(skb2, skb->sk);
598
599		/*
600		 *	Copy the packet header into the new buffer.
601		 */
602
603		memcpy(skb2->nh.raw, skb->data, hlen);
604
605		/*
606		 *	Copy a block of the IP datagram.
607		 */
608		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
609			BUG();
610		left -= len;
611
612		/*
613		 *	Fill in the new header fields.
614		 */
615		iph = skb2->nh.iph;
616		iph->frag_off = htons((offset >> 3));
617
618		/* ANK: dirty, but effective trick. Upgrade options only if
619		 * the segment to be fragmented was THE FIRST (otherwise,
620		 * options are already fixed) and make it ONCE
621		 * on the initial skb, so that all the following fragments
622		 * will inherit fixed options.
623		 */
624		if (offset == 0)
625			ip_options_fragment(skb);
626
627		/*
628		 *	Added AC : If we are fragmenting a fragment that's not the
629		 *		   last fragment then keep MF on each bit
630		 */
631		if (left > 0 || not_last_frag)
632			iph->frag_off |= htons(IP_MF);
633		ptr += len;
634		offset += len;
635
636		/*
637		 *	Put this fragment into the sending queue.
638		 */
639
640		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
641
642		iph->tot_len = htons(len + hlen);
643
644		ip_send_check(iph);
645
646		err = output(skb2);
647		if (err)
648			goto fail;
649	}
650	kfree_skb(skb);
651	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
652	return err;
653
654fail:
655	kfree_skb(skb);
656	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
657	return err;
658}
659
660int
661ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
662{
663	struct iovec *iov = from;
664
665	if (skb->ip_summed == CHECKSUM_HW) {
666		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
667			return -EFAULT;
668	} else {
669		unsigned int csum = 0;
670		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
671			return -EFAULT;
672		skb->csum = csum_block_add(skb->csum, csum, odd);
673	}
674	return 0;
675}
676
677static inline unsigned int
678csum_page(struct page *page, int offset, int copy)
679{
680	char *kaddr;
681	unsigned int csum;
682	kaddr = kmap(page);
683	csum = csum_partial(kaddr + offset, copy, 0);
684	kunmap(page);
685	return csum;
686}
687
688/*
689 *	ip_append_data() and ip_append_page() can make one large IP datagram
690 *	from many pieces of data. Each pieces will be holded on the socket
691 *	until ip_push_pending_frames() is called. Each piece can be a page
692 *	or non-page data.
693 *
694 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
695 *	this interface potentially.
696 *
697 *	LATER: length must be adjusted by pad at tail, when it is required.
698 */
699int ip_append_data(struct sock *sk,
700		   int getfrag(void *from, char *to, int offset, int len,
701			       int odd, struct sk_buff *skb),
702		   void *from, int length, int transhdrlen,
703		   struct ipcm_cookie *ipc, struct rtable *rt,
704		   unsigned int flags)
705{
706	struct inet_sock *inet = inet_sk(sk);
707	struct sk_buff *skb;
708
709	struct ip_options *opt = NULL;
710	int hh_len;
711	int exthdrlen;
712	int mtu;
713	int copy;
714	int err;
715	int offset = 0;
716	unsigned int maxfraglen, fragheaderlen;
717	int csummode = CHECKSUM_NONE;
718
719	if (flags&MSG_PROBE)
720		return 0;
721
722	if (skb_queue_empty(&sk->sk_write_queue)) {
723		/*
724		 * setup for corking.
725		 */
726		opt = ipc->opt;
727		if (opt) {
728			if (inet->cork.opt == NULL) {
729				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
730				if (unlikely(inet->cork.opt == NULL))
731					return -ENOBUFS;
732			}
733			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
734			inet->cork.flags |= IPCORK_OPT;
735			inet->cork.addr = ipc->addr;
736		}
737		dst_hold(&rt->u.dst);
738		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
739		inet->cork.rt = rt;
740		inet->cork.length = 0;
741		sk->sk_sndmsg_page = NULL;
742		sk->sk_sndmsg_off = 0;
743		if ((exthdrlen = rt->u.dst.header_len) != 0) {
744			length += exthdrlen;
745			transhdrlen += exthdrlen;
746		}
747	} else {
748		rt = inet->cork.rt;
749		if (inet->cork.flags & IPCORK_OPT)
750			opt = inet->cork.opt;
751
752		transhdrlen = 0;
753		exthdrlen = 0;
754		mtu = inet->cork.fragsize;
755	}
756	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
757
758	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
759	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
760
761	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
762		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
763		return -EMSGSIZE;
764	}
765
766	/*
767	 * transhdrlen > 0 means that this is the first fragment and we wish
768	 * it won't be fragmented in the future.
769	 */
770	if (transhdrlen &&
771	    length + fragheaderlen <= mtu &&
772	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
773	    !exthdrlen)
774		csummode = CHECKSUM_HW;
775
776	inet->cork.length += length;
777
778	/* So, what's going on in the loop below?
779	 *
780	 * We use calculated fragment length to generate chained skb,
781	 * each of segments is IP fragment ready for sending to network after
782	 * adding appropriate IP header.
783	 */
784
785	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
786		goto alloc_new_skb;
787
788	while (length > 0) {
789		/* Check if the remaining data fits into current packet. */
790		copy = mtu - skb->len;
791		if (copy < length)
792			copy = maxfraglen - skb->len;
793		if (copy <= 0) {
794			char *data;
795			unsigned int datalen;
796			unsigned int fraglen;
797			unsigned int fraggap;
798			unsigned int alloclen;
799			struct sk_buff *skb_prev;
800alloc_new_skb:
801			skb_prev = skb;
802			if (skb_prev)
803				fraggap = skb_prev->len - maxfraglen;
804			else
805				fraggap = 0;
806
807			/*
808			 * If remaining data exceeds the mtu,
809			 * we know we need more fragment(s).
810			 */
811			datalen = length + fraggap;
812			if (datalen > mtu - fragheaderlen)
813				datalen = maxfraglen - fragheaderlen;
814			fraglen = datalen + fragheaderlen;
815
816			if ((flags & MSG_MORE) &&
817			    !(rt->u.dst.dev->features&NETIF_F_SG))
818				alloclen = mtu;
819			else
820				alloclen = datalen + fragheaderlen;
821
822			/* The last fragment gets additional space at tail.
823			 * Note, with MSG_MORE we overallocate on fragments,
824			 * because we have no idea what fragment will be
825			 * the last.
826			 */
827			if (datalen == length)
828				alloclen += rt->u.dst.trailer_len;
829
830			if (transhdrlen) {
831				skb = sock_alloc_send_skb(sk,
832						alloclen + hh_len + 15,
833						(flags & MSG_DONTWAIT), &err);
834			} else {
835				skb = NULL;
836				if (atomic_read(&sk->sk_wmem_alloc) <=
837				    2 * sk->sk_sndbuf)
838					skb = sock_wmalloc(sk,
839							   alloclen + hh_len + 15, 1,
840							   sk->sk_allocation);
841				if (unlikely(skb == NULL))
842					err = -ENOBUFS;
843			}
844			if (skb == NULL)
845				goto error;
846
847			/*
848			 *	Fill in the control structures
849			 */
850			skb->ip_summed = csummode;
851			skb->csum = 0;
852			skb_reserve(skb, hh_len);
853
854			/*
855			 *	Find where to start putting bytes.
856			 */
857			data = skb_put(skb, fraglen);
858			skb->nh.raw = data + exthdrlen;
859			data += fragheaderlen;
860			skb->h.raw = data + exthdrlen;
861
862			if (fraggap) {
863				skb->csum = skb_copy_and_csum_bits(
864					skb_prev, maxfraglen,
865					data + transhdrlen, fraggap, 0);
866				skb_prev->csum = csum_sub(skb_prev->csum,
867							  skb->csum);
868				data += fraggap;
869				skb_trim(skb_prev, maxfraglen);
870			}
871
872			copy = datalen - transhdrlen - fraggap;
873			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
874				err = -EFAULT;
875				kfree_skb(skb);
876				goto error;
877			}
878
879			offset += copy;
880			length -= datalen - fraggap;
881			transhdrlen = 0;
882			exthdrlen = 0;
883			csummode = CHECKSUM_NONE;
884
885			/*
886			 * Put the packet on the pending queue.
887			 */
888			__skb_queue_tail(&sk->sk_write_queue, skb);
889			continue;
890		}
891
892		if (copy > length)
893			copy = length;
894
895		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
896			unsigned int off;
897
898			off = skb->len;
899			if (getfrag(from, skb_put(skb, copy),
900					offset, copy, off, skb) < 0) {
901				__skb_trim(skb, off);
902				err = -EFAULT;
903				goto error;
904			}
905		} else {
906			int i = skb_shinfo(skb)->nr_frags;
907			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
908			struct page *page = sk->sk_sndmsg_page;
909			int off = sk->sk_sndmsg_off;
910			unsigned int left;
911
912			if (page && (left = PAGE_SIZE - off) > 0) {
913				if (copy >= left)
914					copy = left;
915				if (page != frag->page) {
916					if (i == MAX_SKB_FRAGS) {
917						err = -EMSGSIZE;
918						goto error;
919					}
920					get_page(page);
921	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
922					frag = &skb_shinfo(skb)->frags[i];
923				}
924			} else if (i < MAX_SKB_FRAGS) {
925				if (copy > PAGE_SIZE)
926					copy = PAGE_SIZE;
927				page = alloc_pages(sk->sk_allocation, 0);
928				if (page == NULL)  {
929					err = -ENOMEM;
930					goto error;
931				}
932				sk->sk_sndmsg_page = page;
933				sk->sk_sndmsg_off = 0;
934
935				skb_fill_page_desc(skb, i, page, 0, 0);
936				frag = &skb_shinfo(skb)->frags[i];
937				skb->truesize += PAGE_SIZE;
938				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
939			} else {
940				err = -EMSGSIZE;
941				goto error;
942			}
943			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
944				err = -EFAULT;
945				goto error;
946			}
947			sk->sk_sndmsg_off += copy;
948			frag->size += copy;
949			skb->len += copy;
950			skb->data_len += copy;
951		}
952		offset += copy;
953		length -= copy;
954	}
955
956	return 0;
957
958error:
959	inet->cork.length -= length;
960	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
961	return err;
962}
963
964ssize_t	ip_append_page(struct sock *sk, struct page *page,
965		       int offset, size_t size, int flags)
966{
967	struct inet_sock *inet = inet_sk(sk);
968	struct sk_buff *skb;
969	struct rtable *rt;
970	struct ip_options *opt = NULL;
971	int hh_len;
972	int mtu;
973	int len;
974	int err;
975	unsigned int maxfraglen, fragheaderlen, fraggap;
976
977	if (inet->hdrincl)
978		return -EPERM;
979
980	if (flags&MSG_PROBE)
981		return 0;
982
983	if (skb_queue_empty(&sk->sk_write_queue))
984		return -EINVAL;
985
986	rt = inet->cork.rt;
987	if (inet->cork.flags & IPCORK_OPT)
988		opt = inet->cork.opt;
989
990	if (!(rt->u.dst.dev->features&NETIF_F_SG))
991		return -EOPNOTSUPP;
992
993	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
994	mtu = inet->cork.fragsize;
995
996	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
997	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
998
999	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1000		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1001		return -EMSGSIZE;
1002	}
1003
1004	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1005		return -EINVAL;
1006
1007	inet->cork.length += size;
1008
1009	while (size > 0) {
1010		int i;
1011
1012		/* Check if the remaining data fits into current packet. */
1013		len = mtu - skb->len;
1014		if (len < size)
1015			len = maxfraglen - skb->len;
1016		if (len <= 0) {
1017			struct sk_buff *skb_prev;
1018			char *data;
1019			struct iphdr *iph;
1020			int alloclen;
1021
1022			skb_prev = skb;
1023			if (skb_prev)
1024				fraggap = skb_prev->len - maxfraglen;
1025			else
1026				fraggap = 0;
1027
1028			alloclen = fragheaderlen + hh_len + fraggap + 15;
1029			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1030			if (unlikely(!skb)) {
1031				err = -ENOBUFS;
1032				goto error;
1033			}
1034
1035			/*
1036			 *	Fill in the control structures
1037			 */
1038			skb->ip_summed = CHECKSUM_NONE;
1039			skb->csum = 0;
1040			skb_reserve(skb, hh_len);
1041
1042			/*
1043			 *	Find where to start putting bytes.
1044			 */
1045			data = skb_put(skb, fragheaderlen + fraggap);
1046			skb->nh.iph = iph = (struct iphdr *)data;
1047			data += fragheaderlen;
1048			skb->h.raw = data;
1049
1050			if (fraggap) {
1051				skb->csum = skb_copy_and_csum_bits(
1052					skb_prev, maxfraglen,
1053					data, fraggap, 0);
1054				skb_prev->csum = csum_sub(skb_prev->csum,
1055							  skb->csum);
1056				skb_trim(skb_prev, maxfraglen);
1057			}
1058
1059			/*
1060			 * Put the packet on the pending queue.
1061			 */
1062			__skb_queue_tail(&sk->sk_write_queue, skb);
1063			continue;
1064		}
1065
1066		i = skb_shinfo(skb)->nr_frags;
1067		if (len > size)
1068			len = size;
1069		if (skb_can_coalesce(skb, i, page, offset)) {
1070			skb_shinfo(skb)->frags[i-1].size += len;
1071		} else if (i < MAX_SKB_FRAGS) {
1072			get_page(page);
1073			skb_fill_page_desc(skb, i, page, offset, len);
1074		} else {
1075			err = -EMSGSIZE;
1076			goto error;
1077		}
1078
1079		if (skb->ip_summed == CHECKSUM_NONE) {
1080			unsigned int csum;
1081			csum = csum_page(page, offset, len);
1082			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1083		}
1084
1085		skb->len += len;
1086		skb->data_len += len;
1087		offset += len;
1088		size -= len;
1089	}
1090	return 0;
1091
1092error:
1093	inet->cork.length -= size;
1094	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1095	return err;
1096}
1097
1098/*
1099 *	Combined all pending IP fragments on the socket as one IP datagram
1100 *	and push them out.
1101 */
1102int ip_push_pending_frames(struct sock *sk)
1103{
1104	struct sk_buff *skb, *tmp_skb;
1105	struct sk_buff **tail_skb;
1106	struct inet_sock *inet = inet_sk(sk);
1107	struct ip_options *opt = NULL;
1108	struct rtable *rt = inet->cork.rt;
1109	struct iphdr *iph;
1110	int df = 0;
1111	__u8 ttl;
1112	int err = 0;
1113
1114	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1115		goto out;
1116	tail_skb = &(skb_shinfo(skb)->frag_list);
1117
1118	/* move skb->data to ip header from ext header */
1119	if (skb->data < skb->nh.raw)
1120		__skb_pull(skb, skb->nh.raw - skb->data);
1121	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1122		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1123		*tail_skb = tmp_skb;
1124		tail_skb = &(tmp_skb->next);
1125		skb->len += tmp_skb->len;
1126		skb->data_len += tmp_skb->len;
1127		skb->truesize += tmp_skb->truesize;
1128		__sock_put(tmp_skb->sk);
1129		tmp_skb->destructor = NULL;
1130		tmp_skb->sk = NULL;
1131	}
1132
1133	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1134	 * to fragment the frame generated here. No matter, what transforms
1135	 * how transforms change size of the packet, it will come out.
1136	 */
1137	if (inet->pmtudisc != IP_PMTUDISC_DO)
1138		skb->local_df = 1;
1139
1140	/* DF bit is set when we want to see DF on outgoing frames.
1141	 * If local_df is set too, we still allow to fragment this frame
1142	 * locally. */
1143	if (inet->pmtudisc == IP_PMTUDISC_DO ||
1144	    (skb->len <= dst_mtu(&rt->u.dst) &&
1145	     ip_dont_fragment(sk, &rt->u.dst)))
1146		df = htons(IP_DF);
1147
1148	if (inet->cork.flags & IPCORK_OPT)
1149		opt = inet->cork.opt;
1150
1151	if (rt->rt_type == RTN_MULTICAST)
1152		ttl = inet->mc_ttl;
1153	else
1154		ttl = ip_select_ttl(inet, &rt->u.dst);
1155
1156	iph = (struct iphdr *)skb->data;
1157	iph->version = 4;
1158	iph->ihl = 5;
1159	if (opt) {
1160		iph->ihl += opt->optlen>>2;
1161		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1162	}
1163	iph->tos = inet->tos;
1164	iph->tot_len = htons(skb->len);
1165	iph->frag_off = df;
1166	if (!df) {
1167		__ip_select_ident(iph, &rt->u.dst, 0);
1168	} else {
1169		iph->id = htons(inet->id++);
1170	}
1171	iph->ttl = ttl;
1172	iph->protocol = sk->sk_protocol;
1173	iph->saddr = rt->rt_src;
1174	iph->daddr = rt->rt_dst;
1175	ip_send_check(iph);
1176
1177	skb->priority = sk->sk_priority;
1178	skb->dst = dst_clone(&rt->u.dst);
1179
1180	/* Netfilter gets whole the not fragmented skb. */
1181	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1182		      skb->dst->dev, dst_output);
1183	if (err) {
1184		if (err > 0)
1185			err = inet->recverr ? net_xmit_errno(err) : 0;
1186		if (err)
1187			goto error;
1188	}
1189
1190out:
1191	inet->cork.flags &= ~IPCORK_OPT;
1192	if (inet->cork.opt) {
1193		kfree(inet->cork.opt);
1194		inet->cork.opt = NULL;
1195	}
1196	if (inet->cork.rt) {
1197		ip_rt_put(inet->cork.rt);
1198		inet->cork.rt = NULL;
1199	}
1200	return err;
1201
1202error:
1203	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1204	goto out;
1205}
1206
1207/*
1208 *	Throw away all pending data on the socket.
1209 */
1210void ip_flush_pending_frames(struct sock *sk)
1211{
1212	struct inet_sock *inet = inet_sk(sk);
1213	struct sk_buff *skb;
1214
1215	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1216		kfree_skb(skb);
1217
1218	inet->cork.flags &= ~IPCORK_OPT;
1219	if (inet->cork.opt) {
1220		kfree(inet->cork.opt);
1221		inet->cork.opt = NULL;
1222	}
1223	if (inet->cork.rt) {
1224		ip_rt_put(inet->cork.rt);
1225		inet->cork.rt = NULL;
1226	}
1227}
1228
1229
1230/*
1231 *	Fetch data from kernel space and fill in checksum if needed.
1232 */
1233static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1234			      int len, int odd, struct sk_buff *skb)
1235{
1236	unsigned int csum;
1237
1238	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1239	skb->csum = csum_block_add(skb->csum, csum, odd);
1240	return 0;
1241}
1242
1243/*
1244 *	Generic function to send a packet as reply to another packet.
1245 *	Used to send TCP resets so far. ICMP should use this function too.
1246 *
1247 *	Should run single threaded per socket because it uses the sock
1248 *     	structure to pass arguments.
1249 *
1250 *	LATER: switch from ip_build_xmit to ip_append_*
1251 */
1252void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1253		   unsigned int len)
1254{
1255	struct inet_sock *inet = inet_sk(sk);
1256	struct {
1257		struct ip_options	opt;
1258		char			data[40];
1259	} replyopts;
1260	struct ipcm_cookie ipc;
1261	u32 daddr;
1262	struct rtable *rt = (struct rtable*)skb->dst;
1263
1264	if (ip_options_echo(&replyopts.opt, skb))
1265		return;
1266
1267	daddr = ipc.addr = rt->rt_src;
1268	ipc.opt = NULL;
1269
1270	if (replyopts.opt.optlen) {
1271		ipc.opt = &replyopts.opt;
1272
1273		if (ipc.opt->srr)
1274			daddr = replyopts.opt.faddr;
1275	}
1276
1277	{
1278		struct flowi fl = { .nl_u = { .ip4_u =
1279					      { .daddr = daddr,
1280						.saddr = rt->rt_spec_dst,
1281						.tos = RT_TOS(skb->nh.iph->tos) } },
1282				    /* Not quite clean, but right. */
1283				    .uli_u = { .ports =
1284					       { .sport = skb->h.th->dest,
1285					         .dport = skb->h.th->source } },
1286				    .proto = sk->sk_protocol };
1287		if (ip_route_output_key(&rt, &fl))
1288			return;
1289	}
1290
1291	/* And let IP do all the hard work.
1292
1293	   This chunk is not reenterable, hence spinlock.
1294	   Note that it uses the fact, that this function is called
1295	   with locally disabled BH and that sk cannot be already spinlocked.
1296	 */
1297	bh_lock_sock(sk);
1298	inet->tos = skb->nh.iph->tos;
1299	sk->sk_priority = skb->priority;
1300	sk->sk_protocol = skb->nh.iph->protocol;
1301	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1302		       &ipc, rt, MSG_DONTWAIT);
1303	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1304		if (arg->csumoffset >= 0)
1305			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1306		skb->ip_summed = CHECKSUM_NONE;
1307		ip_push_pending_frames(sk);
1308	}
1309
1310	bh_unlock_sock(sk);
1311
1312	ip_rt_put(rt);
1313}
1314
1315void __init ip_init(void)
1316{
1317	ip_rt_init();
1318	inet_initpeers();
1319
1320#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1321	igmp_mc_proc_init();
1322#endif
1323}
1324
1325EXPORT_SYMBOL(ip_fragment);
1326EXPORT_SYMBOL(ip_generic_getfrag);
1327EXPORT_SYMBOL(ip_queue_xmit);
1328EXPORT_SYMBOL(ip_send_check);
1329