ip_output.c revision 6cbb0df788b90777a7ed0f9d8261260353f48076
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		The Internet Protocol (IP) output module.
7 *
8 * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Donald Becker, <becker@super.org>
13 *		Alan Cox, <Alan.Cox@linux.org>
14 *		Richard Underwood
15 *		Stefan Becker, <stefanb@yello.ping.de>
16 *		Jorge Cwik, <jorge@laser.satlink.net>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 *	See ip_input.c for original log
21 *
22 *	Fixes:
23 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25 *		Bradford Johnson:	Fix faulty handling of some frames when
26 *					no route is found.
27 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28 *					(in case if packet not accepted by
29 *					output firewall rules)
30 *		Mike McLagan	:	Routing by source
31 *		Alexey Kuznetsov:	use new route cache
32 *		Andi Kleen:		Fix broken PMTU recovery and remove
33 *					some redundant tests.
34 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37 *					for decreased register pressure on x86
38 *					and more readibility.
39 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40 *					silently drop skb instead of failing with -EPERM.
41 *		Detlev Wengorz	:	Copy protocol for fragments.
42 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43 *					datagrams.
44 *		Hirokazu Takahashi:	sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/sched.h>
53#include <linux/mm.h>
54#include <linux/string.h>
55#include <linux/errno.h>
56#include <linux/config.h>
57
58#include <linux/socket.h>
59#include <linux/sockios.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/etherdevice.h>
64#include <linux/proc_fs.h>
65#include <linux/stat.h>
66#include <linux/init.h>
67
68#include <net/snmp.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <net/arp.h>
75#include <net/icmp.h>
76#include <net/checksum.h>
77#include <net/inetpeer.h>
78#include <net/checksum.h>
79#include <linux/igmp.h>
80#include <linux/netfilter_ipv4.h>
81#include <linux/netfilter_bridge.h>
82#include <linux/mroute.h>
83#include <linux/netlink.h>
84#include <linux/tcp.h>
85
86/*
87 *      Shall we try to damage output packets if routing dev changes?
88 */
89
90int sysctl_ip_dynaddr;
91int sysctl_ip_default_ttl = IPDEFTTL;
92
93/* Generate a checksum for an outgoing IP datagram. */
94__inline__ void ip_send_check(struct iphdr *iph)
95{
96	iph->check = 0;
97	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
98}
99
100/* dev_loopback_xmit for use with netfilter. */
101static int ip_dev_loopback_xmit(struct sk_buff *newskb)
102{
103	newskb->mac.raw = newskb->data;
104	__skb_pull(newskb, newskb->nh.raw - newskb->data);
105	newskb->pkt_type = PACKET_LOOPBACK;
106	newskb->ip_summed = CHECKSUM_UNNECESSARY;
107	BUG_TRAP(newskb->dst);
108	netif_rx(newskb);
109	return 0;
110}
111
112static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
113{
114	int ttl = inet->uc_ttl;
115
116	if (ttl < 0)
117		ttl = dst_metric(dst, RTAX_HOPLIMIT);
118	return ttl;
119}
120
121/*
122 *		Add an ip header to a skbuff and send it out.
123 *
124 */
125int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
126			  u32 saddr, u32 daddr, struct ip_options *opt)
127{
128	struct inet_sock *inet = inet_sk(sk);
129	struct rtable *rt = (struct rtable *)skb->dst;
130	struct iphdr *iph;
131
132	/* Build the IP header. */
133	if (opt)
134		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
135	else
136		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
137
138	iph->version  = 4;
139	iph->ihl      = 5;
140	iph->tos      = inet->tos;
141	if (ip_dont_fragment(sk, &rt->u.dst))
142		iph->frag_off = htons(IP_DF);
143	else
144		iph->frag_off = 0;
145	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
146	iph->daddr    = rt->rt_dst;
147	iph->saddr    = rt->rt_src;
148	iph->protocol = sk->sk_protocol;
149	iph->tot_len  = htons(skb->len);
150	ip_select_ident(iph, &rt->u.dst, sk);
151	skb->nh.iph   = iph;
152
153	if (opt && opt->optlen) {
154		iph->ihl += opt->optlen>>2;
155		ip_options_build(skb, opt, daddr, rt, 0);
156	}
157	ip_send_check(iph);
158
159	skb->priority = sk->sk_priority;
160
161	/* Send it out. */
162	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
163		       dst_output);
164}
165
166static inline int ip_finish_output2(struct sk_buff *skb)
167{
168	struct dst_entry *dst = skb->dst;
169	struct hh_cache *hh = dst->hh;
170	struct net_device *dev = dst->dev;
171	int hh_len = LL_RESERVED_SPACE(dev);
172
173	/* Be paranoid, rather than too clever. */
174	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
175		struct sk_buff *skb2;
176
177		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
178		if (skb2 == NULL) {
179			kfree_skb(skb);
180			return -ENOMEM;
181		}
182		if (skb->sk)
183			skb_set_owner_w(skb2, skb->sk);
184		kfree_skb(skb);
185		skb = skb2;
186	}
187
188	if (hh) {
189		int hh_alen;
190
191		read_lock_bh(&hh->hh_lock);
192		hh_alen = HH_DATA_ALIGN(hh->hh_len);
193  		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
194		read_unlock_bh(&hh->hh_lock);
195	        skb_push(skb, hh->hh_len);
196		return hh->hh_output(skb);
197	} else if (dst->neighbour)
198		return dst->neighbour->output(skb);
199
200	if (net_ratelimit())
201		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
202	kfree_skb(skb);
203	return -EINVAL;
204}
205
206static int ip_finish_output(struct sk_buff *skb)
207{
208	struct net_device *dev = skb->dst->dev;
209
210	skb->dev = dev;
211	skb->protocol = htons(ETH_P_IP);
212
213	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
214		       ip_finish_output2);
215}
216
217int ip_mc_output(struct sk_buff *skb)
218{
219	struct sock *sk = skb->sk;
220	struct rtable *rt = (struct rtable*)skb->dst;
221	struct net_device *dev = rt->u.dst.dev;
222
223	/*
224	 *	If the indicated interface is up and running, send the packet.
225	 */
226	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
227
228	skb->dev = dev;
229	skb->protocol = htons(ETH_P_IP);
230
231	/*
232	 *	Multicasts are looped back for other local users
233	 */
234
235	if (rt->rt_flags&RTCF_MULTICAST) {
236		if ((!sk || inet_sk(sk)->mc_loop)
237#ifdef CONFIG_IP_MROUTE
238		/* Small optimization: do not loopback not local frames,
239		   which returned after forwarding; they will be  dropped
240		   by ip_mr_input in any case.
241		   Note, that local frames are looped back to be delivered
242		   to local recipients.
243
244		   This check is duplicated in ip_mr_input at the moment.
245		 */
246		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
247#endif
248		) {
249			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
250			if (newskb)
251				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
252					newskb->dev,
253					ip_dev_loopback_xmit);
254		}
255
256		/* Multicasts with ttl 0 must not go beyond the host */
257
258		if (skb->nh.iph->ttl == 0) {
259			kfree_skb(skb);
260			return 0;
261		}
262	}
263
264	if (rt->rt_flags&RTCF_BROADCAST) {
265		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
266		if (newskb)
267			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
268				newskb->dev, ip_dev_loopback_xmit);
269	}
270
271	if (skb->len > dst_mtu(&rt->u.dst))
272		return ip_fragment(skb, ip_finish_output);
273	else
274		return ip_finish_output(skb);
275}
276
277int ip_output(struct sk_buff *skb)
278{
279	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
280
281	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
282		return ip_fragment(skb, ip_finish_output);
283	else
284		return ip_finish_output(skb);
285}
286
287int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
288{
289	struct sock *sk = skb->sk;
290	struct inet_sock *inet = inet_sk(sk);
291	struct ip_options *opt = inet->opt;
292	struct rtable *rt;
293	struct iphdr *iph;
294
295	/* Skip all of this if the packet is already routed,
296	 * f.e. by something like SCTP.
297	 */
298	rt = (struct rtable *) skb->dst;
299	if (rt != NULL)
300		goto packet_routed;
301
302	/* Make sure we can route this packet. */
303	rt = (struct rtable *)__sk_dst_check(sk, 0);
304	if (rt == NULL) {
305		u32 daddr;
306
307		/* Use correct destination address if we have options. */
308		daddr = inet->daddr;
309		if(opt && opt->srr)
310			daddr = opt->faddr;
311
312		{
313			struct flowi fl = { .oif = sk->sk_bound_dev_if,
314					    .nl_u = { .ip4_u =
315						      { .daddr = daddr,
316							.saddr = inet->saddr,
317							.tos = RT_CONN_FLAGS(sk) } },
318					    .proto = sk->sk_protocol,
319					    .uli_u = { .ports =
320						       { .sport = inet->sport,
321							 .dport = inet->dport } } };
322
323			/* If this fails, retransmit mechanism of transport layer will
324			 * keep trying until route appears or the connection times
325			 * itself out.
326			 */
327			if (ip_route_output_flow(&rt, &fl, sk, 0))
328				goto no_route;
329		}
330		sk_setup_caps(sk, &rt->u.dst);
331	}
332	skb->dst = dst_clone(&rt->u.dst);
333
334packet_routed:
335	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
336		goto no_route;
337
338	/* OK, we know where to send it, allocate and build IP header. */
339	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
340	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
341	iph->tot_len = htons(skb->len);
342	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
343		iph->frag_off = htons(IP_DF);
344	else
345		iph->frag_off = 0;
346	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
347	iph->protocol = sk->sk_protocol;
348	iph->saddr    = rt->rt_src;
349	iph->daddr    = rt->rt_dst;
350	skb->nh.iph   = iph;
351	/* Transport layer set skb->h.foo itself. */
352
353	if (opt && opt->optlen) {
354		iph->ihl += opt->optlen >> 2;
355		ip_options_build(skb, opt, inet->daddr, rt, 0);
356	}
357
358	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
359
360	/* Add an IP checksum. */
361	ip_send_check(iph);
362
363	skb->priority = sk->sk_priority;
364
365	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
366		       dst_output);
367
368no_route:
369	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
370	kfree_skb(skb);
371	return -EHOSTUNREACH;
372}
373
374
375static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
376{
377	to->pkt_type = from->pkt_type;
378	to->priority = from->priority;
379	to->protocol = from->protocol;
380	dst_release(to->dst);
381	to->dst = dst_clone(from->dst);
382	to->dev = from->dev;
383
384	/* Copy the flags to each fragment. */
385	IPCB(to)->flags = IPCB(from)->flags;
386
387#ifdef CONFIG_NET_SCHED
388	to->tc_index = from->tc_index;
389#endif
390#ifdef CONFIG_NETFILTER
391	to->nfmark = from->nfmark;
392	/* Connection association is same as pre-frag packet */
393	nf_conntrack_put(to->nfct);
394	to->nfct = from->nfct;
395	nf_conntrack_get(to->nfct);
396	to->nfctinfo = from->nfctinfo;
397#ifdef CONFIG_BRIDGE_NETFILTER
398	nf_bridge_put(to->nf_bridge);
399	to->nf_bridge = from->nf_bridge;
400	nf_bridge_get(to->nf_bridge);
401#endif
402#endif
403}
404
405/*
406 *	This IP datagram is too large to be sent in one piece.  Break it up into
407 *	smaller pieces (each of size equal to IP header plus
408 *	a block of the data of the original IP data part) that will yet fit in a
409 *	single device frame, and queue such a frame for sending.
410 */
411
412int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
413{
414	struct iphdr *iph;
415	int raw = 0;
416	int ptr;
417	struct net_device *dev;
418	struct sk_buff *skb2;
419	unsigned int mtu, hlen, left, len, ll_rs;
420	int offset;
421	int not_last_frag;
422	struct rtable *rt = (struct rtable*)skb->dst;
423	int err = 0;
424
425	dev = rt->u.dst.dev;
426
427	/*
428	 *	Point into the IP datagram header.
429	 */
430
431	iph = skb->nh.iph;
432
433	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
434		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
435			  htonl(dst_mtu(&rt->u.dst)));
436		kfree_skb(skb);
437		return -EMSGSIZE;
438	}
439
440	/*
441	 *	Setup starting values.
442	 */
443
444	hlen = iph->ihl * 4;
445	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
446
447	/* When frag_list is given, use it. First, check its validity:
448	 * some transformers could create wrong frag_list or break existing
449	 * one, it is not prohibited. In this case fall back to copying.
450	 *
451	 * LATER: this step can be merged to real generation of fragments,
452	 * we can switch to copy when see the first bad fragment.
453	 */
454	if (skb_shinfo(skb)->frag_list) {
455		struct sk_buff *frag;
456		int first_len = skb_pagelen(skb);
457
458		if (first_len - hlen > mtu ||
459		    ((first_len - hlen) & 7) ||
460		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
461		    skb_cloned(skb))
462			goto slow_path;
463
464		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
465			/* Correct geometry. */
466			if (frag->len > mtu ||
467			    ((frag->len & 7) && frag->next) ||
468			    skb_headroom(frag) < hlen)
469			    goto slow_path;
470
471			/* Partially cloned skb? */
472			if (skb_shared(frag))
473				goto slow_path;
474
475			BUG_ON(frag->sk);
476			if (skb->sk) {
477				sock_hold(skb->sk);
478				frag->sk = skb->sk;
479				frag->destructor = sock_wfree;
480				skb->truesize -= frag->truesize;
481			}
482		}
483
484		/* Everything is OK. Generate! */
485
486		err = 0;
487		offset = 0;
488		frag = skb_shinfo(skb)->frag_list;
489		skb_shinfo(skb)->frag_list = NULL;
490		skb->data_len = first_len - skb_headlen(skb);
491		skb->len = first_len;
492		iph->tot_len = htons(first_len);
493		iph->frag_off = htons(IP_MF);
494		ip_send_check(iph);
495
496		for (;;) {
497			/* Prepare header of the next frame,
498			 * before previous one went down. */
499			if (frag) {
500				frag->ip_summed = CHECKSUM_NONE;
501				frag->h.raw = frag->data;
502				frag->nh.raw = __skb_push(frag, hlen);
503				memcpy(frag->nh.raw, iph, hlen);
504				iph = frag->nh.iph;
505				iph->tot_len = htons(frag->len);
506				ip_copy_metadata(frag, skb);
507				if (offset == 0)
508					ip_options_fragment(frag);
509				offset += skb->len - hlen;
510				iph->frag_off = htons(offset>>3);
511				if (frag->next != NULL)
512					iph->frag_off |= htons(IP_MF);
513				/* Ready, complete checksum */
514				ip_send_check(iph);
515			}
516
517			err = output(skb);
518
519			if (err || !frag)
520				break;
521
522			skb = frag;
523			frag = skb->next;
524			skb->next = NULL;
525		}
526
527		if (err == 0) {
528			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
529			return 0;
530		}
531
532		while (frag) {
533			skb = frag->next;
534			kfree_skb(frag);
535			frag = skb;
536		}
537		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538		return err;
539	}
540
541slow_path:
542	left = skb->len - hlen;		/* Space per frame */
543	ptr = raw + hlen;		/* Where to start from */
544
545#ifdef CONFIG_BRIDGE_NETFILTER
546	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
547	 * we need to make room for the encapsulating header */
548	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
549	mtu -= nf_bridge_pad(skb);
550#else
551	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
552#endif
553	/*
554	 *	Fragment the datagram.
555	 */
556
557	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
558	not_last_frag = iph->frag_off & htons(IP_MF);
559
560	/*
561	 *	Keep copying data until we run out.
562	 */
563
564	while(left > 0)	{
565		len = left;
566		/* IF: it doesn't fit, use 'mtu' - the data space left */
567		if (len > mtu)
568			len = mtu;
569		/* IF: we are not sending upto and including the packet end
570		   then align the next start on an eight byte boundary */
571		if (len < left)	{
572			len &= ~7;
573		}
574		/*
575		 *	Allocate buffer.
576		 */
577
578		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
579			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
580			err = -ENOMEM;
581			goto fail;
582		}
583
584		/*
585		 *	Set up data on packet
586		 */
587
588		ip_copy_metadata(skb2, skb);
589		skb_reserve(skb2, ll_rs);
590		skb_put(skb2, len + hlen);
591		skb2->nh.raw = skb2->data;
592		skb2->h.raw = skb2->data + hlen;
593
594		/*
595		 *	Charge the memory for the fragment to any owner
596		 *	it might possess
597		 */
598
599		if (skb->sk)
600			skb_set_owner_w(skb2, skb->sk);
601
602		/*
603		 *	Copy the packet header into the new buffer.
604		 */
605
606		memcpy(skb2->nh.raw, skb->data, hlen);
607
608		/*
609		 *	Copy a block of the IP datagram.
610		 */
611		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
612			BUG();
613		left -= len;
614
615		/*
616		 *	Fill in the new header fields.
617		 */
618		iph = skb2->nh.iph;
619		iph->frag_off = htons((offset >> 3));
620
621		/* ANK: dirty, but effective trick. Upgrade options only if
622		 * the segment to be fragmented was THE FIRST (otherwise,
623		 * options are already fixed) and make it ONCE
624		 * on the initial skb, so that all the following fragments
625		 * will inherit fixed options.
626		 */
627		if (offset == 0)
628			ip_options_fragment(skb);
629
630		/*
631		 *	Added AC : If we are fragmenting a fragment that's not the
632		 *		   last fragment then keep MF on each bit
633		 */
634		if (left > 0 || not_last_frag)
635			iph->frag_off |= htons(IP_MF);
636		ptr += len;
637		offset += len;
638
639		/*
640		 *	Put this fragment into the sending queue.
641		 */
642
643		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
644
645		iph->tot_len = htons(len + hlen);
646
647		ip_send_check(iph);
648
649		err = output(skb2);
650		if (err)
651			goto fail;
652	}
653	kfree_skb(skb);
654	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
655	return err;
656
657fail:
658	kfree_skb(skb);
659	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660	return err;
661}
662
663int
664ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
665{
666	struct iovec *iov = from;
667
668	if (skb->ip_summed == CHECKSUM_HW) {
669		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
670			return -EFAULT;
671	} else {
672		unsigned int csum = 0;
673		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
674			return -EFAULT;
675		skb->csum = csum_block_add(skb->csum, csum, odd);
676	}
677	return 0;
678}
679
680static inline unsigned int
681csum_page(struct page *page, int offset, int copy)
682{
683	char *kaddr;
684	unsigned int csum;
685	kaddr = kmap(page);
686	csum = csum_partial(kaddr + offset, copy, 0);
687	kunmap(page);
688	return csum;
689}
690
691/*
692 *	ip_append_data() and ip_append_page() can make one large IP datagram
693 *	from many pieces of data. Each pieces will be holded on the socket
694 *	until ip_push_pending_frames() is called. Each piece can be a page
695 *	or non-page data.
696 *
697 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
698 *	this interface potentially.
699 *
700 *	LATER: length must be adjusted by pad at tail, when it is required.
701 */
702int ip_append_data(struct sock *sk,
703		   int getfrag(void *from, char *to, int offset, int len,
704			       int odd, struct sk_buff *skb),
705		   void *from, int length, int transhdrlen,
706		   struct ipcm_cookie *ipc, struct rtable *rt,
707		   unsigned int flags)
708{
709	struct inet_sock *inet = inet_sk(sk);
710	struct sk_buff *skb;
711
712	struct ip_options *opt = NULL;
713	int hh_len;
714	int exthdrlen;
715	int mtu;
716	int copy;
717	int err;
718	int offset = 0;
719	unsigned int maxfraglen, fragheaderlen;
720	int csummode = CHECKSUM_NONE;
721
722	if (flags&MSG_PROBE)
723		return 0;
724
725	if (skb_queue_empty(&sk->sk_write_queue)) {
726		/*
727		 * setup for corking.
728		 */
729		opt = ipc->opt;
730		if (opt) {
731			if (inet->cork.opt == NULL) {
732				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
733				if (unlikely(inet->cork.opt == NULL))
734					return -ENOBUFS;
735			}
736			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
737			inet->cork.flags |= IPCORK_OPT;
738			inet->cork.addr = ipc->addr;
739		}
740		dst_hold(&rt->u.dst);
741		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
742		inet->cork.rt = rt;
743		inet->cork.length = 0;
744		sk->sk_sndmsg_page = NULL;
745		sk->sk_sndmsg_off = 0;
746		if ((exthdrlen = rt->u.dst.header_len) != 0) {
747			length += exthdrlen;
748			transhdrlen += exthdrlen;
749		}
750	} else {
751		rt = inet->cork.rt;
752		if (inet->cork.flags & IPCORK_OPT)
753			opt = inet->cork.opt;
754
755		transhdrlen = 0;
756		exthdrlen = 0;
757		mtu = inet->cork.fragsize;
758	}
759	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
760
761	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
762	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
763
764	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
765		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
766		return -EMSGSIZE;
767	}
768
769	/*
770	 * transhdrlen > 0 means that this is the first fragment and we wish
771	 * it won't be fragmented in the future.
772	 */
773	if (transhdrlen &&
774	    length + fragheaderlen <= mtu &&
775	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
776	    !exthdrlen)
777		csummode = CHECKSUM_HW;
778
779	inet->cork.length += length;
780
781	/* So, what's going on in the loop below?
782	 *
783	 * We use calculated fragment length to generate chained skb,
784	 * each of segments is IP fragment ready for sending to network after
785	 * adding appropriate IP header.
786	 */
787
788	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
789		goto alloc_new_skb;
790
791	while (length > 0) {
792		/* Check if the remaining data fits into current packet. */
793		copy = mtu - skb->len;
794		if (copy < length)
795			copy = maxfraglen - skb->len;
796		if (copy <= 0) {
797			char *data;
798			unsigned int datalen;
799			unsigned int fraglen;
800			unsigned int fraggap;
801			unsigned int alloclen;
802			struct sk_buff *skb_prev;
803alloc_new_skb:
804			skb_prev = skb;
805			if (skb_prev)
806				fraggap = skb_prev->len - maxfraglen;
807			else
808				fraggap = 0;
809
810			/*
811			 * If remaining data exceeds the mtu,
812			 * we know we need more fragment(s).
813			 */
814			datalen = length + fraggap;
815			if (datalen > mtu - fragheaderlen)
816				datalen = maxfraglen - fragheaderlen;
817			fraglen = datalen + fragheaderlen;
818
819			if ((flags & MSG_MORE) &&
820			    !(rt->u.dst.dev->features&NETIF_F_SG))
821				alloclen = mtu;
822			else
823				alloclen = datalen + fragheaderlen;
824
825			/* The last fragment gets additional space at tail.
826			 * Note, with MSG_MORE we overallocate on fragments,
827			 * because we have no idea what fragment will be
828			 * the last.
829			 */
830			if (datalen == length)
831				alloclen += rt->u.dst.trailer_len;
832
833			if (transhdrlen) {
834				skb = sock_alloc_send_skb(sk,
835						alloclen + hh_len + 15,
836						(flags & MSG_DONTWAIT), &err);
837			} else {
838				skb = NULL;
839				if (atomic_read(&sk->sk_wmem_alloc) <=
840				    2 * sk->sk_sndbuf)
841					skb = sock_wmalloc(sk,
842							   alloclen + hh_len + 15, 1,
843							   sk->sk_allocation);
844				if (unlikely(skb == NULL))
845					err = -ENOBUFS;
846			}
847			if (skb == NULL)
848				goto error;
849
850			/*
851			 *	Fill in the control structures
852			 */
853			skb->ip_summed = csummode;
854			skb->csum = 0;
855			skb_reserve(skb, hh_len);
856
857			/*
858			 *	Find where to start putting bytes.
859			 */
860			data = skb_put(skb, fraglen);
861			skb->nh.raw = data + exthdrlen;
862			data += fragheaderlen;
863			skb->h.raw = data + exthdrlen;
864
865			if (fraggap) {
866				skb->csum = skb_copy_and_csum_bits(
867					skb_prev, maxfraglen,
868					data + transhdrlen, fraggap, 0);
869				skb_prev->csum = csum_sub(skb_prev->csum,
870							  skb->csum);
871				data += fraggap;
872				skb_trim(skb_prev, maxfraglen);
873			}
874
875			copy = datalen - transhdrlen - fraggap;
876			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
877				err = -EFAULT;
878				kfree_skb(skb);
879				goto error;
880			}
881
882			offset += copy;
883			length -= datalen - fraggap;
884			transhdrlen = 0;
885			exthdrlen = 0;
886			csummode = CHECKSUM_NONE;
887
888			/*
889			 * Put the packet on the pending queue.
890			 */
891			__skb_queue_tail(&sk->sk_write_queue, skb);
892			continue;
893		}
894
895		if (copy > length)
896			copy = length;
897
898		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
899			unsigned int off;
900
901			off = skb->len;
902			if (getfrag(from, skb_put(skb, copy),
903					offset, copy, off, skb) < 0) {
904				__skb_trim(skb, off);
905				err = -EFAULT;
906				goto error;
907			}
908		} else {
909			int i = skb_shinfo(skb)->nr_frags;
910			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
911			struct page *page = sk->sk_sndmsg_page;
912			int off = sk->sk_sndmsg_off;
913			unsigned int left;
914
915			if (page && (left = PAGE_SIZE - off) > 0) {
916				if (copy >= left)
917					copy = left;
918				if (page != frag->page) {
919					if (i == MAX_SKB_FRAGS) {
920						err = -EMSGSIZE;
921						goto error;
922					}
923					get_page(page);
924	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
925					frag = &skb_shinfo(skb)->frags[i];
926				}
927			} else if (i < MAX_SKB_FRAGS) {
928				if (copy > PAGE_SIZE)
929					copy = PAGE_SIZE;
930				page = alloc_pages(sk->sk_allocation, 0);
931				if (page == NULL)  {
932					err = -ENOMEM;
933					goto error;
934				}
935				sk->sk_sndmsg_page = page;
936				sk->sk_sndmsg_off = 0;
937
938				skb_fill_page_desc(skb, i, page, 0, 0);
939				frag = &skb_shinfo(skb)->frags[i];
940				skb->truesize += PAGE_SIZE;
941				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
942			} else {
943				err = -EMSGSIZE;
944				goto error;
945			}
946			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
947				err = -EFAULT;
948				goto error;
949			}
950			sk->sk_sndmsg_off += copy;
951			frag->size += copy;
952			skb->len += copy;
953			skb->data_len += copy;
954		}
955		offset += copy;
956		length -= copy;
957	}
958
959	return 0;
960
961error:
962	inet->cork.length -= length;
963	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
964	return err;
965}
966
967ssize_t	ip_append_page(struct sock *sk, struct page *page,
968		       int offset, size_t size, int flags)
969{
970	struct inet_sock *inet = inet_sk(sk);
971	struct sk_buff *skb;
972	struct rtable *rt;
973	struct ip_options *opt = NULL;
974	int hh_len;
975	int mtu;
976	int len;
977	int err;
978	unsigned int maxfraglen, fragheaderlen, fraggap;
979
980	if (inet->hdrincl)
981		return -EPERM;
982
983	if (flags&MSG_PROBE)
984		return 0;
985
986	if (skb_queue_empty(&sk->sk_write_queue))
987		return -EINVAL;
988
989	rt = inet->cork.rt;
990	if (inet->cork.flags & IPCORK_OPT)
991		opt = inet->cork.opt;
992
993	if (!(rt->u.dst.dev->features&NETIF_F_SG))
994		return -EOPNOTSUPP;
995
996	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
997	mtu = inet->cork.fragsize;
998
999	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001
1002	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1003		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1004		return -EMSGSIZE;
1005	}
1006
1007	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1008		return -EINVAL;
1009
1010	inet->cork.length += size;
1011
1012	while (size > 0) {
1013		int i;
1014
1015		/* Check if the remaining data fits into current packet. */
1016		len = mtu - skb->len;
1017		if (len < size)
1018			len = maxfraglen - skb->len;
1019		if (len <= 0) {
1020			struct sk_buff *skb_prev;
1021			char *data;
1022			struct iphdr *iph;
1023			int alloclen;
1024
1025			skb_prev = skb;
1026			if (skb_prev)
1027				fraggap = skb_prev->len - maxfraglen;
1028			else
1029				fraggap = 0;
1030
1031			alloclen = fragheaderlen + hh_len + fraggap + 15;
1032			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1033			if (unlikely(!skb)) {
1034				err = -ENOBUFS;
1035				goto error;
1036			}
1037
1038			/*
1039			 *	Fill in the control structures
1040			 */
1041			skb->ip_summed = CHECKSUM_NONE;
1042			skb->csum = 0;
1043			skb_reserve(skb, hh_len);
1044
1045			/*
1046			 *	Find where to start putting bytes.
1047			 */
1048			data = skb_put(skb, fragheaderlen + fraggap);
1049			skb->nh.iph = iph = (struct iphdr *)data;
1050			data += fragheaderlen;
1051			skb->h.raw = data;
1052
1053			if (fraggap) {
1054				skb->csum = skb_copy_and_csum_bits(
1055					skb_prev, maxfraglen,
1056					data, fraggap, 0);
1057				skb_prev->csum = csum_sub(skb_prev->csum,
1058							  skb->csum);
1059				skb_trim(skb_prev, maxfraglen);
1060			}
1061
1062			/*
1063			 * Put the packet on the pending queue.
1064			 */
1065			__skb_queue_tail(&sk->sk_write_queue, skb);
1066			continue;
1067		}
1068
1069		i = skb_shinfo(skb)->nr_frags;
1070		if (len > size)
1071			len = size;
1072		if (skb_can_coalesce(skb, i, page, offset)) {
1073			skb_shinfo(skb)->frags[i-1].size += len;
1074		} else if (i < MAX_SKB_FRAGS) {
1075			get_page(page);
1076			skb_fill_page_desc(skb, i, page, offset, len);
1077		} else {
1078			err = -EMSGSIZE;
1079			goto error;
1080		}
1081
1082		if (skb->ip_summed == CHECKSUM_NONE) {
1083			unsigned int csum;
1084			csum = csum_page(page, offset, len);
1085			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1086		}
1087
1088		skb->len += len;
1089		skb->data_len += len;
1090		offset += len;
1091		size -= len;
1092	}
1093	return 0;
1094
1095error:
1096	inet->cork.length -= size;
1097	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1098	return err;
1099}
1100
1101/*
1102 *	Combined all pending IP fragments on the socket as one IP datagram
1103 *	and push them out.
1104 */
1105int ip_push_pending_frames(struct sock *sk)
1106{
1107	struct sk_buff *skb, *tmp_skb;
1108	struct sk_buff **tail_skb;
1109	struct inet_sock *inet = inet_sk(sk);
1110	struct ip_options *opt = NULL;
1111	struct rtable *rt = inet->cork.rt;
1112	struct iphdr *iph;
1113	int df = 0;
1114	__u8 ttl;
1115	int err = 0;
1116
1117	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1118		goto out;
1119	tail_skb = &(skb_shinfo(skb)->frag_list);
1120
1121	/* move skb->data to ip header from ext header */
1122	if (skb->data < skb->nh.raw)
1123		__skb_pull(skb, skb->nh.raw - skb->data);
1124	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1125		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1126		*tail_skb = tmp_skb;
1127		tail_skb = &(tmp_skb->next);
1128		skb->len += tmp_skb->len;
1129		skb->data_len += tmp_skb->len;
1130		skb->truesize += tmp_skb->truesize;
1131		__sock_put(tmp_skb->sk);
1132		tmp_skb->destructor = NULL;
1133		tmp_skb->sk = NULL;
1134	}
1135
1136	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1137	 * to fragment the frame generated here. No matter, what transforms
1138	 * how transforms change size of the packet, it will come out.
1139	 */
1140	if (inet->pmtudisc != IP_PMTUDISC_DO)
1141		skb->local_df = 1;
1142
1143	/* DF bit is set when we want to see DF on outgoing frames.
1144	 * If local_df is set too, we still allow to fragment this frame
1145	 * locally. */
1146	if (inet->pmtudisc == IP_PMTUDISC_DO ||
1147	    (skb->len <= dst_mtu(&rt->u.dst) &&
1148	     ip_dont_fragment(sk, &rt->u.dst)))
1149		df = htons(IP_DF);
1150
1151	if (inet->cork.flags & IPCORK_OPT)
1152		opt = inet->cork.opt;
1153
1154	if (rt->rt_type == RTN_MULTICAST)
1155		ttl = inet->mc_ttl;
1156	else
1157		ttl = ip_select_ttl(inet, &rt->u.dst);
1158
1159	iph = (struct iphdr *)skb->data;
1160	iph->version = 4;
1161	iph->ihl = 5;
1162	if (opt) {
1163		iph->ihl += opt->optlen>>2;
1164		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1165	}
1166	iph->tos = inet->tos;
1167	iph->tot_len = htons(skb->len);
1168	iph->frag_off = df;
1169	if (!df) {
1170		__ip_select_ident(iph, &rt->u.dst, 0);
1171	} else {
1172		iph->id = htons(inet->id++);
1173	}
1174	iph->ttl = ttl;
1175	iph->protocol = sk->sk_protocol;
1176	iph->saddr = rt->rt_src;
1177	iph->daddr = rt->rt_dst;
1178	ip_send_check(iph);
1179
1180	skb->priority = sk->sk_priority;
1181	skb->dst = dst_clone(&rt->u.dst);
1182
1183	/* Netfilter gets whole the not fragmented skb. */
1184	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1185		      skb->dst->dev, dst_output);
1186	if (err) {
1187		if (err > 0)
1188			err = inet->recverr ? net_xmit_errno(err) : 0;
1189		if (err)
1190			goto error;
1191	}
1192
1193out:
1194	inet->cork.flags &= ~IPCORK_OPT;
1195	if (inet->cork.opt) {
1196		kfree(inet->cork.opt);
1197		inet->cork.opt = NULL;
1198	}
1199	if (inet->cork.rt) {
1200		ip_rt_put(inet->cork.rt);
1201		inet->cork.rt = NULL;
1202	}
1203	return err;
1204
1205error:
1206	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1207	goto out;
1208}
1209
1210/*
1211 *	Throw away all pending data on the socket.
1212 */
1213void ip_flush_pending_frames(struct sock *sk)
1214{
1215	struct inet_sock *inet = inet_sk(sk);
1216	struct sk_buff *skb;
1217
1218	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1219		kfree_skb(skb);
1220
1221	inet->cork.flags &= ~IPCORK_OPT;
1222	if (inet->cork.opt) {
1223		kfree(inet->cork.opt);
1224		inet->cork.opt = NULL;
1225	}
1226	if (inet->cork.rt) {
1227		ip_rt_put(inet->cork.rt);
1228		inet->cork.rt = NULL;
1229	}
1230}
1231
1232
1233/*
1234 *	Fetch data from kernel space and fill in checksum if needed.
1235 */
1236static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1237			      int len, int odd, struct sk_buff *skb)
1238{
1239	unsigned int csum;
1240
1241	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1242	skb->csum = csum_block_add(skb->csum, csum, odd);
1243	return 0;
1244}
1245
1246/*
1247 *	Generic function to send a packet as reply to another packet.
1248 *	Used to send TCP resets so far. ICMP should use this function too.
1249 *
1250 *	Should run single threaded per socket because it uses the sock
1251 *     	structure to pass arguments.
1252 *
1253 *	LATER: switch from ip_build_xmit to ip_append_*
1254 */
1255void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1256		   unsigned int len)
1257{
1258	struct inet_sock *inet = inet_sk(sk);
1259	struct {
1260		struct ip_options	opt;
1261		char			data[40];
1262	} replyopts;
1263	struct ipcm_cookie ipc;
1264	u32 daddr;
1265	struct rtable *rt = (struct rtable*)skb->dst;
1266
1267	if (ip_options_echo(&replyopts.opt, skb))
1268		return;
1269
1270	daddr = ipc.addr = rt->rt_src;
1271	ipc.opt = NULL;
1272
1273	if (replyopts.opt.optlen) {
1274		ipc.opt = &replyopts.opt;
1275
1276		if (ipc.opt->srr)
1277			daddr = replyopts.opt.faddr;
1278	}
1279
1280	{
1281		struct flowi fl = { .nl_u = { .ip4_u =
1282					      { .daddr = daddr,
1283						.saddr = rt->rt_spec_dst,
1284						.tos = RT_TOS(skb->nh.iph->tos) } },
1285				    /* Not quite clean, but right. */
1286				    .uli_u = { .ports =
1287					       { .sport = skb->h.th->dest,
1288					         .dport = skb->h.th->source } },
1289				    .proto = sk->sk_protocol };
1290		if (ip_route_output_key(&rt, &fl))
1291			return;
1292	}
1293
1294	/* And let IP do all the hard work.
1295
1296	   This chunk is not reenterable, hence spinlock.
1297	   Note that it uses the fact, that this function is called
1298	   with locally disabled BH and that sk cannot be already spinlocked.
1299	 */
1300	bh_lock_sock(sk);
1301	inet->tos = skb->nh.iph->tos;
1302	sk->sk_priority = skb->priority;
1303	sk->sk_protocol = skb->nh.iph->protocol;
1304	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1305		       &ipc, rt, MSG_DONTWAIT);
1306	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1307		if (arg->csumoffset >= 0)
1308			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1309		skb->ip_summed = CHECKSUM_NONE;
1310		ip_push_pending_frames(sk);
1311	}
1312
1313	bh_unlock_sock(sk);
1314
1315	ip_rt_put(rt);
1316}
1317
1318void __init ip_init(void)
1319{
1320	ip_rt_init();
1321	inet_initpeers();
1322
1323#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1324	igmp_mc_proc_init();
1325#endif
1326}
1327
1328EXPORT_SYMBOL(ip_fragment);
1329EXPORT_SYMBOL(ip_generic_getfrag);
1330EXPORT_SYMBOL(ip_queue_xmit);
1331EXPORT_SYMBOL(ip_send_check);
1332