ip6_output.c revision 69ead7afdf6028184f713a77376ee26f8aaafdcd
1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	: 	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58
59int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61int __ip6_local_out(struct sk_buff *skb)
62{
63	int len;
64
65	len = skb->len - sizeof(struct ipv6hdr);
66	if (len > IPV6_MAXPLEN)
67		len = 0;
68	ipv6_hdr(skb)->payload_len = htons(len);
69
70	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71		       skb_dst(skb)->dev, dst_output);
72}
73
74int ip6_local_out(struct sk_buff *skb)
75{
76	int err;
77
78	err = __ip6_local_out(skb);
79	if (likely(err == 1))
80		err = dst_output(skb);
81
82	return err;
83}
84EXPORT_SYMBOL_GPL(ip6_local_out);
85
86/* dev_loopback_xmit for use with netfilter. */
87static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88{
89	skb_reset_mac_header(newskb);
90	__skb_pull(newskb, skb_network_offset(newskb));
91	newskb->pkt_type = PACKET_LOOPBACK;
92	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93	WARN_ON(!skb_dst(newskb));
94
95	netif_rx_ni(newskb);
96	return 0;
97}
98
99static int ip6_finish_output2(struct sk_buff *skb)
100{
101	struct dst_entry *dst = skb_dst(skb);
102	struct net_device *dev = dst->dev;
103
104	skb->protocol = htons(ETH_P_IPV6);
105	skb->dev = dev;
106
107	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109
110		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111		    ((mroute6_socket(dev_net(dev), skb) &&
112		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114					 &ipv6_hdr(skb)->saddr))) {
115			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116
117			/* Do not check for IFF_ALLMULTI; multicast routing
118			   is not supported in any case.
119			 */
120			if (newskb)
121				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122					newskb, NULL, newskb->dev,
123					ip6_dev_loopback_xmit);
124
125			if (ipv6_hdr(skb)->hop_limit == 0) {
126				IP6_INC_STATS(dev_net(dev), idev,
127					      IPSTATS_MIB_OUTDISCARDS);
128				kfree_skb(skb);
129				return 0;
130			}
131		}
132
133		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134				skb->len);
135	}
136
137	if (dst->hh)
138		return neigh_hh_output(dst->hh, skb);
139	else if (dst->neighbour)
140		return dst->neighbour->output(skb);
141
142	IP6_INC_STATS_BH(dev_net(dst->dev),
143			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144	kfree_skb(skb);
145	return -EINVAL;
146}
147
148static int ip6_finish_output(struct sk_buff *skb)
149{
150	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151	    dst_allfrag(skb_dst(skb)))
152		return ip6_fragment(skb, ip6_finish_output2);
153	else
154		return ip6_finish_output2(skb);
155}
156
157int ip6_output(struct sk_buff *skb)
158{
159	struct net_device *dev = skb_dst(skb)->dev;
160	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161	if (unlikely(idev->cnf.disable_ipv6)) {
162		IP6_INC_STATS(dev_net(dev), idev,
163			      IPSTATS_MIB_OUTDISCARDS);
164		kfree_skb(skb);
165		return 0;
166	}
167
168	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169			    ip6_finish_output,
170			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171}
172
173/*
174 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175 */
176
177int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178	     struct ipv6_txoptions *opt)
179{
180	struct net *net = sock_net(sk);
181	struct ipv6_pinfo *np = inet6_sk(sk);
182	struct in6_addr *first_hop = &fl->fl6_dst;
183	struct dst_entry *dst = skb_dst(skb);
184	struct ipv6hdr *hdr;
185	u8  proto = fl->proto;
186	int seg_len = skb->len;
187	int hlimit = -1;
188	int tclass = 0;
189	u32 mtu;
190
191	if (opt) {
192		unsigned int head_room;
193
194		/* First: exthdrs may take lots of space (~8K for now)
195		   MAX_HEADER is not enough.
196		 */
197		head_room = opt->opt_nflen + opt->opt_flen;
198		seg_len += head_room;
199		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200
201		if (skb_headroom(skb) < head_room) {
202			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203			if (skb2 == NULL) {
204				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205					      IPSTATS_MIB_OUTDISCARDS);
206				kfree_skb(skb);
207				return -ENOBUFS;
208			}
209			kfree_skb(skb);
210			skb = skb2;
211			skb_set_owner_w(skb, sk);
212		}
213		if (opt->opt_flen)
214			ipv6_push_frag_opts(skb, opt, &proto);
215		if (opt->opt_nflen)
216			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217	}
218
219	skb_push(skb, sizeof(struct ipv6hdr));
220	skb_reset_network_header(skb);
221	hdr = ipv6_hdr(skb);
222
223	/*
224	 *	Fill in the IPv6 header
225	 */
226	if (np) {
227		tclass = np->tclass;
228		hlimit = np->hop_limit;
229	}
230	if (hlimit < 0)
231		hlimit = ip6_dst_hoplimit(dst);
232
233	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
234
235	hdr->payload_len = htons(seg_len);
236	hdr->nexthdr = proto;
237	hdr->hop_limit = hlimit;
238
239	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240	ipv6_addr_copy(&hdr->daddr, first_hop);
241
242	skb->priority = sk->sk_priority;
243	skb->mark = sk->sk_mark;
244
245	mtu = dst_mtu(dst);
246	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248			      IPSTATS_MIB_OUT, skb->len);
249		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250			       dst->dev, dst_output);
251	}
252
253	if (net_ratelimit())
254		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255	skb->dev = dst->dev;
256	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258	kfree_skb(skb);
259	return -EMSGSIZE;
260}
261
262EXPORT_SYMBOL(ip6_xmit);
263
264/*
265 *	To avoid extra problems ND packets are send through this
266 *	routine. It's code duplication but I really want to avoid
267 *	extra checks since ipv6_build_header is used by TCP (which
268 *	is for us performance critical)
269 */
270
271int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273	       int proto, int len)
274{
275	struct ipv6_pinfo *np = inet6_sk(sk);
276	struct ipv6hdr *hdr;
277
278	skb->protocol = htons(ETH_P_IPV6);
279	skb->dev = dev;
280
281	skb_reset_network_header(skb);
282	skb_put(skb, sizeof(struct ipv6hdr));
283	hdr = ipv6_hdr(skb);
284
285	*(__be32*)hdr = htonl(0x60000000);
286
287	hdr->payload_len = htons(len);
288	hdr->nexthdr = proto;
289	hdr->hop_limit = np->hop_limit;
290
291	ipv6_addr_copy(&hdr->saddr, saddr);
292	ipv6_addr_copy(&hdr->daddr, daddr);
293
294	return 0;
295}
296
297static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298{
299	struct ip6_ra_chain *ra;
300	struct sock *last = NULL;
301
302	read_lock(&ip6_ra_lock);
303	for (ra = ip6_ra_chain; ra; ra = ra->next) {
304		struct sock *sk = ra->sk;
305		if (sk && ra->sel == sel &&
306		    (!sk->sk_bound_dev_if ||
307		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
308			if (last) {
309				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310				if (skb2)
311					rawv6_rcv(last, skb2);
312			}
313			last = sk;
314		}
315	}
316
317	if (last) {
318		rawv6_rcv(last, skb);
319		read_unlock(&ip6_ra_lock);
320		return 1;
321	}
322	read_unlock(&ip6_ra_lock);
323	return 0;
324}
325
326static int ip6_forward_proxy_check(struct sk_buff *skb)
327{
328	struct ipv6hdr *hdr = ipv6_hdr(skb);
329	u8 nexthdr = hdr->nexthdr;
330	int offset;
331
332	if (ipv6_ext_hdr(nexthdr)) {
333		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334		if (offset < 0)
335			return 0;
336	} else
337		offset = sizeof(struct ipv6hdr);
338
339	if (nexthdr == IPPROTO_ICMPV6) {
340		struct icmp6hdr *icmp6;
341
342		if (!pskb_may_pull(skb, (skb_network_header(skb) +
343					 offset + 1 - skb->data)))
344			return 0;
345
346		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347
348		switch (icmp6->icmp6_type) {
349		case NDISC_ROUTER_SOLICITATION:
350		case NDISC_ROUTER_ADVERTISEMENT:
351		case NDISC_NEIGHBOUR_SOLICITATION:
352		case NDISC_NEIGHBOUR_ADVERTISEMENT:
353		case NDISC_REDIRECT:
354			/* For reaction involving unicast neighbor discovery
355			 * message destined to the proxied address, pass it to
356			 * input function.
357			 */
358			return 1;
359		default:
360			break;
361		}
362	}
363
364	/*
365	 * The proxying router can't forward traffic sent to a link-local
366	 * address, so signal the sender and discard the packet. This
367	 * behavior is clarified by the MIPv6 specification.
368	 */
369	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370		dst_link_failure(skb);
371		return -1;
372	}
373
374	return 0;
375}
376
377static inline int ip6_forward_finish(struct sk_buff *skb)
378{
379	return dst_output(skb);
380}
381
382int ip6_forward(struct sk_buff *skb)
383{
384	struct dst_entry *dst = skb_dst(skb);
385	struct ipv6hdr *hdr = ipv6_hdr(skb);
386	struct inet6_skb_parm *opt = IP6CB(skb);
387	struct net *net = dev_net(dst->dev);
388	u32 mtu;
389
390	if (net->ipv6.devconf_all->forwarding == 0)
391		goto error;
392
393	if (skb_warn_if_lro(skb))
394		goto drop;
395
396	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
397		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
398		goto drop;
399	}
400
401	if (skb->pkt_type != PACKET_HOST)
402		goto drop;
403
404	skb_forward_csum(skb);
405
406	/*
407	 *	We DO NOT make any processing on
408	 *	RA packets, pushing them to user level AS IS
409	 *	without ane WARRANTY that application will be able
410	 *	to interpret them. The reason is that we
411	 *	cannot make anything clever here.
412	 *
413	 *	We are not end-node, so that if packet contains
414	 *	AH/ESP, we cannot make anything.
415	 *	Defragmentation also would be mistake, RA packets
416	 *	cannot be fragmented, because there is no warranty
417	 *	that different fragments will go along one path. --ANK
418	 */
419	if (opt->ra) {
420		u8 *ptr = skb_network_header(skb) + opt->ra;
421		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
422			return 0;
423	}
424
425	/*
426	 *	check and decrement ttl
427	 */
428	if (hdr->hop_limit <= 1) {
429		/* Force OUTPUT device used as source address */
430		skb->dev = dst->dev;
431		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432		IP6_INC_STATS_BH(net,
433				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
434
435		kfree_skb(skb);
436		return -ETIMEDOUT;
437	}
438
439	/* XXX: idev->cnf.proxy_ndp? */
440	if (net->ipv6.devconf_all->proxy_ndp &&
441	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442		int proxied = ip6_forward_proxy_check(skb);
443		if (proxied > 0)
444			return ip6_input(skb);
445		else if (proxied < 0) {
446			IP6_INC_STATS(net, ip6_dst_idev(dst),
447				      IPSTATS_MIB_INDISCARDS);
448			goto drop;
449		}
450	}
451
452	if (!xfrm6_route_forward(skb)) {
453		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
454		goto drop;
455	}
456	dst = skb_dst(skb);
457
458	/* IPv6 specs say nothing about it, but it is clear that we cannot
459	   send redirects to source routed frames.
460	   We don't send redirects to frames decapsulated from IPsec.
461	 */
462	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463	    !skb_sec_path(skb)) {
464		struct in6_addr *target = NULL;
465		struct rt6_info *rt;
466		struct neighbour *n = dst->neighbour;
467
468		/*
469		 *	incoming and outgoing devices are the same
470		 *	send a redirect.
471		 */
472
473		rt = (struct rt6_info *) dst;
474		if ((rt->rt6i_flags & RTF_GATEWAY))
475			target = (struct in6_addr*)&n->primary_key;
476		else
477			target = &hdr->daddr;
478
479		if (!rt->rt6i_peer)
480			rt6_bind_peer(rt, 1);
481
482		/* Limit redirects both by destination (here)
483		   and by source (inside ndisc_send_redirect)
484		 */
485		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486			ndisc_send_redirect(skb, n, target);
487	} else {
488		int addrtype = ipv6_addr_type(&hdr->saddr);
489
490		/* This check is security critical. */
491		if (addrtype == IPV6_ADDR_ANY ||
492		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493			goto error;
494		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496				    ICMPV6_NOT_NEIGHBOUR, 0);
497			goto error;
498		}
499	}
500
501	mtu = dst_mtu(dst);
502	if (mtu < IPV6_MIN_MTU)
503		mtu = IPV6_MIN_MTU;
504
505	if (skb->len > mtu && !skb_is_gso(skb)) {
506		/* Again, force OUTPUT device used as source address */
507		skb->dev = dst->dev;
508		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509		IP6_INC_STATS_BH(net,
510				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511		IP6_INC_STATS_BH(net,
512				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513		kfree_skb(skb);
514		return -EMSGSIZE;
515	}
516
517	if (skb_cow(skb, dst->dev->hard_header_len)) {
518		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519		goto drop;
520	}
521
522	hdr = ipv6_hdr(skb);
523
524	/* Mangling hops number delayed to point after skb COW */
525
526	hdr->hop_limit--;
527
528	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530		       ip6_forward_finish);
531
532error:
533	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534drop:
535	kfree_skb(skb);
536	return -EINVAL;
537}
538
539static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540{
541	to->pkt_type = from->pkt_type;
542	to->priority = from->priority;
543	to->protocol = from->protocol;
544	skb_dst_drop(to);
545	skb_dst_set(to, dst_clone(skb_dst(from)));
546	to->dev = from->dev;
547	to->mark = from->mark;
548
549#ifdef CONFIG_NET_SCHED
550	to->tc_index = from->tc_index;
551#endif
552	nf_copy(to, from);
553#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555	to->nf_trace = from->nf_trace;
556#endif
557	skb_copy_secmark(to, from);
558}
559
560int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561{
562	u16 offset = sizeof(struct ipv6hdr);
563	struct ipv6_opt_hdr *exthdr =
564				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565	unsigned int packet_len = skb->tail - skb->network_header;
566	int found_rhdr = 0;
567	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569	while (offset + 1 <= packet_len) {
570
571		switch (**nexthdr) {
572
573		case NEXTHDR_HOP:
574			break;
575		case NEXTHDR_ROUTING:
576			found_rhdr = 1;
577			break;
578		case NEXTHDR_DEST:
579#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581				break;
582#endif
583			if (found_rhdr)
584				return offset;
585			break;
586		default :
587			return offset;
588		}
589
590		offset += ipv6_optlen(exthdr);
591		*nexthdr = &exthdr->nexthdr;
592		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593						 offset);
594	}
595
596	return offset;
597}
598
599int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
600{
601	struct sk_buff *frag;
602	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
603	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604	struct ipv6hdr *tmp_hdr;
605	struct frag_hdr *fh;
606	unsigned int mtu, hlen, left, len;
607	__be32 frag_id = 0;
608	int ptr, offset = 0, err=0;
609	u8 *prevhdr, nexthdr = 0;
610	struct net *net = dev_net(skb_dst(skb)->dev);
611
612	hlen = ip6_find_1stfragopt(skb, &prevhdr);
613	nexthdr = *prevhdr;
614
615	mtu = ip6_skb_dst_mtu(skb);
616
617	/* We must not fragment if the socket is set to force MTU discovery
618	 * or if the skb it not generated by a local socket.
619	 */
620	if (!skb->local_df && skb->len > mtu) {
621		skb->dev = skb_dst(skb)->dev;
622		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
623		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
624			      IPSTATS_MIB_FRAGFAILS);
625		kfree_skb(skb);
626		return -EMSGSIZE;
627	}
628
629	if (np && np->frag_size < mtu) {
630		if (np->frag_size)
631			mtu = np->frag_size;
632	}
633	mtu -= hlen + sizeof(struct frag_hdr);
634
635	if (skb_has_frag_list(skb)) {
636		int first_len = skb_pagelen(skb);
637		struct sk_buff *frag2;
638
639		if (first_len - hlen > mtu ||
640		    ((first_len - hlen) & 7) ||
641		    skb_cloned(skb))
642			goto slow_path;
643
644		skb_walk_frags(skb, frag) {
645			/* Correct geometry. */
646			if (frag->len > mtu ||
647			    ((frag->len & 7) && frag->next) ||
648			    skb_headroom(frag) < hlen)
649				goto slow_path_clean;
650
651			/* Partially cloned skb? */
652			if (skb_shared(frag))
653				goto slow_path_clean;
654
655			BUG_ON(frag->sk);
656			if (skb->sk) {
657				frag->sk = skb->sk;
658				frag->destructor = sock_wfree;
659			}
660			skb->truesize -= frag->truesize;
661		}
662
663		err = 0;
664		offset = 0;
665		frag = skb_shinfo(skb)->frag_list;
666		skb_frag_list_init(skb);
667		/* BUILD HEADER */
668
669		*prevhdr = NEXTHDR_FRAGMENT;
670		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671		if (!tmp_hdr) {
672			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673				      IPSTATS_MIB_FRAGFAILS);
674			return -ENOMEM;
675		}
676
677		__skb_pull(skb, hlen);
678		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
679		__skb_push(skb, hlen);
680		skb_reset_network_header(skb);
681		memcpy(skb_network_header(skb), tmp_hdr, hlen);
682
683		ipv6_select_ident(fh);
684		fh->nexthdr = nexthdr;
685		fh->reserved = 0;
686		fh->frag_off = htons(IP6_MF);
687		frag_id = fh->identification;
688
689		first_len = skb_pagelen(skb);
690		skb->data_len = first_len - skb_headlen(skb);
691		skb->len = first_len;
692		ipv6_hdr(skb)->payload_len = htons(first_len -
693						   sizeof(struct ipv6hdr));
694
695		dst_hold(&rt->dst);
696
697		for (;;) {
698			/* Prepare header of the next frame,
699			 * before previous one went down. */
700			if (frag) {
701				frag->ip_summed = CHECKSUM_NONE;
702				skb_reset_transport_header(frag);
703				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
704				__skb_push(frag, hlen);
705				skb_reset_network_header(frag);
706				memcpy(skb_network_header(frag), tmp_hdr,
707				       hlen);
708				offset += skb->len - hlen - sizeof(struct frag_hdr);
709				fh->nexthdr = nexthdr;
710				fh->reserved = 0;
711				fh->frag_off = htons(offset);
712				if (frag->next != NULL)
713					fh->frag_off |= htons(IP6_MF);
714				fh->identification = frag_id;
715				ipv6_hdr(frag)->payload_len =
716						htons(frag->len -
717						      sizeof(struct ipv6hdr));
718				ip6_copy_metadata(frag, skb);
719			}
720
721			err = output(skb);
722			if(!err)
723				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724					      IPSTATS_MIB_FRAGCREATES);
725
726			if (err || !frag)
727				break;
728
729			skb = frag;
730			frag = skb->next;
731			skb->next = NULL;
732		}
733
734		kfree(tmp_hdr);
735
736		if (err == 0) {
737			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738				      IPSTATS_MIB_FRAGOKS);
739			dst_release(&rt->dst);
740			return 0;
741		}
742
743		while (frag) {
744			skb = frag->next;
745			kfree_skb(frag);
746			frag = skb;
747		}
748
749		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750			      IPSTATS_MIB_FRAGFAILS);
751		dst_release(&rt->dst);
752		return err;
753
754slow_path_clean:
755		skb_walk_frags(skb, frag2) {
756			if (frag2 == frag)
757				break;
758			frag2->sk = NULL;
759			frag2->destructor = NULL;
760			skb->truesize += frag2->truesize;
761		}
762	}
763
764slow_path:
765	left = skb->len - hlen;		/* Space per frame */
766	ptr = hlen;			/* Where to start from */
767
768	/*
769	 *	Fragment the datagram.
770	 */
771
772	*prevhdr = NEXTHDR_FRAGMENT;
773
774	/*
775	 *	Keep copying data until we run out.
776	 */
777	while(left > 0)	{
778		len = left;
779		/* IF: it doesn't fit, use 'mtu' - the data space left */
780		if (len > mtu)
781			len = mtu;
782		/* IF: we are not sending upto and including the packet end
783		   then align the next start on an eight byte boundary */
784		if (len < left)	{
785			len &= ~7;
786		}
787		/*
788		 *	Allocate buffer.
789		 */
790
791		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
792			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794				      IPSTATS_MIB_FRAGFAILS);
795			err = -ENOMEM;
796			goto fail;
797		}
798
799		/*
800		 *	Set up data on packet
801		 */
802
803		ip6_copy_metadata(frag, skb);
804		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
805		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806		skb_reset_network_header(frag);
807		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808		frag->transport_header = (frag->network_header + hlen +
809					  sizeof(struct frag_hdr));
810
811		/*
812		 *	Charge the memory for the fragment to any owner
813		 *	it might possess
814		 */
815		if (skb->sk)
816			skb_set_owner_w(frag, skb->sk);
817
818		/*
819		 *	Copy the packet header into the new buffer.
820		 */
821		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822
823		/*
824		 *	Build fragment header.
825		 */
826		fh->nexthdr = nexthdr;
827		fh->reserved = 0;
828		if (!frag_id) {
829			ipv6_select_ident(fh);
830			frag_id = fh->identification;
831		} else
832			fh->identification = frag_id;
833
834		/*
835		 *	Copy a block of the IP datagram.
836		 */
837		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838			BUG();
839		left -= len;
840
841		fh->frag_off = htons(offset);
842		if (left > 0)
843			fh->frag_off |= htons(IP6_MF);
844		ipv6_hdr(frag)->payload_len = htons(frag->len -
845						    sizeof(struct ipv6hdr));
846
847		ptr += len;
848		offset += len;
849
850		/*
851		 *	Put this fragment into the sending queue.
852		 */
853		err = output(frag);
854		if (err)
855			goto fail;
856
857		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858			      IPSTATS_MIB_FRAGCREATES);
859	}
860	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861		      IPSTATS_MIB_FRAGOKS);
862	kfree_skb(skb);
863	return err;
864
865fail:
866	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867		      IPSTATS_MIB_FRAGFAILS);
868	kfree_skb(skb);
869	return err;
870}
871
872static inline int ip6_rt_check(struct rt6key *rt_key,
873			       struct in6_addr *fl_addr,
874			       struct in6_addr *addr_cache)
875{
876	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
878}
879
880static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881					  struct dst_entry *dst,
882					  struct flowi *fl)
883{
884	struct ipv6_pinfo *np = inet6_sk(sk);
885	struct rt6_info *rt = (struct rt6_info *)dst;
886
887	if (!dst)
888		goto out;
889
890	/* Yes, checking route validity in not connected
891	 * case is not very simple. Take into account,
892	 * that we do not support routing by source, TOS,
893	 * and MSG_DONTROUTE 		--ANK (980726)
894	 *
895	 * 1. ip6_rt_check(): If route was host route,
896	 *    check that cached destination is current.
897	 *    If it is network route, we still may
898	 *    check its validity using saved pointer
899	 *    to the last used address: daddr_cache.
900	 *    We do not want to save whole address now,
901	 *    (because main consumer of this service
902	 *    is tcp, which has not this problem),
903	 *    so that the last trick works only on connected
904	 *    sockets.
905	 * 2. oif also should be the same.
906	 */
907	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908#ifdef CONFIG_IPV6_SUBTREES
909	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910#endif
911	    (fl->oif && fl->oif != dst->dev->ifindex)) {
912		dst_release(dst);
913		dst = NULL;
914	}
915
916out:
917	return dst;
918}
919
920static int ip6_dst_lookup_tail(struct sock *sk,
921			       struct dst_entry **dst, struct flowi *fl)
922{
923	int err;
924	struct net *net = sock_net(sk);
925
926	if (*dst == NULL)
927		*dst = ip6_route_output(net, sk, fl);
928
929	if ((err = (*dst)->error))
930		goto out_err_release;
931
932	if (ipv6_addr_any(&fl->fl6_src)) {
933		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
934					 &fl->fl6_dst,
935					 sk ? inet6_sk(sk)->srcprefs : 0,
936					 &fl->fl6_src);
937		if (err)
938			goto out_err_release;
939	}
940
941#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942	/*
943	 * Here if the dst entry we've looked up
944	 * has a neighbour entry that is in the INCOMPLETE
945	 * state and the src address from the flow is
946	 * marked as OPTIMISTIC, we release the found
947	 * dst entry and replace it instead with the
948	 * dst entry of the nexthop router
949	 */
950	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951		struct inet6_ifaddr *ifp;
952		struct flowi fl_gw;
953		int redirect;
954
955		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956				      (*dst)->dev, 1);
957
958		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959		if (ifp)
960			in6_ifa_put(ifp);
961
962		if (redirect) {
963			/*
964			 * We need to get the dst entry for the
965			 * default router instead
966			 */
967			dst_release(*dst);
968			memcpy(&fl_gw, fl, sizeof(struct flowi));
969			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970			*dst = ip6_route_output(net, sk, &fl_gw);
971			if ((err = (*dst)->error))
972				goto out_err_release;
973		}
974	}
975#endif
976
977	return 0;
978
979out_err_release:
980	if (err == -ENETUNREACH)
981		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982	dst_release(*dst);
983	*dst = NULL;
984	return err;
985}
986
987/**
988 *	ip6_dst_lookup - perform route lookup on flow
989 *	@sk: socket which provides route info
990 *	@dst: pointer to dst_entry * for result
991 *	@fl: flow to lookup
992 *
993 *	This function performs a route lookup on the given flow.
994 *
995 *	It returns zero on success, or a standard errno code on error.
996 */
997int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998{
999	*dst = NULL;
1000	return ip6_dst_lookup_tail(sk, dst, fl);
1001}
1002EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004/**
1005 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006 *	@sk: socket which provides route info
1007 *	@fl: flow to lookup
1008 *	@final_dst: final destination address for ipsec lookup
1009 *	@can_sleep: we are in a sleepable context
1010 *
1011 *	This function performs a route lookup on the given flow.
1012 *
1013 *	It returns a valid dst pointer on success, or a pointer encoded
1014 *	error code.
1015 */
1016struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi *fl,
1017				      const struct in6_addr *final_dst,
1018				      bool can_sleep)
1019{
1020	struct dst_entry *dst = NULL;
1021	int err;
1022
1023	err = ip6_dst_lookup_tail(sk, &dst, fl);
1024	if (err)
1025		return ERR_PTR(err);
1026	if (final_dst)
1027		ipv6_addr_copy(&fl->fl6_dst, final_dst);
1028	if (can_sleep) {
1029		fl->flags |= FLOWI_FLAG_CAN_SLEEP;
1030		err = __xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1031		if (err == -EREMOTE)
1032			return ip6_dst_blackhole(sock_net(sk), dst);
1033		if (err)
1034			return ERR_PTR(err);
1035	} else {
1036		err = xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1037		if (err)
1038			return ERR_PTR(err);
1039	}
1040	return dst;
1041}
1042EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1043
1044/**
1045 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1046 *	@sk: socket which provides the dst cache and route info
1047 *	@fl: flow to lookup
1048 *	@final_dst: final destination address for ipsec lookup
1049 *	@can_sleep: we are in a sleepable context
1050 *
1051 *	This function performs a route lookup on the given flow with the
1052 *	possibility of using the cached route in the socket if it is valid.
1053 *	It will take the socket dst lock when operating on the dst cache.
1054 *	As a result, this function can only be used in process context.
1055 *
1056 *	It returns a valid dst pointer on success, or a pointer encoded
1057 *	error code.
1058 */
1059struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi *fl,
1060					 const struct in6_addr *final_dst,
1061					 bool can_sleep)
1062{
1063	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1064	int err;
1065
1066	dst = ip6_sk_dst_check(sk, dst, fl);
1067
1068	err = ip6_dst_lookup_tail(sk, &dst, fl);
1069	if (err)
1070		return ERR_PTR(err);
1071	if (final_dst)
1072		ipv6_addr_copy(&fl->fl6_dst, final_dst);
1073	if (can_sleep) {
1074		fl->flags |= FLOWI_FLAG_CAN_SLEEP;
1075		err = __xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1076		if (err == -EREMOTE)
1077			return ip6_dst_blackhole(sock_net(sk), dst);
1078		if (err)
1079			return ERR_PTR(err);
1080	} else {
1081		err = xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1082		if (err)
1083			return ERR_PTR(err);
1084	}
1085	return dst;
1086}
1087EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1088
1089static inline int ip6_ufo_append_data(struct sock *sk,
1090			int getfrag(void *from, char *to, int offset, int len,
1091			int odd, struct sk_buff *skb),
1092			void *from, int length, int hh_len, int fragheaderlen,
1093			int transhdrlen, int mtu,unsigned int flags)
1094
1095{
1096	struct sk_buff *skb;
1097	int err;
1098
1099	/* There is support for UDP large send offload by network
1100	 * device, so create one single skb packet containing complete
1101	 * udp datagram
1102	 */
1103	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1104		skb = sock_alloc_send_skb(sk,
1105			hh_len + fragheaderlen + transhdrlen + 20,
1106			(flags & MSG_DONTWAIT), &err);
1107		if (skb == NULL)
1108			return -ENOMEM;
1109
1110		/* reserve space for Hardware header */
1111		skb_reserve(skb, hh_len);
1112
1113		/* create space for UDP/IP header */
1114		skb_put(skb,fragheaderlen + transhdrlen);
1115
1116		/* initialize network header pointer */
1117		skb_reset_network_header(skb);
1118
1119		/* initialize protocol header pointer */
1120		skb->transport_header = skb->network_header + fragheaderlen;
1121
1122		skb->ip_summed = CHECKSUM_PARTIAL;
1123		skb->csum = 0;
1124	}
1125
1126	err = skb_append_datato_frags(sk,skb, getfrag, from,
1127				      (length - transhdrlen));
1128	if (!err) {
1129		struct frag_hdr fhdr;
1130
1131		/* Specify the length of each IPv6 datagram fragment.
1132		 * It has to be a multiple of 8.
1133		 */
1134		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1135					     sizeof(struct frag_hdr)) & ~7;
1136		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1137		ipv6_select_ident(&fhdr);
1138		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1139		__skb_queue_tail(&sk->sk_write_queue, skb);
1140
1141		return 0;
1142	}
1143	/* There is not enough support do UPD LSO,
1144	 * so follow normal path
1145	 */
1146	kfree_skb(skb);
1147
1148	return err;
1149}
1150
1151static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1152					       gfp_t gfp)
1153{
1154	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1155}
1156
1157static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1158						gfp_t gfp)
1159{
1160	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1161}
1162
1163int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1164	int offset, int len, int odd, struct sk_buff *skb),
1165	void *from, int length, int transhdrlen,
1166	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1167	struct rt6_info *rt, unsigned int flags, int dontfrag)
1168{
1169	struct inet_sock *inet = inet_sk(sk);
1170	struct ipv6_pinfo *np = inet6_sk(sk);
1171	struct sk_buff *skb;
1172	unsigned int maxfraglen, fragheaderlen;
1173	int exthdrlen;
1174	int hh_len;
1175	int mtu;
1176	int copy;
1177	int err;
1178	int offset = 0;
1179	int csummode = CHECKSUM_NONE;
1180	__u8 tx_flags = 0;
1181
1182	if (flags&MSG_PROBE)
1183		return 0;
1184	if (skb_queue_empty(&sk->sk_write_queue)) {
1185		/*
1186		 * setup for corking
1187		 */
1188		if (opt) {
1189			if (WARN_ON(np->cork.opt))
1190				return -EINVAL;
1191
1192			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1193			if (unlikely(np->cork.opt == NULL))
1194				return -ENOBUFS;
1195
1196			np->cork.opt->tot_len = opt->tot_len;
1197			np->cork.opt->opt_flen = opt->opt_flen;
1198			np->cork.opt->opt_nflen = opt->opt_nflen;
1199
1200			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1201							    sk->sk_allocation);
1202			if (opt->dst0opt && !np->cork.opt->dst0opt)
1203				return -ENOBUFS;
1204
1205			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1206							    sk->sk_allocation);
1207			if (opt->dst1opt && !np->cork.opt->dst1opt)
1208				return -ENOBUFS;
1209
1210			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1211							   sk->sk_allocation);
1212			if (opt->hopopt && !np->cork.opt->hopopt)
1213				return -ENOBUFS;
1214
1215			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1216							    sk->sk_allocation);
1217			if (opt->srcrt && !np->cork.opt->srcrt)
1218				return -ENOBUFS;
1219
1220			/* need source address above miyazawa*/
1221		}
1222		dst_hold(&rt->dst);
1223		inet->cork.dst = &rt->dst;
1224		inet->cork.fl = *fl;
1225		np->cork.hop_limit = hlimit;
1226		np->cork.tclass = tclass;
1227		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1228		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1229		if (np->frag_size < mtu) {
1230			if (np->frag_size)
1231				mtu = np->frag_size;
1232		}
1233		inet->cork.fragsize = mtu;
1234		if (dst_allfrag(rt->dst.path))
1235			inet->cork.flags |= IPCORK_ALLFRAG;
1236		inet->cork.length = 0;
1237		sk->sk_sndmsg_page = NULL;
1238		sk->sk_sndmsg_off = 0;
1239		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1240			    rt->rt6i_nfheader_len;
1241		length += exthdrlen;
1242		transhdrlen += exthdrlen;
1243	} else {
1244		rt = (struct rt6_info *)inet->cork.dst;
1245		fl = &inet->cork.fl;
1246		opt = np->cork.opt;
1247		transhdrlen = 0;
1248		exthdrlen = 0;
1249		mtu = inet->cork.fragsize;
1250	}
1251
1252	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1253
1254	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1255			(opt ? opt->opt_nflen : 0);
1256	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1257
1258	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1259		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1260			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1261			return -EMSGSIZE;
1262		}
1263	}
1264
1265	/* For UDP, check if TX timestamp is enabled */
1266	if (sk->sk_type == SOCK_DGRAM) {
1267		err = sock_tx_timestamp(sk, &tx_flags);
1268		if (err)
1269			goto error;
1270	}
1271
1272	/*
1273	 * Let's try using as much space as possible.
1274	 * Use MTU if total length of the message fits into the MTU.
1275	 * Otherwise, we need to reserve fragment header and
1276	 * fragment alignment (= 8-15 octects, in total).
1277	 *
1278	 * Note that we may need to "move" the data from the tail of
1279	 * of the buffer to the new fragment when we split
1280	 * the message.
1281	 *
1282	 * FIXME: It may be fragmented into multiple chunks
1283	 *        at once if non-fragmentable extension headers
1284	 *        are too large.
1285	 * --yoshfuji
1286	 */
1287
1288	inet->cork.length += length;
1289	if (length > mtu) {
1290		int proto = sk->sk_protocol;
1291		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1292			ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1293			return -EMSGSIZE;
1294		}
1295
1296		if (proto == IPPROTO_UDP &&
1297		    (rt->dst.dev->features & NETIF_F_UFO)) {
1298
1299			err = ip6_ufo_append_data(sk, getfrag, from, length,
1300						  hh_len, fragheaderlen,
1301						  transhdrlen, mtu, flags);
1302			if (err)
1303				goto error;
1304			return 0;
1305		}
1306	}
1307
1308	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1309		goto alloc_new_skb;
1310
1311	while (length > 0) {
1312		/* Check if the remaining data fits into current packet. */
1313		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1314		if (copy < length)
1315			copy = maxfraglen - skb->len;
1316
1317		if (copy <= 0) {
1318			char *data;
1319			unsigned int datalen;
1320			unsigned int fraglen;
1321			unsigned int fraggap;
1322			unsigned int alloclen;
1323			struct sk_buff *skb_prev;
1324alloc_new_skb:
1325			skb_prev = skb;
1326
1327			/* There's no room in the current skb */
1328			if (skb_prev)
1329				fraggap = skb_prev->len - maxfraglen;
1330			else
1331				fraggap = 0;
1332
1333			/*
1334			 * If remaining data exceeds the mtu,
1335			 * we know we need more fragment(s).
1336			 */
1337			datalen = length + fraggap;
1338			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1339				datalen = maxfraglen - fragheaderlen;
1340
1341			fraglen = datalen + fragheaderlen;
1342			if ((flags & MSG_MORE) &&
1343			    !(rt->dst.dev->features&NETIF_F_SG))
1344				alloclen = mtu;
1345			else
1346				alloclen = datalen + fragheaderlen;
1347
1348			/*
1349			 * The last fragment gets additional space at tail.
1350			 * Note: we overallocate on fragments with MSG_MODE
1351			 * because we have no idea if we're the last one.
1352			 */
1353			if (datalen == length + fraggap)
1354				alloclen += rt->dst.trailer_len;
1355
1356			/*
1357			 * We just reserve space for fragment header.
1358			 * Note: this may be overallocation if the message
1359			 * (without MSG_MORE) fits into the MTU.
1360			 */
1361			alloclen += sizeof(struct frag_hdr);
1362
1363			if (transhdrlen) {
1364				skb = sock_alloc_send_skb(sk,
1365						alloclen + hh_len,
1366						(flags & MSG_DONTWAIT), &err);
1367			} else {
1368				skb = NULL;
1369				if (atomic_read(&sk->sk_wmem_alloc) <=
1370				    2 * sk->sk_sndbuf)
1371					skb = sock_wmalloc(sk,
1372							   alloclen + hh_len, 1,
1373							   sk->sk_allocation);
1374				if (unlikely(skb == NULL))
1375					err = -ENOBUFS;
1376				else {
1377					/* Only the initial fragment
1378					 * is time stamped.
1379					 */
1380					tx_flags = 0;
1381				}
1382			}
1383			if (skb == NULL)
1384				goto error;
1385			/*
1386			 *	Fill in the control structures
1387			 */
1388			skb->ip_summed = csummode;
1389			skb->csum = 0;
1390			/* reserve for fragmentation */
1391			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1392
1393			if (sk->sk_type == SOCK_DGRAM)
1394				skb_shinfo(skb)->tx_flags = tx_flags;
1395
1396			/*
1397			 *	Find where to start putting bytes
1398			 */
1399			data = skb_put(skb, fraglen);
1400			skb_set_network_header(skb, exthdrlen);
1401			data += fragheaderlen;
1402			skb->transport_header = (skb->network_header +
1403						 fragheaderlen);
1404			if (fraggap) {
1405				skb->csum = skb_copy_and_csum_bits(
1406					skb_prev, maxfraglen,
1407					data + transhdrlen, fraggap, 0);
1408				skb_prev->csum = csum_sub(skb_prev->csum,
1409							  skb->csum);
1410				data += fraggap;
1411				pskb_trim_unique(skb_prev, maxfraglen);
1412			}
1413			copy = datalen - transhdrlen - fraggap;
1414			if (copy < 0) {
1415				err = -EINVAL;
1416				kfree_skb(skb);
1417				goto error;
1418			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1419				err = -EFAULT;
1420				kfree_skb(skb);
1421				goto error;
1422			}
1423
1424			offset += copy;
1425			length -= datalen - fraggap;
1426			transhdrlen = 0;
1427			exthdrlen = 0;
1428			csummode = CHECKSUM_NONE;
1429
1430			/*
1431			 * Put the packet on the pending queue
1432			 */
1433			__skb_queue_tail(&sk->sk_write_queue, skb);
1434			continue;
1435		}
1436
1437		if (copy > length)
1438			copy = length;
1439
1440		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1441			unsigned int off;
1442
1443			off = skb->len;
1444			if (getfrag(from, skb_put(skb, copy),
1445						offset, copy, off, skb) < 0) {
1446				__skb_trim(skb, off);
1447				err = -EFAULT;
1448				goto error;
1449			}
1450		} else {
1451			int i = skb_shinfo(skb)->nr_frags;
1452			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1453			struct page *page = sk->sk_sndmsg_page;
1454			int off = sk->sk_sndmsg_off;
1455			unsigned int left;
1456
1457			if (page && (left = PAGE_SIZE - off) > 0) {
1458				if (copy >= left)
1459					copy = left;
1460				if (page != frag->page) {
1461					if (i == MAX_SKB_FRAGS) {
1462						err = -EMSGSIZE;
1463						goto error;
1464					}
1465					get_page(page);
1466					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1467					frag = &skb_shinfo(skb)->frags[i];
1468				}
1469			} else if(i < MAX_SKB_FRAGS) {
1470				if (copy > PAGE_SIZE)
1471					copy = PAGE_SIZE;
1472				page = alloc_pages(sk->sk_allocation, 0);
1473				if (page == NULL) {
1474					err = -ENOMEM;
1475					goto error;
1476				}
1477				sk->sk_sndmsg_page = page;
1478				sk->sk_sndmsg_off = 0;
1479
1480				skb_fill_page_desc(skb, i, page, 0, 0);
1481				frag = &skb_shinfo(skb)->frags[i];
1482			} else {
1483				err = -EMSGSIZE;
1484				goto error;
1485			}
1486			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1487				err = -EFAULT;
1488				goto error;
1489			}
1490			sk->sk_sndmsg_off += copy;
1491			frag->size += copy;
1492			skb->len += copy;
1493			skb->data_len += copy;
1494			skb->truesize += copy;
1495			atomic_add(copy, &sk->sk_wmem_alloc);
1496		}
1497		offset += copy;
1498		length -= copy;
1499	}
1500	return 0;
1501error:
1502	inet->cork.length -= length;
1503	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1504	return err;
1505}
1506
1507static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1508{
1509	if (np->cork.opt) {
1510		kfree(np->cork.opt->dst0opt);
1511		kfree(np->cork.opt->dst1opt);
1512		kfree(np->cork.opt->hopopt);
1513		kfree(np->cork.opt->srcrt);
1514		kfree(np->cork.opt);
1515		np->cork.opt = NULL;
1516	}
1517
1518	if (inet->cork.dst) {
1519		dst_release(inet->cork.dst);
1520		inet->cork.dst = NULL;
1521		inet->cork.flags &= ~IPCORK_ALLFRAG;
1522	}
1523	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1524}
1525
1526int ip6_push_pending_frames(struct sock *sk)
1527{
1528	struct sk_buff *skb, *tmp_skb;
1529	struct sk_buff **tail_skb;
1530	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1531	struct inet_sock *inet = inet_sk(sk);
1532	struct ipv6_pinfo *np = inet6_sk(sk);
1533	struct net *net = sock_net(sk);
1534	struct ipv6hdr *hdr;
1535	struct ipv6_txoptions *opt = np->cork.opt;
1536	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1537	struct flowi *fl = &inet->cork.fl;
1538	unsigned char proto = fl->proto;
1539	int err = 0;
1540
1541	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1542		goto out;
1543	tail_skb = &(skb_shinfo(skb)->frag_list);
1544
1545	/* move skb->data to ip header from ext header */
1546	if (skb->data < skb_network_header(skb))
1547		__skb_pull(skb, skb_network_offset(skb));
1548	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1549		__skb_pull(tmp_skb, skb_network_header_len(skb));
1550		*tail_skb = tmp_skb;
1551		tail_skb = &(tmp_skb->next);
1552		skb->len += tmp_skb->len;
1553		skb->data_len += tmp_skb->len;
1554		skb->truesize += tmp_skb->truesize;
1555		tmp_skb->destructor = NULL;
1556		tmp_skb->sk = NULL;
1557	}
1558
1559	/* Allow local fragmentation. */
1560	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1561		skb->local_df = 1;
1562
1563	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1564	__skb_pull(skb, skb_network_header_len(skb));
1565	if (opt && opt->opt_flen)
1566		ipv6_push_frag_opts(skb, opt, &proto);
1567	if (opt && opt->opt_nflen)
1568		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1569
1570	skb_push(skb, sizeof(struct ipv6hdr));
1571	skb_reset_network_header(skb);
1572	hdr = ipv6_hdr(skb);
1573
1574	*(__be32*)hdr = fl->fl6_flowlabel |
1575		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1576
1577	hdr->hop_limit = np->cork.hop_limit;
1578	hdr->nexthdr = proto;
1579	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1580	ipv6_addr_copy(&hdr->daddr, final_dst);
1581
1582	skb->priority = sk->sk_priority;
1583	skb->mark = sk->sk_mark;
1584
1585	skb_dst_set(skb, dst_clone(&rt->dst));
1586	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1587	if (proto == IPPROTO_ICMPV6) {
1588		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1589
1590		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1591		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1592	}
1593
1594	err = ip6_local_out(skb);
1595	if (err) {
1596		if (err > 0)
1597			err = net_xmit_errno(err);
1598		if (err)
1599			goto error;
1600	}
1601
1602out:
1603	ip6_cork_release(inet, np);
1604	return err;
1605error:
1606	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1607	goto out;
1608}
1609
1610void ip6_flush_pending_frames(struct sock *sk)
1611{
1612	struct sk_buff *skb;
1613
1614	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1615		if (skb_dst(skb))
1616			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1617				      IPSTATS_MIB_OUTDISCARDS);
1618		kfree_skb(skb);
1619	}
1620
1621	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1622}
1623