ip6_output.c revision e550dfb0c2c31b6363aa463a035fc9f8dcaa3c9b
1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	: 	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv6.h>
43
44#include <net/sock.h>
45#include <net/snmp.h>
46
47#include <net/ipv6.h>
48#include <net/ndisc.h>
49#include <net/protocol.h>
50#include <net/ip6_route.h>
51#include <net/addrconf.h>
52#include <net/rawv6.h>
53#include <net/icmp.h>
54#include <net/xfrm.h>
55#include <net/checksum.h>
56#include <linux/mroute6.h>
57
58static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61{
62	static u32 ipv6_fragmentation_id = 1;
63	static DEFINE_SPINLOCK(ip6_id_lock);
64
65	spin_lock_bh(&ip6_id_lock);
66	fhdr->identification = htonl(ipv6_fragmentation_id);
67	if (++ipv6_fragmentation_id == 0)
68		ipv6_fragmentation_id = 1;
69	spin_unlock_bh(&ip6_id_lock);
70}
71
72int __ip6_local_out(struct sk_buff *skb)
73{
74	int len;
75
76	len = skb->len - sizeof(struct ipv6hdr);
77	if (len > IPV6_MAXPLEN)
78		len = 0;
79	ipv6_hdr(skb)->payload_len = htons(len);
80
81	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82		       dst_output);
83}
84
85int ip6_local_out(struct sk_buff *skb)
86{
87	int err;
88
89	err = __ip6_local_out(skb);
90	if (likely(err == 1))
91		err = dst_output(skb);
92
93	return err;
94}
95EXPORT_SYMBOL_GPL(ip6_local_out);
96
97static int ip6_output_finish(struct sk_buff *skb)
98{
99	struct dst_entry *dst = skb->dst;
100
101	if (dst->hh)
102		return neigh_hh_output(dst->hh, skb);
103	else if (dst->neighbour)
104		return dst->neighbour->output(skb);
105
106	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107	kfree_skb(skb);
108	return -EINVAL;
109
110}
111
112/* dev_loopback_xmit for use with netfilter. */
113static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114{
115	skb_reset_mac_header(newskb);
116	__skb_pull(newskb, skb_network_offset(newskb));
117	newskb->pkt_type = PACKET_LOOPBACK;
118	newskb->ip_summed = CHECKSUM_UNNECESSARY;
119	WARN_ON(!newskb->dst);
120
121	netif_rx(newskb);
122	return 0;
123}
124
125
126static int ip6_output2(struct sk_buff *skb)
127{
128	struct dst_entry *dst = skb->dst;
129	struct net_device *dev = dst->dev;
130
131	skb->protocol = htons(ETH_P_IPV6);
132	skb->dev = dev;
133
134	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137
138		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141					 &ipv6_hdr(skb)->saddr))) {
142			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143
144			/* Do not check for IFF_ALLMULTI; multicast routing
145			   is not supported in any case.
146			 */
147			if (newskb)
148				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149					NULL, newskb->dev,
150					ip6_dev_loopback_xmit);
151
152			if (ipv6_hdr(skb)->hop_limit == 0) {
153				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154				kfree_skb(skb);
155				return 0;
156			}
157		}
158
159		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160	}
161
162	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163		       ip6_output_finish);
164}
165
166static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167{
168	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169
170	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171	       skb->dst->dev->mtu : dst_mtu(skb->dst);
172}
173
174int ip6_output(struct sk_buff *skb)
175{
176	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177	if (unlikely(idev->cnf.disable_ipv6)) {
178		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179		kfree_skb(skb);
180		return 0;
181	}
182
183	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184				dst_allfrag(skb->dst))
185		return ip6_fragment(skb, ip6_output2);
186	else
187		return ip6_output2(skb);
188}
189
190/*
191 *	xmit an sk_buff (used by TCP)
192 */
193
194int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195	     struct ipv6_txoptions *opt, int ipfragok)
196{
197	struct ipv6_pinfo *np = inet6_sk(sk);
198	struct in6_addr *first_hop = &fl->fl6_dst;
199	struct dst_entry *dst = skb->dst;
200	struct ipv6hdr *hdr;
201	u8  proto = fl->proto;
202	int seg_len = skb->len;
203	int hlimit, tclass;
204	u32 mtu;
205
206	if (opt) {
207		unsigned int head_room;
208
209		/* First: exthdrs may take lots of space (~8K for now)
210		   MAX_HEADER is not enough.
211		 */
212		head_room = opt->opt_nflen + opt->opt_flen;
213		seg_len += head_room;
214		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215
216		if (skb_headroom(skb) < head_room) {
217			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
218			if (skb2 == NULL) {
219				IP6_INC_STATS(ip6_dst_idev(skb->dst),
220					      IPSTATS_MIB_OUTDISCARDS);
221				kfree_skb(skb);
222				return -ENOBUFS;
223			}
224			kfree_skb(skb);
225			skb = skb2;
226			if (sk)
227				skb_set_owner_w(skb, sk);
228		}
229		if (opt->opt_flen)
230			ipv6_push_frag_opts(skb, opt, &proto);
231		if (opt->opt_nflen)
232			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233	}
234
235	skb_push(skb, sizeof(struct ipv6hdr));
236	skb_reset_network_header(skb);
237	hdr = ipv6_hdr(skb);
238
239	/* Allow local fragmentation. */
240	if (ipfragok)
241		skb->local_df = 1;
242
243	/*
244	 *	Fill in the IPv6 header
245	 */
246
247	hlimit = -1;
248	if (np)
249		hlimit = np->hop_limit;
250	if (hlimit < 0)
251		hlimit = ip6_dst_hoplimit(dst);
252
253	tclass = -1;
254	if (np)
255		tclass = np->tclass;
256	if (tclass < 0)
257		tclass = 0;
258
259	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
260
261	hdr->payload_len = htons(seg_len);
262	hdr->nexthdr = proto;
263	hdr->hop_limit = hlimit;
264
265	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
266	ipv6_addr_copy(&hdr->daddr, first_hop);
267
268	skb->priority = sk->sk_priority;
269	skb->mark = sk->sk_mark;
270
271	mtu = dst_mtu(dst);
272	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
273		IP6_INC_STATS(ip6_dst_idev(skb->dst),
274			      IPSTATS_MIB_OUTREQUESTS);
275		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
276				dst_output);
277	}
278
279	if (net_ratelimit())
280		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
281	skb->dev = dst->dev;
282	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
283	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
284	kfree_skb(skb);
285	return -EMSGSIZE;
286}
287
288EXPORT_SYMBOL(ip6_xmit);
289
290/*
291 *	To avoid extra problems ND packets are send through this
292 *	routine. It's code duplication but I really want to avoid
293 *	extra checks since ipv6_build_header is used by TCP (which
294 *	is for us performance critical)
295 */
296
297int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
298	       const struct in6_addr *saddr, const struct in6_addr *daddr,
299	       int proto, int len)
300{
301	struct ipv6_pinfo *np = inet6_sk(sk);
302	struct ipv6hdr *hdr;
303	int totlen;
304
305	skb->protocol = htons(ETH_P_IPV6);
306	skb->dev = dev;
307
308	totlen = len + sizeof(struct ipv6hdr);
309
310	skb_reset_network_header(skb);
311	skb_put(skb, sizeof(struct ipv6hdr));
312	hdr = ipv6_hdr(skb);
313
314	*(__be32*)hdr = htonl(0x60000000);
315
316	hdr->payload_len = htons(len);
317	hdr->nexthdr = proto;
318	hdr->hop_limit = np->hop_limit;
319
320	ipv6_addr_copy(&hdr->saddr, saddr);
321	ipv6_addr_copy(&hdr->daddr, daddr);
322
323	return 0;
324}
325
326static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
327{
328	struct ip6_ra_chain *ra;
329	struct sock *last = NULL;
330
331	read_lock(&ip6_ra_lock);
332	for (ra = ip6_ra_chain; ra; ra = ra->next) {
333		struct sock *sk = ra->sk;
334		if (sk && ra->sel == sel &&
335		    (!sk->sk_bound_dev_if ||
336		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
337			if (last) {
338				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
339				if (skb2)
340					rawv6_rcv(last, skb2);
341			}
342			last = sk;
343		}
344	}
345
346	if (last) {
347		rawv6_rcv(last, skb);
348		read_unlock(&ip6_ra_lock);
349		return 1;
350	}
351	read_unlock(&ip6_ra_lock);
352	return 0;
353}
354
355static int ip6_forward_proxy_check(struct sk_buff *skb)
356{
357	struct ipv6hdr *hdr = ipv6_hdr(skb);
358	u8 nexthdr = hdr->nexthdr;
359	int offset;
360
361	if (ipv6_ext_hdr(nexthdr)) {
362		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
363		if (offset < 0)
364			return 0;
365	} else
366		offset = sizeof(struct ipv6hdr);
367
368	if (nexthdr == IPPROTO_ICMPV6) {
369		struct icmp6hdr *icmp6;
370
371		if (!pskb_may_pull(skb, (skb_network_header(skb) +
372					 offset + 1 - skb->data)))
373			return 0;
374
375		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
376
377		switch (icmp6->icmp6_type) {
378		case NDISC_ROUTER_SOLICITATION:
379		case NDISC_ROUTER_ADVERTISEMENT:
380		case NDISC_NEIGHBOUR_SOLICITATION:
381		case NDISC_NEIGHBOUR_ADVERTISEMENT:
382		case NDISC_REDIRECT:
383			/* For reaction involving unicast neighbor discovery
384			 * message destined to the proxied address, pass it to
385			 * input function.
386			 */
387			return 1;
388		default:
389			break;
390		}
391	}
392
393	/*
394	 * The proxying router can't forward traffic sent to a link-local
395	 * address, so signal the sender and discard the packet. This
396	 * behavior is clarified by the MIPv6 specification.
397	 */
398	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
399		dst_link_failure(skb);
400		return -1;
401	}
402
403	return 0;
404}
405
406static inline int ip6_forward_finish(struct sk_buff *skb)
407{
408	return dst_output(skb);
409}
410
411int ip6_forward(struct sk_buff *skb)
412{
413	struct dst_entry *dst = skb->dst;
414	struct ipv6hdr *hdr = ipv6_hdr(skb);
415	struct inet6_skb_parm *opt = IP6CB(skb);
416	struct net *net = dev_net(dst->dev);
417
418	if (net->ipv6.devconf_all->forwarding == 0)
419		goto error;
420
421	if (skb_warn_if_lro(skb))
422		goto drop;
423
424	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
426		goto drop;
427	}
428
429	skb_forward_csum(skb);
430
431	/*
432	 *	We DO NOT make any processing on
433	 *	RA packets, pushing them to user level AS IS
434	 *	without ane WARRANTY that application will be able
435	 *	to interpret them. The reason is that we
436	 *	cannot make anything clever here.
437	 *
438	 *	We are not end-node, so that if packet contains
439	 *	AH/ESP, we cannot make anything.
440	 *	Defragmentation also would be mistake, RA packets
441	 *	cannot be fragmented, because there is no warranty
442	 *	that different fragments will go along one path. --ANK
443	 */
444	if (opt->ra) {
445		u8 *ptr = skb_network_header(skb) + opt->ra;
446		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
447			return 0;
448	}
449
450	/*
451	 *	check and decrement ttl
452	 */
453	if (hdr->hop_limit <= 1) {
454		/* Force OUTPUT device used as source address */
455		skb->dev = dst->dev;
456		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
457			    0, skb->dev);
458		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
459
460		kfree_skb(skb);
461		return -ETIMEDOUT;
462	}
463
464	/* XXX: idev->cnf.proxy_ndp? */
465	if (net->ipv6.devconf_all->proxy_ndp &&
466	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467		int proxied = ip6_forward_proxy_check(skb);
468		if (proxied > 0)
469			return ip6_input(skb);
470		else if (proxied < 0) {
471			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
472			goto drop;
473		}
474	}
475
476	if (!xfrm6_route_forward(skb)) {
477		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
478		goto drop;
479	}
480	dst = skb->dst;
481
482	/* IPv6 specs say nothing about it, but it is clear that we cannot
483	   send redirects to source routed frames.
484	   We don't send redirects to frames decapsulated from IPsec.
485	 */
486	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
487	    !skb->sp) {
488		struct in6_addr *target = NULL;
489		struct rt6_info *rt;
490		struct neighbour *n = dst->neighbour;
491
492		/*
493		 *	incoming and outgoing devices are the same
494		 *	send a redirect.
495		 */
496
497		rt = (struct rt6_info *) dst;
498		if ((rt->rt6i_flags & RTF_GATEWAY))
499			target = (struct in6_addr*)&n->primary_key;
500		else
501			target = &hdr->daddr;
502
503		/* Limit redirects both by destination (here)
504		   and by source (inside ndisc_send_redirect)
505		 */
506		if (xrlim_allow(dst, 1*HZ))
507			ndisc_send_redirect(skb, n, target);
508	} else {
509		int addrtype = ipv6_addr_type(&hdr->saddr);
510
511		/* This check is security critical. */
512		if (addrtype == IPV6_ADDR_ANY ||
513		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
514			goto error;
515		if (addrtype & IPV6_ADDR_LINKLOCAL) {
516			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
517				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
518			goto error;
519		}
520	}
521
522	if (skb->len > dst_mtu(dst)) {
523		/* Again, force OUTPUT device used as source address */
524		skb->dev = dst->dev;
525		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
526		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
527		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
528		kfree_skb(skb);
529		return -EMSGSIZE;
530	}
531
532	if (skb_cow(skb, dst->dev->hard_header_len)) {
533		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
534		goto drop;
535	}
536
537	hdr = ipv6_hdr(skb);
538
539	/* Mangling hops number delayed to point after skb COW */
540
541	hdr->hop_limit--;
542
543	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
544	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
545		       ip6_forward_finish);
546
547error:
548	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
549drop:
550	kfree_skb(skb);
551	return -EINVAL;
552}
553
554static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
555{
556	to->pkt_type = from->pkt_type;
557	to->priority = from->priority;
558	to->protocol = from->protocol;
559	dst_release(to->dst);
560	to->dst = dst_clone(from->dst);
561	to->dev = from->dev;
562	to->mark = from->mark;
563
564#ifdef CONFIG_NET_SCHED
565	to->tc_index = from->tc_index;
566#endif
567	nf_copy(to, from);
568#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
569    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
570	to->nf_trace = from->nf_trace;
571#endif
572	skb_copy_secmark(to, from);
573}
574
575int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
576{
577	u16 offset = sizeof(struct ipv6hdr);
578	struct ipv6_opt_hdr *exthdr =
579				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
580	unsigned int packet_len = skb->tail - skb->network_header;
581	int found_rhdr = 0;
582	*nexthdr = &ipv6_hdr(skb)->nexthdr;
583
584	while (offset + 1 <= packet_len) {
585
586		switch (**nexthdr) {
587
588		case NEXTHDR_HOP:
589			break;
590		case NEXTHDR_ROUTING:
591			found_rhdr = 1;
592			break;
593		case NEXTHDR_DEST:
594#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
595			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596				break;
597#endif
598			if (found_rhdr)
599				return offset;
600			break;
601		default :
602			return offset;
603		}
604
605		offset += ipv6_optlen(exthdr);
606		*nexthdr = &exthdr->nexthdr;
607		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
608						 offset);
609	}
610
611	return offset;
612}
613
614static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
615{
616	struct net_device *dev;
617	struct sk_buff *frag;
618	struct rt6_info *rt = (struct rt6_info*)skb->dst;
619	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
620	struct ipv6hdr *tmp_hdr;
621	struct frag_hdr *fh;
622	unsigned int mtu, hlen, left, len;
623	__be32 frag_id = 0;
624	int ptr, offset = 0, err=0;
625	u8 *prevhdr, nexthdr = 0;
626
627	dev = rt->u.dst.dev;
628	hlen = ip6_find_1stfragopt(skb, &prevhdr);
629	nexthdr = *prevhdr;
630
631	mtu = ip6_skb_dst_mtu(skb);
632
633	/* We must not fragment if the socket is set to force MTU discovery
634	 * or if the skb it not generated by a local socket.  (This last
635	 * check should be redundant, but it's free.)
636	 */
637	if (!skb->local_df) {
638		skb->dev = skb->dst->dev;
639		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
640		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
641		kfree_skb(skb);
642		return -EMSGSIZE;
643	}
644
645	if (np && np->frag_size < mtu) {
646		if (np->frag_size)
647			mtu = np->frag_size;
648	}
649	mtu -= hlen + sizeof(struct frag_hdr);
650
651	if (skb_shinfo(skb)->frag_list) {
652		int first_len = skb_pagelen(skb);
653		int truesizes = 0;
654
655		if (first_len - hlen > mtu ||
656		    ((first_len - hlen) & 7) ||
657		    skb_cloned(skb))
658			goto slow_path;
659
660		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
661			/* Correct geometry. */
662			if (frag->len > mtu ||
663			    ((frag->len & 7) && frag->next) ||
664			    skb_headroom(frag) < hlen)
665			    goto slow_path;
666
667			/* Partially cloned skb? */
668			if (skb_shared(frag))
669				goto slow_path;
670
671			BUG_ON(frag->sk);
672			if (skb->sk) {
673				sock_hold(skb->sk);
674				frag->sk = skb->sk;
675				frag->destructor = sock_wfree;
676				truesizes += frag->truesize;
677			}
678		}
679
680		err = 0;
681		offset = 0;
682		frag = skb_shinfo(skb)->frag_list;
683		skb_shinfo(skb)->frag_list = NULL;
684		/* BUILD HEADER */
685
686		*prevhdr = NEXTHDR_FRAGMENT;
687		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
688		if (!tmp_hdr) {
689			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
690			return -ENOMEM;
691		}
692
693		__skb_pull(skb, hlen);
694		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
695		__skb_push(skb, hlen);
696		skb_reset_network_header(skb);
697		memcpy(skb_network_header(skb), tmp_hdr, hlen);
698
699		ipv6_select_ident(skb, fh);
700		fh->nexthdr = nexthdr;
701		fh->reserved = 0;
702		fh->frag_off = htons(IP6_MF);
703		frag_id = fh->identification;
704
705		first_len = skb_pagelen(skb);
706		skb->data_len = first_len - skb_headlen(skb);
707		skb->truesize -= truesizes;
708		skb->len = first_len;
709		ipv6_hdr(skb)->payload_len = htons(first_len -
710						   sizeof(struct ipv6hdr));
711
712		dst_hold(&rt->u.dst);
713
714		for (;;) {
715			/* Prepare header of the next frame,
716			 * before previous one went down. */
717			if (frag) {
718				frag->ip_summed = CHECKSUM_NONE;
719				skb_reset_transport_header(frag);
720				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
721				__skb_push(frag, hlen);
722				skb_reset_network_header(frag);
723				memcpy(skb_network_header(frag), tmp_hdr,
724				       hlen);
725				offset += skb->len - hlen - sizeof(struct frag_hdr);
726				fh->nexthdr = nexthdr;
727				fh->reserved = 0;
728				fh->frag_off = htons(offset);
729				if (frag->next != NULL)
730					fh->frag_off |= htons(IP6_MF);
731				fh->identification = frag_id;
732				ipv6_hdr(frag)->payload_len =
733						htons(frag->len -
734						      sizeof(struct ipv6hdr));
735				ip6_copy_metadata(frag, skb);
736			}
737
738			err = output(skb);
739			if(!err)
740				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
741
742			if (err || !frag)
743				break;
744
745			skb = frag;
746			frag = skb->next;
747			skb->next = NULL;
748		}
749
750		kfree(tmp_hdr);
751
752		if (err == 0) {
753			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
754			dst_release(&rt->u.dst);
755			return 0;
756		}
757
758		while (frag) {
759			skb = frag->next;
760			kfree_skb(frag);
761			frag = skb;
762		}
763
764		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
765		dst_release(&rt->u.dst);
766		return err;
767	}
768
769slow_path:
770	left = skb->len - hlen;		/* Space per frame */
771	ptr = hlen;			/* Where to start from */
772
773	/*
774	 *	Fragment the datagram.
775	 */
776
777	*prevhdr = NEXTHDR_FRAGMENT;
778
779	/*
780	 *	Keep copying data until we run out.
781	 */
782	while(left > 0)	{
783		len = left;
784		/* IF: it doesn't fit, use 'mtu' - the data space left */
785		if (len > mtu)
786			len = mtu;
787		/* IF: we are not sending upto and including the packet end
788		   then align the next start on an eight byte boundary */
789		if (len < left)	{
790			len &= ~7;
791		}
792		/*
793		 *	Allocate buffer.
794		 */
795
796		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
797			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
798			IP6_INC_STATS(ip6_dst_idev(skb->dst),
799				      IPSTATS_MIB_FRAGFAILS);
800			err = -ENOMEM;
801			goto fail;
802		}
803
804		/*
805		 *	Set up data on packet
806		 */
807
808		ip6_copy_metadata(frag, skb);
809		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
810		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811		skb_reset_network_header(frag);
812		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813		frag->transport_header = (frag->network_header + hlen +
814					  sizeof(struct frag_hdr));
815
816		/*
817		 *	Charge the memory for the fragment to any owner
818		 *	it might possess
819		 */
820		if (skb->sk)
821			skb_set_owner_w(frag, skb->sk);
822
823		/*
824		 *	Copy the packet header into the new buffer.
825		 */
826		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827
828		/*
829		 *	Build fragment header.
830		 */
831		fh->nexthdr = nexthdr;
832		fh->reserved = 0;
833		if (!frag_id) {
834			ipv6_select_ident(skb, fh);
835			frag_id = fh->identification;
836		} else
837			fh->identification = frag_id;
838
839		/*
840		 *	Copy a block of the IP datagram.
841		 */
842		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
843			BUG();
844		left -= len;
845
846		fh->frag_off = htons(offset);
847		if (left > 0)
848			fh->frag_off |= htons(IP6_MF);
849		ipv6_hdr(frag)->payload_len = htons(frag->len -
850						    sizeof(struct ipv6hdr));
851
852		ptr += len;
853		offset += len;
854
855		/*
856		 *	Put this fragment into the sending queue.
857		 */
858		err = output(frag);
859		if (err)
860			goto fail;
861
862		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
863	}
864	IP6_INC_STATS(ip6_dst_idev(skb->dst),
865		      IPSTATS_MIB_FRAGOKS);
866	kfree_skb(skb);
867	return err;
868
869fail:
870	IP6_INC_STATS(ip6_dst_idev(skb->dst),
871		      IPSTATS_MIB_FRAGFAILS);
872	kfree_skb(skb);
873	return err;
874}
875
876static inline int ip6_rt_check(struct rt6key *rt_key,
877			       struct in6_addr *fl_addr,
878			       struct in6_addr *addr_cache)
879{
880	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882}
883
884static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885					  struct dst_entry *dst,
886					  struct flowi *fl)
887{
888	struct ipv6_pinfo *np = inet6_sk(sk);
889	struct rt6_info *rt = (struct rt6_info *)dst;
890
891	if (!dst)
892		goto out;
893
894	/* Yes, checking route validity in not connected
895	 * case is not very simple. Take into account,
896	 * that we do not support routing by source, TOS,
897	 * and MSG_DONTROUTE 		--ANK (980726)
898	 *
899	 * 1. ip6_rt_check(): If route was host route,
900	 *    check that cached destination is current.
901	 *    If it is network route, we still may
902	 *    check its validity using saved pointer
903	 *    to the last used address: daddr_cache.
904	 *    We do not want to save whole address now,
905	 *    (because main consumer of this service
906	 *    is tcp, which has not this problem),
907	 *    so that the last trick works only on connected
908	 *    sockets.
909	 * 2. oif also should be the same.
910	 */
911	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
912#ifdef CONFIG_IPV6_SUBTREES
913	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914#endif
915	    (fl->oif && fl->oif != dst->dev->ifindex)) {
916		dst_release(dst);
917		dst = NULL;
918	}
919
920out:
921	return dst;
922}
923
924static int ip6_dst_lookup_tail(struct sock *sk,
925			       struct dst_entry **dst, struct flowi *fl)
926{
927	int err;
928	struct net *net = sock_net(sk);
929
930	if (*dst == NULL)
931		*dst = ip6_route_output(net, sk, fl);
932
933	if ((err = (*dst)->error))
934		goto out_err_release;
935
936	if (ipv6_addr_any(&fl->fl6_src)) {
937		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
938					 &fl->fl6_dst,
939					 sk ? inet6_sk(sk)->srcprefs : 0,
940					 &fl->fl6_src);
941		if (err)
942			goto out_err_release;
943	}
944
945#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946	/*
947	 * Here if the dst entry we've looked up
948	 * has a neighbour entry that is in the INCOMPLETE
949	 * state and the src address from the flow is
950	 * marked as OPTIMISTIC, we release the found
951	 * dst entry and replace it instead with the
952	 * dst entry of the nexthop router
953	 */
954	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
955		struct inet6_ifaddr *ifp;
956		struct flowi fl_gw;
957		int redirect;
958
959		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960				      (*dst)->dev, 1);
961
962		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963		if (ifp)
964			in6_ifa_put(ifp);
965
966		if (redirect) {
967			/*
968			 * We need to get the dst entry for the
969			 * default router instead
970			 */
971			dst_release(*dst);
972			memcpy(&fl_gw, fl, sizeof(struct flowi));
973			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
974			*dst = ip6_route_output(net, sk, &fl_gw);
975			if ((err = (*dst)->error))
976				goto out_err_release;
977		}
978	}
979#endif
980
981	return 0;
982
983out_err_release:
984	if (err == -ENETUNREACH)
985		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
986	dst_release(*dst);
987	*dst = NULL;
988	return err;
989}
990
991/**
992 *	ip6_dst_lookup - perform route lookup on flow
993 *	@sk: socket which provides route info
994 *	@dst: pointer to dst_entry * for result
995 *	@fl: flow to lookup
996 *
997 *	This function performs a route lookup on the given flow.
998 *
999 *	It returns zero on success, or a standard errno code on error.
1000 */
1001int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002{
1003	*dst = NULL;
1004	return ip6_dst_lookup_tail(sk, dst, fl);
1005}
1006EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
1008/**
1009 *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010 *	@sk: socket which provides the dst cache and route info
1011 *	@dst: pointer to dst_entry * for result
1012 *	@fl: flow to lookup
1013 *
1014 *	This function performs a route lookup on the given flow with the
1015 *	possibility of using the cached route in the socket if it is valid.
1016 *	It will take the socket dst lock when operating on the dst cache.
1017 *	As a result, this function can only be used in process context.
1018 *
1019 *	It returns zero on success, or a standard errno code on error.
1020 */
1021int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022{
1023	*dst = NULL;
1024	if (sk) {
1025		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026		*dst = ip6_sk_dst_check(sk, *dst, fl);
1027	}
1028
1029	return ip6_dst_lookup_tail(sk, dst, fl);
1030}
1031EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032
1033static inline int ip6_ufo_append_data(struct sock *sk,
1034			int getfrag(void *from, char *to, int offset, int len,
1035			int odd, struct sk_buff *skb),
1036			void *from, int length, int hh_len, int fragheaderlen,
1037			int transhdrlen, int mtu,unsigned int flags)
1038
1039{
1040	struct sk_buff *skb;
1041	int err;
1042
1043	/* There is support for UDP large send offload by network
1044	 * device, so create one single skb packet containing complete
1045	 * udp datagram
1046	 */
1047	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048		skb = sock_alloc_send_skb(sk,
1049			hh_len + fragheaderlen + transhdrlen + 20,
1050			(flags & MSG_DONTWAIT), &err);
1051		if (skb == NULL)
1052			return -ENOMEM;
1053
1054		/* reserve space for Hardware header */
1055		skb_reserve(skb, hh_len);
1056
1057		/* create space for UDP/IP header */
1058		skb_put(skb,fragheaderlen + transhdrlen);
1059
1060		/* initialize network header pointer */
1061		skb_reset_network_header(skb);
1062
1063		/* initialize protocol header pointer */
1064		skb->transport_header = skb->network_header + fragheaderlen;
1065
1066		skb->ip_summed = CHECKSUM_PARTIAL;
1067		skb->csum = 0;
1068		sk->sk_sndmsg_off = 0;
1069	}
1070
1071	err = skb_append_datato_frags(sk,skb, getfrag, from,
1072				      (length - transhdrlen));
1073	if (!err) {
1074		struct frag_hdr fhdr;
1075
1076		/* specify the length of each IP datagram fragment*/
1077		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1078					    sizeof(struct frag_hdr);
1079		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1080		ipv6_select_ident(skb, &fhdr);
1081		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082		__skb_queue_tail(&sk->sk_write_queue, skb);
1083
1084		return 0;
1085	}
1086	/* There is not enough support do UPD LSO,
1087	 * so follow normal path
1088	 */
1089	kfree_skb(skb);
1090
1091	return err;
1092}
1093
1094int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1095	int offset, int len, int odd, struct sk_buff *skb),
1096	void *from, int length, int transhdrlen,
1097	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1098	struct rt6_info *rt, unsigned int flags)
1099{
1100	struct inet_sock *inet = inet_sk(sk);
1101	struct ipv6_pinfo *np = inet6_sk(sk);
1102	struct sk_buff *skb;
1103	unsigned int maxfraglen, fragheaderlen;
1104	int exthdrlen;
1105	int hh_len;
1106	int mtu;
1107	int copy;
1108	int err;
1109	int offset = 0;
1110	int csummode = CHECKSUM_NONE;
1111
1112	if (flags&MSG_PROBE)
1113		return 0;
1114	if (skb_queue_empty(&sk->sk_write_queue)) {
1115		/*
1116		 * setup for corking
1117		 */
1118		if (opt) {
1119			if (np->cork.opt == NULL) {
1120				np->cork.opt = kmalloc(opt->tot_len,
1121						       sk->sk_allocation);
1122				if (unlikely(np->cork.opt == NULL))
1123					return -ENOBUFS;
1124			} else if (np->cork.opt->tot_len < opt->tot_len) {
1125				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1126				return -EINVAL;
1127			}
1128			memcpy(np->cork.opt, opt, opt->tot_len);
1129			inet->cork.flags |= IPCORK_OPT;
1130			/* need source address above miyazawa*/
1131		}
1132		dst_hold(&rt->u.dst);
1133		inet->cork.dst = &rt->u.dst;
1134		inet->cork.fl = *fl;
1135		np->cork.hop_limit = hlimit;
1136		np->cork.tclass = tclass;
1137		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1138		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1139		if (np->frag_size < mtu) {
1140			if (np->frag_size)
1141				mtu = np->frag_size;
1142		}
1143		inet->cork.fragsize = mtu;
1144		if (dst_allfrag(rt->u.dst.path))
1145			inet->cork.flags |= IPCORK_ALLFRAG;
1146		inet->cork.length = 0;
1147		sk->sk_sndmsg_page = NULL;
1148		sk->sk_sndmsg_off = 0;
1149		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1150			    rt->rt6i_nfheader_len;
1151		length += exthdrlen;
1152		transhdrlen += exthdrlen;
1153	} else {
1154		rt = (struct rt6_info *)inet->cork.dst;
1155		fl = &inet->cork.fl;
1156		if (inet->cork.flags & IPCORK_OPT)
1157			opt = np->cork.opt;
1158		transhdrlen = 0;
1159		exthdrlen = 0;
1160		mtu = inet->cork.fragsize;
1161	}
1162
1163	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1164
1165	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1166			(opt ? opt->opt_nflen : 0);
1167	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1168
1169	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1170		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1171			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1172			return -EMSGSIZE;
1173		}
1174	}
1175
1176	/*
1177	 * Let's try using as much space as possible.
1178	 * Use MTU if total length of the message fits into the MTU.
1179	 * Otherwise, we need to reserve fragment header and
1180	 * fragment alignment (= 8-15 octects, in total).
1181	 *
1182	 * Note that we may need to "move" the data from the tail of
1183	 * of the buffer to the new fragment when we split
1184	 * the message.
1185	 *
1186	 * FIXME: It may be fragmented into multiple chunks
1187	 *        at once if non-fragmentable extension headers
1188	 *        are too large.
1189	 * --yoshfuji
1190	 */
1191
1192	inet->cork.length += length;
1193	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1194	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1195
1196		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1197					  fragheaderlen, transhdrlen, mtu,
1198					  flags);
1199		if (err)
1200			goto error;
1201		return 0;
1202	}
1203
1204	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1205		goto alloc_new_skb;
1206
1207	while (length > 0) {
1208		/* Check if the remaining data fits into current packet. */
1209		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1210		if (copy < length)
1211			copy = maxfraglen - skb->len;
1212
1213		if (copy <= 0) {
1214			char *data;
1215			unsigned int datalen;
1216			unsigned int fraglen;
1217			unsigned int fraggap;
1218			unsigned int alloclen;
1219			struct sk_buff *skb_prev;
1220alloc_new_skb:
1221			skb_prev = skb;
1222
1223			/* There's no room in the current skb */
1224			if (skb_prev)
1225				fraggap = skb_prev->len - maxfraglen;
1226			else
1227				fraggap = 0;
1228
1229			/*
1230			 * If remaining data exceeds the mtu,
1231			 * we know we need more fragment(s).
1232			 */
1233			datalen = length + fraggap;
1234			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1235				datalen = maxfraglen - fragheaderlen;
1236
1237			fraglen = datalen + fragheaderlen;
1238			if ((flags & MSG_MORE) &&
1239			    !(rt->u.dst.dev->features&NETIF_F_SG))
1240				alloclen = mtu;
1241			else
1242				alloclen = datalen + fragheaderlen;
1243
1244			/*
1245			 * The last fragment gets additional space at tail.
1246			 * Note: we overallocate on fragments with MSG_MODE
1247			 * because we have no idea if we're the last one.
1248			 */
1249			if (datalen == length + fraggap)
1250				alloclen += rt->u.dst.trailer_len;
1251
1252			/*
1253			 * We just reserve space for fragment header.
1254			 * Note: this may be overallocation if the message
1255			 * (without MSG_MORE) fits into the MTU.
1256			 */
1257			alloclen += sizeof(struct frag_hdr);
1258
1259			if (transhdrlen) {
1260				skb = sock_alloc_send_skb(sk,
1261						alloclen + hh_len,
1262						(flags & MSG_DONTWAIT), &err);
1263			} else {
1264				skb = NULL;
1265				if (atomic_read(&sk->sk_wmem_alloc) <=
1266				    2 * sk->sk_sndbuf)
1267					skb = sock_wmalloc(sk,
1268							   alloclen + hh_len, 1,
1269							   sk->sk_allocation);
1270				if (unlikely(skb == NULL))
1271					err = -ENOBUFS;
1272			}
1273			if (skb == NULL)
1274				goto error;
1275			/*
1276			 *	Fill in the control structures
1277			 */
1278			skb->ip_summed = csummode;
1279			skb->csum = 0;
1280			/* reserve for fragmentation */
1281			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1282
1283			/*
1284			 *	Find where to start putting bytes
1285			 */
1286			data = skb_put(skb, fraglen);
1287			skb_set_network_header(skb, exthdrlen);
1288			data += fragheaderlen;
1289			skb->transport_header = (skb->network_header +
1290						 fragheaderlen);
1291			if (fraggap) {
1292				skb->csum = skb_copy_and_csum_bits(
1293					skb_prev, maxfraglen,
1294					data + transhdrlen, fraggap, 0);
1295				skb_prev->csum = csum_sub(skb_prev->csum,
1296							  skb->csum);
1297				data += fraggap;
1298				pskb_trim_unique(skb_prev, maxfraglen);
1299			}
1300			copy = datalen - transhdrlen - fraggap;
1301			if (copy < 0) {
1302				err = -EINVAL;
1303				kfree_skb(skb);
1304				goto error;
1305			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1306				err = -EFAULT;
1307				kfree_skb(skb);
1308				goto error;
1309			}
1310
1311			offset += copy;
1312			length -= datalen - fraggap;
1313			transhdrlen = 0;
1314			exthdrlen = 0;
1315			csummode = CHECKSUM_NONE;
1316
1317			/*
1318			 * Put the packet on the pending queue
1319			 */
1320			__skb_queue_tail(&sk->sk_write_queue, skb);
1321			continue;
1322		}
1323
1324		if (copy > length)
1325			copy = length;
1326
1327		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1328			unsigned int off;
1329
1330			off = skb->len;
1331			if (getfrag(from, skb_put(skb, copy),
1332						offset, copy, off, skb) < 0) {
1333				__skb_trim(skb, off);
1334				err = -EFAULT;
1335				goto error;
1336			}
1337		} else {
1338			int i = skb_shinfo(skb)->nr_frags;
1339			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1340			struct page *page = sk->sk_sndmsg_page;
1341			int off = sk->sk_sndmsg_off;
1342			unsigned int left;
1343
1344			if (page && (left = PAGE_SIZE - off) > 0) {
1345				if (copy >= left)
1346					copy = left;
1347				if (page != frag->page) {
1348					if (i == MAX_SKB_FRAGS) {
1349						err = -EMSGSIZE;
1350						goto error;
1351					}
1352					get_page(page);
1353					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1354					frag = &skb_shinfo(skb)->frags[i];
1355				}
1356			} else if(i < MAX_SKB_FRAGS) {
1357				if (copy > PAGE_SIZE)
1358					copy = PAGE_SIZE;
1359				page = alloc_pages(sk->sk_allocation, 0);
1360				if (page == NULL) {
1361					err = -ENOMEM;
1362					goto error;
1363				}
1364				sk->sk_sndmsg_page = page;
1365				sk->sk_sndmsg_off = 0;
1366
1367				skb_fill_page_desc(skb, i, page, 0, 0);
1368				frag = &skb_shinfo(skb)->frags[i];
1369			} else {
1370				err = -EMSGSIZE;
1371				goto error;
1372			}
1373			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1374				err = -EFAULT;
1375				goto error;
1376			}
1377			sk->sk_sndmsg_off += copy;
1378			frag->size += copy;
1379			skb->len += copy;
1380			skb->data_len += copy;
1381			skb->truesize += copy;
1382			atomic_add(copy, &sk->sk_wmem_alloc);
1383		}
1384		offset += copy;
1385		length -= copy;
1386	}
1387	return 0;
1388error:
1389	inet->cork.length -= length;
1390	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1391	return err;
1392}
1393
1394static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1395{
1396	inet->cork.flags &= ~IPCORK_OPT;
1397	kfree(np->cork.opt);
1398	np->cork.opt = NULL;
1399	if (inet->cork.dst) {
1400		dst_release(inet->cork.dst);
1401		inet->cork.dst = NULL;
1402		inet->cork.flags &= ~IPCORK_ALLFRAG;
1403	}
1404	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1405}
1406
1407int ip6_push_pending_frames(struct sock *sk)
1408{
1409	struct sk_buff *skb, *tmp_skb;
1410	struct sk_buff **tail_skb;
1411	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1412	struct inet_sock *inet = inet_sk(sk);
1413	struct ipv6_pinfo *np = inet6_sk(sk);
1414	struct ipv6hdr *hdr;
1415	struct ipv6_txoptions *opt = np->cork.opt;
1416	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1417	struct flowi *fl = &inet->cork.fl;
1418	unsigned char proto = fl->proto;
1419	int err = 0;
1420
1421	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1422		goto out;
1423	tail_skb = &(skb_shinfo(skb)->frag_list);
1424
1425	/* move skb->data to ip header from ext header */
1426	if (skb->data < skb_network_header(skb))
1427		__skb_pull(skb, skb_network_offset(skb));
1428	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1429		__skb_pull(tmp_skb, skb_network_header_len(skb));
1430		*tail_skb = tmp_skb;
1431		tail_skb = &(tmp_skb->next);
1432		skb->len += tmp_skb->len;
1433		skb->data_len += tmp_skb->len;
1434		skb->truesize += tmp_skb->truesize;
1435		__sock_put(tmp_skb->sk);
1436		tmp_skb->destructor = NULL;
1437		tmp_skb->sk = NULL;
1438	}
1439
1440	/* Allow local fragmentation. */
1441	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1442		skb->local_df = 1;
1443
1444	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1445	__skb_pull(skb, skb_network_header_len(skb));
1446	if (opt && opt->opt_flen)
1447		ipv6_push_frag_opts(skb, opt, &proto);
1448	if (opt && opt->opt_nflen)
1449		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1450
1451	skb_push(skb, sizeof(struct ipv6hdr));
1452	skb_reset_network_header(skb);
1453	hdr = ipv6_hdr(skb);
1454
1455	*(__be32*)hdr = fl->fl6_flowlabel |
1456		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1457
1458	hdr->hop_limit = np->cork.hop_limit;
1459	hdr->nexthdr = proto;
1460	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1461	ipv6_addr_copy(&hdr->daddr, final_dst);
1462
1463	skb->priority = sk->sk_priority;
1464	skb->mark = sk->sk_mark;
1465
1466	skb->dst = dst_clone(&rt->u.dst);
1467	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1468	if (proto == IPPROTO_ICMPV6) {
1469		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1470
1471		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1472		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1473	}
1474
1475	err = ip6_local_out(skb);
1476	if (err) {
1477		if (err > 0)
1478			err = np->recverr ? net_xmit_errno(err) : 0;
1479		if (err)
1480			goto error;
1481	}
1482
1483out:
1484	ip6_cork_release(inet, np);
1485	return err;
1486error:
1487	goto out;
1488}
1489
1490void ip6_flush_pending_frames(struct sock *sk)
1491{
1492	struct sk_buff *skb;
1493
1494	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1495		if (skb->dst)
1496			IP6_INC_STATS(ip6_dst_idev(skb->dst),
1497				      IPSTATS_MIB_OUTDISCARDS);
1498		kfree_skb(skb);
1499	}
1500
1501	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1502}
1503