ip6_output.c revision 5110effee8fde2edfacac9cd12a9960ab2dc39ea
1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	: 	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58
59int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61int __ip6_local_out(struct sk_buff *skb)
62{
63	int len;
64
65	len = skb->len - sizeof(struct ipv6hdr);
66	if (len > IPV6_MAXPLEN)
67		len = 0;
68	ipv6_hdr(skb)->payload_len = htons(len);
69
70	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71		       skb_dst(skb)->dev, dst_output);
72}
73
74int ip6_local_out(struct sk_buff *skb)
75{
76	int err;
77
78	err = __ip6_local_out(skb);
79	if (likely(err == 1))
80		err = dst_output(skb);
81
82	return err;
83}
84EXPORT_SYMBOL_GPL(ip6_local_out);
85
86static int ip6_finish_output2(struct sk_buff *skb)
87{
88	struct dst_entry *dst = skb_dst(skb);
89	struct net_device *dev = dst->dev;
90	struct neighbour *neigh;
91
92	skb->protocol = htons(ETH_P_IPV6);
93	skb->dev = dev;
94
95	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97
98		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
99		    ((mroute6_socket(dev_net(dev), skb) &&
100		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102					 &ipv6_hdr(skb)->saddr))) {
103			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105			/* Do not check for IFF_ALLMULTI; multicast routing
106			   is not supported in any case.
107			 */
108			if (newskb)
109				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110					newskb, NULL, newskb->dev,
111					dev_loopback_xmit);
112
113			if (ipv6_hdr(skb)->hop_limit == 0) {
114				IP6_INC_STATS(dev_net(dev), idev,
115					      IPSTATS_MIB_OUTDISCARDS);
116				kfree_skb(skb);
117				return 0;
118			}
119		}
120
121		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
122				skb->len);
123	}
124
125	rcu_read_lock();
126	neigh = dst_get_neighbour_noref(dst);
127	if (neigh) {
128		int res = dst_neigh_output(dst, neigh, skb);
129
130		rcu_read_unlock();
131		return res;
132	}
133	rcu_read_unlock();
134	IP6_INC_STATS_BH(dev_net(dst->dev),
135			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
136	kfree_skb(skb);
137	return -EINVAL;
138}
139
140static int ip6_finish_output(struct sk_buff *skb)
141{
142	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
143	    dst_allfrag(skb_dst(skb)))
144		return ip6_fragment(skb, ip6_finish_output2);
145	else
146		return ip6_finish_output2(skb);
147}
148
149int ip6_output(struct sk_buff *skb)
150{
151	struct net_device *dev = skb_dst(skb)->dev;
152	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
153	if (unlikely(idev->cnf.disable_ipv6)) {
154		IP6_INC_STATS(dev_net(dev), idev,
155			      IPSTATS_MIB_OUTDISCARDS);
156		kfree_skb(skb);
157		return 0;
158	}
159
160	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
161			    ip6_finish_output,
162			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
163}
164
165/*
166 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
167 */
168
169int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
170	     struct ipv6_txoptions *opt, int tclass)
171{
172	struct net *net = sock_net(sk);
173	struct ipv6_pinfo *np = inet6_sk(sk);
174	struct in6_addr *first_hop = &fl6->daddr;
175	struct dst_entry *dst = skb_dst(skb);
176	struct ipv6hdr *hdr;
177	u8  proto = fl6->flowi6_proto;
178	int seg_len = skb->len;
179	int hlimit = -1;
180	u32 mtu;
181
182	if (opt) {
183		unsigned int head_room;
184
185		/* First: exthdrs may take lots of space (~8K for now)
186		   MAX_HEADER is not enough.
187		 */
188		head_room = opt->opt_nflen + opt->opt_flen;
189		seg_len += head_room;
190		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
191
192		if (skb_headroom(skb) < head_room) {
193			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
194			if (skb2 == NULL) {
195				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
196					      IPSTATS_MIB_OUTDISCARDS);
197				kfree_skb(skb);
198				return -ENOBUFS;
199			}
200			consume_skb(skb);
201			skb = skb2;
202			skb_set_owner_w(skb, sk);
203		}
204		if (opt->opt_flen)
205			ipv6_push_frag_opts(skb, opt, &proto);
206		if (opt->opt_nflen)
207			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
208	}
209
210	skb_push(skb, sizeof(struct ipv6hdr));
211	skb_reset_network_header(skb);
212	hdr = ipv6_hdr(skb);
213
214	/*
215	 *	Fill in the IPv6 header
216	 */
217	if (np)
218		hlimit = np->hop_limit;
219	if (hlimit < 0)
220		hlimit = ip6_dst_hoplimit(dst);
221
222	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
223
224	hdr->payload_len = htons(seg_len);
225	hdr->nexthdr = proto;
226	hdr->hop_limit = hlimit;
227
228	hdr->saddr = fl6->saddr;
229	hdr->daddr = *first_hop;
230
231	skb->priority = sk->sk_priority;
232	skb->mark = sk->sk_mark;
233
234	mtu = dst_mtu(dst);
235	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
236		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
237			      IPSTATS_MIB_OUT, skb->len);
238		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
239			       dst->dev, dst_output);
240	}
241
242	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
243	skb->dev = dst->dev;
244	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
245	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246	kfree_skb(skb);
247	return -EMSGSIZE;
248}
249
250EXPORT_SYMBOL(ip6_xmit);
251
252/*
253 *	To avoid extra problems ND packets are send through this
254 *	routine. It's code duplication but I really want to avoid
255 *	extra checks since ipv6_build_header is used by TCP (which
256 *	is for us performance critical)
257 */
258
259int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
260	       const struct in6_addr *saddr, const struct in6_addr *daddr,
261	       int proto, int len)
262{
263	struct ipv6_pinfo *np = inet6_sk(sk);
264	struct ipv6hdr *hdr;
265
266	skb->protocol = htons(ETH_P_IPV6);
267	skb->dev = dev;
268
269	skb_reset_network_header(skb);
270	skb_put(skb, sizeof(struct ipv6hdr));
271	hdr = ipv6_hdr(skb);
272
273	*(__be32*)hdr = htonl(0x60000000);
274
275	hdr->payload_len = htons(len);
276	hdr->nexthdr = proto;
277	hdr->hop_limit = np->hop_limit;
278
279	hdr->saddr = *saddr;
280	hdr->daddr = *daddr;
281
282	return 0;
283}
284
285static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
286{
287	struct ip6_ra_chain *ra;
288	struct sock *last = NULL;
289
290	read_lock(&ip6_ra_lock);
291	for (ra = ip6_ra_chain; ra; ra = ra->next) {
292		struct sock *sk = ra->sk;
293		if (sk && ra->sel == sel &&
294		    (!sk->sk_bound_dev_if ||
295		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
296			if (last) {
297				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
298				if (skb2)
299					rawv6_rcv(last, skb2);
300			}
301			last = sk;
302		}
303	}
304
305	if (last) {
306		rawv6_rcv(last, skb);
307		read_unlock(&ip6_ra_lock);
308		return 1;
309	}
310	read_unlock(&ip6_ra_lock);
311	return 0;
312}
313
314static int ip6_forward_proxy_check(struct sk_buff *skb)
315{
316	struct ipv6hdr *hdr = ipv6_hdr(skb);
317	u8 nexthdr = hdr->nexthdr;
318	__be16 frag_off;
319	int offset;
320
321	if (ipv6_ext_hdr(nexthdr)) {
322		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
323		if (offset < 0)
324			return 0;
325	} else
326		offset = sizeof(struct ipv6hdr);
327
328	if (nexthdr == IPPROTO_ICMPV6) {
329		struct icmp6hdr *icmp6;
330
331		if (!pskb_may_pull(skb, (skb_network_header(skb) +
332					 offset + 1 - skb->data)))
333			return 0;
334
335		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
336
337		switch (icmp6->icmp6_type) {
338		case NDISC_ROUTER_SOLICITATION:
339		case NDISC_ROUTER_ADVERTISEMENT:
340		case NDISC_NEIGHBOUR_SOLICITATION:
341		case NDISC_NEIGHBOUR_ADVERTISEMENT:
342		case NDISC_REDIRECT:
343			/* For reaction involving unicast neighbor discovery
344			 * message destined to the proxied address, pass it to
345			 * input function.
346			 */
347			return 1;
348		default:
349			break;
350		}
351	}
352
353	/*
354	 * The proxying router can't forward traffic sent to a link-local
355	 * address, so signal the sender and discard the packet. This
356	 * behavior is clarified by the MIPv6 specification.
357	 */
358	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
359		dst_link_failure(skb);
360		return -1;
361	}
362
363	return 0;
364}
365
366static inline int ip6_forward_finish(struct sk_buff *skb)
367{
368	return dst_output(skb);
369}
370
371int ip6_forward(struct sk_buff *skb)
372{
373	struct dst_entry *dst = skb_dst(skb);
374	struct ipv6hdr *hdr = ipv6_hdr(skb);
375	struct inet6_skb_parm *opt = IP6CB(skb);
376	struct net *net = dev_net(dst->dev);
377	u32 mtu;
378
379	if (net->ipv6.devconf_all->forwarding == 0)
380		goto error;
381
382	if (skb_warn_if_lro(skb))
383		goto drop;
384
385	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
387		goto drop;
388	}
389
390	if (skb->pkt_type != PACKET_HOST)
391		goto drop;
392
393	skb_forward_csum(skb);
394
395	/*
396	 *	We DO NOT make any processing on
397	 *	RA packets, pushing them to user level AS IS
398	 *	without ane WARRANTY that application will be able
399	 *	to interpret them. The reason is that we
400	 *	cannot make anything clever here.
401	 *
402	 *	We are not end-node, so that if packet contains
403	 *	AH/ESP, we cannot make anything.
404	 *	Defragmentation also would be mistake, RA packets
405	 *	cannot be fragmented, because there is no warranty
406	 *	that different fragments will go along one path. --ANK
407	 */
408	if (opt->ra) {
409		u8 *ptr = skb_network_header(skb) + opt->ra;
410		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
411			return 0;
412	}
413
414	/*
415	 *	check and decrement ttl
416	 */
417	if (hdr->hop_limit <= 1) {
418		/* Force OUTPUT device used as source address */
419		skb->dev = dst->dev;
420		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
421		IP6_INC_STATS_BH(net,
422				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
423
424		kfree_skb(skb);
425		return -ETIMEDOUT;
426	}
427
428	/* XXX: idev->cnf.proxy_ndp? */
429	if (net->ipv6.devconf_all->proxy_ndp &&
430	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
431		int proxied = ip6_forward_proxy_check(skb);
432		if (proxied > 0)
433			return ip6_input(skb);
434		else if (proxied < 0) {
435			IP6_INC_STATS(net, ip6_dst_idev(dst),
436				      IPSTATS_MIB_INDISCARDS);
437			goto drop;
438		}
439	}
440
441	if (!xfrm6_route_forward(skb)) {
442		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
443		goto drop;
444	}
445	dst = skb_dst(skb);
446
447	/* IPv6 specs say nothing about it, but it is clear that we cannot
448	   send redirects to source routed frames.
449	   We don't send redirects to frames decapsulated from IPsec.
450	 */
451	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
452		struct in6_addr *target = NULL;
453		struct inet_peer *peer;
454		struct rt6_info *rt;
455
456		/*
457		 *	incoming and outgoing devices are the same
458		 *	send a redirect.
459		 */
460
461		rt = (struct rt6_info *) dst;
462		if (rt->rt6i_flags & RTF_GATEWAY)
463			target = &rt->rt6i_gateway;
464		else
465			target = &hdr->daddr;
466
467		peer = rt6_get_peer_create(rt);
468
469		/* Limit redirects both by destination (here)
470		   and by source (inside ndisc_send_redirect)
471		 */
472		if (inet_peer_xrlim_allow(peer, 1*HZ))
473			ndisc_send_redirect(skb, target);
474	} else {
475		int addrtype = ipv6_addr_type(&hdr->saddr);
476
477		/* This check is security critical. */
478		if (addrtype == IPV6_ADDR_ANY ||
479		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480			goto error;
481		if (addrtype & IPV6_ADDR_LINKLOCAL) {
482			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483				    ICMPV6_NOT_NEIGHBOUR, 0);
484			goto error;
485		}
486	}
487
488	mtu = dst_mtu(dst);
489	if (mtu < IPV6_MIN_MTU)
490		mtu = IPV6_MIN_MTU;
491
492	if (skb->len > mtu && !skb_is_gso(skb)) {
493		/* Again, force OUTPUT device used as source address */
494		skb->dev = dst->dev;
495		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496		IP6_INC_STATS_BH(net,
497				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498		IP6_INC_STATS_BH(net,
499				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500		kfree_skb(skb);
501		return -EMSGSIZE;
502	}
503
504	if (skb_cow(skb, dst->dev->hard_header_len)) {
505		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506		goto drop;
507	}
508
509	hdr = ipv6_hdr(skb);
510
511	/* Mangling hops number delayed to point after skb COW */
512
513	hdr->hop_limit--;
514
515	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518		       ip6_forward_finish);
519
520error:
521	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522drop:
523	kfree_skb(skb);
524	return -EINVAL;
525}
526
527static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528{
529	to->pkt_type = from->pkt_type;
530	to->priority = from->priority;
531	to->protocol = from->protocol;
532	skb_dst_drop(to);
533	skb_dst_set(to, dst_clone(skb_dst(from)));
534	to->dev = from->dev;
535	to->mark = from->mark;
536
537#ifdef CONFIG_NET_SCHED
538	to->tc_index = from->tc_index;
539#endif
540	nf_copy(to, from);
541#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543	to->nf_trace = from->nf_trace;
544#endif
545	skb_copy_secmark(to, from);
546}
547
548int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549{
550	u16 offset = sizeof(struct ipv6hdr);
551	struct ipv6_opt_hdr *exthdr =
552				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553	unsigned int packet_len = skb->tail - skb->network_header;
554	int found_rhdr = 0;
555	*nexthdr = &ipv6_hdr(skb)->nexthdr;
556
557	while (offset + 1 <= packet_len) {
558
559		switch (**nexthdr) {
560
561		case NEXTHDR_HOP:
562			break;
563		case NEXTHDR_ROUTING:
564			found_rhdr = 1;
565			break;
566		case NEXTHDR_DEST:
567#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
569				break;
570#endif
571			if (found_rhdr)
572				return offset;
573			break;
574		default :
575			return offset;
576		}
577
578		offset += ipv6_optlen(exthdr);
579		*nexthdr = &exthdr->nexthdr;
580		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
581						 offset);
582	}
583
584	return offset;
585}
586
587void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588{
589	static atomic_t ipv6_fragmentation_id;
590	int old, new;
591
592	if (rt && !(rt->dst.flags & DST_NOPEER)) {
593		struct inet_peer *peer = rt6_get_peer_create(rt);
594
595		if (peer) {
596			fhdr->identification = htonl(inet_getid(peer, 0));
597			return;
598		}
599	}
600	do {
601		old = atomic_read(&ipv6_fragmentation_id);
602		new = old + 1;
603		if (!new)
604			new = 1;
605	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
606	fhdr->identification = htonl(new);
607}
608
609int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
610{
611	struct sk_buff *frag;
612	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
613	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
614	struct ipv6hdr *tmp_hdr;
615	struct frag_hdr *fh;
616	unsigned int mtu, hlen, left, len;
617	int hroom, troom;
618	__be32 frag_id = 0;
619	int ptr, offset = 0, err=0;
620	u8 *prevhdr, nexthdr = 0;
621	struct net *net = dev_net(skb_dst(skb)->dev);
622
623	hlen = ip6_find_1stfragopt(skb, &prevhdr);
624	nexthdr = *prevhdr;
625
626	mtu = ip6_skb_dst_mtu(skb);
627
628	/* We must not fragment if the socket is set to force MTU discovery
629	 * or if the skb it not generated by a local socket.
630	 */
631	if (unlikely(!skb->local_df && skb->len > mtu)) {
632		if (skb->sk && dst_allfrag(skb_dst(skb)))
633			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
634
635		skb->dev = skb_dst(skb)->dev;
636		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
637		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
638			      IPSTATS_MIB_FRAGFAILS);
639		kfree_skb(skb);
640		return -EMSGSIZE;
641	}
642
643	if (np && np->frag_size < mtu) {
644		if (np->frag_size)
645			mtu = np->frag_size;
646	}
647	mtu -= hlen + sizeof(struct frag_hdr);
648
649	if (skb_has_frag_list(skb)) {
650		int first_len = skb_pagelen(skb);
651		struct sk_buff *frag2;
652
653		if (first_len - hlen > mtu ||
654		    ((first_len - hlen) & 7) ||
655		    skb_cloned(skb))
656			goto slow_path;
657
658		skb_walk_frags(skb, frag) {
659			/* Correct geometry. */
660			if (frag->len > mtu ||
661			    ((frag->len & 7) && frag->next) ||
662			    skb_headroom(frag) < hlen)
663				goto slow_path_clean;
664
665			/* Partially cloned skb? */
666			if (skb_shared(frag))
667				goto slow_path_clean;
668
669			BUG_ON(frag->sk);
670			if (skb->sk) {
671				frag->sk = skb->sk;
672				frag->destructor = sock_wfree;
673			}
674			skb->truesize -= frag->truesize;
675		}
676
677		err = 0;
678		offset = 0;
679		frag = skb_shinfo(skb)->frag_list;
680		skb_frag_list_init(skb);
681		/* BUILD HEADER */
682
683		*prevhdr = NEXTHDR_FRAGMENT;
684		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
685		if (!tmp_hdr) {
686			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
687				      IPSTATS_MIB_FRAGFAILS);
688			return -ENOMEM;
689		}
690
691		__skb_pull(skb, hlen);
692		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
693		__skb_push(skb, hlen);
694		skb_reset_network_header(skb);
695		memcpy(skb_network_header(skb), tmp_hdr, hlen);
696
697		ipv6_select_ident(fh, rt);
698		fh->nexthdr = nexthdr;
699		fh->reserved = 0;
700		fh->frag_off = htons(IP6_MF);
701		frag_id = fh->identification;
702
703		first_len = skb_pagelen(skb);
704		skb->data_len = first_len - skb_headlen(skb);
705		skb->len = first_len;
706		ipv6_hdr(skb)->payload_len = htons(first_len -
707						   sizeof(struct ipv6hdr));
708
709		dst_hold(&rt->dst);
710
711		for (;;) {
712			/* Prepare header of the next frame,
713			 * before previous one went down. */
714			if (frag) {
715				frag->ip_summed = CHECKSUM_NONE;
716				skb_reset_transport_header(frag);
717				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
718				__skb_push(frag, hlen);
719				skb_reset_network_header(frag);
720				memcpy(skb_network_header(frag), tmp_hdr,
721				       hlen);
722				offset += skb->len - hlen - sizeof(struct frag_hdr);
723				fh->nexthdr = nexthdr;
724				fh->reserved = 0;
725				fh->frag_off = htons(offset);
726				if (frag->next != NULL)
727					fh->frag_off |= htons(IP6_MF);
728				fh->identification = frag_id;
729				ipv6_hdr(frag)->payload_len =
730						htons(frag->len -
731						      sizeof(struct ipv6hdr));
732				ip6_copy_metadata(frag, skb);
733			}
734
735			err = output(skb);
736			if(!err)
737				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738					      IPSTATS_MIB_FRAGCREATES);
739
740			if (err || !frag)
741				break;
742
743			skb = frag;
744			frag = skb->next;
745			skb->next = NULL;
746		}
747
748		kfree(tmp_hdr);
749
750		if (err == 0) {
751			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752				      IPSTATS_MIB_FRAGOKS);
753			dst_release(&rt->dst);
754			return 0;
755		}
756
757		while (frag) {
758			skb = frag->next;
759			kfree_skb(frag);
760			frag = skb;
761		}
762
763		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764			      IPSTATS_MIB_FRAGFAILS);
765		dst_release(&rt->dst);
766		return err;
767
768slow_path_clean:
769		skb_walk_frags(skb, frag2) {
770			if (frag2 == frag)
771				break;
772			frag2->sk = NULL;
773			frag2->destructor = NULL;
774			skb->truesize += frag2->truesize;
775		}
776	}
777
778slow_path:
779	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
780	    skb_checksum_help(skb))
781		goto fail;
782
783	left = skb->len - hlen;		/* Space per frame */
784	ptr = hlen;			/* Where to start from */
785
786	/*
787	 *	Fragment the datagram.
788	 */
789
790	*prevhdr = NEXTHDR_FRAGMENT;
791	hroom = LL_RESERVED_SPACE(rt->dst.dev);
792	troom = rt->dst.dev->needed_tailroom;
793
794	/*
795	 *	Keep copying data until we run out.
796	 */
797	while(left > 0)	{
798		len = left;
799		/* IF: it doesn't fit, use 'mtu' - the data space left */
800		if (len > mtu)
801			len = mtu;
802		/* IF: we are not sending up to and including the packet end
803		   then align the next start on an eight byte boundary */
804		if (len < left)	{
805			len &= ~7;
806		}
807		/*
808		 *	Allocate buffer.
809		 */
810
811		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
812				      hroom + troom, GFP_ATOMIC)) == NULL) {
813			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
814			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
815				      IPSTATS_MIB_FRAGFAILS);
816			err = -ENOMEM;
817			goto fail;
818		}
819
820		/*
821		 *	Set up data on packet
822		 */
823
824		ip6_copy_metadata(frag, skb);
825		skb_reserve(frag, hroom);
826		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
827		skb_reset_network_header(frag);
828		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
829		frag->transport_header = (frag->network_header + hlen +
830					  sizeof(struct frag_hdr));
831
832		/*
833		 *	Charge the memory for the fragment to any owner
834		 *	it might possess
835		 */
836		if (skb->sk)
837			skb_set_owner_w(frag, skb->sk);
838
839		/*
840		 *	Copy the packet header into the new buffer.
841		 */
842		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
843
844		/*
845		 *	Build fragment header.
846		 */
847		fh->nexthdr = nexthdr;
848		fh->reserved = 0;
849		if (!frag_id) {
850			ipv6_select_ident(fh, rt);
851			frag_id = fh->identification;
852		} else
853			fh->identification = frag_id;
854
855		/*
856		 *	Copy a block of the IP datagram.
857		 */
858		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
859			BUG();
860		left -= len;
861
862		fh->frag_off = htons(offset);
863		if (left > 0)
864			fh->frag_off |= htons(IP6_MF);
865		ipv6_hdr(frag)->payload_len = htons(frag->len -
866						    sizeof(struct ipv6hdr));
867
868		ptr += len;
869		offset += len;
870
871		/*
872		 *	Put this fragment into the sending queue.
873		 */
874		err = output(frag);
875		if (err)
876			goto fail;
877
878		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879			      IPSTATS_MIB_FRAGCREATES);
880	}
881	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
882		      IPSTATS_MIB_FRAGOKS);
883	consume_skb(skb);
884	return err;
885
886fail:
887	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888		      IPSTATS_MIB_FRAGFAILS);
889	kfree_skb(skb);
890	return err;
891}
892
893static inline int ip6_rt_check(const struct rt6key *rt_key,
894			       const struct in6_addr *fl_addr,
895			       const struct in6_addr *addr_cache)
896{
897	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
898		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
899}
900
901static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
902					  struct dst_entry *dst,
903					  const struct flowi6 *fl6)
904{
905	struct ipv6_pinfo *np = inet6_sk(sk);
906	struct rt6_info *rt = (struct rt6_info *)dst;
907
908	if (!dst)
909		goto out;
910
911	/* Yes, checking route validity in not connected
912	 * case is not very simple. Take into account,
913	 * that we do not support routing by source, TOS,
914	 * and MSG_DONTROUTE 		--ANK (980726)
915	 *
916	 * 1. ip6_rt_check(): If route was host route,
917	 *    check that cached destination is current.
918	 *    If it is network route, we still may
919	 *    check its validity using saved pointer
920	 *    to the last used address: daddr_cache.
921	 *    We do not want to save whole address now,
922	 *    (because main consumer of this service
923	 *    is tcp, which has not this problem),
924	 *    so that the last trick works only on connected
925	 *    sockets.
926	 * 2. oif also should be the same.
927	 */
928	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
929#ifdef CONFIG_IPV6_SUBTREES
930	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
931#endif
932	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
933		dst_release(dst);
934		dst = NULL;
935	}
936
937out:
938	return dst;
939}
940
941static int ip6_dst_lookup_tail(struct sock *sk,
942			       struct dst_entry **dst, struct flowi6 *fl6)
943{
944	struct net *net = sock_net(sk);
945#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946	struct neighbour *n;
947#endif
948	int err;
949
950	if (*dst == NULL)
951		*dst = ip6_route_output(net, sk, fl6);
952
953	if ((err = (*dst)->error))
954		goto out_err_release;
955
956	if (ipv6_addr_any(&fl6->saddr)) {
957		struct rt6_info *rt = (struct rt6_info *) *dst;
958		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
959					  sk ? inet6_sk(sk)->srcprefs : 0,
960					  &fl6->saddr);
961		if (err)
962			goto out_err_release;
963	}
964
965#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
966	/*
967	 * Here if the dst entry we've looked up
968	 * has a neighbour entry that is in the INCOMPLETE
969	 * state and the src address from the flow is
970	 * marked as OPTIMISTIC, we release the found
971	 * dst entry and replace it instead with the
972	 * dst entry of the nexthop router
973	 */
974	rcu_read_lock();
975	n = dst_get_neighbour_noref(*dst);
976	if (n && !(n->nud_state & NUD_VALID)) {
977		struct inet6_ifaddr *ifp;
978		struct flowi6 fl_gw6;
979		int redirect;
980
981		rcu_read_unlock();
982		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
983				      (*dst)->dev, 1);
984
985		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
986		if (ifp)
987			in6_ifa_put(ifp);
988
989		if (redirect) {
990			/*
991			 * We need to get the dst entry for the
992			 * default router instead
993			 */
994			dst_release(*dst);
995			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
996			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
997			*dst = ip6_route_output(net, sk, &fl_gw6);
998			if ((err = (*dst)->error))
999				goto out_err_release;
1000		}
1001	} else {
1002		rcu_read_unlock();
1003	}
1004#endif
1005
1006	return 0;
1007
1008out_err_release:
1009	if (err == -ENETUNREACH)
1010		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011	dst_release(*dst);
1012	*dst = NULL;
1013	return err;
1014}
1015
1016/**
1017 *	ip6_dst_lookup - perform route lookup on flow
1018 *	@sk: socket which provides route info
1019 *	@dst: pointer to dst_entry * for result
1020 *	@fl6: flow to lookup
1021 *
1022 *	This function performs a route lookup on the given flow.
1023 *
1024 *	It returns zero on success, or a standard errno code on error.
1025 */
1026int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027{
1028	*dst = NULL;
1029	return ip6_dst_lookup_tail(sk, dst, fl6);
1030}
1031EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033/**
1034 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035 *	@sk: socket which provides route info
1036 *	@fl6: flow to lookup
1037 *	@final_dst: final destination address for ipsec lookup
1038 *	@can_sleep: we are in a sleepable context
1039 *
1040 *	This function performs a route lookup on the given flow.
1041 *
1042 *	It returns a valid dst pointer on success, or a pointer encoded
1043 *	error code.
1044 */
1045struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046				      const struct in6_addr *final_dst,
1047				      bool can_sleep)
1048{
1049	struct dst_entry *dst = NULL;
1050	int err;
1051
1052	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053	if (err)
1054		return ERR_PTR(err);
1055	if (final_dst)
1056		fl6->daddr = *final_dst;
1057	if (can_sleep)
1058		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059
1060	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061}
1062EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063
1064/**
1065 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066 *	@sk: socket which provides the dst cache and route info
1067 *	@fl6: flow to lookup
1068 *	@final_dst: final destination address for ipsec lookup
1069 *	@can_sleep: we are in a sleepable context
1070 *
1071 *	This function performs a route lookup on the given flow with the
1072 *	possibility of using the cached route in the socket if it is valid.
1073 *	It will take the socket dst lock when operating on the dst cache.
1074 *	As a result, this function can only be used in process context.
1075 *
1076 *	It returns a valid dst pointer on success, or a pointer encoded
1077 *	error code.
1078 */
1079struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080					 const struct in6_addr *final_dst,
1081					 bool can_sleep)
1082{
1083	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084	int err;
1085
1086	dst = ip6_sk_dst_check(sk, dst, fl6);
1087
1088	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089	if (err)
1090		return ERR_PTR(err);
1091	if (final_dst)
1092		fl6->daddr = *final_dst;
1093	if (can_sleep)
1094		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095
1096	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097}
1098EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099
1100static inline int ip6_ufo_append_data(struct sock *sk,
1101			int getfrag(void *from, char *to, int offset, int len,
1102			int odd, struct sk_buff *skb),
1103			void *from, int length, int hh_len, int fragheaderlen,
1104			int transhdrlen, int mtu,unsigned int flags,
1105			struct rt6_info *rt)
1106
1107{
1108	struct sk_buff *skb;
1109	int err;
1110
1111	/* There is support for UDP large send offload by network
1112	 * device, so create one single skb packet containing complete
1113	 * udp datagram
1114	 */
1115	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116		skb = sock_alloc_send_skb(sk,
1117			hh_len + fragheaderlen + transhdrlen + 20,
1118			(flags & MSG_DONTWAIT), &err);
1119		if (skb == NULL)
1120			return err;
1121
1122		/* reserve space for Hardware header */
1123		skb_reserve(skb, hh_len);
1124
1125		/* create space for UDP/IP header */
1126		skb_put(skb,fragheaderlen + transhdrlen);
1127
1128		/* initialize network header pointer */
1129		skb_reset_network_header(skb);
1130
1131		/* initialize protocol header pointer */
1132		skb->transport_header = skb->network_header + fragheaderlen;
1133
1134		skb->ip_summed = CHECKSUM_PARTIAL;
1135		skb->csum = 0;
1136	}
1137
1138	err = skb_append_datato_frags(sk,skb, getfrag, from,
1139				      (length - transhdrlen));
1140	if (!err) {
1141		struct frag_hdr fhdr;
1142
1143		/* Specify the length of each IPv6 datagram fragment.
1144		 * It has to be a multiple of 8.
1145		 */
1146		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147					     sizeof(struct frag_hdr)) & ~7;
1148		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149		ipv6_select_ident(&fhdr, rt);
1150		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151		__skb_queue_tail(&sk->sk_write_queue, skb);
1152
1153		return 0;
1154	}
1155	/* There is not enough support do UPD LSO,
1156	 * so follow normal path
1157	 */
1158	kfree_skb(skb);
1159
1160	return err;
1161}
1162
1163static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164					       gfp_t gfp)
1165{
1166	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167}
1168
1169static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170						gfp_t gfp)
1171{
1172	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173}
1174
1175static void ip6_append_data_mtu(int *mtu,
1176				int *maxfraglen,
1177				unsigned int fragheaderlen,
1178				struct sk_buff *skb,
1179				struct rt6_info *rt)
1180{
1181	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1182		if (skb == NULL) {
1183			/* first fragment, reserve header_len */
1184			*mtu = *mtu - rt->dst.header_len;
1185
1186		} else {
1187			/*
1188			 * this fragment is not first, the headers
1189			 * space is regarded as data space.
1190			 */
1191			*mtu = dst_mtu(rt->dst.path);
1192		}
1193		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1194			      + fragheaderlen - sizeof(struct frag_hdr);
1195	}
1196}
1197
1198int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1199	int offset, int len, int odd, struct sk_buff *skb),
1200	void *from, int length, int transhdrlen,
1201	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1202	struct rt6_info *rt, unsigned int flags, int dontfrag)
1203{
1204	struct inet_sock *inet = inet_sk(sk);
1205	struct ipv6_pinfo *np = inet6_sk(sk);
1206	struct inet_cork *cork;
1207	struct sk_buff *skb, *skb_prev = NULL;
1208	unsigned int maxfraglen, fragheaderlen;
1209	int exthdrlen;
1210	int dst_exthdrlen;
1211	int hh_len;
1212	int mtu;
1213	int copy;
1214	int err;
1215	int offset = 0;
1216	__u8 tx_flags = 0;
1217
1218	if (flags&MSG_PROBE)
1219		return 0;
1220	cork = &inet->cork.base;
1221	if (skb_queue_empty(&sk->sk_write_queue)) {
1222		/*
1223		 * setup for corking
1224		 */
1225		if (opt) {
1226			if (WARN_ON(np->cork.opt))
1227				return -EINVAL;
1228
1229			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1230			if (unlikely(np->cork.opt == NULL))
1231				return -ENOBUFS;
1232
1233			np->cork.opt->tot_len = opt->tot_len;
1234			np->cork.opt->opt_flen = opt->opt_flen;
1235			np->cork.opt->opt_nflen = opt->opt_nflen;
1236
1237			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1238							    sk->sk_allocation);
1239			if (opt->dst0opt && !np->cork.opt->dst0opt)
1240				return -ENOBUFS;
1241
1242			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1243							    sk->sk_allocation);
1244			if (opt->dst1opt && !np->cork.opt->dst1opt)
1245				return -ENOBUFS;
1246
1247			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1248							   sk->sk_allocation);
1249			if (opt->hopopt && !np->cork.opt->hopopt)
1250				return -ENOBUFS;
1251
1252			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1253							    sk->sk_allocation);
1254			if (opt->srcrt && !np->cork.opt->srcrt)
1255				return -ENOBUFS;
1256
1257			/* need source address above miyazawa*/
1258		}
1259		dst_hold(&rt->dst);
1260		cork->dst = &rt->dst;
1261		inet->cork.fl.u.ip6 = *fl6;
1262		np->cork.hop_limit = hlimit;
1263		np->cork.tclass = tclass;
1264		if (rt->dst.flags & DST_XFRM_TUNNEL)
1265			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1266			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1267		else
1268			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1269			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1270		if (np->frag_size < mtu) {
1271			if (np->frag_size)
1272				mtu = np->frag_size;
1273		}
1274		cork->fragsize = mtu;
1275		if (dst_allfrag(rt->dst.path))
1276			cork->flags |= IPCORK_ALLFRAG;
1277		cork->length = 0;
1278		sk->sk_sndmsg_page = NULL;
1279		sk->sk_sndmsg_off = 0;
1280		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1281		length += exthdrlen;
1282		transhdrlen += exthdrlen;
1283		dst_exthdrlen = rt->dst.header_len;
1284	} else {
1285		rt = (struct rt6_info *)cork->dst;
1286		fl6 = &inet->cork.fl.u.ip6;
1287		opt = np->cork.opt;
1288		transhdrlen = 0;
1289		exthdrlen = 0;
1290		dst_exthdrlen = 0;
1291		mtu = cork->fragsize;
1292	}
1293
1294	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297			(opt ? opt->opt_nflen : 0);
1298	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1299
1300	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1303			return -EMSGSIZE;
1304		}
1305	}
1306
1307	/* For UDP, check if TX timestamp is enabled */
1308	if (sk->sk_type == SOCK_DGRAM) {
1309		err = sock_tx_timestamp(sk, &tx_flags);
1310		if (err)
1311			goto error;
1312	}
1313
1314	/*
1315	 * Let's try using as much space as possible.
1316	 * Use MTU if total length of the message fits into the MTU.
1317	 * Otherwise, we need to reserve fragment header and
1318	 * fragment alignment (= 8-15 octects, in total).
1319	 *
1320	 * Note that we may need to "move" the data from the tail of
1321	 * of the buffer to the new fragment when we split
1322	 * the message.
1323	 *
1324	 * FIXME: It may be fragmented into multiple chunks
1325	 *        at once if non-fragmentable extension headers
1326	 *        are too large.
1327	 * --yoshfuji
1328	 */
1329
1330	cork->length += length;
1331	if (length > mtu) {
1332		int proto = sk->sk_protocol;
1333		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1334			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1335			return -EMSGSIZE;
1336		}
1337
1338		if (proto == IPPROTO_UDP &&
1339		    (rt->dst.dev->features & NETIF_F_UFO)) {
1340
1341			err = ip6_ufo_append_data(sk, getfrag, from, length,
1342						  hh_len, fragheaderlen,
1343						  transhdrlen, mtu, flags, rt);
1344			if (err)
1345				goto error;
1346			return 0;
1347		}
1348	}
1349
1350	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1351		goto alloc_new_skb;
1352
1353	while (length > 0) {
1354		/* Check if the remaining data fits into current packet. */
1355		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356		if (copy < length)
1357			copy = maxfraglen - skb->len;
1358
1359		if (copy <= 0) {
1360			char *data;
1361			unsigned int datalen;
1362			unsigned int fraglen;
1363			unsigned int fraggap;
1364			unsigned int alloclen;
1365alloc_new_skb:
1366			/* There's no room in the current skb */
1367			if (skb)
1368				fraggap = skb->len - maxfraglen;
1369			else
1370				fraggap = 0;
1371			/* update mtu and maxfraglen if necessary */
1372			if (skb == NULL || skb_prev == NULL)
1373				ip6_append_data_mtu(&mtu, &maxfraglen,
1374						    fragheaderlen, skb, rt);
1375
1376			skb_prev = skb;
1377
1378			/*
1379			 * If remaining data exceeds the mtu,
1380			 * we know we need more fragment(s).
1381			 */
1382			datalen = length + fraggap;
1383
1384			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1385				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1386			if ((flags & MSG_MORE) &&
1387			    !(rt->dst.dev->features&NETIF_F_SG))
1388				alloclen = mtu;
1389			else
1390				alloclen = datalen + fragheaderlen;
1391
1392			alloclen += dst_exthdrlen;
1393
1394			if (datalen != length + fraggap) {
1395				/*
1396				 * this is not the last fragment, the trailer
1397				 * space is regarded as data space.
1398				 */
1399				datalen += rt->dst.trailer_len;
1400			}
1401
1402			alloclen += rt->dst.trailer_len;
1403			fraglen = datalen + fragheaderlen;
1404
1405			/*
1406			 * We just reserve space for fragment header.
1407			 * Note: this may be overallocation if the message
1408			 * (without MSG_MORE) fits into the MTU.
1409			 */
1410			alloclen += sizeof(struct frag_hdr);
1411
1412			if (transhdrlen) {
1413				skb = sock_alloc_send_skb(sk,
1414						alloclen + hh_len,
1415						(flags & MSG_DONTWAIT), &err);
1416			} else {
1417				skb = NULL;
1418				if (atomic_read(&sk->sk_wmem_alloc) <=
1419				    2 * sk->sk_sndbuf)
1420					skb = sock_wmalloc(sk,
1421							   alloclen + hh_len, 1,
1422							   sk->sk_allocation);
1423				if (unlikely(skb == NULL))
1424					err = -ENOBUFS;
1425				else {
1426					/* Only the initial fragment
1427					 * is time stamped.
1428					 */
1429					tx_flags = 0;
1430				}
1431			}
1432			if (skb == NULL)
1433				goto error;
1434			/*
1435			 *	Fill in the control structures
1436			 */
1437			skb->ip_summed = CHECKSUM_NONE;
1438			skb->csum = 0;
1439			/* reserve for fragmentation and ipsec header */
1440			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441				    dst_exthdrlen);
1442
1443			if (sk->sk_type == SOCK_DGRAM)
1444				skb_shinfo(skb)->tx_flags = tx_flags;
1445
1446			/*
1447			 *	Find where to start putting bytes
1448			 */
1449			data = skb_put(skb, fraglen);
1450			skb_set_network_header(skb, exthdrlen);
1451			data += fragheaderlen;
1452			skb->transport_header = (skb->network_header +
1453						 fragheaderlen);
1454			if (fraggap) {
1455				skb->csum = skb_copy_and_csum_bits(
1456					skb_prev, maxfraglen,
1457					data + transhdrlen, fraggap, 0);
1458				skb_prev->csum = csum_sub(skb_prev->csum,
1459							  skb->csum);
1460				data += fraggap;
1461				pskb_trim_unique(skb_prev, maxfraglen);
1462			}
1463			copy = datalen - transhdrlen - fraggap;
1464
1465			if (copy < 0) {
1466				err = -EINVAL;
1467				kfree_skb(skb);
1468				goto error;
1469			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1470				err = -EFAULT;
1471				kfree_skb(skb);
1472				goto error;
1473			}
1474
1475			offset += copy;
1476			length -= datalen - fraggap;
1477			transhdrlen = 0;
1478			exthdrlen = 0;
1479			dst_exthdrlen = 0;
1480
1481			/*
1482			 * Put the packet on the pending queue
1483			 */
1484			__skb_queue_tail(&sk->sk_write_queue, skb);
1485			continue;
1486		}
1487
1488		if (copy > length)
1489			copy = length;
1490
1491		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492			unsigned int off;
1493
1494			off = skb->len;
1495			if (getfrag(from, skb_put(skb, copy),
1496						offset, copy, off, skb) < 0) {
1497				__skb_trim(skb, off);
1498				err = -EFAULT;
1499				goto error;
1500			}
1501		} else {
1502			int i = skb_shinfo(skb)->nr_frags;
1503			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1504			struct page *page = sk->sk_sndmsg_page;
1505			int off = sk->sk_sndmsg_off;
1506			unsigned int left;
1507
1508			if (page && (left = PAGE_SIZE - off) > 0) {
1509				if (copy >= left)
1510					copy = left;
1511				if (page != skb_frag_page(frag)) {
1512					if (i == MAX_SKB_FRAGS) {
1513						err = -EMSGSIZE;
1514						goto error;
1515					}
1516					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1517					skb_frag_ref(skb, i);
1518					frag = &skb_shinfo(skb)->frags[i];
1519				}
1520			} else if(i < MAX_SKB_FRAGS) {
1521				if (copy > PAGE_SIZE)
1522					copy = PAGE_SIZE;
1523				page = alloc_pages(sk->sk_allocation, 0);
1524				if (page == NULL) {
1525					err = -ENOMEM;
1526					goto error;
1527				}
1528				sk->sk_sndmsg_page = page;
1529				sk->sk_sndmsg_off = 0;
1530
1531				skb_fill_page_desc(skb, i, page, 0, 0);
1532				frag = &skb_shinfo(skb)->frags[i];
1533			} else {
1534				err = -EMSGSIZE;
1535				goto error;
1536			}
1537			if (getfrag(from,
1538				    skb_frag_address(frag) + skb_frag_size(frag),
1539				    offset, copy, skb->len, skb) < 0) {
1540				err = -EFAULT;
1541				goto error;
1542			}
1543			sk->sk_sndmsg_off += copy;
1544			skb_frag_size_add(frag, copy);
1545			skb->len += copy;
1546			skb->data_len += copy;
1547			skb->truesize += copy;
1548			atomic_add(copy, &sk->sk_wmem_alloc);
1549		}
1550		offset += copy;
1551		length -= copy;
1552	}
1553	return 0;
1554error:
1555	cork->length -= length;
1556	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1557	return err;
1558}
1559EXPORT_SYMBOL_GPL(ip6_append_data);
1560
1561static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1562{
1563	if (np->cork.opt) {
1564		kfree(np->cork.opt->dst0opt);
1565		kfree(np->cork.opt->dst1opt);
1566		kfree(np->cork.opt->hopopt);
1567		kfree(np->cork.opt->srcrt);
1568		kfree(np->cork.opt);
1569		np->cork.opt = NULL;
1570	}
1571
1572	if (inet->cork.base.dst) {
1573		dst_release(inet->cork.base.dst);
1574		inet->cork.base.dst = NULL;
1575		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1576	}
1577	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1578}
1579
1580int ip6_push_pending_frames(struct sock *sk)
1581{
1582	struct sk_buff *skb, *tmp_skb;
1583	struct sk_buff **tail_skb;
1584	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1585	struct inet_sock *inet = inet_sk(sk);
1586	struct ipv6_pinfo *np = inet6_sk(sk);
1587	struct net *net = sock_net(sk);
1588	struct ipv6hdr *hdr;
1589	struct ipv6_txoptions *opt = np->cork.opt;
1590	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1591	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1592	unsigned char proto = fl6->flowi6_proto;
1593	int err = 0;
1594
1595	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1596		goto out;
1597	tail_skb = &(skb_shinfo(skb)->frag_list);
1598
1599	/* move skb->data to ip header from ext header */
1600	if (skb->data < skb_network_header(skb))
1601		__skb_pull(skb, skb_network_offset(skb));
1602	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1603		__skb_pull(tmp_skb, skb_network_header_len(skb));
1604		*tail_skb = tmp_skb;
1605		tail_skb = &(tmp_skb->next);
1606		skb->len += tmp_skb->len;
1607		skb->data_len += tmp_skb->len;
1608		skb->truesize += tmp_skb->truesize;
1609		tmp_skb->destructor = NULL;
1610		tmp_skb->sk = NULL;
1611	}
1612
1613	/* Allow local fragmentation. */
1614	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1615		skb->local_df = 1;
1616
1617	*final_dst = fl6->daddr;
1618	__skb_pull(skb, skb_network_header_len(skb));
1619	if (opt && opt->opt_flen)
1620		ipv6_push_frag_opts(skb, opt, &proto);
1621	if (opt && opt->opt_nflen)
1622		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1623
1624	skb_push(skb, sizeof(struct ipv6hdr));
1625	skb_reset_network_header(skb);
1626	hdr = ipv6_hdr(skb);
1627
1628	*(__be32*)hdr = fl6->flowlabel |
1629		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1630
1631	hdr->hop_limit = np->cork.hop_limit;
1632	hdr->nexthdr = proto;
1633	hdr->saddr = fl6->saddr;
1634	hdr->daddr = *final_dst;
1635
1636	skb->priority = sk->sk_priority;
1637	skb->mark = sk->sk_mark;
1638
1639	skb_dst_set(skb, dst_clone(&rt->dst));
1640	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1641	if (proto == IPPROTO_ICMPV6) {
1642		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1643
1644		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1645		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1646	}
1647
1648	err = ip6_local_out(skb);
1649	if (err) {
1650		if (err > 0)
1651			err = net_xmit_errno(err);
1652		if (err)
1653			goto error;
1654	}
1655
1656out:
1657	ip6_cork_release(inet, np);
1658	return err;
1659error:
1660	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1661	goto out;
1662}
1663EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1664
1665void ip6_flush_pending_frames(struct sock *sk)
1666{
1667	struct sk_buff *skb;
1668
1669	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1670		if (skb_dst(skb))
1671			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1672				      IPSTATS_MIB_OUTDISCARDS);
1673		kfree_skb(skb);
1674	}
1675
1676	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1677}
1678EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1679