ip6_output.c revision c547dbf55d5f8cf615ccc0e7265e98db27d3fb8b
1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	: 	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58
59static int ip6_finish_output2(struct sk_buff *skb)
60{
61	struct dst_entry *dst = skb_dst(skb);
62	struct net_device *dev = dst->dev;
63	struct neighbour *neigh;
64	struct in6_addr *nexthop;
65	int ret;
66
67	skb->protocol = htons(ETH_P_IPV6);
68	skb->dev = dev;
69
70	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74		    ((mroute6_socket(dev_net(dev), skb) &&
75		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77					 &ipv6_hdr(skb)->saddr))) {
78			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80			/* Do not check for IFF_ALLMULTI; multicast routing
81			   is not supported in any case.
82			 */
83			if (newskb)
84				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85					newskb, NULL, newskb->dev,
86					dev_loopback_xmit);
87
88			if (ipv6_hdr(skb)->hop_limit == 0) {
89				IP6_INC_STATS(dev_net(dev), idev,
90					      IPSTATS_MIB_OUTDISCARDS);
91				kfree_skb(skb);
92				return 0;
93			}
94		}
95
96		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97				skb->len);
98
99		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100		    IPV6_ADDR_SCOPE_NODELOCAL &&
101		    !(dev->flags & IFF_LOOPBACK)) {
102			kfree_skb(skb);
103			return 0;
104		}
105	}
106
107	rcu_read_lock_bh();
108	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110	if (unlikely(!neigh))
111		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112	if (!IS_ERR(neigh)) {
113		ret = dst_neigh_output(dst, neigh, skb);
114		rcu_read_unlock_bh();
115		return ret;
116	}
117	rcu_read_unlock_bh();
118
119	IP6_INC_STATS_BH(dev_net(dst->dev),
120			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121	kfree_skb(skb);
122	return -EINVAL;
123}
124
125static int ip6_finish_output(struct sk_buff *skb)
126{
127	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128	    dst_allfrag(skb_dst(skb)))
129		return ip6_fragment(skb, ip6_finish_output2);
130	else
131		return ip6_finish_output2(skb);
132}
133
134int ip6_output(struct sk_buff *skb)
135{
136	struct net_device *dev = skb_dst(skb)->dev;
137	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138	if (unlikely(idev->cnf.disable_ipv6)) {
139		IP6_INC_STATS(dev_net(dev), idev,
140			      IPSTATS_MIB_OUTDISCARDS);
141		kfree_skb(skb);
142		return 0;
143	}
144
145	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
146			    ip6_finish_output,
147			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
148}
149
150/*
151 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
152 */
153
154int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
155	     struct ipv6_txoptions *opt, int tclass)
156{
157	struct net *net = sock_net(sk);
158	struct ipv6_pinfo *np = inet6_sk(sk);
159	struct in6_addr *first_hop = &fl6->daddr;
160	struct dst_entry *dst = skb_dst(skb);
161	struct ipv6hdr *hdr;
162	u8  proto = fl6->flowi6_proto;
163	int seg_len = skb->len;
164	int hlimit = -1;
165	u32 mtu;
166
167	if (opt) {
168		unsigned int head_room;
169
170		/* First: exthdrs may take lots of space (~8K for now)
171		   MAX_HEADER is not enough.
172		 */
173		head_room = opt->opt_nflen + opt->opt_flen;
174		seg_len += head_room;
175		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
176
177		if (skb_headroom(skb) < head_room) {
178			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
179			if (skb2 == NULL) {
180				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
181					      IPSTATS_MIB_OUTDISCARDS);
182				kfree_skb(skb);
183				return -ENOBUFS;
184			}
185			consume_skb(skb);
186			skb = skb2;
187			skb_set_owner_w(skb, sk);
188		}
189		if (opt->opt_flen)
190			ipv6_push_frag_opts(skb, opt, &proto);
191		if (opt->opt_nflen)
192			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
193	}
194
195	skb_push(skb, sizeof(struct ipv6hdr));
196	skb_reset_network_header(skb);
197	hdr = ipv6_hdr(skb);
198
199	/*
200	 *	Fill in the IPv6 header
201	 */
202	if (np)
203		hlimit = np->hop_limit;
204	if (hlimit < 0)
205		hlimit = ip6_dst_hoplimit(dst);
206
207	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
208
209	hdr->payload_len = htons(seg_len);
210	hdr->nexthdr = proto;
211	hdr->hop_limit = hlimit;
212
213	hdr->saddr = fl6->saddr;
214	hdr->daddr = *first_hop;
215
216	skb->protocol = htons(ETH_P_IPV6);
217	skb->priority = sk->sk_priority;
218	skb->mark = sk->sk_mark;
219
220	mtu = dst_mtu(dst);
221	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
222		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
223			      IPSTATS_MIB_OUT, skb->len);
224		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
225			       dst->dev, dst_output);
226	}
227
228	skb->dev = dst->dev;
229	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
230	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
231	kfree_skb(skb);
232	return -EMSGSIZE;
233}
234
235EXPORT_SYMBOL(ip6_xmit);
236
237static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
238{
239	struct ip6_ra_chain *ra;
240	struct sock *last = NULL;
241
242	read_lock(&ip6_ra_lock);
243	for (ra = ip6_ra_chain; ra; ra = ra->next) {
244		struct sock *sk = ra->sk;
245		if (sk && ra->sel == sel &&
246		    (!sk->sk_bound_dev_if ||
247		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
248			if (last) {
249				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
250				if (skb2)
251					rawv6_rcv(last, skb2);
252			}
253			last = sk;
254		}
255	}
256
257	if (last) {
258		rawv6_rcv(last, skb);
259		read_unlock(&ip6_ra_lock);
260		return 1;
261	}
262	read_unlock(&ip6_ra_lock);
263	return 0;
264}
265
266static int ip6_forward_proxy_check(struct sk_buff *skb)
267{
268	struct ipv6hdr *hdr = ipv6_hdr(skb);
269	u8 nexthdr = hdr->nexthdr;
270	__be16 frag_off;
271	int offset;
272
273	if (ipv6_ext_hdr(nexthdr)) {
274		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
275		if (offset < 0)
276			return 0;
277	} else
278		offset = sizeof(struct ipv6hdr);
279
280	if (nexthdr == IPPROTO_ICMPV6) {
281		struct icmp6hdr *icmp6;
282
283		if (!pskb_may_pull(skb, (skb_network_header(skb) +
284					 offset + 1 - skb->data)))
285			return 0;
286
287		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
288
289		switch (icmp6->icmp6_type) {
290		case NDISC_ROUTER_SOLICITATION:
291		case NDISC_ROUTER_ADVERTISEMENT:
292		case NDISC_NEIGHBOUR_SOLICITATION:
293		case NDISC_NEIGHBOUR_ADVERTISEMENT:
294		case NDISC_REDIRECT:
295			/* For reaction involving unicast neighbor discovery
296			 * message destined to the proxied address, pass it to
297			 * input function.
298			 */
299			return 1;
300		default:
301			break;
302		}
303	}
304
305	/*
306	 * The proxying router can't forward traffic sent to a link-local
307	 * address, so signal the sender and discard the packet. This
308	 * behavior is clarified by the MIPv6 specification.
309	 */
310	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
311		dst_link_failure(skb);
312		return -1;
313	}
314
315	return 0;
316}
317
318static inline int ip6_forward_finish(struct sk_buff *skb)
319{
320	return dst_output(skb);
321}
322
323int ip6_forward(struct sk_buff *skb)
324{
325	struct dst_entry *dst = skb_dst(skb);
326	struct ipv6hdr *hdr = ipv6_hdr(skb);
327	struct inet6_skb_parm *opt = IP6CB(skb);
328	struct net *net = dev_net(dst->dev);
329	u32 mtu;
330
331	if (net->ipv6.devconf_all->forwarding == 0)
332		goto error;
333
334	if (skb_warn_if_lro(skb))
335		goto drop;
336
337	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
338		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
339		goto drop;
340	}
341
342	if (skb->pkt_type != PACKET_HOST)
343		goto drop;
344
345	skb_forward_csum(skb);
346
347	/*
348	 *	We DO NOT make any processing on
349	 *	RA packets, pushing them to user level AS IS
350	 *	without ane WARRANTY that application will be able
351	 *	to interpret them. The reason is that we
352	 *	cannot make anything clever here.
353	 *
354	 *	We are not end-node, so that if packet contains
355	 *	AH/ESP, we cannot make anything.
356	 *	Defragmentation also would be mistake, RA packets
357	 *	cannot be fragmented, because there is no warranty
358	 *	that different fragments will go along one path. --ANK
359	 */
360	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
361		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
362			return 0;
363	}
364
365	/*
366	 *	check and decrement ttl
367	 */
368	if (hdr->hop_limit <= 1) {
369		/* Force OUTPUT device used as source address */
370		skb->dev = dst->dev;
371		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
372		IP6_INC_STATS_BH(net,
373				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
374
375		kfree_skb(skb);
376		return -ETIMEDOUT;
377	}
378
379	/* XXX: idev->cnf.proxy_ndp? */
380	if (net->ipv6.devconf_all->proxy_ndp &&
381	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
382		int proxied = ip6_forward_proxy_check(skb);
383		if (proxied > 0)
384			return ip6_input(skb);
385		else if (proxied < 0) {
386			IP6_INC_STATS(net, ip6_dst_idev(dst),
387				      IPSTATS_MIB_INDISCARDS);
388			goto drop;
389		}
390	}
391
392	if (!xfrm6_route_forward(skb)) {
393		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
394		goto drop;
395	}
396	dst = skb_dst(skb);
397
398	/* IPv6 specs say nothing about it, but it is clear that we cannot
399	   send redirects to source routed frames.
400	   We don't send redirects to frames decapsulated from IPsec.
401	 */
402	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
403		struct in6_addr *target = NULL;
404		struct inet_peer *peer;
405		struct rt6_info *rt;
406
407		/*
408		 *	incoming and outgoing devices are the same
409		 *	send a redirect.
410		 */
411
412		rt = (struct rt6_info *) dst;
413		if (rt->rt6i_flags & RTF_GATEWAY)
414			target = &rt->rt6i_gateway;
415		else
416			target = &hdr->daddr;
417
418		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
419
420		/* Limit redirects both by destination (here)
421		   and by source (inside ndisc_send_redirect)
422		 */
423		if (inet_peer_xrlim_allow(peer, 1*HZ))
424			ndisc_send_redirect(skb, target);
425		if (peer)
426			inet_putpeer(peer);
427	} else {
428		int addrtype = ipv6_addr_type(&hdr->saddr);
429
430		/* This check is security critical. */
431		if (addrtype == IPV6_ADDR_ANY ||
432		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
433			goto error;
434		if (addrtype & IPV6_ADDR_LINKLOCAL) {
435			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
436				    ICMPV6_NOT_NEIGHBOUR, 0);
437			goto error;
438		}
439	}
440
441	mtu = dst_mtu(dst);
442	if (mtu < IPV6_MIN_MTU)
443		mtu = IPV6_MIN_MTU;
444
445	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
446	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
447		/* Again, force OUTPUT device used as source address */
448		skb->dev = dst->dev;
449		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
450		IP6_INC_STATS_BH(net,
451				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
452		IP6_INC_STATS_BH(net,
453				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
454		kfree_skb(skb);
455		return -EMSGSIZE;
456	}
457
458	if (skb_cow(skb, dst->dev->hard_header_len)) {
459		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
460		goto drop;
461	}
462
463	hdr = ipv6_hdr(skb);
464
465	/* Mangling hops number delayed to point after skb COW */
466
467	hdr->hop_limit--;
468
469	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
470	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
471	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
472		       ip6_forward_finish);
473
474error:
475	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
476drop:
477	kfree_skb(skb);
478	return -EINVAL;
479}
480
481static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
482{
483	to->pkt_type = from->pkt_type;
484	to->priority = from->priority;
485	to->protocol = from->protocol;
486	skb_dst_drop(to);
487	skb_dst_set(to, dst_clone(skb_dst(from)));
488	to->dev = from->dev;
489	to->mark = from->mark;
490
491#ifdef CONFIG_NET_SCHED
492	to->tc_index = from->tc_index;
493#endif
494	nf_copy(to, from);
495#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
496	to->nf_trace = from->nf_trace;
497#endif
498	skb_copy_secmark(to, from);
499}
500
501int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
502{
503	struct sk_buff *frag;
504	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
505	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
506	struct ipv6hdr *tmp_hdr;
507	struct frag_hdr *fh;
508	unsigned int mtu, hlen, left, len;
509	int hroom, troom;
510	__be32 frag_id = 0;
511	int ptr, offset = 0, err=0;
512	u8 *prevhdr, nexthdr = 0;
513	struct net *net = dev_net(skb_dst(skb)->dev);
514
515	hlen = ip6_find_1stfragopt(skb, &prevhdr);
516	nexthdr = *prevhdr;
517
518	mtu = ip6_skb_dst_mtu(skb);
519
520	/* We must not fragment if the socket is set to force MTU discovery
521	 * or if the skb it not generated by a local socket.
522	 */
523	if (unlikely(!skb->local_df && skb->len > mtu) ||
524		     (IP6CB(skb)->frag_max_size &&
525		      IP6CB(skb)->frag_max_size > mtu)) {
526		if (skb->sk && dst_allfrag(skb_dst(skb)))
527			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
528
529		skb->dev = skb_dst(skb)->dev;
530		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
532			      IPSTATS_MIB_FRAGFAILS);
533		kfree_skb(skb);
534		return -EMSGSIZE;
535	}
536
537	if (np && np->frag_size < mtu) {
538		if (np->frag_size)
539			mtu = np->frag_size;
540	}
541	mtu -= hlen + sizeof(struct frag_hdr);
542
543	if (skb_has_frag_list(skb)) {
544		int first_len = skb_pagelen(skb);
545		struct sk_buff *frag2;
546
547		if (first_len - hlen > mtu ||
548		    ((first_len - hlen) & 7) ||
549		    skb_cloned(skb))
550			goto slow_path;
551
552		skb_walk_frags(skb, frag) {
553			/* Correct geometry. */
554			if (frag->len > mtu ||
555			    ((frag->len & 7) && frag->next) ||
556			    skb_headroom(frag) < hlen)
557				goto slow_path_clean;
558
559			/* Partially cloned skb? */
560			if (skb_shared(frag))
561				goto slow_path_clean;
562
563			BUG_ON(frag->sk);
564			if (skb->sk) {
565				frag->sk = skb->sk;
566				frag->destructor = sock_wfree;
567			}
568			skb->truesize -= frag->truesize;
569		}
570
571		err = 0;
572		offset = 0;
573		frag = skb_shinfo(skb)->frag_list;
574		skb_frag_list_init(skb);
575		/* BUILD HEADER */
576
577		*prevhdr = NEXTHDR_FRAGMENT;
578		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
579		if (!tmp_hdr) {
580			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
581				      IPSTATS_MIB_FRAGFAILS);
582			return -ENOMEM;
583		}
584
585		__skb_pull(skb, hlen);
586		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
587		__skb_push(skb, hlen);
588		skb_reset_network_header(skb);
589		memcpy(skb_network_header(skb), tmp_hdr, hlen);
590
591		ipv6_select_ident(fh, rt);
592		fh->nexthdr = nexthdr;
593		fh->reserved = 0;
594		fh->frag_off = htons(IP6_MF);
595		frag_id = fh->identification;
596
597		first_len = skb_pagelen(skb);
598		skb->data_len = first_len - skb_headlen(skb);
599		skb->len = first_len;
600		ipv6_hdr(skb)->payload_len = htons(first_len -
601						   sizeof(struct ipv6hdr));
602
603		dst_hold(&rt->dst);
604
605		for (;;) {
606			/* Prepare header of the next frame,
607			 * before previous one went down. */
608			if (frag) {
609				frag->ip_summed = CHECKSUM_NONE;
610				skb_reset_transport_header(frag);
611				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
612				__skb_push(frag, hlen);
613				skb_reset_network_header(frag);
614				memcpy(skb_network_header(frag), tmp_hdr,
615				       hlen);
616				offset += skb->len - hlen - sizeof(struct frag_hdr);
617				fh->nexthdr = nexthdr;
618				fh->reserved = 0;
619				fh->frag_off = htons(offset);
620				if (frag->next != NULL)
621					fh->frag_off |= htons(IP6_MF);
622				fh->identification = frag_id;
623				ipv6_hdr(frag)->payload_len =
624						htons(frag->len -
625						      sizeof(struct ipv6hdr));
626				ip6_copy_metadata(frag, skb);
627			}
628
629			err = output(skb);
630			if(!err)
631				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
632					      IPSTATS_MIB_FRAGCREATES);
633
634			if (err || !frag)
635				break;
636
637			skb = frag;
638			frag = skb->next;
639			skb->next = NULL;
640		}
641
642		kfree(tmp_hdr);
643
644		if (err == 0) {
645			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
646				      IPSTATS_MIB_FRAGOKS);
647			ip6_rt_put(rt);
648			return 0;
649		}
650
651		while (frag) {
652			skb = frag->next;
653			kfree_skb(frag);
654			frag = skb;
655		}
656
657		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
658			      IPSTATS_MIB_FRAGFAILS);
659		ip6_rt_put(rt);
660		return err;
661
662slow_path_clean:
663		skb_walk_frags(skb, frag2) {
664			if (frag2 == frag)
665				break;
666			frag2->sk = NULL;
667			frag2->destructor = NULL;
668			skb->truesize += frag2->truesize;
669		}
670	}
671
672slow_path:
673	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
674	    skb_checksum_help(skb))
675		goto fail;
676
677	left = skb->len - hlen;		/* Space per frame */
678	ptr = hlen;			/* Where to start from */
679
680	/*
681	 *	Fragment the datagram.
682	 */
683
684	*prevhdr = NEXTHDR_FRAGMENT;
685	hroom = LL_RESERVED_SPACE(rt->dst.dev);
686	troom = rt->dst.dev->needed_tailroom;
687
688	/*
689	 *	Keep copying data until we run out.
690	 */
691	while(left > 0)	{
692		len = left;
693		/* IF: it doesn't fit, use 'mtu' - the data space left */
694		if (len > mtu)
695			len = mtu;
696		/* IF: we are not sending up to and including the packet end
697		   then align the next start on an eight byte boundary */
698		if (len < left)	{
699			len &= ~7;
700		}
701		/*
702		 *	Allocate buffer.
703		 */
704
705		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
706				      hroom + troom, GFP_ATOMIC)) == NULL) {
707			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
708			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
709				      IPSTATS_MIB_FRAGFAILS);
710			err = -ENOMEM;
711			goto fail;
712		}
713
714		/*
715		 *	Set up data on packet
716		 */
717
718		ip6_copy_metadata(frag, skb);
719		skb_reserve(frag, hroom);
720		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
721		skb_reset_network_header(frag);
722		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
723		frag->transport_header = (frag->network_header + hlen +
724					  sizeof(struct frag_hdr));
725
726		/*
727		 *	Charge the memory for the fragment to any owner
728		 *	it might possess
729		 */
730		if (skb->sk)
731			skb_set_owner_w(frag, skb->sk);
732
733		/*
734		 *	Copy the packet header into the new buffer.
735		 */
736		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
737
738		/*
739		 *	Build fragment header.
740		 */
741		fh->nexthdr = nexthdr;
742		fh->reserved = 0;
743		if (!frag_id) {
744			ipv6_select_ident(fh, rt);
745			frag_id = fh->identification;
746		} else
747			fh->identification = frag_id;
748
749		/*
750		 *	Copy a block of the IP datagram.
751		 */
752		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
753			BUG();
754		left -= len;
755
756		fh->frag_off = htons(offset);
757		if (left > 0)
758			fh->frag_off |= htons(IP6_MF);
759		ipv6_hdr(frag)->payload_len = htons(frag->len -
760						    sizeof(struct ipv6hdr));
761
762		ptr += len;
763		offset += len;
764
765		/*
766		 *	Put this fragment into the sending queue.
767		 */
768		err = output(frag);
769		if (err)
770			goto fail;
771
772		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
773			      IPSTATS_MIB_FRAGCREATES);
774	}
775	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
776		      IPSTATS_MIB_FRAGOKS);
777	consume_skb(skb);
778	return err;
779
780fail:
781	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782		      IPSTATS_MIB_FRAGFAILS);
783	kfree_skb(skb);
784	return err;
785}
786
787static inline int ip6_rt_check(const struct rt6key *rt_key,
788			       const struct in6_addr *fl_addr,
789			       const struct in6_addr *addr_cache)
790{
791	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
792		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
793}
794
795static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
796					  struct dst_entry *dst,
797					  const struct flowi6 *fl6)
798{
799	struct ipv6_pinfo *np = inet6_sk(sk);
800	struct rt6_info *rt;
801
802	if (!dst)
803		goto out;
804
805	if (dst->ops->family != AF_INET6) {
806		dst_release(dst);
807		return NULL;
808	}
809
810	rt = (struct rt6_info *)dst;
811	/* Yes, checking route validity in not connected
812	 * case is not very simple. Take into account,
813	 * that we do not support routing by source, TOS,
814	 * and MSG_DONTROUTE 		--ANK (980726)
815	 *
816	 * 1. ip6_rt_check(): If route was host route,
817	 *    check that cached destination is current.
818	 *    If it is network route, we still may
819	 *    check its validity using saved pointer
820	 *    to the last used address: daddr_cache.
821	 *    We do not want to save whole address now,
822	 *    (because main consumer of this service
823	 *    is tcp, which has not this problem),
824	 *    so that the last trick works only on connected
825	 *    sockets.
826	 * 2. oif also should be the same.
827	 */
828	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
829#ifdef CONFIG_IPV6_SUBTREES
830	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
831#endif
832	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
833		dst_release(dst);
834		dst = NULL;
835	}
836
837out:
838	return dst;
839}
840
841static int ip6_dst_lookup_tail(struct sock *sk,
842			       struct dst_entry **dst, struct flowi6 *fl6)
843{
844	struct net *net = sock_net(sk);
845#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
846	struct neighbour *n;
847	struct rt6_info *rt;
848#endif
849	int err;
850
851	if (*dst == NULL)
852		*dst = ip6_route_output(net, sk, fl6);
853
854	if ((err = (*dst)->error))
855		goto out_err_release;
856
857	if (ipv6_addr_any(&fl6->saddr)) {
858		struct rt6_info *rt = (struct rt6_info *) *dst;
859		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
860					  sk ? inet6_sk(sk)->srcprefs : 0,
861					  &fl6->saddr);
862		if (err)
863			goto out_err_release;
864	}
865
866#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
867	/*
868	 * Here if the dst entry we've looked up
869	 * has a neighbour entry that is in the INCOMPLETE
870	 * state and the src address from the flow is
871	 * marked as OPTIMISTIC, we release the found
872	 * dst entry and replace it instead with the
873	 * dst entry of the nexthop router
874	 */
875	rt = (struct rt6_info *) *dst;
876	rcu_read_lock_bh();
877	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
878	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
879	rcu_read_unlock_bh();
880
881	if (err) {
882		struct inet6_ifaddr *ifp;
883		struct flowi6 fl_gw6;
884		int redirect;
885
886		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
887				      (*dst)->dev, 1);
888
889		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
890		if (ifp)
891			in6_ifa_put(ifp);
892
893		if (redirect) {
894			/*
895			 * We need to get the dst entry for the
896			 * default router instead
897			 */
898			dst_release(*dst);
899			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
900			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
901			*dst = ip6_route_output(net, sk, &fl_gw6);
902			if ((err = (*dst)->error))
903				goto out_err_release;
904		}
905	}
906#endif
907
908	return 0;
909
910out_err_release:
911	if (err == -ENETUNREACH)
912		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
913	dst_release(*dst);
914	*dst = NULL;
915	return err;
916}
917
918/**
919 *	ip6_dst_lookup - perform route lookup on flow
920 *	@sk: socket which provides route info
921 *	@dst: pointer to dst_entry * for result
922 *	@fl6: flow to lookup
923 *
924 *	This function performs a route lookup on the given flow.
925 *
926 *	It returns zero on success, or a standard errno code on error.
927 */
928int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
929{
930	*dst = NULL;
931	return ip6_dst_lookup_tail(sk, dst, fl6);
932}
933EXPORT_SYMBOL_GPL(ip6_dst_lookup);
934
935/**
936 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
937 *	@sk: socket which provides route info
938 *	@fl6: flow to lookup
939 *	@final_dst: final destination address for ipsec lookup
940 *	@can_sleep: we are in a sleepable context
941 *
942 *	This function performs a route lookup on the given flow.
943 *
944 *	It returns a valid dst pointer on success, or a pointer encoded
945 *	error code.
946 */
947struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
948				      const struct in6_addr *final_dst,
949				      bool can_sleep)
950{
951	struct dst_entry *dst = NULL;
952	int err;
953
954	err = ip6_dst_lookup_tail(sk, &dst, fl6);
955	if (err)
956		return ERR_PTR(err);
957	if (final_dst)
958		fl6->daddr = *final_dst;
959	if (can_sleep)
960		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
961
962	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
963}
964EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
965
966/**
967 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
968 *	@sk: socket which provides the dst cache and route info
969 *	@fl6: flow to lookup
970 *	@final_dst: final destination address for ipsec lookup
971 *	@can_sleep: we are in a sleepable context
972 *
973 *	This function performs a route lookup on the given flow with the
974 *	possibility of using the cached route in the socket if it is valid.
975 *	It will take the socket dst lock when operating on the dst cache.
976 *	As a result, this function can only be used in process context.
977 *
978 *	It returns a valid dst pointer on success, or a pointer encoded
979 *	error code.
980 */
981struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
982					 const struct in6_addr *final_dst,
983					 bool can_sleep)
984{
985	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
986	int err;
987
988	dst = ip6_sk_dst_check(sk, dst, fl6);
989
990	err = ip6_dst_lookup_tail(sk, &dst, fl6);
991	if (err)
992		return ERR_PTR(err);
993	if (final_dst)
994		fl6->daddr = *final_dst;
995	if (can_sleep)
996		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
997
998	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
999}
1000EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1001
1002static inline int ip6_ufo_append_data(struct sock *sk,
1003			int getfrag(void *from, char *to, int offset, int len,
1004			int odd, struct sk_buff *skb),
1005			void *from, int length, int hh_len, int fragheaderlen,
1006			int transhdrlen, int mtu,unsigned int flags,
1007			struct rt6_info *rt)
1008
1009{
1010	struct sk_buff *skb;
1011	struct frag_hdr fhdr;
1012	int err;
1013
1014	/* There is support for UDP large send offload by network
1015	 * device, so create one single skb packet containing complete
1016	 * udp datagram
1017	 */
1018	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1019		skb = sock_alloc_send_skb(sk,
1020			hh_len + fragheaderlen + transhdrlen + 20,
1021			(flags & MSG_DONTWAIT), &err);
1022		if (skb == NULL)
1023			return err;
1024
1025		/* reserve space for Hardware header */
1026		skb_reserve(skb, hh_len);
1027
1028		/* create space for UDP/IP header */
1029		skb_put(skb,fragheaderlen + transhdrlen);
1030
1031		/* initialize network header pointer */
1032		skb_reset_network_header(skb);
1033
1034		/* initialize protocol header pointer */
1035		skb->transport_header = skb->network_header + fragheaderlen;
1036
1037		skb->protocol = htons(ETH_P_IPV6);
1038		skb->csum = 0;
1039
1040		__skb_queue_tail(&sk->sk_write_queue, skb);
1041	} else if (skb_is_gso(skb)) {
1042		goto append;
1043	}
1044
1045	skb->ip_summed = CHECKSUM_PARTIAL;
1046	/* Specify the length of each IPv6 datagram fragment.
1047	 * It has to be a multiple of 8.
1048	 */
1049	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1050				     sizeof(struct frag_hdr)) & ~7;
1051	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1052	ipv6_select_ident(&fhdr, rt);
1053	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1054
1055append:
1056	return skb_append_datato_frags(sk, skb, getfrag, from,
1057				       (length - transhdrlen));
1058}
1059
1060static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1061					       gfp_t gfp)
1062{
1063	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1064}
1065
1066static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1067						gfp_t gfp)
1068{
1069	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1070}
1071
1072static void ip6_append_data_mtu(unsigned int *mtu,
1073				int *maxfraglen,
1074				unsigned int fragheaderlen,
1075				struct sk_buff *skb,
1076				struct rt6_info *rt,
1077				bool pmtuprobe)
1078{
1079	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1080		if (skb == NULL) {
1081			/* first fragment, reserve header_len */
1082			*mtu = *mtu - rt->dst.header_len;
1083
1084		} else {
1085			/*
1086			 * this fragment is not first, the headers
1087			 * space is regarded as data space.
1088			 */
1089			*mtu = min(*mtu, pmtuprobe ?
1090				   rt->dst.dev->mtu :
1091				   dst_mtu(rt->dst.path));
1092		}
1093		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1094			      + fragheaderlen - sizeof(struct frag_hdr);
1095	}
1096}
1097
1098int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1099	int offset, int len, int odd, struct sk_buff *skb),
1100	void *from, int length, int transhdrlen,
1101	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1102	struct rt6_info *rt, unsigned int flags, int dontfrag)
1103{
1104	struct inet_sock *inet = inet_sk(sk);
1105	struct ipv6_pinfo *np = inet6_sk(sk);
1106	struct inet_cork *cork;
1107	struct sk_buff *skb, *skb_prev = NULL;
1108	unsigned int maxfraglen, fragheaderlen, mtu;
1109	int exthdrlen;
1110	int dst_exthdrlen;
1111	int hh_len;
1112	int copy;
1113	int err;
1114	int offset = 0;
1115	__u8 tx_flags = 0;
1116
1117	if (flags&MSG_PROBE)
1118		return 0;
1119	cork = &inet->cork.base;
1120	if (skb_queue_empty(&sk->sk_write_queue)) {
1121		/*
1122		 * setup for corking
1123		 */
1124		if (opt) {
1125			if (WARN_ON(np->cork.opt))
1126				return -EINVAL;
1127
1128			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1129			if (unlikely(np->cork.opt == NULL))
1130				return -ENOBUFS;
1131
1132			np->cork.opt->tot_len = opt->tot_len;
1133			np->cork.opt->opt_flen = opt->opt_flen;
1134			np->cork.opt->opt_nflen = opt->opt_nflen;
1135
1136			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1137							    sk->sk_allocation);
1138			if (opt->dst0opt && !np->cork.opt->dst0opt)
1139				return -ENOBUFS;
1140
1141			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1142							    sk->sk_allocation);
1143			if (opt->dst1opt && !np->cork.opt->dst1opt)
1144				return -ENOBUFS;
1145
1146			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1147							   sk->sk_allocation);
1148			if (opt->hopopt && !np->cork.opt->hopopt)
1149				return -ENOBUFS;
1150
1151			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1152							    sk->sk_allocation);
1153			if (opt->srcrt && !np->cork.opt->srcrt)
1154				return -ENOBUFS;
1155
1156			/* need source address above miyazawa*/
1157		}
1158		dst_hold(&rt->dst);
1159		cork->dst = &rt->dst;
1160		inet->cork.fl.u.ip6 = *fl6;
1161		np->cork.hop_limit = hlimit;
1162		np->cork.tclass = tclass;
1163		if (rt->dst.flags & DST_XFRM_TUNNEL)
1164			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1165			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1166		else
1167			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1169		if (np->frag_size < mtu) {
1170			if (np->frag_size)
1171				mtu = np->frag_size;
1172		}
1173		cork->fragsize = mtu;
1174		if (dst_allfrag(rt->dst.path))
1175			cork->flags |= IPCORK_ALLFRAG;
1176		cork->length = 0;
1177		exthdrlen = (opt ? opt->opt_flen : 0);
1178		length += exthdrlen;
1179		transhdrlen += exthdrlen;
1180		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1181	} else {
1182		rt = (struct rt6_info *)cork->dst;
1183		fl6 = &inet->cork.fl.u.ip6;
1184		opt = np->cork.opt;
1185		transhdrlen = 0;
1186		exthdrlen = 0;
1187		dst_exthdrlen = 0;
1188		mtu = cork->fragsize;
1189	}
1190
1191	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1192
1193	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1194			(opt ? opt->opt_nflen : 0);
1195	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1196
1197	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1198		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1199			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1200			return -EMSGSIZE;
1201		}
1202	}
1203
1204	/* For UDP, check if TX timestamp is enabled */
1205	if (sk->sk_type == SOCK_DGRAM)
1206		sock_tx_timestamp(sk, &tx_flags);
1207
1208	/*
1209	 * Let's try using as much space as possible.
1210	 * Use MTU if total length of the message fits into the MTU.
1211	 * Otherwise, we need to reserve fragment header and
1212	 * fragment alignment (= 8-15 octects, in total).
1213	 *
1214	 * Note that we may need to "move" the data from the tail of
1215	 * of the buffer to the new fragment when we split
1216	 * the message.
1217	 *
1218	 * FIXME: It may be fragmented into multiple chunks
1219	 *        at once if non-fragmentable extension headers
1220	 *        are too large.
1221	 * --yoshfuji
1222	 */
1223
1224	if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1225					   sk->sk_protocol == IPPROTO_RAW)) {
1226		ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1227		return -EMSGSIZE;
1228	}
1229
1230	skb = skb_peek_tail(&sk->sk_write_queue);
1231	cork->length += length;
1232	if (((length > mtu) ||
1233	     (skb && skb_is_gso(skb))) &&
1234	    (sk->sk_protocol == IPPROTO_UDP) &&
1235	    (rt->dst.dev->features & NETIF_F_UFO)) {
1236		err = ip6_ufo_append_data(sk, getfrag, from, length,
1237					  hh_len, fragheaderlen,
1238					  transhdrlen, mtu, flags, rt);
1239		if (err)
1240			goto error;
1241		return 0;
1242	}
1243
1244	if (!skb)
1245		goto alloc_new_skb;
1246
1247	while (length > 0) {
1248		/* Check if the remaining data fits into current packet. */
1249		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1250		if (copy < length)
1251			copy = maxfraglen - skb->len;
1252
1253		if (copy <= 0) {
1254			char *data;
1255			unsigned int datalen;
1256			unsigned int fraglen;
1257			unsigned int fraggap;
1258			unsigned int alloclen;
1259alloc_new_skb:
1260			/* There's no room in the current skb */
1261			if (skb)
1262				fraggap = skb->len - maxfraglen;
1263			else
1264				fraggap = 0;
1265			/* update mtu and maxfraglen if necessary */
1266			if (skb == NULL || skb_prev == NULL)
1267				ip6_append_data_mtu(&mtu, &maxfraglen,
1268						    fragheaderlen, skb, rt,
1269						    np->pmtudisc ==
1270						    IPV6_PMTUDISC_PROBE);
1271
1272			skb_prev = skb;
1273
1274			/*
1275			 * If remaining data exceeds the mtu,
1276			 * we know we need more fragment(s).
1277			 */
1278			datalen = length + fraggap;
1279
1280			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1281				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1282			if ((flags & MSG_MORE) &&
1283			    !(rt->dst.dev->features&NETIF_F_SG))
1284				alloclen = mtu;
1285			else
1286				alloclen = datalen + fragheaderlen;
1287
1288			alloclen += dst_exthdrlen;
1289
1290			if (datalen != length + fraggap) {
1291				/*
1292				 * this is not the last fragment, the trailer
1293				 * space is regarded as data space.
1294				 */
1295				datalen += rt->dst.trailer_len;
1296			}
1297
1298			alloclen += rt->dst.trailer_len;
1299			fraglen = datalen + fragheaderlen;
1300
1301			/*
1302			 * We just reserve space for fragment header.
1303			 * Note: this may be overallocation if the message
1304			 * (without MSG_MORE) fits into the MTU.
1305			 */
1306			alloclen += sizeof(struct frag_hdr);
1307
1308			if (transhdrlen) {
1309				skb = sock_alloc_send_skb(sk,
1310						alloclen + hh_len,
1311						(flags & MSG_DONTWAIT), &err);
1312			} else {
1313				skb = NULL;
1314				if (atomic_read(&sk->sk_wmem_alloc) <=
1315				    2 * sk->sk_sndbuf)
1316					skb = sock_wmalloc(sk,
1317							   alloclen + hh_len, 1,
1318							   sk->sk_allocation);
1319				if (unlikely(skb == NULL))
1320					err = -ENOBUFS;
1321				else {
1322					/* Only the initial fragment
1323					 * is time stamped.
1324					 */
1325					tx_flags = 0;
1326				}
1327			}
1328			if (skb == NULL)
1329				goto error;
1330			/*
1331			 *	Fill in the control structures
1332			 */
1333			skb->protocol = htons(ETH_P_IPV6);
1334			skb->ip_summed = CHECKSUM_NONE;
1335			skb->csum = 0;
1336			/* reserve for fragmentation and ipsec header */
1337			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1338				    dst_exthdrlen);
1339
1340			if (sk->sk_type == SOCK_DGRAM)
1341				skb_shinfo(skb)->tx_flags = tx_flags;
1342
1343			/*
1344			 *	Find where to start putting bytes
1345			 */
1346			data = skb_put(skb, fraglen);
1347			skb_set_network_header(skb, exthdrlen);
1348			data += fragheaderlen;
1349			skb->transport_header = (skb->network_header +
1350						 fragheaderlen);
1351			if (fraggap) {
1352				skb->csum = skb_copy_and_csum_bits(
1353					skb_prev, maxfraglen,
1354					data + transhdrlen, fraggap, 0);
1355				skb_prev->csum = csum_sub(skb_prev->csum,
1356							  skb->csum);
1357				data += fraggap;
1358				pskb_trim_unique(skb_prev, maxfraglen);
1359			}
1360			copy = datalen - transhdrlen - fraggap;
1361
1362			if (copy < 0) {
1363				err = -EINVAL;
1364				kfree_skb(skb);
1365				goto error;
1366			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1367				err = -EFAULT;
1368				kfree_skb(skb);
1369				goto error;
1370			}
1371
1372			offset += copy;
1373			length -= datalen - fraggap;
1374			transhdrlen = 0;
1375			exthdrlen = 0;
1376			dst_exthdrlen = 0;
1377
1378			/*
1379			 * Put the packet on the pending queue
1380			 */
1381			__skb_queue_tail(&sk->sk_write_queue, skb);
1382			continue;
1383		}
1384
1385		if (copy > length)
1386			copy = length;
1387
1388		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1389			unsigned int off;
1390
1391			off = skb->len;
1392			if (getfrag(from, skb_put(skb, copy),
1393						offset, copy, off, skb) < 0) {
1394				__skb_trim(skb, off);
1395				err = -EFAULT;
1396				goto error;
1397			}
1398		} else {
1399			int i = skb_shinfo(skb)->nr_frags;
1400			struct page_frag *pfrag = sk_page_frag(sk);
1401
1402			err = -ENOMEM;
1403			if (!sk_page_frag_refill(sk, pfrag))
1404				goto error;
1405
1406			if (!skb_can_coalesce(skb, i, pfrag->page,
1407					      pfrag->offset)) {
1408				err = -EMSGSIZE;
1409				if (i == MAX_SKB_FRAGS)
1410					goto error;
1411
1412				__skb_fill_page_desc(skb, i, pfrag->page,
1413						     pfrag->offset, 0);
1414				skb_shinfo(skb)->nr_frags = ++i;
1415				get_page(pfrag->page);
1416			}
1417			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1418			if (getfrag(from,
1419				    page_address(pfrag->page) + pfrag->offset,
1420				    offset, copy, skb->len, skb) < 0)
1421				goto error_efault;
1422
1423			pfrag->offset += copy;
1424			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1425			skb->len += copy;
1426			skb->data_len += copy;
1427			skb->truesize += copy;
1428			atomic_add(copy, &sk->sk_wmem_alloc);
1429		}
1430		offset += copy;
1431		length -= copy;
1432	}
1433
1434	return 0;
1435
1436error_efault:
1437	err = -EFAULT;
1438error:
1439	cork->length -= length;
1440	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1441	return err;
1442}
1443EXPORT_SYMBOL_GPL(ip6_append_data);
1444
1445static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1446{
1447	if (np->cork.opt) {
1448		kfree(np->cork.opt->dst0opt);
1449		kfree(np->cork.opt->dst1opt);
1450		kfree(np->cork.opt->hopopt);
1451		kfree(np->cork.opt->srcrt);
1452		kfree(np->cork.opt);
1453		np->cork.opt = NULL;
1454	}
1455
1456	if (inet->cork.base.dst) {
1457		dst_release(inet->cork.base.dst);
1458		inet->cork.base.dst = NULL;
1459		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1460	}
1461	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1462}
1463
1464int ip6_push_pending_frames(struct sock *sk)
1465{
1466	struct sk_buff *skb, *tmp_skb;
1467	struct sk_buff **tail_skb;
1468	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1469	struct inet_sock *inet = inet_sk(sk);
1470	struct ipv6_pinfo *np = inet6_sk(sk);
1471	struct net *net = sock_net(sk);
1472	struct ipv6hdr *hdr;
1473	struct ipv6_txoptions *opt = np->cork.opt;
1474	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1475	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1476	unsigned char proto = fl6->flowi6_proto;
1477	int err = 0;
1478
1479	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1480		goto out;
1481	tail_skb = &(skb_shinfo(skb)->frag_list);
1482
1483	/* move skb->data to ip header from ext header */
1484	if (skb->data < skb_network_header(skb))
1485		__skb_pull(skb, skb_network_offset(skb));
1486	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1487		__skb_pull(tmp_skb, skb_network_header_len(skb));
1488		*tail_skb = tmp_skb;
1489		tail_skb = &(tmp_skb->next);
1490		skb->len += tmp_skb->len;
1491		skb->data_len += tmp_skb->len;
1492		skb->truesize += tmp_skb->truesize;
1493		tmp_skb->destructor = NULL;
1494		tmp_skb->sk = NULL;
1495	}
1496
1497	/* Allow local fragmentation. */
1498	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1499		skb->local_df = 1;
1500
1501	*final_dst = fl6->daddr;
1502	__skb_pull(skb, skb_network_header_len(skb));
1503	if (opt && opt->opt_flen)
1504		ipv6_push_frag_opts(skb, opt, &proto);
1505	if (opt && opt->opt_nflen)
1506		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1507
1508	skb_push(skb, sizeof(struct ipv6hdr));
1509	skb_reset_network_header(skb);
1510	hdr = ipv6_hdr(skb);
1511
1512	ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1513	hdr->hop_limit = np->cork.hop_limit;
1514	hdr->nexthdr = proto;
1515	hdr->saddr = fl6->saddr;
1516	hdr->daddr = *final_dst;
1517
1518	skb->priority = sk->sk_priority;
1519	skb->mark = sk->sk_mark;
1520
1521	skb_dst_set(skb, dst_clone(&rt->dst));
1522	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1523	if (proto == IPPROTO_ICMPV6) {
1524		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1525
1526		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1527		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1528	}
1529
1530	err = ip6_local_out(skb);
1531	if (err) {
1532		if (err > 0)
1533			err = net_xmit_errno(err);
1534		if (err)
1535			goto error;
1536	}
1537
1538out:
1539	ip6_cork_release(inet, np);
1540	return err;
1541error:
1542	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1543	goto out;
1544}
1545EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1546
1547void ip6_flush_pending_frames(struct sock *sk)
1548{
1549	struct sk_buff *skb;
1550
1551	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1552		if (skb_dst(skb))
1553			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1554				      IPSTATS_MIB_OUTDISCARDS);
1555		kfree_skb(skb);
1556	}
1557
1558	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1559}
1560EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1561