1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	:	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58
59static int ip6_finish_output2(struct sk_buff *skb)
60{
61	struct dst_entry *dst = skb_dst(skb);
62	struct net_device *dev = dst->dev;
63	struct neighbour *neigh;
64	struct in6_addr *nexthop;
65	int ret;
66
67	skb->protocol = htons(ETH_P_IPV6);
68	skb->dev = dev;
69
70	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74		    ((mroute6_socket(dev_net(dev), skb) &&
75		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77					 &ipv6_hdr(skb)->saddr))) {
78			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80			/* Do not check for IFF_ALLMULTI; multicast routing
81			   is not supported in any case.
82			 */
83			if (newskb)
84				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85					newskb, NULL, newskb->dev,
86					dev_loopback_xmit);
87
88			if (ipv6_hdr(skb)->hop_limit == 0) {
89				IP6_INC_STATS(dev_net(dev), idev,
90					      IPSTATS_MIB_OUTDISCARDS);
91				kfree_skb(skb);
92				return 0;
93			}
94		}
95
96		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97				skb->len);
98
99		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100		    IPV6_ADDR_SCOPE_NODELOCAL &&
101		    !(dev->flags & IFF_LOOPBACK)) {
102			kfree_skb(skb);
103			return 0;
104		}
105	}
106
107	rcu_read_lock_bh();
108	nexthop = rt6_nexthop((struct rt6_info *)dst);
109	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110	if (unlikely(!neigh))
111		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112	if (!IS_ERR(neigh)) {
113		ret = dst_neigh_output(dst, neigh, skb);
114		rcu_read_unlock_bh();
115		return ret;
116	}
117	rcu_read_unlock_bh();
118
119	IP6_INC_STATS(dev_net(dst->dev),
120		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121	kfree_skb(skb);
122	return -EINVAL;
123}
124
125static int ip6_finish_output(struct sk_buff *skb)
126{
127	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128	    dst_allfrag(skb_dst(skb)) ||
129	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130		return ip6_fragment(skb, ip6_finish_output2);
131	else
132		return ip6_finish_output2(skb);
133}
134
135int ip6_output(struct sock *sk, struct sk_buff *skb)
136{
137	struct net_device *dev = skb_dst(skb)->dev;
138	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139	if (unlikely(idev->cnf.disable_ipv6)) {
140		IP6_INC_STATS(dev_net(dev), idev,
141			      IPSTATS_MIB_OUTDISCARDS);
142		kfree_skb(skb);
143		return 0;
144	}
145
146	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147			    ip6_finish_output,
148			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149}
150
151/*
152 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
153 */
154
155int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156	     struct ipv6_txoptions *opt, int tclass)
157{
158	struct net *net = sock_net(sk);
159	struct ipv6_pinfo *np = inet6_sk(sk);
160	struct in6_addr *first_hop = &fl6->daddr;
161	struct dst_entry *dst = skb_dst(skb);
162	struct ipv6hdr *hdr;
163	u8  proto = fl6->flowi6_proto;
164	int seg_len = skb->len;
165	int hlimit = -1;
166	u32 mtu;
167
168	if (opt) {
169		unsigned int head_room;
170
171		/* First: exthdrs may take lots of space (~8K for now)
172		   MAX_HEADER is not enough.
173		 */
174		head_room = opt->opt_nflen + opt->opt_flen;
175		seg_len += head_room;
176		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177
178		if (skb_headroom(skb) < head_room) {
179			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180			if (skb2 == NULL) {
181				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182					      IPSTATS_MIB_OUTDISCARDS);
183				kfree_skb(skb);
184				return -ENOBUFS;
185			}
186			consume_skb(skb);
187			skb = skb2;
188			skb_set_owner_w(skb, sk);
189		}
190		if (opt->opt_flen)
191			ipv6_push_frag_opts(skb, opt, &proto);
192		if (opt->opt_nflen)
193			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194	}
195
196	skb_push(skb, sizeof(struct ipv6hdr));
197	skb_reset_network_header(skb);
198	hdr = ipv6_hdr(skb);
199
200	/*
201	 *	Fill in the IPv6 header
202	 */
203	if (np)
204		hlimit = np->hop_limit;
205	if (hlimit < 0)
206		hlimit = ip6_dst_hoplimit(dst);
207
208	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
209						     np->autoflowlabel));
210
211	hdr->payload_len = htons(seg_len);
212	hdr->nexthdr = proto;
213	hdr->hop_limit = hlimit;
214
215	hdr->saddr = fl6->saddr;
216	hdr->daddr = *first_hop;
217
218	skb->protocol = htons(ETH_P_IPV6);
219	skb->priority = sk->sk_priority;
220	skb->mark = sk->sk_mark;
221
222	mtu = dst_mtu(dst);
223	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
224		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
225			      IPSTATS_MIB_OUT, skb->len);
226		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
227			       dst->dev, dst_output);
228	}
229
230	skb->dev = dst->dev;
231	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
232	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
233	kfree_skb(skb);
234	return -EMSGSIZE;
235}
236EXPORT_SYMBOL(ip6_xmit);
237
238static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239{
240	struct ip6_ra_chain *ra;
241	struct sock *last = NULL;
242
243	read_lock(&ip6_ra_lock);
244	for (ra = ip6_ra_chain; ra; ra = ra->next) {
245		struct sock *sk = ra->sk;
246		if (sk && ra->sel == sel &&
247		    (!sk->sk_bound_dev_if ||
248		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
249			if (last) {
250				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251				if (skb2)
252					rawv6_rcv(last, skb2);
253			}
254			last = sk;
255		}
256	}
257
258	if (last) {
259		rawv6_rcv(last, skb);
260		read_unlock(&ip6_ra_lock);
261		return 1;
262	}
263	read_unlock(&ip6_ra_lock);
264	return 0;
265}
266
267static int ip6_forward_proxy_check(struct sk_buff *skb)
268{
269	struct ipv6hdr *hdr = ipv6_hdr(skb);
270	u8 nexthdr = hdr->nexthdr;
271	__be16 frag_off;
272	int offset;
273
274	if (ipv6_ext_hdr(nexthdr)) {
275		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
276		if (offset < 0)
277			return 0;
278	} else
279		offset = sizeof(struct ipv6hdr);
280
281	if (nexthdr == IPPROTO_ICMPV6) {
282		struct icmp6hdr *icmp6;
283
284		if (!pskb_may_pull(skb, (skb_network_header(skb) +
285					 offset + 1 - skb->data)))
286			return 0;
287
288		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
289
290		switch (icmp6->icmp6_type) {
291		case NDISC_ROUTER_SOLICITATION:
292		case NDISC_ROUTER_ADVERTISEMENT:
293		case NDISC_NEIGHBOUR_SOLICITATION:
294		case NDISC_NEIGHBOUR_ADVERTISEMENT:
295		case NDISC_REDIRECT:
296			/* For reaction involving unicast neighbor discovery
297			 * message destined to the proxied address, pass it to
298			 * input function.
299			 */
300			return 1;
301		default:
302			break;
303		}
304	}
305
306	/*
307	 * The proxying router can't forward traffic sent to a link-local
308	 * address, so signal the sender and discard the packet. This
309	 * behavior is clarified by the MIPv6 specification.
310	 */
311	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312		dst_link_failure(skb);
313		return -1;
314	}
315
316	return 0;
317}
318
319static inline int ip6_forward_finish(struct sk_buff *skb)
320{
321	return dst_output(skb);
322}
323
324static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
325{
326	unsigned int mtu;
327	struct inet6_dev *idev;
328
329	if (dst_metric_locked(dst, RTAX_MTU)) {
330		mtu = dst_metric_raw(dst, RTAX_MTU);
331		if (mtu)
332			return mtu;
333	}
334
335	mtu = IPV6_MIN_MTU;
336	rcu_read_lock();
337	idev = __in6_dev_get(dst->dev);
338	if (idev)
339		mtu = idev->cnf.mtu6;
340	rcu_read_unlock();
341
342	return mtu;
343}
344
345static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
346{
347	if (skb->len <= mtu)
348		return false;
349
350	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
351	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
352		return true;
353
354	if (skb->ignore_df)
355		return false;
356
357	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
358		return false;
359
360	return true;
361}
362
363int ip6_forward(struct sk_buff *skb)
364{
365	struct dst_entry *dst = skb_dst(skb);
366	struct ipv6hdr *hdr = ipv6_hdr(skb);
367	struct inet6_skb_parm *opt = IP6CB(skb);
368	struct net *net = dev_net(dst->dev);
369	u32 mtu;
370
371	if (net->ipv6.devconf_all->forwarding == 0)
372		goto error;
373
374	if (skb->pkt_type != PACKET_HOST)
375		goto drop;
376
377	if (skb_warn_if_lro(skb))
378		goto drop;
379
380	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
381		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
382				 IPSTATS_MIB_INDISCARDS);
383		goto drop;
384	}
385
386	skb_forward_csum(skb);
387
388	/*
389	 *	We DO NOT make any processing on
390	 *	RA packets, pushing them to user level AS IS
391	 *	without ane WARRANTY that application will be able
392	 *	to interpret them. The reason is that we
393	 *	cannot make anything clever here.
394	 *
395	 *	We are not end-node, so that if packet contains
396	 *	AH/ESP, we cannot make anything.
397	 *	Defragmentation also would be mistake, RA packets
398	 *	cannot be fragmented, because there is no warranty
399	 *	that different fragments will go along one path. --ANK
400	 */
401	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
402		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
403			return 0;
404	}
405
406	/*
407	 *	check and decrement ttl
408	 */
409	if (hdr->hop_limit <= 1) {
410		/* Force OUTPUT device used as source address */
411		skb->dev = dst->dev;
412		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
413		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
414				 IPSTATS_MIB_INHDRERRORS);
415
416		kfree_skb(skb);
417		return -ETIMEDOUT;
418	}
419
420	/* XXX: idev->cnf.proxy_ndp? */
421	if (net->ipv6.devconf_all->proxy_ndp &&
422	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
423		int proxied = ip6_forward_proxy_check(skb);
424		if (proxied > 0)
425			return ip6_input(skb);
426		else if (proxied < 0) {
427			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
428					 IPSTATS_MIB_INDISCARDS);
429			goto drop;
430		}
431	}
432
433	if (!xfrm6_route_forward(skb)) {
434		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
435				 IPSTATS_MIB_INDISCARDS);
436		goto drop;
437	}
438	dst = skb_dst(skb);
439
440	/* IPv6 specs say nothing about it, but it is clear that we cannot
441	   send redirects to source routed frames.
442	   We don't send redirects to frames decapsulated from IPsec.
443	 */
444	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
445		struct in6_addr *target = NULL;
446		struct inet_peer *peer;
447		struct rt6_info *rt;
448
449		/*
450		 *	incoming and outgoing devices are the same
451		 *	send a redirect.
452		 */
453
454		rt = (struct rt6_info *) dst;
455		if (rt->rt6i_flags & RTF_GATEWAY)
456			target = &rt->rt6i_gateway;
457		else
458			target = &hdr->daddr;
459
460		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
461
462		/* Limit redirects both by destination (here)
463		   and by source (inside ndisc_send_redirect)
464		 */
465		if (inet_peer_xrlim_allow(peer, 1*HZ))
466			ndisc_send_redirect(skb, target);
467		if (peer)
468			inet_putpeer(peer);
469	} else {
470		int addrtype = ipv6_addr_type(&hdr->saddr);
471
472		/* This check is security critical. */
473		if (addrtype == IPV6_ADDR_ANY ||
474		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
475			goto error;
476		if (addrtype & IPV6_ADDR_LINKLOCAL) {
477			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
478				    ICMPV6_NOT_NEIGHBOUR, 0);
479			goto error;
480		}
481	}
482
483	mtu = ip6_dst_mtu_forward(dst);
484	if (mtu < IPV6_MIN_MTU)
485		mtu = IPV6_MIN_MTU;
486
487	if (ip6_pkt_too_big(skb, mtu)) {
488		/* Again, force OUTPUT device used as source address */
489		skb->dev = dst->dev;
490		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
491		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
492				 IPSTATS_MIB_INTOOBIGERRORS);
493		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494				 IPSTATS_MIB_FRAGFAILS);
495		kfree_skb(skb);
496		return -EMSGSIZE;
497	}
498
499	if (skb_cow(skb, dst->dev->hard_header_len)) {
500		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
501				 IPSTATS_MIB_OUTDISCARDS);
502		goto drop;
503	}
504
505	hdr = ipv6_hdr(skb);
506
507	/* Mangling hops number delayed to point after skb COW */
508
509	hdr->hop_limit--;
510
511	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
512	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
513	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
514		       ip6_forward_finish);
515
516error:
517	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
518drop:
519	kfree_skb(skb);
520	return -EINVAL;
521}
522
523static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
524{
525	to->pkt_type = from->pkt_type;
526	to->priority = from->priority;
527	to->protocol = from->protocol;
528	skb_dst_drop(to);
529	skb_dst_set(to, dst_clone(skb_dst(from)));
530	to->dev = from->dev;
531	to->mark = from->mark;
532
533#ifdef CONFIG_NET_SCHED
534	to->tc_index = from->tc_index;
535#endif
536	nf_copy(to, from);
537	skb_copy_secmark(to, from);
538}
539
540static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
541{
542	static u32 ip6_idents_hashrnd __read_mostly;
543	u32 hash, id;
544
545	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
546
547	hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
548	hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
549
550	id = ip_idents_reserve(hash, 1);
551	fhdr->identification = htonl(id);
552}
553
554int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
555{
556	struct sk_buff *frag;
557	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
558	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
559	struct ipv6hdr *tmp_hdr;
560	struct frag_hdr *fh;
561	unsigned int mtu, hlen, left, len;
562	int hroom, troom;
563	__be32 frag_id = 0;
564	int ptr, offset = 0, err = 0;
565	u8 *prevhdr, nexthdr = 0;
566	struct net *net = dev_net(skb_dst(skb)->dev);
567
568	hlen = ip6_find_1stfragopt(skb, &prevhdr);
569	nexthdr = *prevhdr;
570
571	mtu = ip6_skb_dst_mtu(skb);
572
573	/* We must not fragment if the socket is set to force MTU discovery
574	 * or if the skb it not generated by a local socket.
575	 */
576	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
577		     (IP6CB(skb)->frag_max_size &&
578		      IP6CB(skb)->frag_max_size > mtu)) {
579		if (skb->sk && dst_allfrag(skb_dst(skb)))
580			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
581
582		skb->dev = skb_dst(skb)->dev;
583		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
584		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
585			      IPSTATS_MIB_FRAGFAILS);
586		kfree_skb(skb);
587		return -EMSGSIZE;
588	}
589
590	if (np && np->frag_size < mtu) {
591		if (np->frag_size)
592			mtu = np->frag_size;
593	}
594	mtu -= hlen + sizeof(struct frag_hdr);
595
596	if (skb_has_frag_list(skb)) {
597		int first_len = skb_pagelen(skb);
598		struct sk_buff *frag2;
599
600		if (first_len - hlen > mtu ||
601		    ((first_len - hlen) & 7) ||
602		    skb_cloned(skb))
603			goto slow_path;
604
605		skb_walk_frags(skb, frag) {
606			/* Correct geometry. */
607			if (frag->len > mtu ||
608			    ((frag->len & 7) && frag->next) ||
609			    skb_headroom(frag) < hlen)
610				goto slow_path_clean;
611
612			/* Partially cloned skb? */
613			if (skb_shared(frag))
614				goto slow_path_clean;
615
616			BUG_ON(frag->sk);
617			if (skb->sk) {
618				frag->sk = skb->sk;
619				frag->destructor = sock_wfree;
620			}
621			skb->truesize -= frag->truesize;
622		}
623
624		err = 0;
625		offset = 0;
626		frag = skb_shinfo(skb)->frag_list;
627		skb_frag_list_init(skb);
628		/* BUILD HEADER */
629
630		*prevhdr = NEXTHDR_FRAGMENT;
631		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
632		if (!tmp_hdr) {
633			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
634				      IPSTATS_MIB_FRAGFAILS);
635			return -ENOMEM;
636		}
637
638		__skb_pull(skb, hlen);
639		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
640		__skb_push(skb, hlen);
641		skb_reset_network_header(skb);
642		memcpy(skb_network_header(skb), tmp_hdr, hlen);
643
644		ipv6_select_ident(fh, rt);
645		fh->nexthdr = nexthdr;
646		fh->reserved = 0;
647		fh->frag_off = htons(IP6_MF);
648		frag_id = fh->identification;
649
650		first_len = skb_pagelen(skb);
651		skb->data_len = first_len - skb_headlen(skb);
652		skb->len = first_len;
653		ipv6_hdr(skb)->payload_len = htons(first_len -
654						   sizeof(struct ipv6hdr));
655
656		dst_hold(&rt->dst);
657
658		for (;;) {
659			/* Prepare header of the next frame,
660			 * before previous one went down. */
661			if (frag) {
662				frag->ip_summed = CHECKSUM_NONE;
663				skb_reset_transport_header(frag);
664				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
665				__skb_push(frag, hlen);
666				skb_reset_network_header(frag);
667				memcpy(skb_network_header(frag), tmp_hdr,
668				       hlen);
669				offset += skb->len - hlen - sizeof(struct frag_hdr);
670				fh->nexthdr = nexthdr;
671				fh->reserved = 0;
672				fh->frag_off = htons(offset);
673				if (frag->next != NULL)
674					fh->frag_off |= htons(IP6_MF);
675				fh->identification = frag_id;
676				ipv6_hdr(frag)->payload_len =
677						htons(frag->len -
678						      sizeof(struct ipv6hdr));
679				ip6_copy_metadata(frag, skb);
680			}
681
682			err = output(skb);
683			if (!err)
684				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
685					      IPSTATS_MIB_FRAGCREATES);
686
687			if (err || !frag)
688				break;
689
690			skb = frag;
691			frag = skb->next;
692			skb->next = NULL;
693		}
694
695		kfree(tmp_hdr);
696
697		if (err == 0) {
698			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
699				      IPSTATS_MIB_FRAGOKS);
700			ip6_rt_put(rt);
701			return 0;
702		}
703
704		kfree_skb_list(frag);
705
706		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
707			      IPSTATS_MIB_FRAGFAILS);
708		ip6_rt_put(rt);
709		return err;
710
711slow_path_clean:
712		skb_walk_frags(skb, frag2) {
713			if (frag2 == frag)
714				break;
715			frag2->sk = NULL;
716			frag2->destructor = NULL;
717			skb->truesize += frag2->truesize;
718		}
719	}
720
721slow_path:
722	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
723	    skb_checksum_help(skb))
724		goto fail;
725
726	left = skb->len - hlen;		/* Space per frame */
727	ptr = hlen;			/* Where to start from */
728
729	/*
730	 *	Fragment the datagram.
731	 */
732
733	*prevhdr = NEXTHDR_FRAGMENT;
734	hroom = LL_RESERVED_SPACE(rt->dst.dev);
735	troom = rt->dst.dev->needed_tailroom;
736
737	/*
738	 *	Keep copying data until we run out.
739	 */
740	while (left > 0)	{
741		len = left;
742		/* IF: it doesn't fit, use 'mtu' - the data space left */
743		if (len > mtu)
744			len = mtu;
745		/* IF: we are not sending up to and including the packet end
746		   then align the next start on an eight byte boundary */
747		if (len < left)	{
748			len &= ~7;
749		}
750		/*
751		 *	Allocate buffer.
752		 */
753
754		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
755				      hroom + troom, GFP_ATOMIC)) == NULL) {
756			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
757			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
758				      IPSTATS_MIB_FRAGFAILS);
759			err = -ENOMEM;
760			goto fail;
761		}
762
763		/*
764		 *	Set up data on packet
765		 */
766
767		ip6_copy_metadata(frag, skb);
768		skb_reserve(frag, hroom);
769		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
770		skb_reset_network_header(frag);
771		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
772		frag->transport_header = (frag->network_header + hlen +
773					  sizeof(struct frag_hdr));
774
775		/*
776		 *	Charge the memory for the fragment to any owner
777		 *	it might possess
778		 */
779		if (skb->sk)
780			skb_set_owner_w(frag, skb->sk);
781
782		/*
783		 *	Copy the packet header into the new buffer.
784		 */
785		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
786
787		/*
788		 *	Build fragment header.
789		 */
790		fh->nexthdr = nexthdr;
791		fh->reserved = 0;
792		if (!frag_id) {
793			ipv6_select_ident(fh, rt);
794			frag_id = fh->identification;
795		} else
796			fh->identification = frag_id;
797
798		/*
799		 *	Copy a block of the IP datagram.
800		 */
801		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
802				     len));
803		left -= len;
804
805		fh->frag_off = htons(offset);
806		if (left > 0)
807			fh->frag_off |= htons(IP6_MF);
808		ipv6_hdr(frag)->payload_len = htons(frag->len -
809						    sizeof(struct ipv6hdr));
810
811		ptr += len;
812		offset += len;
813
814		/*
815		 *	Put this fragment into the sending queue.
816		 */
817		err = output(frag);
818		if (err)
819			goto fail;
820
821		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
822			      IPSTATS_MIB_FRAGCREATES);
823	}
824	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825		      IPSTATS_MIB_FRAGOKS);
826	consume_skb(skb);
827	return err;
828
829fail:
830	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
831		      IPSTATS_MIB_FRAGFAILS);
832	kfree_skb(skb);
833	return err;
834}
835
836static inline int ip6_rt_check(const struct rt6key *rt_key,
837			       const struct in6_addr *fl_addr,
838			       const struct in6_addr *addr_cache)
839{
840	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
841		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
842}
843
844static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
845					  struct dst_entry *dst,
846					  const struct flowi6 *fl6)
847{
848	struct ipv6_pinfo *np = inet6_sk(sk);
849	struct rt6_info *rt;
850
851	if (!dst)
852		goto out;
853
854	if (dst->ops->family != AF_INET6) {
855		dst_release(dst);
856		return NULL;
857	}
858
859	rt = (struct rt6_info *)dst;
860	/* Yes, checking route validity in not connected
861	 * case is not very simple. Take into account,
862	 * that we do not support routing by source, TOS,
863	 * and MSG_DONTROUTE		--ANK (980726)
864	 *
865	 * 1. ip6_rt_check(): If route was host route,
866	 *    check that cached destination is current.
867	 *    If it is network route, we still may
868	 *    check its validity using saved pointer
869	 *    to the last used address: daddr_cache.
870	 *    We do not want to save whole address now,
871	 *    (because main consumer of this service
872	 *    is tcp, which has not this problem),
873	 *    so that the last trick works only on connected
874	 *    sockets.
875	 * 2. oif also should be the same.
876	 */
877	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
878#ifdef CONFIG_IPV6_SUBTREES
879	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
880#endif
881	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
882		dst_release(dst);
883		dst = NULL;
884	}
885
886out:
887	return dst;
888}
889
890static int ip6_dst_lookup_tail(struct sock *sk,
891			       struct dst_entry **dst, struct flowi6 *fl6)
892{
893	struct net *net = sock_net(sk);
894#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
895	struct neighbour *n;
896	struct rt6_info *rt;
897#endif
898	int err;
899
900	if (*dst == NULL)
901		*dst = ip6_route_output(net, sk, fl6);
902
903	if ((err = (*dst)->error))
904		goto out_err_release;
905
906	if (ipv6_addr_any(&fl6->saddr)) {
907		struct rt6_info *rt = (struct rt6_info *) *dst;
908		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
909					  sk ? inet6_sk(sk)->srcprefs : 0,
910					  &fl6->saddr);
911		if (err)
912			goto out_err_release;
913	}
914
915#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
916	/*
917	 * Here if the dst entry we've looked up
918	 * has a neighbour entry that is in the INCOMPLETE
919	 * state and the src address from the flow is
920	 * marked as OPTIMISTIC, we release the found
921	 * dst entry and replace it instead with the
922	 * dst entry of the nexthop router
923	 */
924	rt = (struct rt6_info *) *dst;
925	rcu_read_lock_bh();
926	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
927	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
928	rcu_read_unlock_bh();
929
930	if (err) {
931		struct inet6_ifaddr *ifp;
932		struct flowi6 fl_gw6;
933		int redirect;
934
935		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
936				      (*dst)->dev, 1);
937
938		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
939		if (ifp)
940			in6_ifa_put(ifp);
941
942		if (redirect) {
943			/*
944			 * We need to get the dst entry for the
945			 * default router instead
946			 */
947			dst_release(*dst);
948			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
949			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
950			*dst = ip6_route_output(net, sk, &fl_gw6);
951			if ((err = (*dst)->error))
952				goto out_err_release;
953		}
954	}
955#endif
956
957	return 0;
958
959out_err_release:
960	if (err == -ENETUNREACH)
961		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
962	dst_release(*dst);
963	*dst = NULL;
964	return err;
965}
966
967/**
968 *	ip6_dst_lookup - perform route lookup on flow
969 *	@sk: socket which provides route info
970 *	@dst: pointer to dst_entry * for result
971 *	@fl6: flow to lookup
972 *
973 *	This function performs a route lookup on the given flow.
974 *
975 *	It returns zero on success, or a standard errno code on error.
976 */
977int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
978{
979	*dst = NULL;
980	return ip6_dst_lookup_tail(sk, dst, fl6);
981}
982EXPORT_SYMBOL_GPL(ip6_dst_lookup);
983
984/**
985 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
986 *	@sk: socket which provides route info
987 *	@fl6: flow to lookup
988 *	@final_dst: final destination address for ipsec lookup
989 *
990 *	This function performs a route lookup on the given flow.
991 *
992 *	It returns a valid dst pointer on success, or a pointer encoded
993 *	error code.
994 */
995struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
996				      const struct in6_addr *final_dst)
997{
998	struct dst_entry *dst = NULL;
999	int err;
1000
1001	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1002	if (err)
1003		return ERR_PTR(err);
1004	if (final_dst)
1005		fl6->daddr = *final_dst;
1006
1007	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1008}
1009EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1010
1011/**
1012 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1013 *	@sk: socket which provides the dst cache and route info
1014 *	@fl6: flow to lookup
1015 *	@final_dst: final destination address for ipsec lookup
1016 *
1017 *	This function performs a route lookup on the given flow with the
1018 *	possibility of using the cached route in the socket if it is valid.
1019 *	It will take the socket dst lock when operating on the dst cache.
1020 *	As a result, this function can only be used in process context.
1021 *
1022 *	It returns a valid dst pointer on success, or a pointer encoded
1023 *	error code.
1024 */
1025struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1026					 const struct in6_addr *final_dst)
1027{
1028	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1029	int err;
1030
1031	dst = ip6_sk_dst_check(sk, dst, fl6);
1032
1033	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1034	if (err)
1035		return ERR_PTR(err);
1036	if (final_dst)
1037		fl6->daddr = *final_dst;
1038
1039	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1040}
1041EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1042
1043static inline int ip6_ufo_append_data(struct sock *sk,
1044			int getfrag(void *from, char *to, int offset, int len,
1045			int odd, struct sk_buff *skb),
1046			void *from, int length, int hh_len, int fragheaderlen,
1047			int transhdrlen, int mtu, unsigned int flags,
1048			struct rt6_info *rt)
1049
1050{
1051	struct sk_buff *skb;
1052	struct frag_hdr fhdr;
1053	int err;
1054
1055	/* There is support for UDP large send offload by network
1056	 * device, so create one single skb packet containing complete
1057	 * udp datagram
1058	 */
1059	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1060		skb = sock_alloc_send_skb(sk,
1061			hh_len + fragheaderlen + transhdrlen + 20,
1062			(flags & MSG_DONTWAIT), &err);
1063		if (skb == NULL)
1064			return err;
1065
1066		/* reserve space for Hardware header */
1067		skb_reserve(skb, hh_len);
1068
1069		/* create space for UDP/IP header */
1070		skb_put(skb, fragheaderlen + transhdrlen);
1071
1072		/* initialize network header pointer */
1073		skb_reset_network_header(skb);
1074
1075		/* initialize protocol header pointer */
1076		skb->transport_header = skb->network_header + fragheaderlen;
1077
1078		skb->protocol = htons(ETH_P_IPV6);
1079		skb->csum = 0;
1080
1081		__skb_queue_tail(&sk->sk_write_queue, skb);
1082	} else if (skb_is_gso(skb)) {
1083		goto append;
1084	}
1085
1086	skb->ip_summed = CHECKSUM_PARTIAL;
1087	/* Specify the length of each IPv6 datagram fragment.
1088	 * It has to be a multiple of 8.
1089	 */
1090	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1091				     sizeof(struct frag_hdr)) & ~7;
1092	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1093	ipv6_select_ident(&fhdr, rt);
1094	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1095
1096append:
1097	return skb_append_datato_frags(sk, skb, getfrag, from,
1098				       (length - transhdrlen));
1099}
1100
1101static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1102					       gfp_t gfp)
1103{
1104	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105}
1106
1107static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1108						gfp_t gfp)
1109{
1110	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1111}
1112
1113static void ip6_append_data_mtu(unsigned int *mtu,
1114				int *maxfraglen,
1115				unsigned int fragheaderlen,
1116				struct sk_buff *skb,
1117				struct rt6_info *rt,
1118				unsigned int orig_mtu)
1119{
1120	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1121		if (skb == NULL) {
1122			/* first fragment, reserve header_len */
1123			*mtu = orig_mtu - rt->dst.header_len;
1124
1125		} else {
1126			/*
1127			 * this fragment is not first, the headers
1128			 * space is regarded as data space.
1129			 */
1130			*mtu = orig_mtu;
1131		}
1132		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1133			      + fragheaderlen - sizeof(struct frag_hdr);
1134	}
1135}
1136
1137int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1138	int offset, int len, int odd, struct sk_buff *skb),
1139	void *from, int length, int transhdrlen,
1140	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1141	struct rt6_info *rt, unsigned int flags, int dontfrag)
1142{
1143	struct inet_sock *inet = inet_sk(sk);
1144	struct ipv6_pinfo *np = inet6_sk(sk);
1145	struct inet_cork *cork;
1146	struct sk_buff *skb, *skb_prev = NULL;
1147	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1148	int exthdrlen;
1149	int dst_exthdrlen;
1150	int hh_len;
1151	int copy;
1152	int err;
1153	int offset = 0;
1154	__u8 tx_flags = 0;
1155	u32 tskey = 0;
1156
1157	if (flags&MSG_PROBE)
1158		return 0;
1159	cork = &inet->cork.base;
1160	if (skb_queue_empty(&sk->sk_write_queue)) {
1161		/*
1162		 * setup for corking
1163		 */
1164		if (opt) {
1165			if (WARN_ON(np->cork.opt))
1166				return -EINVAL;
1167
1168			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1169			if (unlikely(np->cork.opt == NULL))
1170				return -ENOBUFS;
1171
1172			np->cork.opt->tot_len = opt->tot_len;
1173			np->cork.opt->opt_flen = opt->opt_flen;
1174			np->cork.opt->opt_nflen = opt->opt_nflen;
1175
1176			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1177							    sk->sk_allocation);
1178			if (opt->dst0opt && !np->cork.opt->dst0opt)
1179				return -ENOBUFS;
1180
1181			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1182							    sk->sk_allocation);
1183			if (opt->dst1opt && !np->cork.opt->dst1opt)
1184				return -ENOBUFS;
1185
1186			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1187							   sk->sk_allocation);
1188			if (opt->hopopt && !np->cork.opt->hopopt)
1189				return -ENOBUFS;
1190
1191			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1192							    sk->sk_allocation);
1193			if (opt->srcrt && !np->cork.opt->srcrt)
1194				return -ENOBUFS;
1195
1196			/* need source address above miyazawa*/
1197		}
1198		dst_hold(&rt->dst);
1199		cork->dst = &rt->dst;
1200		inet->cork.fl.u.ip6 = *fl6;
1201		np->cork.hop_limit = hlimit;
1202		np->cork.tclass = tclass;
1203		if (rt->dst.flags & DST_XFRM_TUNNEL)
1204			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1205			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1206		else
1207			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1208			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1209		if (np->frag_size < mtu) {
1210			if (np->frag_size)
1211				mtu = np->frag_size;
1212		}
1213		cork->fragsize = mtu;
1214		if (dst_allfrag(rt->dst.path))
1215			cork->flags |= IPCORK_ALLFRAG;
1216		cork->length = 0;
1217		exthdrlen = (opt ? opt->opt_flen : 0);
1218		length += exthdrlen;
1219		transhdrlen += exthdrlen;
1220		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1221	} else {
1222		rt = (struct rt6_info *)cork->dst;
1223		fl6 = &inet->cork.fl.u.ip6;
1224		opt = np->cork.opt;
1225		transhdrlen = 0;
1226		exthdrlen = 0;
1227		dst_exthdrlen = 0;
1228		mtu = cork->fragsize;
1229	}
1230	orig_mtu = mtu;
1231
1232	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1233
1234	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1235			(opt ? opt->opt_nflen : 0);
1236	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1237		     sizeof(struct frag_hdr);
1238
1239	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1240		unsigned int maxnonfragsize, headersize;
1241
1242		headersize = sizeof(struct ipv6hdr) +
1243			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1244			     (dst_allfrag(&rt->dst) ?
1245			      sizeof(struct frag_hdr) : 0) +
1246			     rt->rt6i_nfheader_len;
1247
1248		if (ip6_sk_ignore_df(sk))
1249			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1250		else
1251			maxnonfragsize = mtu;
1252
1253		/* dontfrag active */
1254		if ((cork->length + length > mtu - headersize) && dontfrag &&
1255		    (sk->sk_protocol == IPPROTO_UDP ||
1256		     sk->sk_protocol == IPPROTO_RAW)) {
1257			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1258						   sizeof(struct ipv6hdr));
1259			goto emsgsize;
1260		}
1261
1262		if (cork->length + length > maxnonfragsize - headersize) {
1263emsgsize:
1264			ipv6_local_error(sk, EMSGSIZE, fl6,
1265					 mtu - headersize +
1266					 sizeof(struct ipv6hdr));
1267			return -EMSGSIZE;
1268		}
1269	}
1270
1271	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1272		sock_tx_timestamp(sk, &tx_flags);
1273		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275			tskey = sk->sk_tskey++;
1276	}
1277
1278	/*
1279	 * Let's try using as much space as possible.
1280	 * Use MTU if total length of the message fits into the MTU.
1281	 * Otherwise, we need to reserve fragment header and
1282	 * fragment alignment (= 8-15 octects, in total).
1283	 *
1284	 * Note that we may need to "move" the data from the tail of
1285	 * of the buffer to the new fragment when we split
1286	 * the message.
1287	 *
1288	 * FIXME: It may be fragmented into multiple chunks
1289	 *        at once if non-fragmentable extension headers
1290	 *        are too large.
1291	 * --yoshfuji
1292	 */
1293
1294	skb = skb_peek_tail(&sk->sk_write_queue);
1295	cork->length += length;
1296	if (((length > mtu) ||
1297	     (skb && skb_is_gso(skb))) &&
1298	    (sk->sk_protocol == IPPROTO_UDP) &&
1299	    (rt->dst.dev->features & NETIF_F_UFO)) {
1300		err = ip6_ufo_append_data(sk, getfrag, from, length,
1301					  hh_len, fragheaderlen,
1302					  transhdrlen, mtu, flags, rt);
1303		if (err)
1304			goto error;
1305		return 0;
1306	}
1307
1308	if (!skb)
1309		goto alloc_new_skb;
1310
1311	while (length > 0) {
1312		/* Check if the remaining data fits into current packet. */
1313		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1314		if (copy < length)
1315			copy = maxfraglen - skb->len;
1316
1317		if (copy <= 0) {
1318			char *data;
1319			unsigned int datalen;
1320			unsigned int fraglen;
1321			unsigned int fraggap;
1322			unsigned int alloclen;
1323alloc_new_skb:
1324			/* There's no room in the current skb */
1325			if (skb)
1326				fraggap = skb->len - maxfraglen;
1327			else
1328				fraggap = 0;
1329			/* update mtu and maxfraglen if necessary */
1330			if (skb == NULL || skb_prev == NULL)
1331				ip6_append_data_mtu(&mtu, &maxfraglen,
1332						    fragheaderlen, skb, rt,
1333						    orig_mtu);
1334
1335			skb_prev = skb;
1336
1337			/*
1338			 * If remaining data exceeds the mtu,
1339			 * we know we need more fragment(s).
1340			 */
1341			datalen = length + fraggap;
1342
1343			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1344				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1345			if ((flags & MSG_MORE) &&
1346			    !(rt->dst.dev->features&NETIF_F_SG))
1347				alloclen = mtu;
1348			else
1349				alloclen = datalen + fragheaderlen;
1350
1351			alloclen += dst_exthdrlen;
1352
1353			if (datalen != length + fraggap) {
1354				/*
1355				 * this is not the last fragment, the trailer
1356				 * space is regarded as data space.
1357				 */
1358				datalen += rt->dst.trailer_len;
1359			}
1360
1361			alloclen += rt->dst.trailer_len;
1362			fraglen = datalen + fragheaderlen;
1363
1364			/*
1365			 * We just reserve space for fragment header.
1366			 * Note: this may be overallocation if the message
1367			 * (without MSG_MORE) fits into the MTU.
1368			 */
1369			alloclen += sizeof(struct frag_hdr);
1370
1371			if (transhdrlen) {
1372				skb = sock_alloc_send_skb(sk,
1373						alloclen + hh_len,
1374						(flags & MSG_DONTWAIT), &err);
1375			} else {
1376				skb = NULL;
1377				if (atomic_read(&sk->sk_wmem_alloc) <=
1378				    2 * sk->sk_sndbuf)
1379					skb = sock_wmalloc(sk,
1380							   alloclen + hh_len, 1,
1381							   sk->sk_allocation);
1382				if (unlikely(skb == NULL))
1383					err = -ENOBUFS;
1384			}
1385			if (skb == NULL)
1386				goto error;
1387			/*
1388			 *	Fill in the control structures
1389			 */
1390			skb->protocol = htons(ETH_P_IPV6);
1391			skb->ip_summed = CHECKSUM_NONE;
1392			skb->csum = 0;
1393			/* reserve for fragmentation and ipsec header */
1394			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1395				    dst_exthdrlen);
1396
1397			/* Only the initial fragment is time stamped */
1398			skb_shinfo(skb)->tx_flags = tx_flags;
1399			tx_flags = 0;
1400			skb_shinfo(skb)->tskey = tskey;
1401			tskey = 0;
1402
1403			/*
1404			 *	Find where to start putting bytes
1405			 */
1406			data = skb_put(skb, fraglen);
1407			skb_set_network_header(skb, exthdrlen);
1408			data += fragheaderlen;
1409			skb->transport_header = (skb->network_header +
1410						 fragheaderlen);
1411			if (fraggap) {
1412				skb->csum = skb_copy_and_csum_bits(
1413					skb_prev, maxfraglen,
1414					data + transhdrlen, fraggap, 0);
1415				skb_prev->csum = csum_sub(skb_prev->csum,
1416							  skb->csum);
1417				data += fraggap;
1418				pskb_trim_unique(skb_prev, maxfraglen);
1419			}
1420			copy = datalen - transhdrlen - fraggap;
1421
1422			if (copy < 0) {
1423				err = -EINVAL;
1424				kfree_skb(skb);
1425				goto error;
1426			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1427				err = -EFAULT;
1428				kfree_skb(skb);
1429				goto error;
1430			}
1431
1432			offset += copy;
1433			length -= datalen - fraggap;
1434			transhdrlen = 0;
1435			exthdrlen = 0;
1436			dst_exthdrlen = 0;
1437
1438			/*
1439			 * Put the packet on the pending queue
1440			 */
1441			__skb_queue_tail(&sk->sk_write_queue, skb);
1442			continue;
1443		}
1444
1445		if (copy > length)
1446			copy = length;
1447
1448		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1449			unsigned int off;
1450
1451			off = skb->len;
1452			if (getfrag(from, skb_put(skb, copy),
1453						offset, copy, off, skb) < 0) {
1454				__skb_trim(skb, off);
1455				err = -EFAULT;
1456				goto error;
1457			}
1458		} else {
1459			int i = skb_shinfo(skb)->nr_frags;
1460			struct page_frag *pfrag = sk_page_frag(sk);
1461
1462			err = -ENOMEM;
1463			if (!sk_page_frag_refill(sk, pfrag))
1464				goto error;
1465
1466			if (!skb_can_coalesce(skb, i, pfrag->page,
1467					      pfrag->offset)) {
1468				err = -EMSGSIZE;
1469				if (i == MAX_SKB_FRAGS)
1470					goto error;
1471
1472				__skb_fill_page_desc(skb, i, pfrag->page,
1473						     pfrag->offset, 0);
1474				skb_shinfo(skb)->nr_frags = ++i;
1475				get_page(pfrag->page);
1476			}
1477			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1478			if (getfrag(from,
1479				    page_address(pfrag->page) + pfrag->offset,
1480				    offset, copy, skb->len, skb) < 0)
1481				goto error_efault;
1482
1483			pfrag->offset += copy;
1484			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1485			skb->len += copy;
1486			skb->data_len += copy;
1487			skb->truesize += copy;
1488			atomic_add(copy, &sk->sk_wmem_alloc);
1489		}
1490		offset += copy;
1491		length -= copy;
1492	}
1493
1494	return 0;
1495
1496error_efault:
1497	err = -EFAULT;
1498error:
1499	cork->length -= length;
1500	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1501	return err;
1502}
1503EXPORT_SYMBOL_GPL(ip6_append_data);
1504
1505static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1506{
1507	if (np->cork.opt) {
1508		kfree(np->cork.opt->dst0opt);
1509		kfree(np->cork.opt->dst1opt);
1510		kfree(np->cork.opt->hopopt);
1511		kfree(np->cork.opt->srcrt);
1512		kfree(np->cork.opt);
1513		np->cork.opt = NULL;
1514	}
1515
1516	if (inet->cork.base.dst) {
1517		dst_release(inet->cork.base.dst);
1518		inet->cork.base.dst = NULL;
1519		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1520	}
1521	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1522}
1523
1524int ip6_push_pending_frames(struct sock *sk)
1525{
1526	struct sk_buff *skb, *tmp_skb;
1527	struct sk_buff **tail_skb;
1528	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1529	struct inet_sock *inet = inet_sk(sk);
1530	struct ipv6_pinfo *np = inet6_sk(sk);
1531	struct net *net = sock_net(sk);
1532	struct ipv6hdr *hdr;
1533	struct ipv6_txoptions *opt = np->cork.opt;
1534	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1535	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1536	unsigned char proto = fl6->flowi6_proto;
1537	int err = 0;
1538
1539	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1540		goto out;
1541	tail_skb = &(skb_shinfo(skb)->frag_list);
1542
1543	/* move skb->data to ip header from ext header */
1544	if (skb->data < skb_network_header(skb))
1545		__skb_pull(skb, skb_network_offset(skb));
1546	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1547		__skb_pull(tmp_skb, skb_network_header_len(skb));
1548		*tail_skb = tmp_skb;
1549		tail_skb = &(tmp_skb->next);
1550		skb->len += tmp_skb->len;
1551		skb->data_len += tmp_skb->len;
1552		skb->truesize += tmp_skb->truesize;
1553		tmp_skb->destructor = NULL;
1554		tmp_skb->sk = NULL;
1555	}
1556
1557	/* Allow local fragmentation. */
1558	skb->ignore_df = ip6_sk_ignore_df(sk);
1559
1560	*final_dst = fl6->daddr;
1561	__skb_pull(skb, skb_network_header_len(skb));
1562	if (opt && opt->opt_flen)
1563		ipv6_push_frag_opts(skb, opt, &proto);
1564	if (opt && opt->opt_nflen)
1565		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1566
1567	skb_push(skb, sizeof(struct ipv6hdr));
1568	skb_reset_network_header(skb);
1569	hdr = ipv6_hdr(skb);
1570
1571	ip6_flow_hdr(hdr, np->cork.tclass,
1572		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1573					np->autoflowlabel));
1574	hdr->hop_limit = np->cork.hop_limit;
1575	hdr->nexthdr = proto;
1576	hdr->saddr = fl6->saddr;
1577	hdr->daddr = *final_dst;
1578
1579	skb->priority = sk->sk_priority;
1580	skb->mark = sk->sk_mark;
1581
1582	skb_dst_set(skb, dst_clone(&rt->dst));
1583	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1584	if (proto == IPPROTO_ICMPV6) {
1585		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1586
1587		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1588		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1589	}
1590
1591	err = ip6_local_out(skb);
1592	if (err) {
1593		if (err > 0)
1594			err = net_xmit_errno(err);
1595		if (err)
1596			goto error;
1597	}
1598
1599out:
1600	ip6_cork_release(inet, np);
1601	return err;
1602error:
1603	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1604	goto out;
1605}
1606EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1607
1608void ip6_flush_pending_frames(struct sock *sk)
1609{
1610	struct sk_buff *skb;
1611
1612	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1613		if (skb_dst(skb))
1614			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1615				      IPSTATS_MIB_OUTDISCARDS);
1616		kfree_skb(skb);
1617	}
1618
1619	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1620}
1621EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1622