tcp_ipv4.c revision 0a5ebb8000c5362be368df9d197943deb06b6916
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90#ifdef CONFIG_TCP_MD5SIG
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92						   __be32 addr);
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99	return NULL;
100}
101#endif
102
103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	__be16 orig_sport, orig_dport;
153	__be32 daddr, nexthop;
154	struct flowi4 *fl4;
155	struct rtable *rt;
156	int err;
157	struct ip_options_rcu *inet_opt;
158
159	if (addr_len < sizeof(struct sockaddr_in))
160		return -EINVAL;
161
162	if (usin->sin_family != AF_INET)
163		return -EAFNOSUPPORT;
164
165	nexthop = daddr = usin->sin_addr.s_addr;
166	inet_opt = rcu_dereference_protected(inet->inet_opt,
167					     sock_owned_by_user(sk));
168	if (inet_opt && inet_opt->opt.srr) {
169		if (!daddr)
170			return -EINVAL;
171		nexthop = inet_opt->opt.faddr;
172	}
173
174	orig_sport = inet->inet_sport;
175	orig_dport = usin->sin_port;
176	fl4 = &inet->cork.fl.u.ip4;
177	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
178			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179			      IPPROTO_TCP,
180			      orig_sport, orig_dport, sk, true);
181	if (IS_ERR(rt)) {
182		err = PTR_ERR(rt);
183		if (err == -ENETUNREACH)
184			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
185		return err;
186	}
187
188	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189		ip_rt_put(rt);
190		return -ENETUNREACH;
191	}
192
193	if (!inet_opt || !inet_opt->opt.srr)
194		daddr = fl4->daddr;
195
196	if (!inet->inet_saddr)
197		inet->inet_saddr = fl4->saddr;
198	inet->inet_rcv_saddr = inet->inet_saddr;
199
200	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
201		/* Reset inherited state */
202		tp->rx_opt.ts_recent	   = 0;
203		tp->rx_opt.ts_recent_stamp = 0;
204		tp->write_seq		   = 0;
205	}
206
207	if (tcp_death_row.sysctl_tw_recycle &&
208	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
209		struct inet_peer *peer = rt_get_peer(rt);
210		/*
211		 * VJ's idea. We save last timestamp seen from
212		 * the destination in peer table, when entering state
213		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214		 * when trying new connection.
215		 */
216		if (peer) {
217			inet_peer_refcheck(peer);
218			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220				tp->rx_opt.ts_recent = peer->tcp_ts;
221			}
222		}
223	}
224
225	inet->inet_dport = usin->sin_port;
226	inet->inet_daddr = daddr;
227
228	inet_csk(sk)->icsk_ext_hdr_len = 0;
229	if (inet_opt)
230		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
231
232	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
233
234	/* Socket identity is still unknown (sport may be zero).
235	 * However we set state to SYN-SENT and not releasing socket
236	 * lock select source port, enter ourselves into the hash tables and
237	 * complete initialization after this.
238	 */
239	tcp_set_state(sk, TCP_SYN_SENT);
240	err = inet_hash_connect(&tcp_death_row, sk);
241	if (err)
242		goto failure;
243
244	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
245			       inet->inet_sport, inet->inet_dport, sk);
246	if (IS_ERR(rt)) {
247		err = PTR_ERR(rt);
248		rt = NULL;
249		goto failure;
250	}
251	/* OK, now commit destination to socket.  */
252	sk->sk_gso_type = SKB_GSO_TCPV4;
253	sk_setup_caps(sk, &rt->dst);
254
255	if (!tp->write_seq)
256		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257							   inet->inet_daddr,
258							   inet->inet_sport,
259							   usin->sin_port);
260
261	inet->inet_id = tp->write_seq ^ jiffies;
262
263	err = tcp_connect(sk);
264	rt = NULL;
265	if (err)
266		goto failure;
267
268	return 0;
269
270failure:
271	/*
272	 * This unhashes the socket and releases the local port,
273	 * if necessary.
274	 */
275	tcp_set_state(sk, TCP_CLOSE);
276	ip_rt_put(rt);
277	sk->sk_route_caps = 0;
278	inet->inet_dport = 0;
279	return err;
280}
281EXPORT_SYMBOL(tcp_v4_connect);
282
283/*
284 * This routine does path mtu discovery as defined in RFC1191.
285 */
286static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
287{
288	struct dst_entry *dst;
289	struct inet_sock *inet = inet_sk(sk);
290
291	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292	 * send out by Linux are always <576bytes so they should go through
293	 * unfragmented).
294	 */
295	if (sk->sk_state == TCP_LISTEN)
296		return;
297
298	/* We don't check in the destentry if pmtu discovery is forbidden
299	 * on this route. We just assume that no packet_to_big packets
300	 * are send back when pmtu discovery is not active.
301	 * There is a small race when the user changes this flag in the
302	 * route, but I think that's acceptable.
303	 */
304	if ((dst = __sk_dst_check(sk, 0)) == NULL)
305		return;
306
307	dst->ops->update_pmtu(dst, mtu);
308
309	/* Something is about to be wrong... Remember soft error
310	 * for the case, if this connection will not able to recover.
311	 */
312	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313		sk->sk_err_soft = EMSGSIZE;
314
315	mtu = dst_mtu(dst);
316
317	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
318	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
319		tcp_sync_mss(sk, mtu);
320
321		/* Resend the TCP packet because it's
322		 * clear that the old packet has been
323		 * dropped. This is the new "fast" path mtu
324		 * discovery.
325		 */
326		tcp_simple_retransmit(sk);
327	} /* else let the usual retransmit timer handle it */
328}
329
330/*
331 * This routine is called by the ICMP module when it gets some
332 * sort of error condition.  If err < 0 then the socket should
333 * be closed and the error returned to the user.  If err > 0
334 * it's just the icmp type << 8 | icmp code.  After adjustment
335 * header points to the first 8 bytes of the tcp header.  We need
336 * to find the appropriate port.
337 *
338 * The locking strategy used here is very "optimistic". When
339 * someone else accesses the socket the ICMP is just dropped
340 * and for some paths there is no check at all.
341 * A more general error queue to queue errors for later handling
342 * is probably better.
343 *
344 */
345
346void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
347{
348	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
349	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
350	struct inet_connection_sock *icsk;
351	struct tcp_sock *tp;
352	struct inet_sock *inet;
353	const int type = icmp_hdr(icmp_skb)->type;
354	const int code = icmp_hdr(icmp_skb)->code;
355	struct sock *sk;
356	struct sk_buff *skb;
357	__u32 seq;
358	__u32 remaining;
359	int err;
360	struct net *net = dev_net(icmp_skb->dev);
361
362	if (icmp_skb->len < (iph->ihl << 2) + 8) {
363		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364		return;
365	}
366
367	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
368			iph->saddr, th->source, inet_iif(icmp_skb));
369	if (!sk) {
370		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371		return;
372	}
373	if (sk->sk_state == TCP_TIME_WAIT) {
374		inet_twsk_put(inet_twsk(sk));
375		return;
376	}
377
378	bh_lock_sock(sk);
379	/* If too many ICMPs get dropped on busy
380	 * servers this needs to be solved differently.
381	 */
382	if (sock_owned_by_user(sk))
383		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
384
385	if (sk->sk_state == TCP_CLOSE)
386		goto out;
387
388	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390		goto out;
391	}
392
393	icsk = inet_csk(sk);
394	tp = tcp_sk(sk);
395	seq = ntohl(th->seq);
396	if (sk->sk_state != TCP_LISTEN &&
397	    !between(seq, tp->snd_una, tp->snd_nxt)) {
398		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
399		goto out;
400	}
401
402	switch (type) {
403	case ICMP_SOURCE_QUENCH:
404		/* Just silently ignore these. */
405		goto out;
406	case ICMP_PARAMETERPROB:
407		err = EPROTO;
408		break;
409	case ICMP_DEST_UNREACH:
410		if (code > NR_ICMP_UNREACH)
411			goto out;
412
413		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414			if (!sock_owned_by_user(sk))
415				do_pmtu_discovery(sk, iph, info);
416			goto out;
417		}
418
419		err = icmp_err_convert[code].errno;
420		/* check if icmp_skb allows revert of backoff
421		 * (see draft-zimmermann-tcp-lcd) */
422		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423			break;
424		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425		    !icsk->icsk_backoff)
426			break;
427
428		if (sock_owned_by_user(sk))
429			break;
430
431		icsk->icsk_backoff--;
432		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433					 icsk->icsk_backoff;
434		tcp_bound_rto(sk);
435
436		skb = tcp_write_queue_head(sk);
437		BUG_ON(!skb);
438
439		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440				tcp_time_stamp - TCP_SKB_CB(skb)->when);
441
442		if (remaining) {
443			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444						  remaining, TCP_RTO_MAX);
445		} else {
446			/* RTO revert clocked out retransmission.
447			 * Will retransmit now */
448			tcp_retransmit_timer(sk);
449		}
450
451		break;
452	case ICMP_TIME_EXCEEDED:
453		err = EHOSTUNREACH;
454		break;
455	default:
456		goto out;
457	}
458
459	switch (sk->sk_state) {
460		struct request_sock *req, **prev;
461	case TCP_LISTEN:
462		if (sock_owned_by_user(sk))
463			goto out;
464
465		req = inet_csk_search_req(sk, &prev, th->dest,
466					  iph->daddr, iph->saddr);
467		if (!req)
468			goto out;
469
470		/* ICMPs are not backlogged, hence we cannot get
471		   an established socket here.
472		 */
473		WARN_ON(req->sk);
474
475		if (seq != tcp_rsk(req)->snt_isn) {
476			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
477			goto out;
478		}
479
480		/*
481		 * Still in SYN_RECV, just remove it silently.
482		 * There is no good way to pass the error to the newly
483		 * created socket, and POSIX does not want network
484		 * errors returned from accept().
485		 */
486		inet_csk_reqsk_queue_drop(sk, req, prev);
487		goto out;
488
489	case TCP_SYN_SENT:
490	case TCP_SYN_RECV:  /* Cannot happen.
491			       It can f.e. if SYNs crossed.
492			     */
493		if (!sock_owned_by_user(sk)) {
494			sk->sk_err = err;
495
496			sk->sk_error_report(sk);
497
498			tcp_done(sk);
499		} else {
500			sk->sk_err_soft = err;
501		}
502		goto out;
503	}
504
505	/* If we've already connected we will keep trying
506	 * until we time out, or the user gives up.
507	 *
508	 * rfc1122 4.2.3.9 allows to consider as hard errors
509	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510	 * but it is obsoleted by pmtu discovery).
511	 *
512	 * Note, that in modern internet, where routing is unreliable
513	 * and in each dark corner broken firewalls sit, sending random
514	 * errors ordered by their masters even this two messages finally lose
515	 * their original sense (even Linux sends invalid PORT_UNREACHs)
516	 *
517	 * Now we are in compliance with RFCs.
518	 *							--ANK (980905)
519	 */
520
521	inet = inet_sk(sk);
522	if (!sock_owned_by_user(sk) && inet->recverr) {
523		sk->sk_err = err;
524		sk->sk_error_report(sk);
525	} else	{ /* Only an error on timeout */
526		sk->sk_err_soft = err;
527	}
528
529out:
530	bh_unlock_sock(sk);
531	sock_put(sk);
532}
533
534static void __tcp_v4_send_check(struct sk_buff *skb,
535				__be32 saddr, __be32 daddr)
536{
537	struct tcphdr *th = tcp_hdr(skb);
538
539	if (skb->ip_summed == CHECKSUM_PARTIAL) {
540		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
541		skb->csum_start = skb_transport_header(skb) - skb->head;
542		skb->csum_offset = offsetof(struct tcphdr, check);
543	} else {
544		th->check = tcp_v4_check(skb->len, saddr, daddr,
545					 csum_partial(th,
546						      th->doff << 2,
547						      skb->csum));
548	}
549}
550
551/* This routine computes an IPv4 TCP checksum. */
552void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
553{
554	struct inet_sock *inet = inet_sk(sk);
555
556	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557}
558EXPORT_SYMBOL(tcp_v4_send_check);
559
560int tcp_v4_gso_send_check(struct sk_buff *skb)
561{
562	const struct iphdr *iph;
563	struct tcphdr *th;
564
565	if (!pskb_may_pull(skb, sizeof(*th)))
566		return -EINVAL;
567
568	iph = ip_hdr(skb);
569	th = tcp_hdr(skb);
570
571	th->check = 0;
572	skb->ip_summed = CHECKSUM_PARTIAL;
573	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
574	return 0;
575}
576
577/*
578 *	This routine will send an RST to the other tcp.
579 *
580 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581 *		      for reset.
582 *	Answer: if a packet caused RST, it is not for a socket
583 *		existing in our system, if it is matched to a socket,
584 *		it is just duplicate segment or bug in other side's TCP.
585 *		So that we build reply only basing on parameters
586 *		arrived with segment.
587 *	Exception: precedence violation. We do not implement it in any case.
588 */
589
590static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
591{
592	struct tcphdr *th = tcp_hdr(skb);
593	struct {
594		struct tcphdr th;
595#ifdef CONFIG_TCP_MD5SIG
596		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597#endif
598	} rep;
599	struct ip_reply_arg arg;
600#ifdef CONFIG_TCP_MD5SIG
601	struct tcp_md5sig_key *key;
602#endif
603	struct net *net;
604
605	/* Never send a reset in response to a reset. */
606	if (th->rst)
607		return;
608
609	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
610		return;
611
612	/* Swap the send and the receive. */
613	memset(&rep, 0, sizeof(rep));
614	rep.th.dest   = th->source;
615	rep.th.source = th->dest;
616	rep.th.doff   = sizeof(struct tcphdr) / 4;
617	rep.th.rst    = 1;
618
619	if (th->ack) {
620		rep.th.seq = th->ack_seq;
621	} else {
622		rep.th.ack = 1;
623		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624				       skb->len - (th->doff << 2));
625	}
626
627	memset(&arg, 0, sizeof(arg));
628	arg.iov[0].iov_base = (unsigned char *)&rep;
629	arg.iov[0].iov_len  = sizeof(rep.th);
630
631#ifdef CONFIG_TCP_MD5SIG
632	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
633	if (key) {
634		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635				   (TCPOPT_NOP << 16) |
636				   (TCPOPT_MD5SIG << 8) |
637				   TCPOLEN_MD5SIG);
638		/* Update length and the length the header thinks exists */
639		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640		rep.th.doff = arg.iov[0].iov_len / 4;
641
642		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
643				     key, ip_hdr(skb)->saddr,
644				     ip_hdr(skb)->daddr, &rep.th);
645	}
646#endif
647	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648				      ip_hdr(skb)->saddr, /* XXX */
649				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
650	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
651	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
652
653	net = dev_net(skb_dst(skb)->dev);
654	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
655		      &arg, arg.iov[0].iov_len);
656
657	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
659}
660
661/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662   outside socket context is ugly, certainly. What can I do?
663 */
664
665static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666			    u32 win, u32 ts, int oif,
667			    struct tcp_md5sig_key *key,
668			    int reply_flags)
669{
670	struct tcphdr *th = tcp_hdr(skb);
671	struct {
672		struct tcphdr th;
673		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
674#ifdef CONFIG_TCP_MD5SIG
675			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
676#endif
677			];
678	} rep;
679	struct ip_reply_arg arg;
680	struct net *net = dev_net(skb_dst(skb)->dev);
681
682	memset(&rep.th, 0, sizeof(struct tcphdr));
683	memset(&arg, 0, sizeof(arg));
684
685	arg.iov[0].iov_base = (unsigned char *)&rep;
686	arg.iov[0].iov_len  = sizeof(rep.th);
687	if (ts) {
688		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689				   (TCPOPT_TIMESTAMP << 8) |
690				   TCPOLEN_TIMESTAMP);
691		rep.opt[1] = htonl(tcp_time_stamp);
692		rep.opt[2] = htonl(ts);
693		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
694	}
695
696	/* Swap the send and the receive. */
697	rep.th.dest    = th->source;
698	rep.th.source  = th->dest;
699	rep.th.doff    = arg.iov[0].iov_len / 4;
700	rep.th.seq     = htonl(seq);
701	rep.th.ack_seq = htonl(ack);
702	rep.th.ack     = 1;
703	rep.th.window  = htons(win);
704
705#ifdef CONFIG_TCP_MD5SIG
706	if (key) {
707		int offset = (ts) ? 3 : 0;
708
709		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710					  (TCPOPT_NOP << 16) |
711					  (TCPOPT_MD5SIG << 8) |
712					  TCPOLEN_MD5SIG);
713		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714		rep.th.doff = arg.iov[0].iov_len/4;
715
716		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
717				    key, ip_hdr(skb)->saddr,
718				    ip_hdr(skb)->daddr, &rep.th);
719	}
720#endif
721	arg.flags = reply_flags;
722	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723				      ip_hdr(skb)->saddr, /* XXX */
724				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
725	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
726	if (oif)
727		arg.bound_dev_if = oif;
728
729	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
730		      &arg, arg.iov[0].iov_len);
731
732	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
733}
734
735static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736{
737	struct inet_timewait_sock *tw = inet_twsk(sk);
738	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
739
740	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
741			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
742			tcptw->tw_ts_recent,
743			tw->tw_bound_dev_if,
744			tcp_twsk_md5_key(tcptw),
745			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
746			);
747
748	inet_twsk_put(tw);
749}
750
751static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
752				  struct request_sock *req)
753{
754	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
755			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
756			req->ts_recent,
757			0,
758			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
760}
761
762/*
763 *	Send a SYN-ACK after having received a SYN.
764 *	This still operates on a request_sock only, not on a big
765 *	socket.
766 */
767static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768			      struct request_sock *req,
769			      struct request_values *rvp)
770{
771	const struct inet_request_sock *ireq = inet_rsk(req);
772	int err = -1;
773	struct sk_buff * skb;
774
775	/* First, grab a route. */
776	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
777		return -1;
778
779	skb = tcp_make_synack(sk, dst, req, rvp);
780
781	if (skb) {
782		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
783
784		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
785					    ireq->rmt_addr,
786					    ireq->opt);
787		err = net_xmit_eval(err);
788	}
789
790	dst_release(dst);
791	return err;
792}
793
794static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
795			      struct request_values *rvp)
796{
797	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
798	return tcp_v4_send_synack(sk, NULL, req, rvp);
799}
800
801/*
802 *	IPv4 request_sock destructor.
803 */
804static void tcp_v4_reqsk_destructor(struct request_sock *req)
805{
806	kfree(inet_rsk(req)->opt);
807}
808
809static void syn_flood_warning(const struct sk_buff *skb)
810{
811	const char *msg;
812
813#ifdef CONFIG_SYN_COOKIES
814	if (sysctl_tcp_syncookies)
815		msg = "Sending cookies";
816	else
817#endif
818		msg = "Dropping request";
819
820	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
821				ntohs(tcp_hdr(skb)->dest), msg);
822}
823
824/*
825 * Save and compile IPv4 options into the request_sock if needed.
826 */
827static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
828						  struct sk_buff *skb)
829{
830	const struct ip_options *opt = &(IPCB(skb)->opt);
831	struct ip_options_rcu *dopt = NULL;
832
833	if (opt && opt->optlen) {
834		int opt_size = sizeof(*dopt) + opt->optlen;
835
836		dopt = kmalloc(opt_size, GFP_ATOMIC);
837		if (dopt) {
838			if (ip_options_echo(&dopt->opt, skb)) {
839				kfree(dopt);
840				dopt = NULL;
841			}
842		}
843	}
844	return dopt;
845}
846
847#ifdef CONFIG_TCP_MD5SIG
848/*
849 * RFC2385 MD5 checksumming requires a mapping of
850 * IP address->MD5 Key.
851 * We need to maintain these in the sk structure.
852 */
853
854/* Find the Key structure for an address.  */
855static struct tcp_md5sig_key *
856			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
857{
858	struct tcp_sock *tp = tcp_sk(sk);
859	int i;
860
861	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
862		return NULL;
863	for (i = 0; i < tp->md5sig_info->entries4; i++) {
864		if (tp->md5sig_info->keys4[i].addr == addr)
865			return &tp->md5sig_info->keys4[i].base;
866	}
867	return NULL;
868}
869
870struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
871					 struct sock *addr_sk)
872{
873	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
874}
875EXPORT_SYMBOL(tcp_v4_md5_lookup);
876
877static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
878						      struct request_sock *req)
879{
880	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
881}
882
883/* This can be called on a newly created socket, from other files */
884int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
885		      u8 *newkey, u8 newkeylen)
886{
887	/* Add Key to the list */
888	struct tcp_md5sig_key *key;
889	struct tcp_sock *tp = tcp_sk(sk);
890	struct tcp4_md5sig_key *keys;
891
892	key = tcp_v4_md5_do_lookup(sk, addr);
893	if (key) {
894		/* Pre-existing entry - just update that one. */
895		kfree(key->key);
896		key->key = newkey;
897		key->keylen = newkeylen;
898	} else {
899		struct tcp_md5sig_info *md5sig;
900
901		if (!tp->md5sig_info) {
902			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
903						  GFP_ATOMIC);
904			if (!tp->md5sig_info) {
905				kfree(newkey);
906				return -ENOMEM;
907			}
908			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
909		}
910		if (tcp_alloc_md5sig_pool(sk) == NULL) {
911			kfree(newkey);
912			return -ENOMEM;
913		}
914		md5sig = tp->md5sig_info;
915
916		if (md5sig->alloced4 == md5sig->entries4) {
917			keys = kmalloc((sizeof(*keys) *
918					(md5sig->entries4 + 1)), GFP_ATOMIC);
919			if (!keys) {
920				kfree(newkey);
921				tcp_free_md5sig_pool();
922				return -ENOMEM;
923			}
924
925			if (md5sig->entries4)
926				memcpy(keys, md5sig->keys4,
927				       sizeof(*keys) * md5sig->entries4);
928
929			/* Free old key list, and reference new one */
930			kfree(md5sig->keys4);
931			md5sig->keys4 = keys;
932			md5sig->alloced4++;
933		}
934		md5sig->entries4++;
935		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
936		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
937		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
938	}
939	return 0;
940}
941EXPORT_SYMBOL(tcp_v4_md5_do_add);
942
943static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
944			       u8 *newkey, u8 newkeylen)
945{
946	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
947				 newkey, newkeylen);
948}
949
950int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
951{
952	struct tcp_sock *tp = tcp_sk(sk);
953	int i;
954
955	for (i = 0; i < tp->md5sig_info->entries4; i++) {
956		if (tp->md5sig_info->keys4[i].addr == addr) {
957			/* Free the key */
958			kfree(tp->md5sig_info->keys4[i].base.key);
959			tp->md5sig_info->entries4--;
960
961			if (tp->md5sig_info->entries4 == 0) {
962				kfree(tp->md5sig_info->keys4);
963				tp->md5sig_info->keys4 = NULL;
964				tp->md5sig_info->alloced4 = 0;
965			} else if (tp->md5sig_info->entries4 != i) {
966				/* Need to do some manipulation */
967				memmove(&tp->md5sig_info->keys4[i],
968					&tp->md5sig_info->keys4[i+1],
969					(tp->md5sig_info->entries4 - i) *
970					 sizeof(struct tcp4_md5sig_key));
971			}
972			tcp_free_md5sig_pool();
973			return 0;
974		}
975	}
976	return -ENOENT;
977}
978EXPORT_SYMBOL(tcp_v4_md5_do_del);
979
980static void tcp_v4_clear_md5_list(struct sock *sk)
981{
982	struct tcp_sock *tp = tcp_sk(sk);
983
984	/* Free each key, then the set of key keys,
985	 * the crypto element, and then decrement our
986	 * hold on the last resort crypto.
987	 */
988	if (tp->md5sig_info->entries4) {
989		int i;
990		for (i = 0; i < tp->md5sig_info->entries4; i++)
991			kfree(tp->md5sig_info->keys4[i].base.key);
992		tp->md5sig_info->entries4 = 0;
993		tcp_free_md5sig_pool();
994	}
995	if (tp->md5sig_info->keys4) {
996		kfree(tp->md5sig_info->keys4);
997		tp->md5sig_info->keys4 = NULL;
998		tp->md5sig_info->alloced4  = 0;
999	}
1000}
1001
1002static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1003				 int optlen)
1004{
1005	struct tcp_md5sig cmd;
1006	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1007	u8 *newkey;
1008
1009	if (optlen < sizeof(cmd))
1010		return -EINVAL;
1011
1012	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1013		return -EFAULT;
1014
1015	if (sin->sin_family != AF_INET)
1016		return -EINVAL;
1017
1018	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1019		if (!tcp_sk(sk)->md5sig_info)
1020			return -ENOENT;
1021		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1022	}
1023
1024	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1025		return -EINVAL;
1026
1027	if (!tcp_sk(sk)->md5sig_info) {
1028		struct tcp_sock *tp = tcp_sk(sk);
1029		struct tcp_md5sig_info *p;
1030
1031		p = kzalloc(sizeof(*p), sk->sk_allocation);
1032		if (!p)
1033			return -EINVAL;
1034
1035		tp->md5sig_info = p;
1036		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1037	}
1038
1039	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1040	if (!newkey)
1041		return -ENOMEM;
1042	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1043				 newkey, cmd.tcpm_keylen);
1044}
1045
1046static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1047					__be32 daddr, __be32 saddr, int nbytes)
1048{
1049	struct tcp4_pseudohdr *bp;
1050	struct scatterlist sg;
1051
1052	bp = &hp->md5_blk.ip4;
1053
1054	/*
1055	 * 1. the TCP pseudo-header (in the order: source IP address,
1056	 * destination IP address, zero-padded protocol number, and
1057	 * segment length)
1058	 */
1059	bp->saddr = saddr;
1060	bp->daddr = daddr;
1061	bp->pad = 0;
1062	bp->protocol = IPPROTO_TCP;
1063	bp->len = cpu_to_be16(nbytes);
1064
1065	sg_init_one(&sg, bp, sizeof(*bp));
1066	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1067}
1068
1069static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1070			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1071{
1072	struct tcp_md5sig_pool *hp;
1073	struct hash_desc *desc;
1074
1075	hp = tcp_get_md5sig_pool();
1076	if (!hp)
1077		goto clear_hash_noput;
1078	desc = &hp->md5_desc;
1079
1080	if (crypto_hash_init(desc))
1081		goto clear_hash;
1082	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1083		goto clear_hash;
1084	if (tcp_md5_hash_header(hp, th))
1085		goto clear_hash;
1086	if (tcp_md5_hash_key(hp, key))
1087		goto clear_hash;
1088	if (crypto_hash_final(desc, md5_hash))
1089		goto clear_hash;
1090
1091	tcp_put_md5sig_pool();
1092	return 0;
1093
1094clear_hash:
1095	tcp_put_md5sig_pool();
1096clear_hash_noput:
1097	memset(md5_hash, 0, 16);
1098	return 1;
1099}
1100
1101int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1102			struct sock *sk, struct request_sock *req,
1103			struct sk_buff *skb)
1104{
1105	struct tcp_md5sig_pool *hp;
1106	struct hash_desc *desc;
1107	struct tcphdr *th = tcp_hdr(skb);
1108	__be32 saddr, daddr;
1109
1110	if (sk) {
1111		saddr = inet_sk(sk)->inet_saddr;
1112		daddr = inet_sk(sk)->inet_daddr;
1113	} else if (req) {
1114		saddr = inet_rsk(req)->loc_addr;
1115		daddr = inet_rsk(req)->rmt_addr;
1116	} else {
1117		const struct iphdr *iph = ip_hdr(skb);
1118		saddr = iph->saddr;
1119		daddr = iph->daddr;
1120	}
1121
1122	hp = tcp_get_md5sig_pool();
1123	if (!hp)
1124		goto clear_hash_noput;
1125	desc = &hp->md5_desc;
1126
1127	if (crypto_hash_init(desc))
1128		goto clear_hash;
1129
1130	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1131		goto clear_hash;
1132	if (tcp_md5_hash_header(hp, th))
1133		goto clear_hash;
1134	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1135		goto clear_hash;
1136	if (tcp_md5_hash_key(hp, key))
1137		goto clear_hash;
1138	if (crypto_hash_final(desc, md5_hash))
1139		goto clear_hash;
1140
1141	tcp_put_md5sig_pool();
1142	return 0;
1143
1144clear_hash:
1145	tcp_put_md5sig_pool();
1146clear_hash_noput:
1147	memset(md5_hash, 0, 16);
1148	return 1;
1149}
1150EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1151
1152static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1153{
1154	/*
1155	 * This gets called for each TCP segment that arrives
1156	 * so we want to be efficient.
1157	 * We have 3 drop cases:
1158	 * o No MD5 hash and one expected.
1159	 * o MD5 hash and we're not expecting one.
1160	 * o MD5 hash and its wrong.
1161	 */
1162	__u8 *hash_location = NULL;
1163	struct tcp_md5sig_key *hash_expected;
1164	const struct iphdr *iph = ip_hdr(skb);
1165	struct tcphdr *th = tcp_hdr(skb);
1166	int genhash;
1167	unsigned char newhash[16];
1168
1169	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1170	hash_location = tcp_parse_md5sig_option(th);
1171
1172	/* We've parsed the options - do we have a hash? */
1173	if (!hash_expected && !hash_location)
1174		return 0;
1175
1176	if (hash_expected && !hash_location) {
1177		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1178		return 1;
1179	}
1180
1181	if (!hash_expected && hash_location) {
1182		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1183		return 1;
1184	}
1185
1186	/* Okay, so this is hash_expected and hash_location -
1187	 * so we need to calculate the checksum.
1188	 */
1189	genhash = tcp_v4_md5_hash_skb(newhash,
1190				      hash_expected,
1191				      NULL, NULL, skb);
1192
1193	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1194		if (net_ratelimit()) {
1195			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1196			       &iph->saddr, ntohs(th->source),
1197			       &iph->daddr, ntohs(th->dest),
1198			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1199		}
1200		return 1;
1201	}
1202	return 0;
1203}
1204
1205#endif
1206
1207struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1208	.family		=	PF_INET,
1209	.obj_size	=	sizeof(struct tcp_request_sock),
1210	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1211	.send_ack	=	tcp_v4_reqsk_send_ack,
1212	.destructor	=	tcp_v4_reqsk_destructor,
1213	.send_reset	=	tcp_v4_send_reset,
1214	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1215};
1216
1217#ifdef CONFIG_TCP_MD5SIG
1218static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1219	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1220	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1221};
1222#endif
1223
1224int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1225{
1226	struct tcp_extend_values tmp_ext;
1227	struct tcp_options_received tmp_opt;
1228	u8 *hash_location;
1229	struct request_sock *req;
1230	struct inet_request_sock *ireq;
1231	struct tcp_sock *tp = tcp_sk(sk);
1232	struct dst_entry *dst = NULL;
1233	__be32 saddr = ip_hdr(skb)->saddr;
1234	__be32 daddr = ip_hdr(skb)->daddr;
1235	__u32 isn = TCP_SKB_CB(skb)->when;
1236#ifdef CONFIG_SYN_COOKIES
1237	int want_cookie = 0;
1238#else
1239#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1240#endif
1241
1242	/* Never answer to SYNs send to broadcast or multicast */
1243	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1244		goto drop;
1245
1246	/* TW buckets are converted to open requests without
1247	 * limitations, they conserve resources and peer is
1248	 * evidently real one.
1249	 */
1250	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1251		if (net_ratelimit())
1252			syn_flood_warning(skb);
1253#ifdef CONFIG_SYN_COOKIES
1254		if (sysctl_tcp_syncookies) {
1255			want_cookie = 1;
1256		} else
1257#endif
1258		goto drop;
1259	}
1260
1261	/* Accept backlog is full. If we have already queued enough
1262	 * of warm entries in syn queue, drop request. It is better than
1263	 * clogging syn queue with openreqs with exponentially increasing
1264	 * timeout.
1265	 */
1266	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1267		goto drop;
1268
1269	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1270	if (!req)
1271		goto drop;
1272
1273#ifdef CONFIG_TCP_MD5SIG
1274	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1275#endif
1276
1277	tcp_clear_options(&tmp_opt);
1278	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1279	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1280	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1281
1282	if (tmp_opt.cookie_plus > 0 &&
1283	    tmp_opt.saw_tstamp &&
1284	    !tp->rx_opt.cookie_out_never &&
1285	    (sysctl_tcp_cookie_size > 0 ||
1286	     (tp->cookie_values != NULL &&
1287	      tp->cookie_values->cookie_desired > 0))) {
1288		u8 *c;
1289		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1290		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1291
1292		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1293			goto drop_and_release;
1294
1295		/* Secret recipe starts with IP addresses */
1296		*mess++ ^= (__force u32)daddr;
1297		*mess++ ^= (__force u32)saddr;
1298
1299		/* plus variable length Initiator Cookie */
1300		c = (u8 *)mess;
1301		while (l-- > 0)
1302			*c++ ^= *hash_location++;
1303
1304#ifdef CONFIG_SYN_COOKIES
1305		want_cookie = 0;	/* not our kind of cookie */
1306#endif
1307		tmp_ext.cookie_out_never = 0; /* false */
1308		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1309	} else if (!tp->rx_opt.cookie_in_always) {
1310		/* redundant indications, but ensure initialization. */
1311		tmp_ext.cookie_out_never = 1; /* true */
1312		tmp_ext.cookie_plus = 0;
1313	} else {
1314		goto drop_and_release;
1315	}
1316	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1317
1318	if (want_cookie && !tmp_opt.saw_tstamp)
1319		tcp_clear_options(&tmp_opt);
1320
1321	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1322	tcp_openreq_init(req, &tmp_opt, skb);
1323
1324	ireq = inet_rsk(req);
1325	ireq->loc_addr = daddr;
1326	ireq->rmt_addr = saddr;
1327	ireq->no_srccheck = inet_sk(sk)->transparent;
1328	ireq->opt = tcp_v4_save_options(sk, skb);
1329
1330	if (security_inet_conn_request(sk, skb, req))
1331		goto drop_and_free;
1332
1333	if (!want_cookie || tmp_opt.tstamp_ok)
1334		TCP_ECN_create_request(req, tcp_hdr(skb));
1335
1336	if (want_cookie) {
1337		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1338		req->cookie_ts = tmp_opt.tstamp_ok;
1339	} else if (!isn) {
1340		struct inet_peer *peer = NULL;
1341
1342		/* VJ's idea. We save last timestamp seen
1343		 * from the destination in peer table, when entering
1344		 * state TIME-WAIT, and check against it before
1345		 * accepting new connection request.
1346		 *
1347		 * If "isn" is not zero, this request hit alive
1348		 * timewait bucket, so that all the necessary checks
1349		 * are made in the function processing timewait state.
1350		 */
1351		if (tmp_opt.saw_tstamp &&
1352		    tcp_death_row.sysctl_tw_recycle &&
1353		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1354		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355		    peer->daddr.addr.a4 == saddr) {
1356			inet_peer_refcheck(peer);
1357			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1358			    (s32)(peer->tcp_ts - req->ts_recent) >
1359							TCP_PAWS_WINDOW) {
1360				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1361				goto drop_and_release;
1362			}
1363		}
1364		/* Kill the following clause, if you dislike this way. */
1365		else if (!sysctl_tcp_syncookies &&
1366			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1367			  (sysctl_max_syn_backlog >> 2)) &&
1368			 (!peer || !peer->tcp_ts_stamp) &&
1369			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1370			/* Without syncookies last quarter of
1371			 * backlog is filled with destinations,
1372			 * proven to be alive.
1373			 * It means that we continue to communicate
1374			 * to destinations, already remembered
1375			 * to the moment of synflood.
1376			 */
1377			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1378				       &saddr, ntohs(tcp_hdr(skb)->source));
1379			goto drop_and_release;
1380		}
1381
1382		isn = tcp_v4_init_sequence(skb);
1383	}
1384	tcp_rsk(req)->snt_isn = isn;
1385
1386	if (tcp_v4_send_synack(sk, dst, req,
1387			       (struct request_values *)&tmp_ext) ||
1388	    want_cookie)
1389		goto drop_and_free;
1390
1391	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1392	return 0;
1393
1394drop_and_release:
1395	dst_release(dst);
1396drop_and_free:
1397	reqsk_free(req);
1398drop:
1399	return 0;
1400}
1401EXPORT_SYMBOL(tcp_v4_conn_request);
1402
1403
1404/*
1405 * The three way handshake has completed - we got a valid synack -
1406 * now create the new socket.
1407 */
1408struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409				  struct request_sock *req,
1410				  struct dst_entry *dst)
1411{
1412	struct inet_request_sock *ireq;
1413	struct inet_sock *newinet;
1414	struct tcp_sock *newtp;
1415	struct sock *newsk;
1416#ifdef CONFIG_TCP_MD5SIG
1417	struct tcp_md5sig_key *key;
1418#endif
1419	struct ip_options_rcu *inet_opt;
1420
1421	if (sk_acceptq_is_full(sk))
1422		goto exit_overflow;
1423
1424	newsk = tcp_create_openreq_child(sk, req, skb);
1425	if (!newsk)
1426		goto exit_nonewsk;
1427
1428	newsk->sk_gso_type = SKB_GSO_TCPV4;
1429
1430	newtp		      = tcp_sk(newsk);
1431	newinet		      = inet_sk(newsk);
1432	ireq		      = inet_rsk(req);
1433	newinet->inet_daddr   = ireq->rmt_addr;
1434	newinet->inet_rcv_saddr = ireq->loc_addr;
1435	newinet->inet_saddr	      = ireq->loc_addr;
1436	inet_opt	      = ireq->opt;
1437	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1438	ireq->opt	      = NULL;
1439	newinet->mc_index     = inet_iif(skb);
1440	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1441	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1442	if (inet_opt)
1443		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1444	newinet->inet_id = newtp->write_seq ^ jiffies;
1445
1446	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1447		goto put_and_exit;
1448
1449	sk_setup_caps(newsk, dst);
1450
1451	tcp_mtup_init(newsk);
1452	tcp_sync_mss(newsk, dst_mtu(dst));
1453	newtp->advmss = dst_metric_advmss(dst);
1454	if (tcp_sk(sk)->rx_opt.user_mss &&
1455	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1456		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1457
1458	tcp_initialize_rcv_mss(newsk);
1459
1460#ifdef CONFIG_TCP_MD5SIG
1461	/* Copy over the MD5 key from the original socket */
1462	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1463	if (key != NULL) {
1464		/*
1465		 * We're using one, so create a matching key
1466		 * on the newsk structure. If we fail to get
1467		 * memory, then we end up not copying the key
1468		 * across. Shucks.
1469		 */
1470		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1471		if (newkey != NULL)
1472			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1473					  newkey, key->keylen);
1474		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1475	}
1476#endif
1477
1478	if (__inet_inherit_port(sk, newsk) < 0)
1479		goto put_and_exit;
1480	__inet_hash_nolisten(newsk, NULL);
1481
1482	return newsk;
1483
1484exit_overflow:
1485	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486exit_nonewsk:
1487	dst_release(dst);
1488exit:
1489	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1490	return NULL;
1491put_and_exit:
1492	sock_put(newsk);
1493	goto exit;
1494}
1495EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1496
1497static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1498{
1499	struct tcphdr *th = tcp_hdr(skb);
1500	const struct iphdr *iph = ip_hdr(skb);
1501	struct sock *nsk;
1502	struct request_sock **prev;
1503	/* Find possible connection requests. */
1504	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1505						       iph->saddr, iph->daddr);
1506	if (req)
1507		return tcp_check_req(sk, skb, req, prev);
1508
1509	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1510			th->source, iph->daddr, th->dest, inet_iif(skb));
1511
1512	if (nsk) {
1513		if (nsk->sk_state != TCP_TIME_WAIT) {
1514			bh_lock_sock(nsk);
1515			return nsk;
1516		}
1517		inet_twsk_put(inet_twsk(nsk));
1518		return NULL;
1519	}
1520
1521#ifdef CONFIG_SYN_COOKIES
1522	if (!th->syn)
1523		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1524#endif
1525	return sk;
1526}
1527
1528static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1529{
1530	const struct iphdr *iph = ip_hdr(skb);
1531
1532	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1533		if (!tcp_v4_check(skb->len, iph->saddr,
1534				  iph->daddr, skb->csum)) {
1535			skb->ip_summed = CHECKSUM_UNNECESSARY;
1536			return 0;
1537		}
1538	}
1539
1540	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1541				       skb->len, IPPROTO_TCP, 0);
1542
1543	if (skb->len <= 76) {
1544		return __skb_checksum_complete(skb);
1545	}
1546	return 0;
1547}
1548
1549
1550/* The socket must have it's spinlock held when we get
1551 * here.
1552 *
1553 * We have a potential double-lock case here, so even when
1554 * doing backlog processing we use the BH locking scheme.
1555 * This is because we cannot sleep with the original spinlock
1556 * held.
1557 */
1558int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1559{
1560	struct sock *rsk;
1561#ifdef CONFIG_TCP_MD5SIG
1562	/*
1563	 * We really want to reject the packet as early as possible
1564	 * if:
1565	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1566	 *  o There is an MD5 option and we're not expecting one
1567	 */
1568	if (tcp_v4_inbound_md5_hash(sk, skb))
1569		goto discard;
1570#endif
1571
1572	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1573		sock_rps_save_rxhash(sk, skb->rxhash);
1574		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1575			rsk = sk;
1576			goto reset;
1577		}
1578		return 0;
1579	}
1580
1581	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1582		goto csum_err;
1583
1584	if (sk->sk_state == TCP_LISTEN) {
1585		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1586		if (!nsk)
1587			goto discard;
1588
1589		if (nsk != sk) {
1590			if (tcp_child_process(sk, nsk, skb)) {
1591				rsk = nsk;
1592				goto reset;
1593			}
1594			return 0;
1595		}
1596	} else
1597		sock_rps_save_rxhash(sk, skb->rxhash);
1598
1599	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1600		rsk = sk;
1601		goto reset;
1602	}
1603	return 0;
1604
1605reset:
1606	tcp_v4_send_reset(rsk, skb);
1607discard:
1608	kfree_skb(skb);
1609	/* Be careful here. If this function gets more complicated and
1610	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1611	 * might be destroyed here. This current version compiles correctly,
1612	 * but you have been warned.
1613	 */
1614	return 0;
1615
1616csum_err:
1617	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1618	goto discard;
1619}
1620EXPORT_SYMBOL(tcp_v4_do_rcv);
1621
1622/*
1623 *	From tcp_input.c
1624 */
1625
1626int tcp_v4_rcv(struct sk_buff *skb)
1627{
1628	const struct iphdr *iph;
1629	struct tcphdr *th;
1630	struct sock *sk;
1631	int ret;
1632	struct net *net = dev_net(skb->dev);
1633
1634	if (skb->pkt_type != PACKET_HOST)
1635		goto discard_it;
1636
1637	/* Count it even if it's bad */
1638	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1639
1640	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1641		goto discard_it;
1642
1643	th = tcp_hdr(skb);
1644
1645	if (th->doff < sizeof(struct tcphdr) / 4)
1646		goto bad_packet;
1647	if (!pskb_may_pull(skb, th->doff * 4))
1648		goto discard_it;
1649
1650	/* An explanation is required here, I think.
1651	 * Packet length and doff are validated by header prediction,
1652	 * provided case of th->doff==0 is eliminated.
1653	 * So, we defer the checks. */
1654	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1655		goto bad_packet;
1656
1657	th = tcp_hdr(skb);
1658	iph = ip_hdr(skb);
1659	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1660	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1661				    skb->len - th->doff * 4);
1662	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1663	TCP_SKB_CB(skb)->when	 = 0;
1664	TCP_SKB_CB(skb)->flags	 = iph->tos;
1665	TCP_SKB_CB(skb)->sacked	 = 0;
1666
1667	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1668	if (!sk)
1669		goto no_tcp_socket;
1670
1671process:
1672	if (sk->sk_state == TCP_TIME_WAIT)
1673		goto do_time_wait;
1674
1675	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1676		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1677		goto discard_and_relse;
1678	}
1679
1680	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1681		goto discard_and_relse;
1682	nf_reset(skb);
1683
1684	if (sk_filter(sk, skb))
1685		goto discard_and_relse;
1686
1687	skb->dev = NULL;
1688
1689	bh_lock_sock_nested(sk);
1690	ret = 0;
1691	if (!sock_owned_by_user(sk)) {
1692#ifdef CONFIG_NET_DMA
1693		struct tcp_sock *tp = tcp_sk(sk);
1694		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1695			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1696		if (tp->ucopy.dma_chan)
1697			ret = tcp_v4_do_rcv(sk, skb);
1698		else
1699#endif
1700		{
1701			if (!tcp_prequeue(sk, skb))
1702				ret = tcp_v4_do_rcv(sk, skb);
1703		}
1704	} else if (unlikely(sk_add_backlog(sk, skb))) {
1705		bh_unlock_sock(sk);
1706		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1707		goto discard_and_relse;
1708	}
1709	bh_unlock_sock(sk);
1710
1711	sock_put(sk);
1712
1713	return ret;
1714
1715no_tcp_socket:
1716	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1717		goto discard_it;
1718
1719	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1720bad_packet:
1721		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1722	} else {
1723		tcp_v4_send_reset(NULL, skb);
1724	}
1725
1726discard_it:
1727	/* Discard frame. */
1728	kfree_skb(skb);
1729	return 0;
1730
1731discard_and_relse:
1732	sock_put(sk);
1733	goto discard_it;
1734
1735do_time_wait:
1736	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1737		inet_twsk_put(inet_twsk(sk));
1738		goto discard_it;
1739	}
1740
1741	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1742		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1743		inet_twsk_put(inet_twsk(sk));
1744		goto discard_it;
1745	}
1746	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1747	case TCP_TW_SYN: {
1748		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1749							&tcp_hashinfo,
1750							iph->daddr, th->dest,
1751							inet_iif(skb));
1752		if (sk2) {
1753			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1754			inet_twsk_put(inet_twsk(sk));
1755			sk = sk2;
1756			goto process;
1757		}
1758		/* Fall through to ACK */
1759	}
1760	case TCP_TW_ACK:
1761		tcp_v4_timewait_ack(sk, skb);
1762		break;
1763	case TCP_TW_RST:
1764		goto no_tcp_socket;
1765	case TCP_TW_SUCCESS:;
1766	}
1767	goto discard_it;
1768}
1769
1770struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1771{
1772	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1773	struct inet_sock *inet = inet_sk(sk);
1774	struct inet_peer *peer;
1775
1776	if (!rt ||
1777	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1778		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1779		*release_it = true;
1780	} else {
1781		if (!rt->peer)
1782			rt_bind_peer(rt, 1);
1783		peer = rt->peer;
1784		*release_it = false;
1785	}
1786
1787	return peer;
1788}
1789EXPORT_SYMBOL(tcp_v4_get_peer);
1790
1791void *tcp_v4_tw_get_peer(struct sock *sk)
1792{
1793	struct inet_timewait_sock *tw = inet_twsk(sk);
1794
1795	return inet_getpeer_v4(tw->tw_daddr, 1);
1796}
1797EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1798
1799static struct timewait_sock_ops tcp_timewait_sock_ops = {
1800	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1801	.twsk_unique	= tcp_twsk_unique,
1802	.twsk_destructor= tcp_twsk_destructor,
1803	.twsk_getpeer	= tcp_v4_tw_get_peer,
1804};
1805
1806const struct inet_connection_sock_af_ops ipv4_specific = {
1807	.queue_xmit	   = ip_queue_xmit,
1808	.send_check	   = tcp_v4_send_check,
1809	.rebuild_header	   = inet_sk_rebuild_header,
1810	.conn_request	   = tcp_v4_conn_request,
1811	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1812	.get_peer	   = tcp_v4_get_peer,
1813	.net_header_len	   = sizeof(struct iphdr),
1814	.setsockopt	   = ip_setsockopt,
1815	.getsockopt	   = ip_getsockopt,
1816	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1817	.sockaddr_len	   = sizeof(struct sockaddr_in),
1818	.bind_conflict	   = inet_csk_bind_conflict,
1819#ifdef CONFIG_COMPAT
1820	.compat_setsockopt = compat_ip_setsockopt,
1821	.compat_getsockopt = compat_ip_getsockopt,
1822#endif
1823};
1824EXPORT_SYMBOL(ipv4_specific);
1825
1826#ifdef CONFIG_TCP_MD5SIG
1827static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1828	.md5_lookup		= tcp_v4_md5_lookup,
1829	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1830	.md5_add		= tcp_v4_md5_add_func,
1831	.md5_parse		= tcp_v4_parse_md5_keys,
1832};
1833#endif
1834
1835/* NOTE: A lot of things set to zero explicitly by call to
1836 *       sk_alloc() so need not be done here.
1837 */
1838static int tcp_v4_init_sock(struct sock *sk)
1839{
1840	struct inet_connection_sock *icsk = inet_csk(sk);
1841	struct tcp_sock *tp = tcp_sk(sk);
1842
1843	skb_queue_head_init(&tp->out_of_order_queue);
1844	tcp_init_xmit_timers(sk);
1845	tcp_prequeue_init(tp);
1846
1847	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1848	tp->mdev = TCP_TIMEOUT_INIT;
1849
1850	/* So many TCP implementations out there (incorrectly) count the
1851	 * initial SYN frame in their delayed-ACK and congestion control
1852	 * algorithms that we must have the following bandaid to talk
1853	 * efficiently to them.  -DaveM
1854	 */
1855	tp->snd_cwnd = 2;
1856
1857	/* See draft-stevens-tcpca-spec-01 for discussion of the
1858	 * initialization of these values.
1859	 */
1860	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1861	tp->snd_cwnd_clamp = ~0;
1862	tp->mss_cache = TCP_MSS_DEFAULT;
1863
1864	tp->reordering = sysctl_tcp_reordering;
1865	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1866
1867	sk->sk_state = TCP_CLOSE;
1868
1869	sk->sk_write_space = sk_stream_write_space;
1870	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1871
1872	icsk->icsk_af_ops = &ipv4_specific;
1873	icsk->icsk_sync_mss = tcp_sync_mss;
1874#ifdef CONFIG_TCP_MD5SIG
1875	tp->af_specific = &tcp_sock_ipv4_specific;
1876#endif
1877
1878	/* TCP Cookie Transactions */
1879	if (sysctl_tcp_cookie_size > 0) {
1880		/* Default, cookies without s_data_payload. */
1881		tp->cookie_values =
1882			kzalloc(sizeof(*tp->cookie_values),
1883				sk->sk_allocation);
1884		if (tp->cookie_values != NULL)
1885			kref_init(&tp->cookie_values->kref);
1886	}
1887	/* Presumed zeroed, in order of appearance:
1888	 *	cookie_in_always, cookie_out_never,
1889	 *	s_data_constant, s_data_in, s_data_out
1890	 */
1891	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1892	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1893
1894	local_bh_disable();
1895	percpu_counter_inc(&tcp_sockets_allocated);
1896	local_bh_enable();
1897
1898	return 0;
1899}
1900
1901void tcp_v4_destroy_sock(struct sock *sk)
1902{
1903	struct tcp_sock *tp = tcp_sk(sk);
1904
1905	tcp_clear_xmit_timers(sk);
1906
1907	tcp_cleanup_congestion_control(sk);
1908
1909	/* Cleanup up the write buffer. */
1910	tcp_write_queue_purge(sk);
1911
1912	/* Cleans up our, hopefully empty, out_of_order_queue. */
1913	__skb_queue_purge(&tp->out_of_order_queue);
1914
1915#ifdef CONFIG_TCP_MD5SIG
1916	/* Clean up the MD5 key list, if any */
1917	if (tp->md5sig_info) {
1918		tcp_v4_clear_md5_list(sk);
1919		kfree(tp->md5sig_info);
1920		tp->md5sig_info = NULL;
1921	}
1922#endif
1923
1924#ifdef CONFIG_NET_DMA
1925	/* Cleans up our sk_async_wait_queue */
1926	__skb_queue_purge(&sk->sk_async_wait_queue);
1927#endif
1928
1929	/* Clean prequeue, it must be empty really */
1930	__skb_queue_purge(&tp->ucopy.prequeue);
1931
1932	/* Clean up a referenced TCP bind bucket. */
1933	if (inet_csk(sk)->icsk_bind_hash)
1934		inet_put_port(sk);
1935
1936	/*
1937	 * If sendmsg cached page exists, toss it.
1938	 */
1939	if (sk->sk_sndmsg_page) {
1940		__free_page(sk->sk_sndmsg_page);
1941		sk->sk_sndmsg_page = NULL;
1942	}
1943
1944	/* TCP Cookie Transactions */
1945	if (tp->cookie_values != NULL) {
1946		kref_put(&tp->cookie_values->kref,
1947			 tcp_cookie_values_release);
1948		tp->cookie_values = NULL;
1949	}
1950
1951	percpu_counter_dec(&tcp_sockets_allocated);
1952}
1953EXPORT_SYMBOL(tcp_v4_destroy_sock);
1954
1955#ifdef CONFIG_PROC_FS
1956/* Proc filesystem TCP sock list dumping. */
1957
1958static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1959{
1960	return hlist_nulls_empty(head) ? NULL :
1961		list_entry(head->first, struct inet_timewait_sock, tw_node);
1962}
1963
1964static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1965{
1966	return !is_a_nulls(tw->tw_node.next) ?
1967		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1968}
1969
1970/*
1971 * Get next listener socket follow cur.  If cur is NULL, get first socket
1972 * starting from bucket given in st->bucket; when st->bucket is zero the
1973 * very first socket in the hash table is returned.
1974 */
1975static void *listening_get_next(struct seq_file *seq, void *cur)
1976{
1977	struct inet_connection_sock *icsk;
1978	struct hlist_nulls_node *node;
1979	struct sock *sk = cur;
1980	struct inet_listen_hashbucket *ilb;
1981	struct tcp_iter_state *st = seq->private;
1982	struct net *net = seq_file_net(seq);
1983
1984	if (!sk) {
1985		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1986		spin_lock_bh(&ilb->lock);
1987		sk = sk_nulls_head(&ilb->head);
1988		st->offset = 0;
1989		goto get_sk;
1990	}
1991	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1992	++st->num;
1993	++st->offset;
1994
1995	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1996		struct request_sock *req = cur;
1997
1998		icsk = inet_csk(st->syn_wait_sk);
1999		req = req->dl_next;
2000		while (1) {
2001			while (req) {
2002				if (req->rsk_ops->family == st->family) {
2003					cur = req;
2004					goto out;
2005				}
2006				req = req->dl_next;
2007			}
2008			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2009				break;
2010get_req:
2011			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2012		}
2013		sk	  = sk_nulls_next(st->syn_wait_sk);
2014		st->state = TCP_SEQ_STATE_LISTENING;
2015		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2016	} else {
2017		icsk = inet_csk(sk);
2018		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2019		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2020			goto start_req;
2021		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022		sk = sk_nulls_next(sk);
2023	}
2024get_sk:
2025	sk_nulls_for_each_from(sk, node) {
2026		if (!net_eq(sock_net(sk), net))
2027			continue;
2028		if (sk->sk_family == st->family) {
2029			cur = sk;
2030			goto out;
2031		}
2032		icsk = inet_csk(sk);
2033		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2035start_req:
2036			st->uid		= sock_i_uid(sk);
2037			st->syn_wait_sk = sk;
2038			st->state	= TCP_SEQ_STATE_OPENREQ;
2039			st->sbucket	= 0;
2040			goto get_req;
2041		}
2042		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2043	}
2044	spin_unlock_bh(&ilb->lock);
2045	st->offset = 0;
2046	if (++st->bucket < INET_LHTABLE_SIZE) {
2047		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2048		spin_lock_bh(&ilb->lock);
2049		sk = sk_nulls_head(&ilb->head);
2050		goto get_sk;
2051	}
2052	cur = NULL;
2053out:
2054	return cur;
2055}
2056
2057static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2058{
2059	struct tcp_iter_state *st = seq->private;
2060	void *rc;
2061
2062	st->bucket = 0;
2063	st->offset = 0;
2064	rc = listening_get_next(seq, NULL);
2065
2066	while (rc && *pos) {
2067		rc = listening_get_next(seq, rc);
2068		--*pos;
2069	}
2070	return rc;
2071}
2072
2073static inline int empty_bucket(struct tcp_iter_state *st)
2074{
2075	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2076		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2077}
2078
2079/*
2080 * Get first established socket starting from bucket given in st->bucket.
2081 * If st->bucket is zero, the very first socket in the hash is returned.
2082 */
2083static void *established_get_first(struct seq_file *seq)
2084{
2085	struct tcp_iter_state *st = seq->private;
2086	struct net *net = seq_file_net(seq);
2087	void *rc = NULL;
2088
2089	st->offset = 0;
2090	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2091		struct sock *sk;
2092		struct hlist_nulls_node *node;
2093		struct inet_timewait_sock *tw;
2094		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2095
2096		/* Lockless fast path for the common case of empty buckets */
2097		if (empty_bucket(st))
2098			continue;
2099
2100		spin_lock_bh(lock);
2101		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2102			if (sk->sk_family != st->family ||
2103			    !net_eq(sock_net(sk), net)) {
2104				continue;
2105			}
2106			rc = sk;
2107			goto out;
2108		}
2109		st->state = TCP_SEQ_STATE_TIME_WAIT;
2110		inet_twsk_for_each(tw, node,
2111				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2112			if (tw->tw_family != st->family ||
2113			    !net_eq(twsk_net(tw), net)) {
2114				continue;
2115			}
2116			rc = tw;
2117			goto out;
2118		}
2119		spin_unlock_bh(lock);
2120		st->state = TCP_SEQ_STATE_ESTABLISHED;
2121	}
2122out:
2123	return rc;
2124}
2125
2126static void *established_get_next(struct seq_file *seq, void *cur)
2127{
2128	struct sock *sk = cur;
2129	struct inet_timewait_sock *tw;
2130	struct hlist_nulls_node *node;
2131	struct tcp_iter_state *st = seq->private;
2132	struct net *net = seq_file_net(seq);
2133
2134	++st->num;
2135	++st->offset;
2136
2137	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2138		tw = cur;
2139		tw = tw_next(tw);
2140get_tw:
2141		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2142			tw = tw_next(tw);
2143		}
2144		if (tw) {
2145			cur = tw;
2146			goto out;
2147		}
2148		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2149		st->state = TCP_SEQ_STATE_ESTABLISHED;
2150
2151		/* Look for next non empty bucket */
2152		st->offset = 0;
2153		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2154				empty_bucket(st))
2155			;
2156		if (st->bucket > tcp_hashinfo.ehash_mask)
2157			return NULL;
2158
2159		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2160		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2161	} else
2162		sk = sk_nulls_next(sk);
2163
2164	sk_nulls_for_each_from(sk, node) {
2165		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2166			goto found;
2167	}
2168
2169	st->state = TCP_SEQ_STATE_TIME_WAIT;
2170	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2171	goto get_tw;
2172found:
2173	cur = sk;
2174out:
2175	return cur;
2176}
2177
2178static void *established_get_idx(struct seq_file *seq, loff_t pos)
2179{
2180	struct tcp_iter_state *st = seq->private;
2181	void *rc;
2182
2183	st->bucket = 0;
2184	rc = established_get_first(seq);
2185
2186	while (rc && pos) {
2187		rc = established_get_next(seq, rc);
2188		--pos;
2189	}
2190	return rc;
2191}
2192
2193static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2194{
2195	void *rc;
2196	struct tcp_iter_state *st = seq->private;
2197
2198	st->state = TCP_SEQ_STATE_LISTENING;
2199	rc	  = listening_get_idx(seq, &pos);
2200
2201	if (!rc) {
2202		st->state = TCP_SEQ_STATE_ESTABLISHED;
2203		rc	  = established_get_idx(seq, pos);
2204	}
2205
2206	return rc;
2207}
2208
2209static void *tcp_seek_last_pos(struct seq_file *seq)
2210{
2211	struct tcp_iter_state *st = seq->private;
2212	int offset = st->offset;
2213	int orig_num = st->num;
2214	void *rc = NULL;
2215
2216	switch (st->state) {
2217	case TCP_SEQ_STATE_OPENREQ:
2218	case TCP_SEQ_STATE_LISTENING:
2219		if (st->bucket >= INET_LHTABLE_SIZE)
2220			break;
2221		st->state = TCP_SEQ_STATE_LISTENING;
2222		rc = listening_get_next(seq, NULL);
2223		while (offset-- && rc)
2224			rc = listening_get_next(seq, rc);
2225		if (rc)
2226			break;
2227		st->bucket = 0;
2228		/* Fallthrough */
2229	case TCP_SEQ_STATE_ESTABLISHED:
2230	case TCP_SEQ_STATE_TIME_WAIT:
2231		st->state = TCP_SEQ_STATE_ESTABLISHED;
2232		if (st->bucket > tcp_hashinfo.ehash_mask)
2233			break;
2234		rc = established_get_first(seq);
2235		while (offset-- && rc)
2236			rc = established_get_next(seq, rc);
2237	}
2238
2239	st->num = orig_num;
2240
2241	return rc;
2242}
2243
2244static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2245{
2246	struct tcp_iter_state *st = seq->private;
2247	void *rc;
2248
2249	if (*pos && *pos == st->last_pos) {
2250		rc = tcp_seek_last_pos(seq);
2251		if (rc)
2252			goto out;
2253	}
2254
2255	st->state = TCP_SEQ_STATE_LISTENING;
2256	st->num = 0;
2257	st->bucket = 0;
2258	st->offset = 0;
2259	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2260
2261out:
2262	st->last_pos = *pos;
2263	return rc;
2264}
2265
2266static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2267{
2268	struct tcp_iter_state *st = seq->private;
2269	void *rc = NULL;
2270
2271	if (v == SEQ_START_TOKEN) {
2272		rc = tcp_get_idx(seq, 0);
2273		goto out;
2274	}
2275
2276	switch (st->state) {
2277	case TCP_SEQ_STATE_OPENREQ:
2278	case TCP_SEQ_STATE_LISTENING:
2279		rc = listening_get_next(seq, v);
2280		if (!rc) {
2281			st->state = TCP_SEQ_STATE_ESTABLISHED;
2282			st->bucket = 0;
2283			st->offset = 0;
2284			rc	  = established_get_first(seq);
2285		}
2286		break;
2287	case TCP_SEQ_STATE_ESTABLISHED:
2288	case TCP_SEQ_STATE_TIME_WAIT:
2289		rc = established_get_next(seq, v);
2290		break;
2291	}
2292out:
2293	++*pos;
2294	st->last_pos = *pos;
2295	return rc;
2296}
2297
2298static void tcp_seq_stop(struct seq_file *seq, void *v)
2299{
2300	struct tcp_iter_state *st = seq->private;
2301
2302	switch (st->state) {
2303	case TCP_SEQ_STATE_OPENREQ:
2304		if (v) {
2305			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2306			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2307		}
2308	case TCP_SEQ_STATE_LISTENING:
2309		if (v != SEQ_START_TOKEN)
2310			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2311		break;
2312	case TCP_SEQ_STATE_TIME_WAIT:
2313	case TCP_SEQ_STATE_ESTABLISHED:
2314		if (v)
2315			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2316		break;
2317	}
2318}
2319
2320static int tcp_seq_open(struct inode *inode, struct file *file)
2321{
2322	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2323	struct tcp_iter_state *s;
2324	int err;
2325
2326	err = seq_open_net(inode, file, &afinfo->seq_ops,
2327			  sizeof(struct tcp_iter_state));
2328	if (err < 0)
2329		return err;
2330
2331	s = ((struct seq_file *)file->private_data)->private;
2332	s->family		= afinfo->family;
2333	s->last_pos 		= 0;
2334	return 0;
2335}
2336
2337int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2338{
2339	int rc = 0;
2340	struct proc_dir_entry *p;
2341
2342	afinfo->seq_fops.open		= tcp_seq_open;
2343	afinfo->seq_fops.read		= seq_read;
2344	afinfo->seq_fops.llseek		= seq_lseek;
2345	afinfo->seq_fops.release	= seq_release_net;
2346
2347	afinfo->seq_ops.start		= tcp_seq_start;
2348	afinfo->seq_ops.next		= tcp_seq_next;
2349	afinfo->seq_ops.stop		= tcp_seq_stop;
2350
2351	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2352			     &afinfo->seq_fops, afinfo);
2353	if (!p)
2354		rc = -ENOMEM;
2355	return rc;
2356}
2357EXPORT_SYMBOL(tcp_proc_register);
2358
2359void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2360{
2361	proc_net_remove(net, afinfo->name);
2362}
2363EXPORT_SYMBOL(tcp_proc_unregister);
2364
2365static void get_openreq4(struct sock *sk, struct request_sock *req,
2366			 struct seq_file *f, int i, int uid, int *len)
2367{
2368	const struct inet_request_sock *ireq = inet_rsk(req);
2369	int ttd = req->expires - jiffies;
2370
2371	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2372		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2373		i,
2374		ireq->loc_addr,
2375		ntohs(inet_sk(sk)->inet_sport),
2376		ireq->rmt_addr,
2377		ntohs(ireq->rmt_port),
2378		TCP_SYN_RECV,
2379		0, 0, /* could print option size, but that is af dependent. */
2380		1,    /* timers active (only the expire timer) */
2381		jiffies_to_clock_t(ttd),
2382		req->retrans,
2383		uid,
2384		0,  /* non standard timer */
2385		0, /* open_requests have no inode */
2386		atomic_read(&sk->sk_refcnt),
2387		req,
2388		len);
2389}
2390
2391static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2392{
2393	int timer_active;
2394	unsigned long timer_expires;
2395	struct tcp_sock *tp = tcp_sk(sk);
2396	const struct inet_connection_sock *icsk = inet_csk(sk);
2397	struct inet_sock *inet = inet_sk(sk);
2398	__be32 dest = inet->inet_daddr;
2399	__be32 src = inet->inet_rcv_saddr;
2400	__u16 destp = ntohs(inet->inet_dport);
2401	__u16 srcp = ntohs(inet->inet_sport);
2402	int rx_queue;
2403
2404	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2405		timer_active	= 1;
2406		timer_expires	= icsk->icsk_timeout;
2407	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2408		timer_active	= 4;
2409		timer_expires	= icsk->icsk_timeout;
2410	} else if (timer_pending(&sk->sk_timer)) {
2411		timer_active	= 2;
2412		timer_expires	= sk->sk_timer.expires;
2413	} else {
2414		timer_active	= 0;
2415		timer_expires = jiffies;
2416	}
2417
2418	if (sk->sk_state == TCP_LISTEN)
2419		rx_queue = sk->sk_ack_backlog;
2420	else
2421		/*
2422		 * because we dont lock socket, we might find a transient negative value
2423		 */
2424		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2425
2426	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2427			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2428		i, src, srcp, dest, destp, sk->sk_state,
2429		tp->write_seq - tp->snd_una,
2430		rx_queue,
2431		timer_active,
2432		jiffies_to_clock_t(timer_expires - jiffies),
2433		icsk->icsk_retransmits,
2434		sock_i_uid(sk),
2435		icsk->icsk_probes_out,
2436		sock_i_ino(sk),
2437		atomic_read(&sk->sk_refcnt), sk,
2438		jiffies_to_clock_t(icsk->icsk_rto),
2439		jiffies_to_clock_t(icsk->icsk_ack.ato),
2440		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2441		tp->snd_cwnd,
2442		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2443		len);
2444}
2445
2446static void get_timewait4_sock(struct inet_timewait_sock *tw,
2447			       struct seq_file *f, int i, int *len)
2448{
2449	__be32 dest, src;
2450	__u16 destp, srcp;
2451	int ttd = tw->tw_ttd - jiffies;
2452
2453	if (ttd < 0)
2454		ttd = 0;
2455
2456	dest  = tw->tw_daddr;
2457	src   = tw->tw_rcv_saddr;
2458	destp = ntohs(tw->tw_dport);
2459	srcp  = ntohs(tw->tw_sport);
2460
2461	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2462		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2463		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2464		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2465		atomic_read(&tw->tw_refcnt), tw, len);
2466}
2467
2468#define TMPSZ 150
2469
2470static int tcp4_seq_show(struct seq_file *seq, void *v)
2471{
2472	struct tcp_iter_state *st;
2473	int len;
2474
2475	if (v == SEQ_START_TOKEN) {
2476		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2477			   "  sl  local_address rem_address   st tx_queue "
2478			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2479			   "inode");
2480		goto out;
2481	}
2482	st = seq->private;
2483
2484	switch (st->state) {
2485	case TCP_SEQ_STATE_LISTENING:
2486	case TCP_SEQ_STATE_ESTABLISHED:
2487		get_tcp4_sock(v, seq, st->num, &len);
2488		break;
2489	case TCP_SEQ_STATE_OPENREQ:
2490		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2491		break;
2492	case TCP_SEQ_STATE_TIME_WAIT:
2493		get_timewait4_sock(v, seq, st->num, &len);
2494		break;
2495	}
2496	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2497out:
2498	return 0;
2499}
2500
2501static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2502	.name		= "tcp",
2503	.family		= AF_INET,
2504	.seq_fops	= {
2505		.owner		= THIS_MODULE,
2506	},
2507	.seq_ops	= {
2508		.show		= tcp4_seq_show,
2509	},
2510};
2511
2512static int __net_init tcp4_proc_init_net(struct net *net)
2513{
2514	return tcp_proc_register(net, &tcp4_seq_afinfo);
2515}
2516
2517static void __net_exit tcp4_proc_exit_net(struct net *net)
2518{
2519	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2520}
2521
2522static struct pernet_operations tcp4_net_ops = {
2523	.init = tcp4_proc_init_net,
2524	.exit = tcp4_proc_exit_net,
2525};
2526
2527int __init tcp4_proc_init(void)
2528{
2529	return register_pernet_subsys(&tcp4_net_ops);
2530}
2531
2532void tcp4_proc_exit(void)
2533{
2534	unregister_pernet_subsys(&tcp4_net_ops);
2535}
2536#endif /* CONFIG_PROC_FS */
2537
2538struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2539{
2540	const struct iphdr *iph = skb_gro_network_header(skb);
2541
2542	switch (skb->ip_summed) {
2543	case CHECKSUM_COMPLETE:
2544		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2545				  skb->csum)) {
2546			skb->ip_summed = CHECKSUM_UNNECESSARY;
2547			break;
2548		}
2549
2550		/* fall through */
2551	case CHECKSUM_NONE:
2552		NAPI_GRO_CB(skb)->flush = 1;
2553		return NULL;
2554	}
2555
2556	return tcp_gro_receive(head, skb);
2557}
2558
2559int tcp4_gro_complete(struct sk_buff *skb)
2560{
2561	const struct iphdr *iph = ip_hdr(skb);
2562	struct tcphdr *th = tcp_hdr(skb);
2563
2564	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2565				  iph->saddr, iph->daddr, 0);
2566	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2567
2568	return tcp_gro_complete(skb);
2569}
2570
2571struct proto tcp_prot = {
2572	.name			= "TCP",
2573	.owner			= THIS_MODULE,
2574	.close			= tcp_close,
2575	.connect		= tcp_v4_connect,
2576	.disconnect		= tcp_disconnect,
2577	.accept			= inet_csk_accept,
2578	.ioctl			= tcp_ioctl,
2579	.init			= tcp_v4_init_sock,
2580	.destroy		= tcp_v4_destroy_sock,
2581	.shutdown		= tcp_shutdown,
2582	.setsockopt		= tcp_setsockopt,
2583	.getsockopt		= tcp_getsockopt,
2584	.recvmsg		= tcp_recvmsg,
2585	.sendmsg		= tcp_sendmsg,
2586	.sendpage		= tcp_sendpage,
2587	.backlog_rcv		= tcp_v4_do_rcv,
2588	.hash			= inet_hash,
2589	.unhash			= inet_unhash,
2590	.get_port		= inet_csk_get_port,
2591	.enter_memory_pressure	= tcp_enter_memory_pressure,
2592	.sockets_allocated	= &tcp_sockets_allocated,
2593	.orphan_count		= &tcp_orphan_count,
2594	.memory_allocated	= &tcp_memory_allocated,
2595	.memory_pressure	= &tcp_memory_pressure,
2596	.sysctl_mem		= sysctl_tcp_mem,
2597	.sysctl_wmem		= sysctl_tcp_wmem,
2598	.sysctl_rmem		= sysctl_tcp_rmem,
2599	.max_header		= MAX_TCP_HEADER,
2600	.obj_size		= sizeof(struct tcp_sock),
2601	.slab_flags		= SLAB_DESTROY_BY_RCU,
2602	.twsk_prot		= &tcp_timewait_sock_ops,
2603	.rsk_prot		= &tcp_request_sock_ops,
2604	.h.hashinfo		= &tcp_hashinfo,
2605	.no_autobind		= true,
2606#ifdef CONFIG_COMPAT
2607	.compat_setsockopt	= compat_tcp_setsockopt,
2608	.compat_getsockopt	= compat_tcp_getsockopt,
2609#endif
2610};
2611EXPORT_SYMBOL(tcp_prot);
2612
2613
2614static int __net_init tcp_sk_init(struct net *net)
2615{
2616	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2617				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2618}
2619
2620static void __net_exit tcp_sk_exit(struct net *net)
2621{
2622	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2623}
2624
2625static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2626{
2627	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2628}
2629
2630static struct pernet_operations __net_initdata tcp_sk_ops = {
2631       .init	   = tcp_sk_init,
2632       .exit	   = tcp_sk_exit,
2633       .exit_batch = tcp_sk_exit_batch,
2634};
2635
2636void __init tcp_v4_init(void)
2637{
2638	inet_hashinfo_init(&tcp_hashinfo);
2639	if (register_pernet_subsys(&tcp_sk_ops))
2640		panic("Failed to create the TCP control socket.\n");
2641}
2642