tcp_ipv4.c revision fd0273c5033630b8673554cd39660435d1ab2ac4
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90#ifdef CONFIG_TCP_MD5SIG
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92						   __be32 addr);
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99	return NULL;
100}
101#endif
102
103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149	struct inet_sock *inet = inet_sk(sk);
150	struct tcp_sock *tp = tcp_sk(sk);
151	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152	struct rtable *rt;
153	__be32 daddr, nexthop;
154	int tmp;
155	int err;
156
157	if (addr_len < sizeof(struct sockaddr_in))
158		return -EINVAL;
159
160	if (usin->sin_family != AF_INET)
161		return -EAFNOSUPPORT;
162
163	nexthop = daddr = usin->sin_addr.s_addr;
164	if (inet->opt && inet->opt->srr) {
165		if (!daddr)
166			return -EINVAL;
167		nexthop = inet->opt->faddr;
168	}
169
170	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
171			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172			       IPPROTO_TCP,
173			       inet->inet_sport, usin->sin_port, sk, 1);
174	if (tmp < 0) {
175		if (tmp == -ENETUNREACH)
176			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177		return tmp;
178	}
179
180	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181		ip_rt_put(rt);
182		return -ENETUNREACH;
183	}
184
185	if (!inet->opt || !inet->opt->srr)
186		daddr = rt->rt_dst;
187
188	if (!inet->inet_saddr)
189		inet->inet_saddr = rt->rt_src;
190	inet->inet_rcv_saddr = inet->inet_saddr;
191
192	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193		/* Reset inherited state */
194		tp->rx_opt.ts_recent	   = 0;
195		tp->rx_opt.ts_recent_stamp = 0;
196		tp->write_seq		   = 0;
197	}
198
199	if (tcp_death_row.sysctl_tw_recycle &&
200	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
201		struct inet_peer *peer = rt_get_peer(rt);
202		/*
203		 * VJ's idea. We save last timestamp seen from
204		 * the destination in peer table, when entering state
205		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
206		 * when trying new connection.
207		 */
208		if (peer) {
209			inet_peer_refcheck(peer);
210			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
211				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212				tp->rx_opt.ts_recent = peer->tcp_ts;
213			}
214		}
215	}
216
217	inet->inet_dport = usin->sin_port;
218	inet->inet_daddr = daddr;
219
220	inet_csk(sk)->icsk_ext_hdr_len = 0;
221	if (inet->opt)
222		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
223
224	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
225
226	/* Socket identity is still unknown (sport may be zero).
227	 * However we set state to SYN-SENT and not releasing socket
228	 * lock select source port, enter ourselves into the hash tables and
229	 * complete initialization after this.
230	 */
231	tcp_set_state(sk, TCP_SYN_SENT);
232	err = inet_hash_connect(&tcp_death_row, sk);
233	if (err)
234		goto failure;
235
236	err = ip_route_newports(&rt, IPPROTO_TCP,
237				inet->inet_sport, inet->inet_dport, sk);
238	if (err)
239		goto failure;
240
241	/* OK, now commit destination to socket.  */
242	sk->sk_gso_type = SKB_GSO_TCPV4;
243	sk_setup_caps(sk, &rt->dst);
244
245	if (!tp->write_seq)
246		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
247							   inet->inet_daddr,
248							   inet->inet_sport,
249							   usin->sin_port);
250
251	inet->inet_id = tp->write_seq ^ jiffies;
252
253	err = tcp_connect(sk);
254	rt = NULL;
255	if (err)
256		goto failure;
257
258	return 0;
259
260failure:
261	/*
262	 * This unhashes the socket and releases the local port,
263	 * if necessary.
264	 */
265	tcp_set_state(sk, TCP_CLOSE);
266	ip_rt_put(rt);
267	sk->sk_route_caps = 0;
268	inet->inet_dport = 0;
269	return err;
270}
271EXPORT_SYMBOL(tcp_v4_connect);
272
273/*
274 * This routine does path mtu discovery as defined in RFC1191.
275 */
276static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277{
278	struct dst_entry *dst;
279	struct inet_sock *inet = inet_sk(sk);
280
281	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282	 * send out by Linux are always <576bytes so they should go through
283	 * unfragmented).
284	 */
285	if (sk->sk_state == TCP_LISTEN)
286		return;
287
288	/* We don't check in the destentry if pmtu discovery is forbidden
289	 * on this route. We just assume that no packet_to_big packets
290	 * are send back when pmtu discovery is not active.
291	 * There is a small race when the user changes this flag in the
292	 * route, but I think that's acceptable.
293	 */
294	if ((dst = __sk_dst_check(sk, 0)) == NULL)
295		return;
296
297	dst->ops->update_pmtu(dst, mtu);
298
299	/* Something is about to be wrong... Remember soft error
300	 * for the case, if this connection will not able to recover.
301	 */
302	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303		sk->sk_err_soft = EMSGSIZE;
304
305	mtu = dst_mtu(dst);
306
307	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309		tcp_sync_mss(sk, mtu);
310
311		/* Resend the TCP packet because it's
312		 * clear that the old packet has been
313		 * dropped. This is the new "fast" path mtu
314		 * discovery.
315		 */
316		tcp_simple_retransmit(sk);
317	} /* else let the usual retransmit timer handle it */
318}
319
320/*
321 * This routine is called by the ICMP module when it gets some
322 * sort of error condition.  If err < 0 then the socket should
323 * be closed and the error returned to the user.  If err > 0
324 * it's just the icmp type << 8 | icmp code.  After adjustment
325 * header points to the first 8 bytes of the tcp header.  We need
326 * to find the appropriate port.
327 *
328 * The locking strategy used here is very "optimistic". When
329 * someone else accesses the socket the ICMP is just dropped
330 * and for some paths there is no check at all.
331 * A more general error queue to queue errors for later handling
332 * is probably better.
333 *
334 */
335
336void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
337{
338	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
339	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
340	struct inet_connection_sock *icsk;
341	struct tcp_sock *tp;
342	struct inet_sock *inet;
343	const int type = icmp_hdr(icmp_skb)->type;
344	const int code = icmp_hdr(icmp_skb)->code;
345	struct sock *sk;
346	struct sk_buff *skb;
347	__u32 seq;
348	__u32 remaining;
349	int err;
350	struct net *net = dev_net(icmp_skb->dev);
351
352	if (icmp_skb->len < (iph->ihl << 2) + 8) {
353		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
354		return;
355	}
356
357	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
358			iph->saddr, th->source, inet_iif(icmp_skb));
359	if (!sk) {
360		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
361		return;
362	}
363	if (sk->sk_state == TCP_TIME_WAIT) {
364		inet_twsk_put(inet_twsk(sk));
365		return;
366	}
367
368	bh_lock_sock(sk);
369	/* If too many ICMPs get dropped on busy
370	 * servers this needs to be solved differently.
371	 */
372	if (sock_owned_by_user(sk))
373		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
374
375	if (sk->sk_state == TCP_CLOSE)
376		goto out;
377
378	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
379		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
380		goto out;
381	}
382
383	icsk = inet_csk(sk);
384	tp = tcp_sk(sk);
385	seq = ntohl(th->seq);
386	if (sk->sk_state != TCP_LISTEN &&
387	    !between(seq, tp->snd_una, tp->snd_nxt)) {
388		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
389		goto out;
390	}
391
392	switch (type) {
393	case ICMP_SOURCE_QUENCH:
394		/* Just silently ignore these. */
395		goto out;
396	case ICMP_PARAMETERPROB:
397		err = EPROTO;
398		break;
399	case ICMP_DEST_UNREACH:
400		if (code > NR_ICMP_UNREACH)
401			goto out;
402
403		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
404			if (!sock_owned_by_user(sk))
405				do_pmtu_discovery(sk, iph, info);
406			goto out;
407		}
408
409		err = icmp_err_convert[code].errno;
410		/* check if icmp_skb allows revert of backoff
411		 * (see draft-zimmermann-tcp-lcd) */
412		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
413			break;
414		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
415		    !icsk->icsk_backoff)
416			break;
417
418		if (sock_owned_by_user(sk))
419			break;
420
421		icsk->icsk_backoff--;
422		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
423					 icsk->icsk_backoff;
424		tcp_bound_rto(sk);
425
426		skb = tcp_write_queue_head(sk);
427		BUG_ON(!skb);
428
429		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
430				tcp_time_stamp - TCP_SKB_CB(skb)->when);
431
432		if (remaining) {
433			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
434						  remaining, TCP_RTO_MAX);
435		} else {
436			/* RTO revert clocked out retransmission.
437			 * Will retransmit now */
438			tcp_retransmit_timer(sk);
439		}
440
441		break;
442	case ICMP_TIME_EXCEEDED:
443		err = EHOSTUNREACH;
444		break;
445	default:
446		goto out;
447	}
448
449	switch (sk->sk_state) {
450		struct request_sock *req, **prev;
451	case TCP_LISTEN:
452		if (sock_owned_by_user(sk))
453			goto out;
454
455		req = inet_csk_search_req(sk, &prev, th->dest,
456					  iph->daddr, iph->saddr);
457		if (!req)
458			goto out;
459
460		/* ICMPs are not backlogged, hence we cannot get
461		   an established socket here.
462		 */
463		WARN_ON(req->sk);
464
465		if (seq != tcp_rsk(req)->snt_isn) {
466			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
467			goto out;
468		}
469
470		/*
471		 * Still in SYN_RECV, just remove it silently.
472		 * There is no good way to pass the error to the newly
473		 * created socket, and POSIX does not want network
474		 * errors returned from accept().
475		 */
476		inet_csk_reqsk_queue_drop(sk, req, prev);
477		goto out;
478
479	case TCP_SYN_SENT:
480	case TCP_SYN_RECV:  /* Cannot happen.
481			       It can f.e. if SYNs crossed.
482			     */
483		if (!sock_owned_by_user(sk)) {
484			sk->sk_err = err;
485
486			sk->sk_error_report(sk);
487
488			tcp_done(sk);
489		} else {
490			sk->sk_err_soft = err;
491		}
492		goto out;
493	}
494
495	/* If we've already connected we will keep trying
496	 * until we time out, or the user gives up.
497	 *
498	 * rfc1122 4.2.3.9 allows to consider as hard errors
499	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
500	 * but it is obsoleted by pmtu discovery).
501	 *
502	 * Note, that in modern internet, where routing is unreliable
503	 * and in each dark corner broken firewalls sit, sending random
504	 * errors ordered by their masters even this two messages finally lose
505	 * their original sense (even Linux sends invalid PORT_UNREACHs)
506	 *
507	 * Now we are in compliance with RFCs.
508	 *							--ANK (980905)
509	 */
510
511	inet = inet_sk(sk);
512	if (!sock_owned_by_user(sk) && inet->recverr) {
513		sk->sk_err = err;
514		sk->sk_error_report(sk);
515	} else	{ /* Only an error on timeout */
516		sk->sk_err_soft = err;
517	}
518
519out:
520	bh_unlock_sock(sk);
521	sock_put(sk);
522}
523
524static void __tcp_v4_send_check(struct sk_buff *skb,
525				__be32 saddr, __be32 daddr)
526{
527	struct tcphdr *th = tcp_hdr(skb);
528
529	if (skb->ip_summed == CHECKSUM_PARTIAL) {
530		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
531		skb->csum_start = skb_transport_header(skb) - skb->head;
532		skb->csum_offset = offsetof(struct tcphdr, check);
533	} else {
534		th->check = tcp_v4_check(skb->len, saddr, daddr,
535					 csum_partial(th,
536						      th->doff << 2,
537						      skb->csum));
538	}
539}
540
541/* This routine computes an IPv4 TCP checksum. */
542void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
543{
544	struct inet_sock *inet = inet_sk(sk);
545
546	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
547}
548EXPORT_SYMBOL(tcp_v4_send_check);
549
550int tcp_v4_gso_send_check(struct sk_buff *skb)
551{
552	const struct iphdr *iph;
553	struct tcphdr *th;
554
555	if (!pskb_may_pull(skb, sizeof(*th)))
556		return -EINVAL;
557
558	iph = ip_hdr(skb);
559	th = tcp_hdr(skb);
560
561	th->check = 0;
562	skb->ip_summed = CHECKSUM_PARTIAL;
563	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
564	return 0;
565}
566
567/*
568 *	This routine will send an RST to the other tcp.
569 *
570 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
571 *		      for reset.
572 *	Answer: if a packet caused RST, it is not for a socket
573 *		existing in our system, if it is matched to a socket,
574 *		it is just duplicate segment or bug in other side's TCP.
575 *		So that we build reply only basing on parameters
576 *		arrived with segment.
577 *	Exception: precedence violation. We do not implement it in any case.
578 */
579
580static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
581{
582	struct tcphdr *th = tcp_hdr(skb);
583	struct {
584		struct tcphdr th;
585#ifdef CONFIG_TCP_MD5SIG
586		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
587#endif
588	} rep;
589	struct ip_reply_arg arg;
590#ifdef CONFIG_TCP_MD5SIG
591	struct tcp_md5sig_key *key;
592#endif
593	struct net *net;
594
595	/* Never send a reset in response to a reset. */
596	if (th->rst)
597		return;
598
599	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
600		return;
601
602	/* Swap the send and the receive. */
603	memset(&rep, 0, sizeof(rep));
604	rep.th.dest   = th->source;
605	rep.th.source = th->dest;
606	rep.th.doff   = sizeof(struct tcphdr) / 4;
607	rep.th.rst    = 1;
608
609	if (th->ack) {
610		rep.th.seq = th->ack_seq;
611	} else {
612		rep.th.ack = 1;
613		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
614				       skb->len - (th->doff << 2));
615	}
616
617	memset(&arg, 0, sizeof(arg));
618	arg.iov[0].iov_base = (unsigned char *)&rep;
619	arg.iov[0].iov_len  = sizeof(rep.th);
620
621#ifdef CONFIG_TCP_MD5SIG
622	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
623	if (key) {
624		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
625				   (TCPOPT_NOP << 16) |
626				   (TCPOPT_MD5SIG << 8) |
627				   TCPOLEN_MD5SIG);
628		/* Update length and the length the header thinks exists */
629		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
630		rep.th.doff = arg.iov[0].iov_len / 4;
631
632		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
633				     key, ip_hdr(skb)->saddr,
634				     ip_hdr(skb)->daddr, &rep.th);
635	}
636#endif
637	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
638				      ip_hdr(skb)->saddr, /* XXX */
639				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
640	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
641	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
642
643	net = dev_net(skb_dst(skb)->dev);
644	ip_send_reply(net->ipv4.tcp_sock, skb,
645		      &arg, arg.iov[0].iov_len);
646
647	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
648	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
649}
650
651/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
652   outside socket context is ugly, certainly. What can I do?
653 */
654
655static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
656			    u32 win, u32 ts, int oif,
657			    struct tcp_md5sig_key *key,
658			    int reply_flags)
659{
660	struct tcphdr *th = tcp_hdr(skb);
661	struct {
662		struct tcphdr th;
663		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
664#ifdef CONFIG_TCP_MD5SIG
665			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
666#endif
667			];
668	} rep;
669	struct ip_reply_arg arg;
670	struct net *net = dev_net(skb_dst(skb)->dev);
671
672	memset(&rep.th, 0, sizeof(struct tcphdr));
673	memset(&arg, 0, sizeof(arg));
674
675	arg.iov[0].iov_base = (unsigned char *)&rep;
676	arg.iov[0].iov_len  = sizeof(rep.th);
677	if (ts) {
678		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
679				   (TCPOPT_TIMESTAMP << 8) |
680				   TCPOLEN_TIMESTAMP);
681		rep.opt[1] = htonl(tcp_time_stamp);
682		rep.opt[2] = htonl(ts);
683		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
684	}
685
686	/* Swap the send and the receive. */
687	rep.th.dest    = th->source;
688	rep.th.source  = th->dest;
689	rep.th.doff    = arg.iov[0].iov_len / 4;
690	rep.th.seq     = htonl(seq);
691	rep.th.ack_seq = htonl(ack);
692	rep.th.ack     = 1;
693	rep.th.window  = htons(win);
694
695#ifdef CONFIG_TCP_MD5SIG
696	if (key) {
697		int offset = (ts) ? 3 : 0;
698
699		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
700					  (TCPOPT_NOP << 16) |
701					  (TCPOPT_MD5SIG << 8) |
702					  TCPOLEN_MD5SIG);
703		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
704		rep.th.doff = arg.iov[0].iov_len/4;
705
706		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
707				    key, ip_hdr(skb)->saddr,
708				    ip_hdr(skb)->daddr, &rep.th);
709	}
710#endif
711	arg.flags = reply_flags;
712	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
713				      ip_hdr(skb)->saddr, /* XXX */
714				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
715	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
716	if (oif)
717		arg.bound_dev_if = oif;
718
719	ip_send_reply(net->ipv4.tcp_sock, skb,
720		      &arg, arg.iov[0].iov_len);
721
722	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
723}
724
725static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
726{
727	struct inet_timewait_sock *tw = inet_twsk(sk);
728	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
729
730	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
731			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
732			tcptw->tw_ts_recent,
733			tw->tw_bound_dev_if,
734			tcp_twsk_md5_key(tcptw),
735			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
736			);
737
738	inet_twsk_put(tw);
739}
740
741static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
742				  struct request_sock *req)
743{
744	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
745			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
746			req->ts_recent,
747			0,
748			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
749			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
750}
751
752/*
753 *	Send a SYN-ACK after having received a SYN.
754 *	This still operates on a request_sock only, not on a big
755 *	socket.
756 */
757static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
758			      struct request_sock *req,
759			      struct request_values *rvp)
760{
761	const struct inet_request_sock *ireq = inet_rsk(req);
762	int err = -1;
763	struct sk_buff * skb;
764
765	/* First, grab a route. */
766	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
767		return -1;
768
769	skb = tcp_make_synack(sk, dst, req, rvp);
770
771	if (skb) {
772		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
773
774		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
775					    ireq->rmt_addr,
776					    ireq->opt);
777		err = net_xmit_eval(err);
778	}
779
780	dst_release(dst);
781	return err;
782}
783
784static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
785			      struct request_values *rvp)
786{
787	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
788	return tcp_v4_send_synack(sk, NULL, req, rvp);
789}
790
791/*
792 *	IPv4 request_sock destructor.
793 */
794static void tcp_v4_reqsk_destructor(struct request_sock *req)
795{
796	kfree(inet_rsk(req)->opt);
797}
798
799static void syn_flood_warning(const struct sk_buff *skb)
800{
801	const char *msg;
802
803#ifdef CONFIG_SYN_COOKIES
804	if (sysctl_tcp_syncookies)
805		msg = "Sending cookies";
806	else
807#endif
808		msg = "Dropping request";
809
810	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
811				ntohs(tcp_hdr(skb)->dest), msg);
812}
813
814/*
815 * Save and compile IPv4 options into the request_sock if needed.
816 */
817static struct ip_options *tcp_v4_save_options(struct sock *sk,
818					      struct sk_buff *skb)
819{
820	struct ip_options *opt = &(IPCB(skb)->opt);
821	struct ip_options *dopt = NULL;
822
823	if (opt && opt->optlen) {
824		int opt_size = optlength(opt);
825		dopt = kmalloc(opt_size, GFP_ATOMIC);
826		if (dopt) {
827			if (ip_options_echo(dopt, skb)) {
828				kfree(dopt);
829				dopt = NULL;
830			}
831		}
832	}
833	return dopt;
834}
835
836#ifdef CONFIG_TCP_MD5SIG
837/*
838 * RFC2385 MD5 checksumming requires a mapping of
839 * IP address->MD5 Key.
840 * We need to maintain these in the sk structure.
841 */
842
843/* Find the Key structure for an address.  */
844static struct tcp_md5sig_key *
845			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
846{
847	struct tcp_sock *tp = tcp_sk(sk);
848	int i;
849
850	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
851		return NULL;
852	for (i = 0; i < tp->md5sig_info->entries4; i++) {
853		if (tp->md5sig_info->keys4[i].addr == addr)
854			return &tp->md5sig_info->keys4[i].base;
855	}
856	return NULL;
857}
858
859struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
860					 struct sock *addr_sk)
861{
862	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
863}
864EXPORT_SYMBOL(tcp_v4_md5_lookup);
865
866static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
867						      struct request_sock *req)
868{
869	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
870}
871
872/* This can be called on a newly created socket, from other files */
873int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
874		      u8 *newkey, u8 newkeylen)
875{
876	/* Add Key to the list */
877	struct tcp_md5sig_key *key;
878	struct tcp_sock *tp = tcp_sk(sk);
879	struct tcp4_md5sig_key *keys;
880
881	key = tcp_v4_md5_do_lookup(sk, addr);
882	if (key) {
883		/* Pre-existing entry - just update that one. */
884		kfree(key->key);
885		key->key = newkey;
886		key->keylen = newkeylen;
887	} else {
888		struct tcp_md5sig_info *md5sig;
889
890		if (!tp->md5sig_info) {
891			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
892						  GFP_ATOMIC);
893			if (!tp->md5sig_info) {
894				kfree(newkey);
895				return -ENOMEM;
896			}
897			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
898		}
899		if (tcp_alloc_md5sig_pool(sk) == NULL) {
900			kfree(newkey);
901			return -ENOMEM;
902		}
903		md5sig = tp->md5sig_info;
904
905		if (md5sig->alloced4 == md5sig->entries4) {
906			keys = kmalloc((sizeof(*keys) *
907					(md5sig->entries4 + 1)), GFP_ATOMIC);
908			if (!keys) {
909				kfree(newkey);
910				tcp_free_md5sig_pool();
911				return -ENOMEM;
912			}
913
914			if (md5sig->entries4)
915				memcpy(keys, md5sig->keys4,
916				       sizeof(*keys) * md5sig->entries4);
917
918			/* Free old key list, and reference new one */
919			kfree(md5sig->keys4);
920			md5sig->keys4 = keys;
921			md5sig->alloced4++;
922		}
923		md5sig->entries4++;
924		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
925		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
926		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
927	}
928	return 0;
929}
930EXPORT_SYMBOL(tcp_v4_md5_do_add);
931
932static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
933			       u8 *newkey, u8 newkeylen)
934{
935	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
936				 newkey, newkeylen);
937}
938
939int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
940{
941	struct tcp_sock *tp = tcp_sk(sk);
942	int i;
943
944	for (i = 0; i < tp->md5sig_info->entries4; i++) {
945		if (tp->md5sig_info->keys4[i].addr == addr) {
946			/* Free the key */
947			kfree(tp->md5sig_info->keys4[i].base.key);
948			tp->md5sig_info->entries4--;
949
950			if (tp->md5sig_info->entries4 == 0) {
951				kfree(tp->md5sig_info->keys4);
952				tp->md5sig_info->keys4 = NULL;
953				tp->md5sig_info->alloced4 = 0;
954			} else if (tp->md5sig_info->entries4 != i) {
955				/* Need to do some manipulation */
956				memmove(&tp->md5sig_info->keys4[i],
957					&tp->md5sig_info->keys4[i+1],
958					(tp->md5sig_info->entries4 - i) *
959					 sizeof(struct tcp4_md5sig_key));
960			}
961			tcp_free_md5sig_pool();
962			return 0;
963		}
964	}
965	return -ENOENT;
966}
967EXPORT_SYMBOL(tcp_v4_md5_do_del);
968
969static void tcp_v4_clear_md5_list(struct sock *sk)
970{
971	struct tcp_sock *tp = tcp_sk(sk);
972
973	/* Free each key, then the set of key keys,
974	 * the crypto element, and then decrement our
975	 * hold on the last resort crypto.
976	 */
977	if (tp->md5sig_info->entries4) {
978		int i;
979		for (i = 0; i < tp->md5sig_info->entries4; i++)
980			kfree(tp->md5sig_info->keys4[i].base.key);
981		tp->md5sig_info->entries4 = 0;
982		tcp_free_md5sig_pool();
983	}
984	if (tp->md5sig_info->keys4) {
985		kfree(tp->md5sig_info->keys4);
986		tp->md5sig_info->keys4 = NULL;
987		tp->md5sig_info->alloced4  = 0;
988	}
989}
990
991static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
992				 int optlen)
993{
994	struct tcp_md5sig cmd;
995	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
996	u8 *newkey;
997
998	if (optlen < sizeof(cmd))
999		return -EINVAL;
1000
1001	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1002		return -EFAULT;
1003
1004	if (sin->sin_family != AF_INET)
1005		return -EINVAL;
1006
1007	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1008		if (!tcp_sk(sk)->md5sig_info)
1009			return -ENOENT;
1010		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1011	}
1012
1013	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014		return -EINVAL;
1015
1016	if (!tcp_sk(sk)->md5sig_info) {
1017		struct tcp_sock *tp = tcp_sk(sk);
1018		struct tcp_md5sig_info *p;
1019
1020		p = kzalloc(sizeof(*p), sk->sk_allocation);
1021		if (!p)
1022			return -EINVAL;
1023
1024		tp->md5sig_info = p;
1025		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1026	}
1027
1028	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1029	if (!newkey)
1030		return -ENOMEM;
1031	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1032				 newkey, cmd.tcpm_keylen);
1033}
1034
1035static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1036					__be32 daddr, __be32 saddr, int nbytes)
1037{
1038	struct tcp4_pseudohdr *bp;
1039	struct scatterlist sg;
1040
1041	bp = &hp->md5_blk.ip4;
1042
1043	/*
1044	 * 1. the TCP pseudo-header (in the order: source IP address,
1045	 * destination IP address, zero-padded protocol number, and
1046	 * segment length)
1047	 */
1048	bp->saddr = saddr;
1049	bp->daddr = daddr;
1050	bp->pad = 0;
1051	bp->protocol = IPPROTO_TCP;
1052	bp->len = cpu_to_be16(nbytes);
1053
1054	sg_init_one(&sg, bp, sizeof(*bp));
1055	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1056}
1057
1058static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1059			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1060{
1061	struct tcp_md5sig_pool *hp;
1062	struct hash_desc *desc;
1063
1064	hp = tcp_get_md5sig_pool();
1065	if (!hp)
1066		goto clear_hash_noput;
1067	desc = &hp->md5_desc;
1068
1069	if (crypto_hash_init(desc))
1070		goto clear_hash;
1071	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1072		goto clear_hash;
1073	if (tcp_md5_hash_header(hp, th))
1074		goto clear_hash;
1075	if (tcp_md5_hash_key(hp, key))
1076		goto clear_hash;
1077	if (crypto_hash_final(desc, md5_hash))
1078		goto clear_hash;
1079
1080	tcp_put_md5sig_pool();
1081	return 0;
1082
1083clear_hash:
1084	tcp_put_md5sig_pool();
1085clear_hash_noput:
1086	memset(md5_hash, 0, 16);
1087	return 1;
1088}
1089
1090int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1091			struct sock *sk, struct request_sock *req,
1092			struct sk_buff *skb)
1093{
1094	struct tcp_md5sig_pool *hp;
1095	struct hash_desc *desc;
1096	struct tcphdr *th = tcp_hdr(skb);
1097	__be32 saddr, daddr;
1098
1099	if (sk) {
1100		saddr = inet_sk(sk)->inet_saddr;
1101		daddr = inet_sk(sk)->inet_daddr;
1102	} else if (req) {
1103		saddr = inet_rsk(req)->loc_addr;
1104		daddr = inet_rsk(req)->rmt_addr;
1105	} else {
1106		const struct iphdr *iph = ip_hdr(skb);
1107		saddr = iph->saddr;
1108		daddr = iph->daddr;
1109	}
1110
1111	hp = tcp_get_md5sig_pool();
1112	if (!hp)
1113		goto clear_hash_noput;
1114	desc = &hp->md5_desc;
1115
1116	if (crypto_hash_init(desc))
1117		goto clear_hash;
1118
1119	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1120		goto clear_hash;
1121	if (tcp_md5_hash_header(hp, th))
1122		goto clear_hash;
1123	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124		goto clear_hash;
1125	if (tcp_md5_hash_key(hp, key))
1126		goto clear_hash;
1127	if (crypto_hash_final(desc, md5_hash))
1128		goto clear_hash;
1129
1130	tcp_put_md5sig_pool();
1131	return 0;
1132
1133clear_hash:
1134	tcp_put_md5sig_pool();
1135clear_hash_noput:
1136	memset(md5_hash, 0, 16);
1137	return 1;
1138}
1139EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140
1141static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1142{
1143	/*
1144	 * This gets called for each TCP segment that arrives
1145	 * so we want to be efficient.
1146	 * We have 3 drop cases:
1147	 * o No MD5 hash and one expected.
1148	 * o MD5 hash and we're not expecting one.
1149	 * o MD5 hash and its wrong.
1150	 */
1151	__u8 *hash_location = NULL;
1152	struct tcp_md5sig_key *hash_expected;
1153	const struct iphdr *iph = ip_hdr(skb);
1154	struct tcphdr *th = tcp_hdr(skb);
1155	int genhash;
1156	unsigned char newhash[16];
1157
1158	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1159	hash_location = tcp_parse_md5sig_option(th);
1160
1161	/* We've parsed the options - do we have a hash? */
1162	if (!hash_expected && !hash_location)
1163		return 0;
1164
1165	if (hash_expected && !hash_location) {
1166		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1167		return 1;
1168	}
1169
1170	if (!hash_expected && hash_location) {
1171		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1172		return 1;
1173	}
1174
1175	/* Okay, so this is hash_expected and hash_location -
1176	 * so we need to calculate the checksum.
1177	 */
1178	genhash = tcp_v4_md5_hash_skb(newhash,
1179				      hash_expected,
1180				      NULL, NULL, skb);
1181
1182	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1183		if (net_ratelimit()) {
1184			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1185			       &iph->saddr, ntohs(th->source),
1186			       &iph->daddr, ntohs(th->dest),
1187			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1188		}
1189		return 1;
1190	}
1191	return 0;
1192}
1193
1194#endif
1195
1196struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1197	.family		=	PF_INET,
1198	.obj_size	=	sizeof(struct tcp_request_sock),
1199	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1200	.send_ack	=	tcp_v4_reqsk_send_ack,
1201	.destructor	=	tcp_v4_reqsk_destructor,
1202	.send_reset	=	tcp_v4_send_reset,
1203	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1204};
1205
1206#ifdef CONFIG_TCP_MD5SIG
1207static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1208	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1209	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1210};
1211#endif
1212
1213int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1214{
1215	struct tcp_extend_values tmp_ext;
1216	struct tcp_options_received tmp_opt;
1217	u8 *hash_location;
1218	struct request_sock *req;
1219	struct inet_request_sock *ireq;
1220	struct tcp_sock *tp = tcp_sk(sk);
1221	struct dst_entry *dst = NULL;
1222	__be32 saddr = ip_hdr(skb)->saddr;
1223	__be32 daddr = ip_hdr(skb)->daddr;
1224	__u32 isn = TCP_SKB_CB(skb)->when;
1225#ifdef CONFIG_SYN_COOKIES
1226	int want_cookie = 0;
1227#else
1228#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1229#endif
1230
1231	/* Never answer to SYNs send to broadcast or multicast */
1232	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1233		goto drop;
1234
1235	/* TW buckets are converted to open requests without
1236	 * limitations, they conserve resources and peer is
1237	 * evidently real one.
1238	 */
1239	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1240		if (net_ratelimit())
1241			syn_flood_warning(skb);
1242#ifdef CONFIG_SYN_COOKIES
1243		if (sysctl_tcp_syncookies) {
1244			want_cookie = 1;
1245		} else
1246#endif
1247		goto drop;
1248	}
1249
1250	/* Accept backlog is full. If we have already queued enough
1251	 * of warm entries in syn queue, drop request. It is better than
1252	 * clogging syn queue with openreqs with exponentially increasing
1253	 * timeout.
1254	 */
1255	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1256		goto drop;
1257
1258	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1259	if (!req)
1260		goto drop;
1261
1262#ifdef CONFIG_TCP_MD5SIG
1263	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1264#endif
1265
1266	tcp_clear_options(&tmp_opt);
1267	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1268	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1269	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1270
1271	if (tmp_opt.cookie_plus > 0 &&
1272	    tmp_opt.saw_tstamp &&
1273	    !tp->rx_opt.cookie_out_never &&
1274	    (sysctl_tcp_cookie_size > 0 ||
1275	     (tp->cookie_values != NULL &&
1276	      tp->cookie_values->cookie_desired > 0))) {
1277		u8 *c;
1278		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1279		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1280
1281		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1282			goto drop_and_release;
1283
1284		/* Secret recipe starts with IP addresses */
1285		*mess++ ^= (__force u32)daddr;
1286		*mess++ ^= (__force u32)saddr;
1287
1288		/* plus variable length Initiator Cookie */
1289		c = (u8 *)mess;
1290		while (l-- > 0)
1291			*c++ ^= *hash_location++;
1292
1293#ifdef CONFIG_SYN_COOKIES
1294		want_cookie = 0;	/* not our kind of cookie */
1295#endif
1296		tmp_ext.cookie_out_never = 0; /* false */
1297		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1298	} else if (!tp->rx_opt.cookie_in_always) {
1299		/* redundant indications, but ensure initialization. */
1300		tmp_ext.cookie_out_never = 1; /* true */
1301		tmp_ext.cookie_plus = 0;
1302	} else {
1303		goto drop_and_release;
1304	}
1305	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1306
1307	if (want_cookie && !tmp_opt.saw_tstamp)
1308		tcp_clear_options(&tmp_opt);
1309
1310	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1311	tcp_openreq_init(req, &tmp_opt, skb);
1312
1313	ireq = inet_rsk(req);
1314	ireq->loc_addr = daddr;
1315	ireq->rmt_addr = saddr;
1316	ireq->no_srccheck = inet_sk(sk)->transparent;
1317	ireq->opt = tcp_v4_save_options(sk, skb);
1318
1319	if (security_inet_conn_request(sk, skb, req))
1320		goto drop_and_free;
1321
1322	if (!want_cookie || tmp_opt.tstamp_ok)
1323		TCP_ECN_create_request(req, tcp_hdr(skb));
1324
1325	if (want_cookie) {
1326		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1327		req->cookie_ts = tmp_opt.tstamp_ok;
1328	} else if (!isn) {
1329		struct inet_peer *peer = NULL;
1330
1331		/* VJ's idea. We save last timestamp seen
1332		 * from the destination in peer table, when entering
1333		 * state TIME-WAIT, and check against it before
1334		 * accepting new connection request.
1335		 *
1336		 * If "isn" is not zero, this request hit alive
1337		 * timewait bucket, so that all the necessary checks
1338		 * are made in the function processing timewait state.
1339		 */
1340		if (tmp_opt.saw_tstamp &&
1341		    tcp_death_row.sysctl_tw_recycle &&
1342		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1343		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1344		    peer->daddr.a4 == saddr) {
1345			inet_peer_refcheck(peer);
1346			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1347			    (s32)(peer->tcp_ts - req->ts_recent) >
1348							TCP_PAWS_WINDOW) {
1349				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1350				goto drop_and_release;
1351			}
1352		}
1353		/* Kill the following clause, if you dislike this way. */
1354		else if (!sysctl_tcp_syncookies &&
1355			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1356			  (sysctl_max_syn_backlog >> 2)) &&
1357			 (!peer || !peer->tcp_ts_stamp) &&
1358			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1359			/* Without syncookies last quarter of
1360			 * backlog is filled with destinations,
1361			 * proven to be alive.
1362			 * It means that we continue to communicate
1363			 * to destinations, already remembered
1364			 * to the moment of synflood.
1365			 */
1366			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1367				       &saddr, ntohs(tcp_hdr(skb)->source));
1368			goto drop_and_release;
1369		}
1370
1371		isn = tcp_v4_init_sequence(skb);
1372	}
1373	tcp_rsk(req)->snt_isn = isn;
1374
1375	if (tcp_v4_send_synack(sk, dst, req,
1376			       (struct request_values *)&tmp_ext) ||
1377	    want_cookie)
1378		goto drop_and_free;
1379
1380	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1381	return 0;
1382
1383drop_and_release:
1384	dst_release(dst);
1385drop_and_free:
1386	reqsk_free(req);
1387drop:
1388	return 0;
1389}
1390EXPORT_SYMBOL(tcp_v4_conn_request);
1391
1392
1393/*
1394 * The three way handshake has completed - we got a valid synack -
1395 * now create the new socket.
1396 */
1397struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1398				  struct request_sock *req,
1399				  struct dst_entry *dst)
1400{
1401	struct inet_request_sock *ireq;
1402	struct inet_sock *newinet;
1403	struct tcp_sock *newtp;
1404	struct sock *newsk;
1405#ifdef CONFIG_TCP_MD5SIG
1406	struct tcp_md5sig_key *key;
1407#endif
1408
1409	if (sk_acceptq_is_full(sk))
1410		goto exit_overflow;
1411
1412	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1413		goto exit;
1414
1415	newsk = tcp_create_openreq_child(sk, req, skb);
1416	if (!newsk)
1417		goto exit_nonewsk;
1418
1419	newsk->sk_gso_type = SKB_GSO_TCPV4;
1420	sk_setup_caps(newsk, dst);
1421
1422	newtp		      = tcp_sk(newsk);
1423	newinet		      = inet_sk(newsk);
1424	ireq		      = inet_rsk(req);
1425	newinet->inet_daddr   = ireq->rmt_addr;
1426	newinet->inet_rcv_saddr = ireq->loc_addr;
1427	newinet->inet_saddr	      = ireq->loc_addr;
1428	newinet->opt	      = ireq->opt;
1429	ireq->opt	      = NULL;
1430	newinet->mc_index     = inet_iif(skb);
1431	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1432	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433	if (newinet->opt)
1434		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1435	newinet->inet_id = newtp->write_seq ^ jiffies;
1436
1437	tcp_mtup_init(newsk);
1438	tcp_sync_mss(newsk, dst_mtu(dst));
1439	newtp->advmss = dst_metric_advmss(dst);
1440	if (tcp_sk(sk)->rx_opt.user_mss &&
1441	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1442		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1443
1444	tcp_initialize_rcv_mss(newsk);
1445
1446#ifdef CONFIG_TCP_MD5SIG
1447	/* Copy over the MD5 key from the original socket */
1448	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1449	if (key != NULL) {
1450		/*
1451		 * We're using one, so create a matching key
1452		 * on the newsk structure. If we fail to get
1453		 * memory, then we end up not copying the key
1454		 * across. Shucks.
1455		 */
1456		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1457		if (newkey != NULL)
1458			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1459					  newkey, key->keylen);
1460		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1461	}
1462#endif
1463
1464	if (__inet_inherit_port(sk, newsk) < 0) {
1465		sock_put(newsk);
1466		goto exit;
1467	}
1468	__inet_hash_nolisten(newsk, NULL);
1469
1470	return newsk;
1471
1472exit_overflow:
1473	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1474exit_nonewsk:
1475	dst_release(dst);
1476exit:
1477	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1478	return NULL;
1479}
1480EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1481
1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483{
1484	struct tcphdr *th = tcp_hdr(skb);
1485	const struct iphdr *iph = ip_hdr(skb);
1486	struct sock *nsk;
1487	struct request_sock **prev;
1488	/* Find possible connection requests. */
1489	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1490						       iph->saddr, iph->daddr);
1491	if (req)
1492		return tcp_check_req(sk, skb, req, prev);
1493
1494	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1495			th->source, iph->daddr, th->dest, inet_iif(skb));
1496
1497	if (nsk) {
1498		if (nsk->sk_state != TCP_TIME_WAIT) {
1499			bh_lock_sock(nsk);
1500			return nsk;
1501		}
1502		inet_twsk_put(inet_twsk(nsk));
1503		return NULL;
1504	}
1505
1506#ifdef CONFIG_SYN_COOKIES
1507	if (!th->syn)
1508		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1509#endif
1510	return sk;
1511}
1512
1513static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1514{
1515	const struct iphdr *iph = ip_hdr(skb);
1516
1517	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1518		if (!tcp_v4_check(skb->len, iph->saddr,
1519				  iph->daddr, skb->csum)) {
1520			skb->ip_summed = CHECKSUM_UNNECESSARY;
1521			return 0;
1522		}
1523	}
1524
1525	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1526				       skb->len, IPPROTO_TCP, 0);
1527
1528	if (skb->len <= 76) {
1529		return __skb_checksum_complete(skb);
1530	}
1531	return 0;
1532}
1533
1534
1535/* The socket must have it's spinlock held when we get
1536 * here.
1537 *
1538 * We have a potential double-lock case here, so even when
1539 * doing backlog processing we use the BH locking scheme.
1540 * This is because we cannot sleep with the original spinlock
1541 * held.
1542 */
1543int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1544{
1545	struct sock *rsk;
1546#ifdef CONFIG_TCP_MD5SIG
1547	/*
1548	 * We really want to reject the packet as early as possible
1549	 * if:
1550	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1551	 *  o There is an MD5 option and we're not expecting one
1552	 */
1553	if (tcp_v4_inbound_md5_hash(sk, skb))
1554		goto discard;
1555#endif
1556
1557	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558		sock_rps_save_rxhash(sk, skb->rxhash);
1559		TCP_CHECK_TIMER(sk);
1560		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1561			rsk = sk;
1562			goto reset;
1563		}
1564		TCP_CHECK_TIMER(sk);
1565		return 0;
1566	}
1567
1568	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1569		goto csum_err;
1570
1571	if (sk->sk_state == TCP_LISTEN) {
1572		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1573		if (!nsk)
1574			goto discard;
1575
1576		if (nsk != sk) {
1577			if (tcp_child_process(sk, nsk, skb)) {
1578				rsk = nsk;
1579				goto reset;
1580			}
1581			return 0;
1582		}
1583	} else
1584		sock_rps_save_rxhash(sk, skb->rxhash);
1585
1586
1587	TCP_CHECK_TIMER(sk);
1588	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589		rsk = sk;
1590		goto reset;
1591	}
1592	TCP_CHECK_TIMER(sk);
1593	return 0;
1594
1595reset:
1596	tcp_v4_send_reset(rsk, skb);
1597discard:
1598	kfree_skb(skb);
1599	/* Be careful here. If this function gets more complicated and
1600	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1601	 * might be destroyed here. This current version compiles correctly,
1602	 * but you have been warned.
1603	 */
1604	return 0;
1605
1606csum_err:
1607	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1608	goto discard;
1609}
1610EXPORT_SYMBOL(tcp_v4_do_rcv);
1611
1612/*
1613 *	From tcp_input.c
1614 */
1615
1616int tcp_v4_rcv(struct sk_buff *skb)
1617{
1618	const struct iphdr *iph;
1619	struct tcphdr *th;
1620	struct sock *sk;
1621	int ret;
1622	struct net *net = dev_net(skb->dev);
1623
1624	if (skb->pkt_type != PACKET_HOST)
1625		goto discard_it;
1626
1627	/* Count it even if it's bad */
1628	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1629
1630	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1631		goto discard_it;
1632
1633	th = tcp_hdr(skb);
1634
1635	if (th->doff < sizeof(struct tcphdr) / 4)
1636		goto bad_packet;
1637	if (!pskb_may_pull(skb, th->doff * 4))
1638		goto discard_it;
1639
1640	/* An explanation is required here, I think.
1641	 * Packet length and doff are validated by header prediction,
1642	 * provided case of th->doff==0 is eliminated.
1643	 * So, we defer the checks. */
1644	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1645		goto bad_packet;
1646
1647	th = tcp_hdr(skb);
1648	iph = ip_hdr(skb);
1649	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1650	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1651				    skb->len - th->doff * 4);
1652	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1653	TCP_SKB_CB(skb)->when	 = 0;
1654	TCP_SKB_CB(skb)->flags	 = iph->tos;
1655	TCP_SKB_CB(skb)->sacked	 = 0;
1656
1657	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1658	if (!sk)
1659		goto no_tcp_socket;
1660
1661process:
1662	if (sk->sk_state == TCP_TIME_WAIT)
1663		goto do_time_wait;
1664
1665	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1666		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1667		goto discard_and_relse;
1668	}
1669
1670	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1671		goto discard_and_relse;
1672	nf_reset(skb);
1673
1674	if (sk_filter(sk, skb))
1675		goto discard_and_relse;
1676
1677	skb->dev = NULL;
1678
1679	bh_lock_sock_nested(sk);
1680	ret = 0;
1681	if (!sock_owned_by_user(sk)) {
1682#ifdef CONFIG_NET_DMA
1683		struct tcp_sock *tp = tcp_sk(sk);
1684		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1685			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1686		if (tp->ucopy.dma_chan)
1687			ret = tcp_v4_do_rcv(sk, skb);
1688		else
1689#endif
1690		{
1691			if (!tcp_prequeue(sk, skb))
1692				ret = tcp_v4_do_rcv(sk, skb);
1693		}
1694	} else if (unlikely(sk_add_backlog(sk, skb))) {
1695		bh_unlock_sock(sk);
1696		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1697		goto discard_and_relse;
1698	}
1699	bh_unlock_sock(sk);
1700
1701	sock_put(sk);
1702
1703	return ret;
1704
1705no_tcp_socket:
1706	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1707		goto discard_it;
1708
1709	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1710bad_packet:
1711		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1712	} else {
1713		tcp_v4_send_reset(NULL, skb);
1714	}
1715
1716discard_it:
1717	/* Discard frame. */
1718	kfree_skb(skb);
1719	return 0;
1720
1721discard_and_relse:
1722	sock_put(sk);
1723	goto discard_it;
1724
1725do_time_wait:
1726	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1727		inet_twsk_put(inet_twsk(sk));
1728		goto discard_it;
1729	}
1730
1731	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1732		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1733		inet_twsk_put(inet_twsk(sk));
1734		goto discard_it;
1735	}
1736	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1737	case TCP_TW_SYN: {
1738		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1739							&tcp_hashinfo,
1740							iph->daddr, th->dest,
1741							inet_iif(skb));
1742		if (sk2) {
1743			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1744			inet_twsk_put(inet_twsk(sk));
1745			sk = sk2;
1746			goto process;
1747		}
1748		/* Fall through to ACK */
1749	}
1750	case TCP_TW_ACK:
1751		tcp_v4_timewait_ack(sk, skb);
1752		break;
1753	case TCP_TW_RST:
1754		goto no_tcp_socket;
1755	case TCP_TW_SUCCESS:;
1756	}
1757	goto discard_it;
1758}
1759
1760struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1761{
1762	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1763	struct inet_sock *inet = inet_sk(sk);
1764	struct inet_peer *peer;
1765
1766	if (!rt || rt->rt_dst != inet->inet_daddr) {
1767		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1768		*release_it = true;
1769	} else {
1770		if (!rt->peer)
1771			rt_bind_peer(rt, 1);
1772		peer = rt->peer;
1773		*release_it = false;
1774	}
1775
1776	return peer;
1777}
1778EXPORT_SYMBOL(tcp_v4_get_peer);
1779
1780void *tcp_v4_tw_get_peer(struct sock *sk)
1781{
1782	struct inet_timewait_sock *tw = inet_twsk(sk);
1783
1784	return inet_getpeer_v4(tw->tw_daddr, 1);
1785}
1786EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1787
1788static struct timewait_sock_ops tcp_timewait_sock_ops = {
1789	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1790	.twsk_unique	= tcp_twsk_unique,
1791	.twsk_destructor= tcp_twsk_destructor,
1792	.twsk_getpeer	= tcp_v4_tw_get_peer,
1793};
1794
1795const struct inet_connection_sock_af_ops ipv4_specific = {
1796	.queue_xmit	   = ip_queue_xmit,
1797	.send_check	   = tcp_v4_send_check,
1798	.rebuild_header	   = inet_sk_rebuild_header,
1799	.conn_request	   = tcp_v4_conn_request,
1800	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1801	.get_peer	   = tcp_v4_get_peer,
1802	.net_header_len	   = sizeof(struct iphdr),
1803	.setsockopt	   = ip_setsockopt,
1804	.getsockopt	   = ip_getsockopt,
1805	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1806	.sockaddr_len	   = sizeof(struct sockaddr_in),
1807	.bind_conflict	   = inet_csk_bind_conflict,
1808#ifdef CONFIG_COMPAT
1809	.compat_setsockopt = compat_ip_setsockopt,
1810	.compat_getsockopt = compat_ip_getsockopt,
1811#endif
1812};
1813EXPORT_SYMBOL(ipv4_specific);
1814
1815#ifdef CONFIG_TCP_MD5SIG
1816static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1817	.md5_lookup		= tcp_v4_md5_lookup,
1818	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1819	.md5_add		= tcp_v4_md5_add_func,
1820	.md5_parse		= tcp_v4_parse_md5_keys,
1821};
1822#endif
1823
1824/* NOTE: A lot of things set to zero explicitly by call to
1825 *       sk_alloc() so need not be done here.
1826 */
1827static int tcp_v4_init_sock(struct sock *sk)
1828{
1829	struct inet_connection_sock *icsk = inet_csk(sk);
1830	struct tcp_sock *tp = tcp_sk(sk);
1831
1832	skb_queue_head_init(&tp->out_of_order_queue);
1833	tcp_init_xmit_timers(sk);
1834	tcp_prequeue_init(tp);
1835
1836	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1837	tp->mdev = TCP_TIMEOUT_INIT;
1838
1839	/* So many TCP implementations out there (incorrectly) count the
1840	 * initial SYN frame in their delayed-ACK and congestion control
1841	 * algorithms that we must have the following bandaid to talk
1842	 * efficiently to them.  -DaveM
1843	 */
1844	tp->snd_cwnd = 2;
1845
1846	/* See draft-stevens-tcpca-spec-01 for discussion of the
1847	 * initialization of these values.
1848	 */
1849	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1850	tp->snd_cwnd_clamp = ~0;
1851	tp->mss_cache = TCP_MSS_DEFAULT;
1852
1853	tp->reordering = sysctl_tcp_reordering;
1854	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1855
1856	sk->sk_state = TCP_CLOSE;
1857
1858	sk->sk_write_space = sk_stream_write_space;
1859	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1860
1861	icsk->icsk_af_ops = &ipv4_specific;
1862	icsk->icsk_sync_mss = tcp_sync_mss;
1863#ifdef CONFIG_TCP_MD5SIG
1864	tp->af_specific = &tcp_sock_ipv4_specific;
1865#endif
1866
1867	/* TCP Cookie Transactions */
1868	if (sysctl_tcp_cookie_size > 0) {
1869		/* Default, cookies without s_data_payload. */
1870		tp->cookie_values =
1871			kzalloc(sizeof(*tp->cookie_values),
1872				sk->sk_allocation);
1873		if (tp->cookie_values != NULL)
1874			kref_init(&tp->cookie_values->kref);
1875	}
1876	/* Presumed zeroed, in order of appearance:
1877	 *	cookie_in_always, cookie_out_never,
1878	 *	s_data_constant, s_data_in, s_data_out
1879	 */
1880	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1881	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1882
1883	local_bh_disable();
1884	percpu_counter_inc(&tcp_sockets_allocated);
1885	local_bh_enable();
1886
1887	return 0;
1888}
1889
1890void tcp_v4_destroy_sock(struct sock *sk)
1891{
1892	struct tcp_sock *tp = tcp_sk(sk);
1893
1894	tcp_clear_xmit_timers(sk);
1895
1896	tcp_cleanup_congestion_control(sk);
1897
1898	/* Cleanup up the write buffer. */
1899	tcp_write_queue_purge(sk);
1900
1901	/* Cleans up our, hopefully empty, out_of_order_queue. */
1902	__skb_queue_purge(&tp->out_of_order_queue);
1903
1904#ifdef CONFIG_TCP_MD5SIG
1905	/* Clean up the MD5 key list, if any */
1906	if (tp->md5sig_info) {
1907		tcp_v4_clear_md5_list(sk);
1908		kfree(tp->md5sig_info);
1909		tp->md5sig_info = NULL;
1910	}
1911#endif
1912
1913#ifdef CONFIG_NET_DMA
1914	/* Cleans up our sk_async_wait_queue */
1915	__skb_queue_purge(&sk->sk_async_wait_queue);
1916#endif
1917
1918	/* Clean prequeue, it must be empty really */
1919	__skb_queue_purge(&tp->ucopy.prequeue);
1920
1921	/* Clean up a referenced TCP bind bucket. */
1922	if (inet_csk(sk)->icsk_bind_hash)
1923		inet_put_port(sk);
1924
1925	/*
1926	 * If sendmsg cached page exists, toss it.
1927	 */
1928	if (sk->sk_sndmsg_page) {
1929		__free_page(sk->sk_sndmsg_page);
1930		sk->sk_sndmsg_page = NULL;
1931	}
1932
1933	/* TCP Cookie Transactions */
1934	if (tp->cookie_values != NULL) {
1935		kref_put(&tp->cookie_values->kref,
1936			 tcp_cookie_values_release);
1937		tp->cookie_values = NULL;
1938	}
1939
1940	percpu_counter_dec(&tcp_sockets_allocated);
1941}
1942EXPORT_SYMBOL(tcp_v4_destroy_sock);
1943
1944#ifdef CONFIG_PROC_FS
1945/* Proc filesystem TCP sock list dumping. */
1946
1947static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1948{
1949	return hlist_nulls_empty(head) ? NULL :
1950		list_entry(head->first, struct inet_timewait_sock, tw_node);
1951}
1952
1953static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1954{
1955	return !is_a_nulls(tw->tw_node.next) ?
1956		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1957}
1958
1959/*
1960 * Get next listener socket follow cur.  If cur is NULL, get first socket
1961 * starting from bucket given in st->bucket; when st->bucket is zero the
1962 * very first socket in the hash table is returned.
1963 */
1964static void *listening_get_next(struct seq_file *seq, void *cur)
1965{
1966	struct inet_connection_sock *icsk;
1967	struct hlist_nulls_node *node;
1968	struct sock *sk = cur;
1969	struct inet_listen_hashbucket *ilb;
1970	struct tcp_iter_state *st = seq->private;
1971	struct net *net = seq_file_net(seq);
1972
1973	if (!sk) {
1974		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1975		spin_lock_bh(&ilb->lock);
1976		sk = sk_nulls_head(&ilb->head);
1977		st->offset = 0;
1978		goto get_sk;
1979	}
1980	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1981	++st->num;
1982	++st->offset;
1983
1984	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1985		struct request_sock *req = cur;
1986
1987		icsk = inet_csk(st->syn_wait_sk);
1988		req = req->dl_next;
1989		while (1) {
1990			while (req) {
1991				if (req->rsk_ops->family == st->family) {
1992					cur = req;
1993					goto out;
1994				}
1995				req = req->dl_next;
1996			}
1997			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1998				break;
1999get_req:
2000			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2001		}
2002		sk	  = sk_nulls_next(st->syn_wait_sk);
2003		st->state = TCP_SEQ_STATE_LISTENING;
2004		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005	} else {
2006		icsk = inet_csk(sk);
2007		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2009			goto start_req;
2010		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011		sk = sk_nulls_next(sk);
2012	}
2013get_sk:
2014	sk_nulls_for_each_from(sk, node) {
2015		if (!net_eq(sock_net(sk), net))
2016			continue;
2017		if (sk->sk_family == st->family) {
2018			cur = sk;
2019			goto out;
2020		}
2021		icsk = inet_csk(sk);
2022		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2024start_req:
2025			st->uid		= sock_i_uid(sk);
2026			st->syn_wait_sk = sk;
2027			st->state	= TCP_SEQ_STATE_OPENREQ;
2028			st->sbucket	= 0;
2029			goto get_req;
2030		}
2031		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2032	}
2033	spin_unlock_bh(&ilb->lock);
2034	st->offset = 0;
2035	if (++st->bucket < INET_LHTABLE_SIZE) {
2036		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037		spin_lock_bh(&ilb->lock);
2038		sk = sk_nulls_head(&ilb->head);
2039		goto get_sk;
2040	}
2041	cur = NULL;
2042out:
2043	return cur;
2044}
2045
2046static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2047{
2048	struct tcp_iter_state *st = seq->private;
2049	void *rc;
2050
2051	st->bucket = 0;
2052	st->offset = 0;
2053	rc = listening_get_next(seq, NULL);
2054
2055	while (rc && *pos) {
2056		rc = listening_get_next(seq, rc);
2057		--*pos;
2058	}
2059	return rc;
2060}
2061
2062static inline int empty_bucket(struct tcp_iter_state *st)
2063{
2064	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2065		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2066}
2067
2068/*
2069 * Get first established socket starting from bucket given in st->bucket.
2070 * If st->bucket is zero, the very first socket in the hash is returned.
2071 */
2072static void *established_get_first(struct seq_file *seq)
2073{
2074	struct tcp_iter_state *st = seq->private;
2075	struct net *net = seq_file_net(seq);
2076	void *rc = NULL;
2077
2078	st->offset = 0;
2079	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2080		struct sock *sk;
2081		struct hlist_nulls_node *node;
2082		struct inet_timewait_sock *tw;
2083		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2084
2085		/* Lockless fast path for the common case of empty buckets */
2086		if (empty_bucket(st))
2087			continue;
2088
2089		spin_lock_bh(lock);
2090		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2091			if (sk->sk_family != st->family ||
2092			    !net_eq(sock_net(sk), net)) {
2093				continue;
2094			}
2095			rc = sk;
2096			goto out;
2097		}
2098		st->state = TCP_SEQ_STATE_TIME_WAIT;
2099		inet_twsk_for_each(tw, node,
2100				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2101			if (tw->tw_family != st->family ||
2102			    !net_eq(twsk_net(tw), net)) {
2103				continue;
2104			}
2105			rc = tw;
2106			goto out;
2107		}
2108		spin_unlock_bh(lock);
2109		st->state = TCP_SEQ_STATE_ESTABLISHED;
2110	}
2111out:
2112	return rc;
2113}
2114
2115static void *established_get_next(struct seq_file *seq, void *cur)
2116{
2117	struct sock *sk = cur;
2118	struct inet_timewait_sock *tw;
2119	struct hlist_nulls_node *node;
2120	struct tcp_iter_state *st = seq->private;
2121	struct net *net = seq_file_net(seq);
2122
2123	++st->num;
2124	++st->offset;
2125
2126	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2127		tw = cur;
2128		tw = tw_next(tw);
2129get_tw:
2130		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2131			tw = tw_next(tw);
2132		}
2133		if (tw) {
2134			cur = tw;
2135			goto out;
2136		}
2137		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138		st->state = TCP_SEQ_STATE_ESTABLISHED;
2139
2140		/* Look for next non empty bucket */
2141		st->offset = 0;
2142		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2143				empty_bucket(st))
2144			;
2145		if (st->bucket > tcp_hashinfo.ehash_mask)
2146			return NULL;
2147
2148		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2149		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2150	} else
2151		sk = sk_nulls_next(sk);
2152
2153	sk_nulls_for_each_from(sk, node) {
2154		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2155			goto found;
2156	}
2157
2158	st->state = TCP_SEQ_STATE_TIME_WAIT;
2159	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2160	goto get_tw;
2161found:
2162	cur = sk;
2163out:
2164	return cur;
2165}
2166
2167static void *established_get_idx(struct seq_file *seq, loff_t pos)
2168{
2169	struct tcp_iter_state *st = seq->private;
2170	void *rc;
2171
2172	st->bucket = 0;
2173	rc = established_get_first(seq);
2174
2175	while (rc && pos) {
2176		rc = established_get_next(seq, rc);
2177		--pos;
2178	}
2179	return rc;
2180}
2181
2182static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2183{
2184	void *rc;
2185	struct tcp_iter_state *st = seq->private;
2186
2187	st->state = TCP_SEQ_STATE_LISTENING;
2188	rc	  = listening_get_idx(seq, &pos);
2189
2190	if (!rc) {
2191		st->state = TCP_SEQ_STATE_ESTABLISHED;
2192		rc	  = established_get_idx(seq, pos);
2193	}
2194
2195	return rc;
2196}
2197
2198static void *tcp_seek_last_pos(struct seq_file *seq)
2199{
2200	struct tcp_iter_state *st = seq->private;
2201	int offset = st->offset;
2202	int orig_num = st->num;
2203	void *rc = NULL;
2204
2205	switch (st->state) {
2206	case TCP_SEQ_STATE_OPENREQ:
2207	case TCP_SEQ_STATE_LISTENING:
2208		if (st->bucket >= INET_LHTABLE_SIZE)
2209			break;
2210		st->state = TCP_SEQ_STATE_LISTENING;
2211		rc = listening_get_next(seq, NULL);
2212		while (offset-- && rc)
2213			rc = listening_get_next(seq, rc);
2214		if (rc)
2215			break;
2216		st->bucket = 0;
2217		/* Fallthrough */
2218	case TCP_SEQ_STATE_ESTABLISHED:
2219	case TCP_SEQ_STATE_TIME_WAIT:
2220		st->state = TCP_SEQ_STATE_ESTABLISHED;
2221		if (st->bucket > tcp_hashinfo.ehash_mask)
2222			break;
2223		rc = established_get_first(seq);
2224		while (offset-- && rc)
2225			rc = established_get_next(seq, rc);
2226	}
2227
2228	st->num = orig_num;
2229
2230	return rc;
2231}
2232
2233static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2234{
2235	struct tcp_iter_state *st = seq->private;
2236	void *rc;
2237
2238	if (*pos && *pos == st->last_pos) {
2239		rc = tcp_seek_last_pos(seq);
2240		if (rc)
2241			goto out;
2242	}
2243
2244	st->state = TCP_SEQ_STATE_LISTENING;
2245	st->num = 0;
2246	st->bucket = 0;
2247	st->offset = 0;
2248	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2249
2250out:
2251	st->last_pos = *pos;
2252	return rc;
2253}
2254
2255static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256{
2257	struct tcp_iter_state *st = seq->private;
2258	void *rc = NULL;
2259
2260	if (v == SEQ_START_TOKEN) {
2261		rc = tcp_get_idx(seq, 0);
2262		goto out;
2263	}
2264
2265	switch (st->state) {
2266	case TCP_SEQ_STATE_OPENREQ:
2267	case TCP_SEQ_STATE_LISTENING:
2268		rc = listening_get_next(seq, v);
2269		if (!rc) {
2270			st->state = TCP_SEQ_STATE_ESTABLISHED;
2271			st->bucket = 0;
2272			st->offset = 0;
2273			rc	  = established_get_first(seq);
2274		}
2275		break;
2276	case TCP_SEQ_STATE_ESTABLISHED:
2277	case TCP_SEQ_STATE_TIME_WAIT:
2278		rc = established_get_next(seq, v);
2279		break;
2280	}
2281out:
2282	++*pos;
2283	st->last_pos = *pos;
2284	return rc;
2285}
2286
2287static void tcp_seq_stop(struct seq_file *seq, void *v)
2288{
2289	struct tcp_iter_state *st = seq->private;
2290
2291	switch (st->state) {
2292	case TCP_SEQ_STATE_OPENREQ:
2293		if (v) {
2294			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2295			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2296		}
2297	case TCP_SEQ_STATE_LISTENING:
2298		if (v != SEQ_START_TOKEN)
2299			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2300		break;
2301	case TCP_SEQ_STATE_TIME_WAIT:
2302	case TCP_SEQ_STATE_ESTABLISHED:
2303		if (v)
2304			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2305		break;
2306	}
2307}
2308
2309static int tcp_seq_open(struct inode *inode, struct file *file)
2310{
2311	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2312	struct tcp_iter_state *s;
2313	int err;
2314
2315	err = seq_open_net(inode, file, &afinfo->seq_ops,
2316			  sizeof(struct tcp_iter_state));
2317	if (err < 0)
2318		return err;
2319
2320	s = ((struct seq_file *)file->private_data)->private;
2321	s->family		= afinfo->family;
2322	s->last_pos 		= 0;
2323	return 0;
2324}
2325
2326int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2327{
2328	int rc = 0;
2329	struct proc_dir_entry *p;
2330
2331	afinfo->seq_fops.open		= tcp_seq_open;
2332	afinfo->seq_fops.read		= seq_read;
2333	afinfo->seq_fops.llseek		= seq_lseek;
2334	afinfo->seq_fops.release	= seq_release_net;
2335
2336	afinfo->seq_ops.start		= tcp_seq_start;
2337	afinfo->seq_ops.next		= tcp_seq_next;
2338	afinfo->seq_ops.stop		= tcp_seq_stop;
2339
2340	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2341			     &afinfo->seq_fops, afinfo);
2342	if (!p)
2343		rc = -ENOMEM;
2344	return rc;
2345}
2346EXPORT_SYMBOL(tcp_proc_register);
2347
2348void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2349{
2350	proc_net_remove(net, afinfo->name);
2351}
2352EXPORT_SYMBOL(tcp_proc_unregister);
2353
2354static void get_openreq4(struct sock *sk, struct request_sock *req,
2355			 struct seq_file *f, int i, int uid, int *len)
2356{
2357	const struct inet_request_sock *ireq = inet_rsk(req);
2358	int ttd = req->expires - jiffies;
2359
2360	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2361		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2362		i,
2363		ireq->loc_addr,
2364		ntohs(inet_sk(sk)->inet_sport),
2365		ireq->rmt_addr,
2366		ntohs(ireq->rmt_port),
2367		TCP_SYN_RECV,
2368		0, 0, /* could print option size, but that is af dependent. */
2369		1,    /* timers active (only the expire timer) */
2370		jiffies_to_clock_t(ttd),
2371		req->retrans,
2372		uid,
2373		0,  /* non standard timer */
2374		0, /* open_requests have no inode */
2375		atomic_read(&sk->sk_refcnt),
2376		req,
2377		len);
2378}
2379
2380static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2381{
2382	int timer_active;
2383	unsigned long timer_expires;
2384	struct tcp_sock *tp = tcp_sk(sk);
2385	const struct inet_connection_sock *icsk = inet_csk(sk);
2386	struct inet_sock *inet = inet_sk(sk);
2387	__be32 dest = inet->inet_daddr;
2388	__be32 src = inet->inet_rcv_saddr;
2389	__u16 destp = ntohs(inet->inet_dport);
2390	__u16 srcp = ntohs(inet->inet_sport);
2391	int rx_queue;
2392
2393	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2394		timer_active	= 1;
2395		timer_expires	= icsk->icsk_timeout;
2396	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2397		timer_active	= 4;
2398		timer_expires	= icsk->icsk_timeout;
2399	} else if (timer_pending(&sk->sk_timer)) {
2400		timer_active	= 2;
2401		timer_expires	= sk->sk_timer.expires;
2402	} else {
2403		timer_active	= 0;
2404		timer_expires = jiffies;
2405	}
2406
2407	if (sk->sk_state == TCP_LISTEN)
2408		rx_queue = sk->sk_ack_backlog;
2409	else
2410		/*
2411		 * because we dont lock socket, we might find a transient negative value
2412		 */
2413		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2414
2415	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2416			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2417		i, src, srcp, dest, destp, sk->sk_state,
2418		tp->write_seq - tp->snd_una,
2419		rx_queue,
2420		timer_active,
2421		jiffies_to_clock_t(timer_expires - jiffies),
2422		icsk->icsk_retransmits,
2423		sock_i_uid(sk),
2424		icsk->icsk_probes_out,
2425		sock_i_ino(sk),
2426		atomic_read(&sk->sk_refcnt), sk,
2427		jiffies_to_clock_t(icsk->icsk_rto),
2428		jiffies_to_clock_t(icsk->icsk_ack.ato),
2429		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2430		tp->snd_cwnd,
2431		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2432		len);
2433}
2434
2435static void get_timewait4_sock(struct inet_timewait_sock *tw,
2436			       struct seq_file *f, int i, int *len)
2437{
2438	__be32 dest, src;
2439	__u16 destp, srcp;
2440	int ttd = tw->tw_ttd - jiffies;
2441
2442	if (ttd < 0)
2443		ttd = 0;
2444
2445	dest  = tw->tw_daddr;
2446	src   = tw->tw_rcv_saddr;
2447	destp = ntohs(tw->tw_dport);
2448	srcp  = ntohs(tw->tw_sport);
2449
2450	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2451		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2452		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2453		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2454		atomic_read(&tw->tw_refcnt), tw, len);
2455}
2456
2457#define TMPSZ 150
2458
2459static int tcp4_seq_show(struct seq_file *seq, void *v)
2460{
2461	struct tcp_iter_state *st;
2462	int len;
2463
2464	if (v == SEQ_START_TOKEN) {
2465		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2466			   "  sl  local_address rem_address   st tx_queue "
2467			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2468			   "inode");
2469		goto out;
2470	}
2471	st = seq->private;
2472
2473	switch (st->state) {
2474	case TCP_SEQ_STATE_LISTENING:
2475	case TCP_SEQ_STATE_ESTABLISHED:
2476		get_tcp4_sock(v, seq, st->num, &len);
2477		break;
2478	case TCP_SEQ_STATE_OPENREQ:
2479		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2480		break;
2481	case TCP_SEQ_STATE_TIME_WAIT:
2482		get_timewait4_sock(v, seq, st->num, &len);
2483		break;
2484	}
2485	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2486out:
2487	return 0;
2488}
2489
2490static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2491	.name		= "tcp",
2492	.family		= AF_INET,
2493	.seq_fops	= {
2494		.owner		= THIS_MODULE,
2495	},
2496	.seq_ops	= {
2497		.show		= tcp4_seq_show,
2498	},
2499};
2500
2501static int __net_init tcp4_proc_init_net(struct net *net)
2502{
2503	return tcp_proc_register(net, &tcp4_seq_afinfo);
2504}
2505
2506static void __net_exit tcp4_proc_exit_net(struct net *net)
2507{
2508	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2509}
2510
2511static struct pernet_operations tcp4_net_ops = {
2512	.init = tcp4_proc_init_net,
2513	.exit = tcp4_proc_exit_net,
2514};
2515
2516int __init tcp4_proc_init(void)
2517{
2518	return register_pernet_subsys(&tcp4_net_ops);
2519}
2520
2521void tcp4_proc_exit(void)
2522{
2523	unregister_pernet_subsys(&tcp4_net_ops);
2524}
2525#endif /* CONFIG_PROC_FS */
2526
2527struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2528{
2529	struct iphdr *iph = skb_gro_network_header(skb);
2530
2531	switch (skb->ip_summed) {
2532	case CHECKSUM_COMPLETE:
2533		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2534				  skb->csum)) {
2535			skb->ip_summed = CHECKSUM_UNNECESSARY;
2536			break;
2537		}
2538
2539		/* fall through */
2540	case CHECKSUM_NONE:
2541		NAPI_GRO_CB(skb)->flush = 1;
2542		return NULL;
2543	}
2544
2545	return tcp_gro_receive(head, skb);
2546}
2547
2548int tcp4_gro_complete(struct sk_buff *skb)
2549{
2550	struct iphdr *iph = ip_hdr(skb);
2551	struct tcphdr *th = tcp_hdr(skb);
2552
2553	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2554				  iph->saddr, iph->daddr, 0);
2555	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2556
2557	return tcp_gro_complete(skb);
2558}
2559
2560struct proto tcp_prot = {
2561	.name			= "TCP",
2562	.owner			= THIS_MODULE,
2563	.close			= tcp_close,
2564	.connect		= tcp_v4_connect,
2565	.disconnect		= tcp_disconnect,
2566	.accept			= inet_csk_accept,
2567	.ioctl			= tcp_ioctl,
2568	.init			= tcp_v4_init_sock,
2569	.destroy		= tcp_v4_destroy_sock,
2570	.shutdown		= tcp_shutdown,
2571	.setsockopt		= tcp_setsockopt,
2572	.getsockopt		= tcp_getsockopt,
2573	.recvmsg		= tcp_recvmsg,
2574	.sendmsg		= tcp_sendmsg,
2575	.sendpage		= tcp_sendpage,
2576	.backlog_rcv		= tcp_v4_do_rcv,
2577	.hash			= inet_hash,
2578	.unhash			= inet_unhash,
2579	.get_port		= inet_csk_get_port,
2580	.enter_memory_pressure	= tcp_enter_memory_pressure,
2581	.sockets_allocated	= &tcp_sockets_allocated,
2582	.orphan_count		= &tcp_orphan_count,
2583	.memory_allocated	= &tcp_memory_allocated,
2584	.memory_pressure	= &tcp_memory_pressure,
2585	.sysctl_mem		= sysctl_tcp_mem,
2586	.sysctl_wmem		= sysctl_tcp_wmem,
2587	.sysctl_rmem		= sysctl_tcp_rmem,
2588	.max_header		= MAX_TCP_HEADER,
2589	.obj_size		= sizeof(struct tcp_sock),
2590	.slab_flags		= SLAB_DESTROY_BY_RCU,
2591	.twsk_prot		= &tcp_timewait_sock_ops,
2592	.rsk_prot		= &tcp_request_sock_ops,
2593	.h.hashinfo		= &tcp_hashinfo,
2594	.no_autobind		= true,
2595#ifdef CONFIG_COMPAT
2596	.compat_setsockopt	= compat_tcp_setsockopt,
2597	.compat_getsockopt	= compat_tcp_getsockopt,
2598#endif
2599};
2600EXPORT_SYMBOL(tcp_prot);
2601
2602
2603static int __net_init tcp_sk_init(struct net *net)
2604{
2605	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2606				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2607}
2608
2609static void __net_exit tcp_sk_exit(struct net *net)
2610{
2611	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2612}
2613
2614static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2615{
2616	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2617}
2618
2619static struct pernet_operations __net_initdata tcp_sk_ops = {
2620       .init	   = tcp_sk_init,
2621       .exit	   = tcp_sk_exit,
2622       .exit_batch = tcp_sk_exit_batch,
2623};
2624
2625void __init tcp_v4_init(void)
2626{
2627	inet_hashinfo_init(&tcp_hashinfo);
2628	if (register_pernet_subsys(&tcp_sk_ops))
2629		panic("Failed to create the TCP control socket.\n");
2630}
2631