tcp_ipv4.c revision 70e7341673a47fb1525cfc7d6651cc98b5348928
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53#define pr_fmt(fmt) "TCP: " fmt
54
55#include <linux/bottom_half.h>
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64#include <linux/slab.h>
65
66#include <net/net_namespace.h>
67#include <net/icmp.h>
68#include <net/inet_hashtables.h>
69#include <net/tcp.h>
70#include <net/transp_v6.h>
71#include <net/ipv6.h>
72#include <net/inet_common.h>
73#include <net/timewait_sock.h>
74#include <net/xfrm.h>
75#include <net/netdma.h>
76#include <net/secure_seq.h>
77#include <net/tcp_memcontrol.h>
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
90EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92
93#ifdef CONFIG_TCP_MD5SIG
94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96#endif
97
98struct inet_hashinfo tcp_hashinfo;
99EXPORT_SYMBOL(tcp_hashinfo);
100
101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102{
103	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104					  ip_hdr(skb)->saddr,
105					  tcp_hdr(skb)->dest,
106					  tcp_hdr(skb)->source);
107}
108
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112	struct tcp_sock *tp = tcp_sk(sk);
113
114	/* With PAWS, it is safe from the viewpoint
115	   of data integrity. Even without PAWS it is safe provided sequence
116	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118	   Actually, the idea is close to VJ's one, only timestamp cache is
119	   held not per host, but per port pair and TW bucket is used as state
120	   holder.
121
122	   If TW bucket has been already destroyed we fall back to VJ's scheme
123	   and use initial timestamp retrieved from peer table.
124	 */
125	if (tcptw->tw_ts_recent_stamp &&
126	    (twp == NULL || (sysctl_tcp_tw_reuse &&
127			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129		if (tp->write_seq == 0)
130			tp->write_seq = 1;
131		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
132		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133		sock_hold(sktw);
134		return 1;
135	}
136
137	return 0;
138}
139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141static int tcp_repair_connect(struct sock *sk)
142{
143	tcp_connect_init(sk);
144	tcp_finish_connect(sk, NULL);
145
146	return 0;
147}
148
149/* This will initiate an outgoing connection. */
150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151{
152	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153	struct inet_sock *inet = inet_sk(sk);
154	struct tcp_sock *tp = tcp_sk(sk);
155	__be16 orig_sport, orig_dport;
156	__be32 daddr, nexthop;
157	struct flowi4 *fl4;
158	struct rtable *rt;
159	int err;
160	struct ip_options_rcu *inet_opt;
161
162	if (addr_len < sizeof(struct sockaddr_in))
163		return -EINVAL;
164
165	if (usin->sin_family != AF_INET)
166		return -EAFNOSUPPORT;
167
168	nexthop = daddr = usin->sin_addr.s_addr;
169	inet_opt = rcu_dereference_protected(inet->inet_opt,
170					     sock_owned_by_user(sk));
171	if (inet_opt && inet_opt->opt.srr) {
172		if (!daddr)
173			return -EINVAL;
174		nexthop = inet_opt->opt.faddr;
175	}
176
177	orig_sport = inet->inet_sport;
178	orig_dport = usin->sin_port;
179	fl4 = &inet->cork.fl.u.ip4;
180	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
182			      IPPROTO_TCP,
183			      orig_sport, orig_dport, sk, true);
184	if (IS_ERR(rt)) {
185		err = PTR_ERR(rt);
186		if (err == -ENETUNREACH)
187			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
188		return err;
189	}
190
191	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
192		ip_rt_put(rt);
193		return -ENETUNREACH;
194	}
195
196	if (!inet_opt || !inet_opt->opt.srr)
197		daddr = fl4->daddr;
198
199	if (!inet->inet_saddr)
200		inet->inet_saddr = fl4->saddr;
201	inet->inet_rcv_saddr = inet->inet_saddr;
202
203	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
204		/* Reset inherited state */
205		tp->rx_opt.ts_recent	   = 0;
206		tp->rx_opt.ts_recent_stamp = 0;
207		if (likely(!tp->repair))
208			tp->write_seq	   = 0;
209	}
210
211	if (tcp_death_row.sysctl_tw_recycle &&
212	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
213		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
214		/*
215		 * VJ's idea. We save last timestamp seen from
216		 * the destination in peer table, when entering state
217		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218		 * when trying new connection.
219		 */
220		if (peer) {
221			inet_peer_refcheck(peer);
222			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
223				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
224				tp->rx_opt.ts_recent = peer->tcp_ts;
225			}
226		}
227	}
228
229	inet->inet_dport = usin->sin_port;
230	inet->inet_daddr = daddr;
231
232	inet_csk(sk)->icsk_ext_hdr_len = 0;
233	if (inet_opt)
234		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
235
236	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
237
238	/* Socket identity is still unknown (sport may be zero).
239	 * However we set state to SYN-SENT and not releasing socket
240	 * lock select source port, enter ourselves into the hash tables and
241	 * complete initialization after this.
242	 */
243	tcp_set_state(sk, TCP_SYN_SENT);
244	err = inet_hash_connect(&tcp_death_row, sk);
245	if (err)
246		goto failure;
247
248	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
249			       inet->inet_sport, inet->inet_dport, sk);
250	if (IS_ERR(rt)) {
251		err = PTR_ERR(rt);
252		rt = NULL;
253		goto failure;
254	}
255	/* OK, now commit destination to socket.  */
256	sk->sk_gso_type = SKB_GSO_TCPV4;
257	sk_setup_caps(sk, &rt->dst);
258
259	if (!tp->write_seq && likely(!tp->repair))
260		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
261							   inet->inet_daddr,
262							   inet->inet_sport,
263							   usin->sin_port);
264
265	inet->inet_id = tp->write_seq ^ jiffies;
266
267	if (likely(!tp->repair))
268		err = tcp_connect(sk);
269	else
270		err = tcp_repair_connect(sk);
271
272	rt = NULL;
273	if (err)
274		goto failure;
275
276	return 0;
277
278failure:
279	/*
280	 * This unhashes the socket and releases the local port,
281	 * if necessary.
282	 */
283	tcp_set_state(sk, TCP_CLOSE);
284	ip_rt_put(rt);
285	sk->sk_route_caps = 0;
286	inet->inet_dport = 0;
287	return err;
288}
289EXPORT_SYMBOL(tcp_v4_connect);
290
291/*
292 * This routine does path mtu discovery as defined in RFC1191.
293 */
294static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
295{
296	struct dst_entry *dst;
297	struct inet_sock *inet = inet_sk(sk);
298
299	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300	 * send out by Linux are always <576bytes so they should go through
301	 * unfragmented).
302	 */
303	if (sk->sk_state == TCP_LISTEN)
304		return;
305
306	/* We don't check in the destentry if pmtu discovery is forbidden
307	 * on this route. We just assume that no packet_to_big packets
308	 * are send back when pmtu discovery is not active.
309	 * There is a small race when the user changes this flag in the
310	 * route, but I think that's acceptable.
311	 */
312	if ((dst = __sk_dst_check(sk, 0)) == NULL)
313		return;
314
315	dst->ops->update_pmtu(dst, mtu);
316
317	/* Something is about to be wrong... Remember soft error
318	 * for the case, if this connection will not able to recover.
319	 */
320	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321		sk->sk_err_soft = EMSGSIZE;
322
323	mtu = dst_mtu(dst);
324
325	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
326	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
327		tcp_sync_mss(sk, mtu);
328
329		/* Resend the TCP packet because it's
330		 * clear that the old packet has been
331		 * dropped. This is the new "fast" path mtu
332		 * discovery.
333		 */
334		tcp_simple_retransmit(sk);
335	} /* else let the usual retransmit timer handle it */
336}
337
338/*
339 * This routine is called by the ICMP module when it gets some
340 * sort of error condition.  If err < 0 then the socket should
341 * be closed and the error returned to the user.  If err > 0
342 * it's just the icmp type << 8 | icmp code.  After adjustment
343 * header points to the first 8 bytes of the tcp header.  We need
344 * to find the appropriate port.
345 *
346 * The locking strategy used here is very "optimistic". When
347 * someone else accesses the socket the ICMP is just dropped
348 * and for some paths there is no check at all.
349 * A more general error queue to queue errors for later handling
350 * is probably better.
351 *
352 */
353
354void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355{
356	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358	struct inet_connection_sock *icsk;
359	struct tcp_sock *tp;
360	struct inet_sock *inet;
361	const int type = icmp_hdr(icmp_skb)->type;
362	const int code = icmp_hdr(icmp_skb)->code;
363	struct sock *sk;
364	struct sk_buff *skb;
365	__u32 seq;
366	__u32 remaining;
367	int err;
368	struct net *net = dev_net(icmp_skb->dev);
369
370	if (icmp_skb->len < (iph->ihl << 2) + 8) {
371		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
372		return;
373	}
374
375	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
376			iph->saddr, th->source, inet_iif(icmp_skb));
377	if (!sk) {
378		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379		return;
380	}
381	if (sk->sk_state == TCP_TIME_WAIT) {
382		inet_twsk_put(inet_twsk(sk));
383		return;
384	}
385
386	bh_lock_sock(sk);
387	/* If too many ICMPs get dropped on busy
388	 * servers this needs to be solved differently.
389	 */
390	if (sock_owned_by_user(sk))
391		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
392
393	if (sk->sk_state == TCP_CLOSE)
394		goto out;
395
396	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
397		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
398		goto out;
399	}
400
401	icsk = inet_csk(sk);
402	tp = tcp_sk(sk);
403	seq = ntohl(th->seq);
404	if (sk->sk_state != TCP_LISTEN &&
405	    !between(seq, tp->snd_una, tp->snd_nxt)) {
406		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
407		goto out;
408	}
409
410	switch (type) {
411	case ICMP_SOURCE_QUENCH:
412		/* Just silently ignore these. */
413		goto out;
414	case ICMP_PARAMETERPROB:
415		err = EPROTO;
416		break;
417	case ICMP_DEST_UNREACH:
418		if (code > NR_ICMP_UNREACH)
419			goto out;
420
421		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
422			if (!sock_owned_by_user(sk))
423				do_pmtu_discovery(sk, iph, info);
424			goto out;
425		}
426
427		err = icmp_err_convert[code].errno;
428		/* check if icmp_skb allows revert of backoff
429		 * (see draft-zimmermann-tcp-lcd) */
430		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
431			break;
432		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
433		    !icsk->icsk_backoff)
434			break;
435
436		if (sock_owned_by_user(sk))
437			break;
438
439		icsk->icsk_backoff--;
440		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
441			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
442		tcp_bound_rto(sk);
443
444		skb = tcp_write_queue_head(sk);
445		BUG_ON(!skb);
446
447		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
448				tcp_time_stamp - TCP_SKB_CB(skb)->when);
449
450		if (remaining) {
451			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
452						  remaining, TCP_RTO_MAX);
453		} else {
454			/* RTO revert clocked out retransmission.
455			 * Will retransmit now */
456			tcp_retransmit_timer(sk);
457		}
458
459		break;
460	case ICMP_TIME_EXCEEDED:
461		err = EHOSTUNREACH;
462		break;
463	default:
464		goto out;
465	}
466
467	switch (sk->sk_state) {
468		struct request_sock *req, **prev;
469	case TCP_LISTEN:
470		if (sock_owned_by_user(sk))
471			goto out;
472
473		req = inet_csk_search_req(sk, &prev, th->dest,
474					  iph->daddr, iph->saddr);
475		if (!req)
476			goto out;
477
478		/* ICMPs are not backlogged, hence we cannot get
479		   an established socket here.
480		 */
481		WARN_ON(req->sk);
482
483		if (seq != tcp_rsk(req)->snt_isn) {
484			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
485			goto out;
486		}
487
488		/*
489		 * Still in SYN_RECV, just remove it silently.
490		 * There is no good way to pass the error to the newly
491		 * created socket, and POSIX does not want network
492		 * errors returned from accept().
493		 */
494		inet_csk_reqsk_queue_drop(sk, req, prev);
495		goto out;
496
497	case TCP_SYN_SENT:
498	case TCP_SYN_RECV:  /* Cannot happen.
499			       It can f.e. if SYNs crossed.
500			     */
501		if (!sock_owned_by_user(sk)) {
502			sk->sk_err = err;
503
504			sk->sk_error_report(sk);
505
506			tcp_done(sk);
507		} else {
508			sk->sk_err_soft = err;
509		}
510		goto out;
511	}
512
513	/* If we've already connected we will keep trying
514	 * until we time out, or the user gives up.
515	 *
516	 * rfc1122 4.2.3.9 allows to consider as hard errors
517	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518	 * but it is obsoleted by pmtu discovery).
519	 *
520	 * Note, that in modern internet, where routing is unreliable
521	 * and in each dark corner broken firewalls sit, sending random
522	 * errors ordered by their masters even this two messages finally lose
523	 * their original sense (even Linux sends invalid PORT_UNREACHs)
524	 *
525	 * Now we are in compliance with RFCs.
526	 *							--ANK (980905)
527	 */
528
529	inet = inet_sk(sk);
530	if (!sock_owned_by_user(sk) && inet->recverr) {
531		sk->sk_err = err;
532		sk->sk_error_report(sk);
533	} else	{ /* Only an error on timeout */
534		sk->sk_err_soft = err;
535	}
536
537out:
538	bh_unlock_sock(sk);
539	sock_put(sk);
540}
541
542static void __tcp_v4_send_check(struct sk_buff *skb,
543				__be32 saddr, __be32 daddr)
544{
545	struct tcphdr *th = tcp_hdr(skb);
546
547	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549		skb->csum_start = skb_transport_header(skb) - skb->head;
550		skb->csum_offset = offsetof(struct tcphdr, check);
551	} else {
552		th->check = tcp_v4_check(skb->len, saddr, daddr,
553					 csum_partial(th,
554						      th->doff << 2,
555						      skb->csum));
556	}
557}
558
559/* This routine computes an IPv4 TCP checksum. */
560void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561{
562	const struct inet_sock *inet = inet_sk(sk);
563
564	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565}
566EXPORT_SYMBOL(tcp_v4_send_check);
567
568int tcp_v4_gso_send_check(struct sk_buff *skb)
569{
570	const struct iphdr *iph;
571	struct tcphdr *th;
572
573	if (!pskb_may_pull(skb, sizeof(*th)))
574		return -EINVAL;
575
576	iph = ip_hdr(skb);
577	th = tcp_hdr(skb);
578
579	th->check = 0;
580	skb->ip_summed = CHECKSUM_PARTIAL;
581	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
582	return 0;
583}
584
585/*
586 *	This routine will send an RST to the other tcp.
587 *
588 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589 *		      for reset.
590 *	Answer: if a packet caused RST, it is not for a socket
591 *		existing in our system, if it is matched to a socket,
592 *		it is just duplicate segment or bug in other side's TCP.
593 *		So that we build reply only basing on parameters
594 *		arrived with segment.
595 *	Exception: precedence violation. We do not implement it in any case.
596 */
597
598static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
599{
600	const struct tcphdr *th = tcp_hdr(skb);
601	struct {
602		struct tcphdr th;
603#ifdef CONFIG_TCP_MD5SIG
604		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605#endif
606	} rep;
607	struct ip_reply_arg arg;
608#ifdef CONFIG_TCP_MD5SIG
609	struct tcp_md5sig_key *key;
610	const __u8 *hash_location = NULL;
611	unsigned char newhash[16];
612	int genhash;
613	struct sock *sk1 = NULL;
614#endif
615	struct net *net;
616
617	/* Never send a reset in response to a reset. */
618	if (th->rst)
619		return;
620
621	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
622		return;
623
624	/* Swap the send and the receive. */
625	memset(&rep, 0, sizeof(rep));
626	rep.th.dest   = th->source;
627	rep.th.source = th->dest;
628	rep.th.doff   = sizeof(struct tcphdr) / 4;
629	rep.th.rst    = 1;
630
631	if (th->ack) {
632		rep.th.seq = th->ack_seq;
633	} else {
634		rep.th.ack = 1;
635		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
636				       skb->len - (th->doff << 2));
637	}
638
639	memset(&arg, 0, sizeof(arg));
640	arg.iov[0].iov_base = (unsigned char *)&rep;
641	arg.iov[0].iov_len  = sizeof(rep.th);
642
643#ifdef CONFIG_TCP_MD5SIG
644	hash_location = tcp_parse_md5sig_option(th);
645	if (!sk && hash_location) {
646		/*
647		 * active side is lost. Try to find listening socket through
648		 * source port, and then find md5 key through listening socket.
649		 * we are not loose security here:
650		 * Incoming packet is checked with md5 hash with finding key,
651		 * no RST generated if md5 hash doesn't match.
652		 */
653		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
654					     &tcp_hashinfo, ip_hdr(skb)->daddr,
655					     ntohs(th->source), inet_iif(skb));
656		/* don't send rst if it can't find key */
657		if (!sk1)
658			return;
659		rcu_read_lock();
660		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
661					&ip_hdr(skb)->saddr, AF_INET);
662		if (!key)
663			goto release_sk1;
664
665		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
666		if (genhash || memcmp(hash_location, newhash, 16) != 0)
667			goto release_sk1;
668	} else {
669		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
670					     &ip_hdr(skb)->saddr,
671					     AF_INET) : NULL;
672	}
673
674	if (key) {
675		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
676				   (TCPOPT_NOP << 16) |
677				   (TCPOPT_MD5SIG << 8) |
678				   TCPOLEN_MD5SIG);
679		/* Update length and the length the header thinks exists */
680		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
681		rep.th.doff = arg.iov[0].iov_len / 4;
682
683		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
684				     key, ip_hdr(skb)->saddr,
685				     ip_hdr(skb)->daddr, &rep.th);
686	}
687#endif
688	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
689				      ip_hdr(skb)->saddr, /* XXX */
690				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
691	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
692	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
693	/* When socket is gone, all binding information is lost.
694	 * routing might fail in this case. using iif for oif to
695	 * make sure we can deliver it
696	 */
697	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
698
699	net = dev_net(skb_dst(skb)->dev);
700	arg.tos = ip_hdr(skb)->tos;
701	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
702			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
703
704	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
705	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
706
707#ifdef CONFIG_TCP_MD5SIG
708release_sk1:
709	if (sk1) {
710		rcu_read_unlock();
711		sock_put(sk1);
712	}
713#endif
714}
715
716/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
717   outside socket context is ugly, certainly. What can I do?
718 */
719
720static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
721			    u32 win, u32 ts, int oif,
722			    struct tcp_md5sig_key *key,
723			    int reply_flags, u8 tos)
724{
725	const struct tcphdr *th = tcp_hdr(skb);
726	struct {
727		struct tcphdr th;
728		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
729#ifdef CONFIG_TCP_MD5SIG
730			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
731#endif
732			];
733	} rep;
734	struct ip_reply_arg arg;
735	struct net *net = dev_net(skb_dst(skb)->dev);
736
737	memset(&rep.th, 0, sizeof(struct tcphdr));
738	memset(&arg, 0, sizeof(arg));
739
740	arg.iov[0].iov_base = (unsigned char *)&rep;
741	arg.iov[0].iov_len  = sizeof(rep.th);
742	if (ts) {
743		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
744				   (TCPOPT_TIMESTAMP << 8) |
745				   TCPOLEN_TIMESTAMP);
746		rep.opt[1] = htonl(tcp_time_stamp);
747		rep.opt[2] = htonl(ts);
748		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
749	}
750
751	/* Swap the send and the receive. */
752	rep.th.dest    = th->source;
753	rep.th.source  = th->dest;
754	rep.th.doff    = arg.iov[0].iov_len / 4;
755	rep.th.seq     = htonl(seq);
756	rep.th.ack_seq = htonl(ack);
757	rep.th.ack     = 1;
758	rep.th.window  = htons(win);
759
760#ifdef CONFIG_TCP_MD5SIG
761	if (key) {
762		int offset = (ts) ? 3 : 0;
763
764		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
765					  (TCPOPT_NOP << 16) |
766					  (TCPOPT_MD5SIG << 8) |
767					  TCPOLEN_MD5SIG);
768		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
769		rep.th.doff = arg.iov[0].iov_len/4;
770
771		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
772				    key, ip_hdr(skb)->saddr,
773				    ip_hdr(skb)->daddr, &rep.th);
774	}
775#endif
776	arg.flags = reply_flags;
777	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
778				      ip_hdr(skb)->saddr, /* XXX */
779				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
780	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
781	if (oif)
782		arg.bound_dev_if = oif;
783	arg.tos = tos;
784	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
785			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
786
787	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
788}
789
790static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
791{
792	struct inet_timewait_sock *tw = inet_twsk(sk);
793	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
794
795	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797			tcptw->tw_ts_recent,
798			tw->tw_bound_dev_if,
799			tcp_twsk_md5_key(tcptw),
800			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
801			tw->tw_tos
802			);
803
804	inet_twsk_put(tw);
805}
806
807static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
808				  struct request_sock *req)
809{
810	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
811			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
812			req->ts_recent,
813			0,
814			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
815					  AF_INET),
816			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
817			ip_hdr(skb)->tos);
818}
819
820/*
821 *	Send a SYN-ACK after having received a SYN.
822 *	This still operates on a request_sock only, not on a big
823 *	socket.
824 */
825static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
826			      struct request_sock *req,
827			      struct request_values *rvp,
828			      u16 queue_mapping,
829			      bool nocache)
830{
831	const struct inet_request_sock *ireq = inet_rsk(req);
832	struct flowi4 fl4;
833	int err = -1;
834	struct sk_buff * skb;
835
836	/* First, grab a route. */
837	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
838		return -1;
839
840	skb = tcp_make_synack(sk, dst, req, rvp);
841
842	if (skb) {
843		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
844
845		skb_set_queue_mapping(skb, queue_mapping);
846		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
847					    ireq->rmt_addr,
848					    ireq->opt);
849		err = net_xmit_eval(err);
850	}
851
852	return err;
853}
854
855static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
856			      struct request_values *rvp)
857{
858	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859	return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
860}
861
862/*
863 *	IPv4 request_sock destructor.
864 */
865static void tcp_v4_reqsk_destructor(struct request_sock *req)
866{
867	kfree(inet_rsk(req)->opt);
868}
869
870/*
871 * Return true if a syncookie should be sent
872 */
873bool tcp_syn_flood_action(struct sock *sk,
874			 const struct sk_buff *skb,
875			 const char *proto)
876{
877	const char *msg = "Dropping request";
878	bool want_cookie = false;
879	struct listen_sock *lopt;
880
881
882
883#ifdef CONFIG_SYN_COOKIES
884	if (sysctl_tcp_syncookies) {
885		msg = "Sending cookies";
886		want_cookie = true;
887		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
888	} else
889#endif
890		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
891
892	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
893	if (!lopt->synflood_warned) {
894		lopt->synflood_warned = 1;
895		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
896			proto, ntohs(tcp_hdr(skb)->dest), msg);
897	}
898	return want_cookie;
899}
900EXPORT_SYMBOL(tcp_syn_flood_action);
901
902/*
903 * Save and compile IPv4 options into the request_sock if needed.
904 */
905static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
906						  struct sk_buff *skb)
907{
908	const struct ip_options *opt = &(IPCB(skb)->opt);
909	struct ip_options_rcu *dopt = NULL;
910
911	if (opt && opt->optlen) {
912		int opt_size = sizeof(*dopt) + opt->optlen;
913
914		dopt = kmalloc(opt_size, GFP_ATOMIC);
915		if (dopt) {
916			if (ip_options_echo(&dopt->opt, skb)) {
917				kfree(dopt);
918				dopt = NULL;
919			}
920		}
921	}
922	return dopt;
923}
924
925#ifdef CONFIG_TCP_MD5SIG
926/*
927 * RFC2385 MD5 checksumming requires a mapping of
928 * IP address->MD5 Key.
929 * We need to maintain these in the sk structure.
930 */
931
932/* Find the Key structure for an address.  */
933struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
934					 const union tcp_md5_addr *addr,
935					 int family)
936{
937	struct tcp_sock *tp = tcp_sk(sk);
938	struct tcp_md5sig_key *key;
939	struct hlist_node *pos;
940	unsigned int size = sizeof(struct in_addr);
941	struct tcp_md5sig_info *md5sig;
942
943	/* caller either holds rcu_read_lock() or socket lock */
944	md5sig = rcu_dereference_check(tp->md5sig_info,
945				       sock_owned_by_user(sk) ||
946				       lockdep_is_held(&sk->sk_lock.slock));
947	if (!md5sig)
948		return NULL;
949#if IS_ENABLED(CONFIG_IPV6)
950	if (family == AF_INET6)
951		size = sizeof(struct in6_addr);
952#endif
953	hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
954		if (key->family != family)
955			continue;
956		if (!memcmp(&key->addr, addr, size))
957			return key;
958	}
959	return NULL;
960}
961EXPORT_SYMBOL(tcp_md5_do_lookup);
962
963struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
964					 struct sock *addr_sk)
965{
966	union tcp_md5_addr *addr;
967
968	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
969	return tcp_md5_do_lookup(sk, addr, AF_INET);
970}
971EXPORT_SYMBOL(tcp_v4_md5_lookup);
972
973static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
974						      struct request_sock *req)
975{
976	union tcp_md5_addr *addr;
977
978	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
979	return tcp_md5_do_lookup(sk, addr, AF_INET);
980}
981
982/* This can be called on a newly created socket, from other files */
983int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
984		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
985{
986	/* Add Key to the list */
987	struct tcp_md5sig_key *key;
988	struct tcp_sock *tp = tcp_sk(sk);
989	struct tcp_md5sig_info *md5sig;
990
991	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
992	if (key) {
993		/* Pre-existing entry - just update that one. */
994		memcpy(key->key, newkey, newkeylen);
995		key->keylen = newkeylen;
996		return 0;
997	}
998
999	md5sig = rcu_dereference_protected(tp->md5sig_info,
1000					   sock_owned_by_user(sk));
1001	if (!md5sig) {
1002		md5sig = kmalloc(sizeof(*md5sig), gfp);
1003		if (!md5sig)
1004			return -ENOMEM;
1005
1006		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1007		INIT_HLIST_HEAD(&md5sig->head);
1008		rcu_assign_pointer(tp->md5sig_info, md5sig);
1009	}
1010
1011	key = sock_kmalloc(sk, sizeof(*key), gfp);
1012	if (!key)
1013		return -ENOMEM;
1014	if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1015		sock_kfree_s(sk, key, sizeof(*key));
1016		return -ENOMEM;
1017	}
1018
1019	memcpy(key->key, newkey, newkeylen);
1020	key->keylen = newkeylen;
1021	key->family = family;
1022	memcpy(&key->addr, addr,
1023	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1024				      sizeof(struct in_addr));
1025	hlist_add_head_rcu(&key->node, &md5sig->head);
1026	return 0;
1027}
1028EXPORT_SYMBOL(tcp_md5_do_add);
1029
1030int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1031{
1032	struct tcp_sock *tp = tcp_sk(sk);
1033	struct tcp_md5sig_key *key;
1034	struct tcp_md5sig_info *md5sig;
1035
1036	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1037	if (!key)
1038		return -ENOENT;
1039	hlist_del_rcu(&key->node);
1040	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1041	kfree_rcu(key, rcu);
1042	md5sig = rcu_dereference_protected(tp->md5sig_info,
1043					   sock_owned_by_user(sk));
1044	if (hlist_empty(&md5sig->head))
1045		tcp_free_md5sig_pool();
1046	return 0;
1047}
1048EXPORT_SYMBOL(tcp_md5_do_del);
1049
1050void tcp_clear_md5_list(struct sock *sk)
1051{
1052	struct tcp_sock *tp = tcp_sk(sk);
1053	struct tcp_md5sig_key *key;
1054	struct hlist_node *pos, *n;
1055	struct tcp_md5sig_info *md5sig;
1056
1057	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1058
1059	if (!hlist_empty(&md5sig->head))
1060		tcp_free_md5sig_pool();
1061	hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1062		hlist_del_rcu(&key->node);
1063		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064		kfree_rcu(key, rcu);
1065	}
1066}
1067
1068static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1069				 int optlen)
1070{
1071	struct tcp_md5sig cmd;
1072	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1073
1074	if (optlen < sizeof(cmd))
1075		return -EINVAL;
1076
1077	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1078		return -EFAULT;
1079
1080	if (sin->sin_family != AF_INET)
1081		return -EINVAL;
1082
1083	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1084		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1085				      AF_INET);
1086
1087	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1088		return -EINVAL;
1089
1090	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1091			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1092			      GFP_KERNEL);
1093}
1094
1095static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1096					__be32 daddr, __be32 saddr, int nbytes)
1097{
1098	struct tcp4_pseudohdr *bp;
1099	struct scatterlist sg;
1100
1101	bp = &hp->md5_blk.ip4;
1102
1103	/*
1104	 * 1. the TCP pseudo-header (in the order: source IP address,
1105	 * destination IP address, zero-padded protocol number, and
1106	 * segment length)
1107	 */
1108	bp->saddr = saddr;
1109	bp->daddr = daddr;
1110	bp->pad = 0;
1111	bp->protocol = IPPROTO_TCP;
1112	bp->len = cpu_to_be16(nbytes);
1113
1114	sg_init_one(&sg, bp, sizeof(*bp));
1115	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1116}
1117
1118static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1119			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1120{
1121	struct tcp_md5sig_pool *hp;
1122	struct hash_desc *desc;
1123
1124	hp = tcp_get_md5sig_pool();
1125	if (!hp)
1126		goto clear_hash_noput;
1127	desc = &hp->md5_desc;
1128
1129	if (crypto_hash_init(desc))
1130		goto clear_hash;
1131	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1132		goto clear_hash;
1133	if (tcp_md5_hash_header(hp, th))
1134		goto clear_hash;
1135	if (tcp_md5_hash_key(hp, key))
1136		goto clear_hash;
1137	if (crypto_hash_final(desc, md5_hash))
1138		goto clear_hash;
1139
1140	tcp_put_md5sig_pool();
1141	return 0;
1142
1143clear_hash:
1144	tcp_put_md5sig_pool();
1145clear_hash_noput:
1146	memset(md5_hash, 0, 16);
1147	return 1;
1148}
1149
1150int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1151			const struct sock *sk, const struct request_sock *req,
1152			const struct sk_buff *skb)
1153{
1154	struct tcp_md5sig_pool *hp;
1155	struct hash_desc *desc;
1156	const struct tcphdr *th = tcp_hdr(skb);
1157	__be32 saddr, daddr;
1158
1159	if (sk) {
1160		saddr = inet_sk(sk)->inet_saddr;
1161		daddr = inet_sk(sk)->inet_daddr;
1162	} else if (req) {
1163		saddr = inet_rsk(req)->loc_addr;
1164		daddr = inet_rsk(req)->rmt_addr;
1165	} else {
1166		const struct iphdr *iph = ip_hdr(skb);
1167		saddr = iph->saddr;
1168		daddr = iph->daddr;
1169	}
1170
1171	hp = tcp_get_md5sig_pool();
1172	if (!hp)
1173		goto clear_hash_noput;
1174	desc = &hp->md5_desc;
1175
1176	if (crypto_hash_init(desc))
1177		goto clear_hash;
1178
1179	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1180		goto clear_hash;
1181	if (tcp_md5_hash_header(hp, th))
1182		goto clear_hash;
1183	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1184		goto clear_hash;
1185	if (tcp_md5_hash_key(hp, key))
1186		goto clear_hash;
1187	if (crypto_hash_final(desc, md5_hash))
1188		goto clear_hash;
1189
1190	tcp_put_md5sig_pool();
1191	return 0;
1192
1193clear_hash:
1194	tcp_put_md5sig_pool();
1195clear_hash_noput:
1196	memset(md5_hash, 0, 16);
1197	return 1;
1198}
1199EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1200
1201static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1202{
1203	/*
1204	 * This gets called for each TCP segment that arrives
1205	 * so we want to be efficient.
1206	 * We have 3 drop cases:
1207	 * o No MD5 hash and one expected.
1208	 * o MD5 hash and we're not expecting one.
1209	 * o MD5 hash and its wrong.
1210	 */
1211	const __u8 *hash_location = NULL;
1212	struct tcp_md5sig_key *hash_expected;
1213	const struct iphdr *iph = ip_hdr(skb);
1214	const struct tcphdr *th = tcp_hdr(skb);
1215	int genhash;
1216	unsigned char newhash[16];
1217
1218	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1219					  AF_INET);
1220	hash_location = tcp_parse_md5sig_option(th);
1221
1222	/* We've parsed the options - do we have a hash? */
1223	if (!hash_expected && !hash_location)
1224		return false;
1225
1226	if (hash_expected && !hash_location) {
1227		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1228		return true;
1229	}
1230
1231	if (!hash_expected && hash_location) {
1232		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1233		return true;
1234	}
1235
1236	/* Okay, so this is hash_expected and hash_location -
1237	 * so we need to calculate the checksum.
1238	 */
1239	genhash = tcp_v4_md5_hash_skb(newhash,
1240				      hash_expected,
1241				      NULL, NULL, skb);
1242
1243	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1244		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1245				     &iph->saddr, ntohs(th->source),
1246				     &iph->daddr, ntohs(th->dest),
1247				     genhash ? " tcp_v4_calc_md5_hash failed"
1248				     : "");
1249		return true;
1250	}
1251	return false;
1252}
1253
1254#endif
1255
1256struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1257	.family		=	PF_INET,
1258	.obj_size	=	sizeof(struct tcp_request_sock),
1259	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1260	.send_ack	=	tcp_v4_reqsk_send_ack,
1261	.destructor	=	tcp_v4_reqsk_destructor,
1262	.send_reset	=	tcp_v4_send_reset,
1263	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1264};
1265
1266#ifdef CONFIG_TCP_MD5SIG
1267static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1268	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1269	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1270};
1271#endif
1272
1273int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1274{
1275	struct tcp_extend_values tmp_ext;
1276	struct tcp_options_received tmp_opt;
1277	const u8 *hash_location;
1278	struct request_sock *req;
1279	struct inet_request_sock *ireq;
1280	struct tcp_sock *tp = tcp_sk(sk);
1281	struct dst_entry *dst = NULL;
1282	__be32 saddr = ip_hdr(skb)->saddr;
1283	__be32 daddr = ip_hdr(skb)->daddr;
1284	__u32 isn = TCP_SKB_CB(skb)->when;
1285	bool want_cookie = false;
1286
1287	/* Never answer to SYNs send to broadcast or multicast */
1288	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1289		goto drop;
1290
1291	/* TW buckets are converted to open requests without
1292	 * limitations, they conserve resources and peer is
1293	 * evidently real one.
1294	 */
1295	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1296		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1297		if (!want_cookie)
1298			goto drop;
1299	}
1300
1301	/* Accept backlog is full. If we have already queued enough
1302	 * of warm entries in syn queue, drop request. It is better than
1303	 * clogging syn queue with openreqs with exponentially increasing
1304	 * timeout.
1305	 */
1306	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1307		goto drop;
1308
1309	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1310	if (!req)
1311		goto drop;
1312
1313#ifdef CONFIG_TCP_MD5SIG
1314	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1315#endif
1316
1317	tcp_clear_options(&tmp_opt);
1318	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1319	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1320	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1321
1322	if (tmp_opt.cookie_plus > 0 &&
1323	    tmp_opt.saw_tstamp &&
1324	    !tp->rx_opt.cookie_out_never &&
1325	    (sysctl_tcp_cookie_size > 0 ||
1326	     (tp->cookie_values != NULL &&
1327	      tp->cookie_values->cookie_desired > 0))) {
1328		u8 *c;
1329		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1330		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1331
1332		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1333			goto drop_and_release;
1334
1335		/* Secret recipe starts with IP addresses */
1336		*mess++ ^= (__force u32)daddr;
1337		*mess++ ^= (__force u32)saddr;
1338
1339		/* plus variable length Initiator Cookie */
1340		c = (u8 *)mess;
1341		while (l-- > 0)
1342			*c++ ^= *hash_location++;
1343
1344		want_cookie = false;	/* not our kind of cookie */
1345		tmp_ext.cookie_out_never = 0; /* false */
1346		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1347	} else if (!tp->rx_opt.cookie_in_always) {
1348		/* redundant indications, but ensure initialization. */
1349		tmp_ext.cookie_out_never = 1; /* true */
1350		tmp_ext.cookie_plus = 0;
1351	} else {
1352		goto drop_and_release;
1353	}
1354	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1355
1356	if (want_cookie && !tmp_opt.saw_tstamp)
1357		tcp_clear_options(&tmp_opt);
1358
1359	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1360	tcp_openreq_init(req, &tmp_opt, skb);
1361
1362	ireq = inet_rsk(req);
1363	ireq->loc_addr = daddr;
1364	ireq->rmt_addr = saddr;
1365	ireq->no_srccheck = inet_sk(sk)->transparent;
1366	ireq->opt = tcp_v4_save_options(sk, skb);
1367
1368	if (security_inet_conn_request(sk, skb, req))
1369		goto drop_and_free;
1370
1371	if (!want_cookie || tmp_opt.tstamp_ok)
1372		TCP_ECN_create_request(req, skb);
1373
1374	if (want_cookie) {
1375		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376		req->cookie_ts = tmp_opt.tstamp_ok;
1377	} else if (!isn) {
1378		struct inet_peer *peer = NULL;
1379		struct flowi4 fl4;
1380
1381		/* VJ's idea. We save last timestamp seen
1382		 * from the destination in peer table, when entering
1383		 * state TIME-WAIT, and check against it before
1384		 * accepting new connection request.
1385		 *
1386		 * If "isn" is not zero, this request hit alive
1387		 * timewait bucket, so that all the necessary checks
1388		 * are made in the function processing timewait state.
1389		 */
1390		if (tmp_opt.saw_tstamp &&
1391		    tcp_death_row.sysctl_tw_recycle &&
1392		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1393		    fl4.daddr == saddr &&
1394		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1395			inet_peer_refcheck(peer);
1396			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1397			    (s32)(peer->tcp_ts - req->ts_recent) >
1398							TCP_PAWS_WINDOW) {
1399				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1400				goto drop_and_release;
1401			}
1402		}
1403		/* Kill the following clause, if you dislike this way. */
1404		else if (!sysctl_tcp_syncookies &&
1405			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1406			  (sysctl_max_syn_backlog >> 2)) &&
1407			 (!peer || !peer->tcp_ts_stamp) &&
1408			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1409			/* Without syncookies last quarter of
1410			 * backlog is filled with destinations,
1411			 * proven to be alive.
1412			 * It means that we continue to communicate
1413			 * to destinations, already remembered
1414			 * to the moment of synflood.
1415			 */
1416			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1417				       &saddr, ntohs(tcp_hdr(skb)->source));
1418			goto drop_and_release;
1419		}
1420
1421		isn = tcp_v4_init_sequence(skb);
1422	}
1423	tcp_rsk(req)->snt_isn = isn;
1424	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1425
1426	if (tcp_v4_send_synack(sk, dst, req,
1427			       (struct request_values *)&tmp_ext,
1428			       skb_get_queue_mapping(skb),
1429			       want_cookie) ||
1430	    want_cookie)
1431		goto drop_and_free;
1432
1433	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1434	return 0;
1435
1436drop_and_release:
1437	dst_release(dst);
1438drop_and_free:
1439	reqsk_free(req);
1440drop:
1441	return 0;
1442}
1443EXPORT_SYMBOL(tcp_v4_conn_request);
1444
1445
1446/*
1447 * The three way handshake has completed - we got a valid synack -
1448 * now create the new socket.
1449 */
1450struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1451				  struct request_sock *req,
1452				  struct dst_entry *dst)
1453{
1454	struct inet_request_sock *ireq;
1455	struct inet_sock *newinet;
1456	struct tcp_sock *newtp;
1457	struct sock *newsk;
1458#ifdef CONFIG_TCP_MD5SIG
1459	struct tcp_md5sig_key *key;
1460#endif
1461	struct ip_options_rcu *inet_opt;
1462
1463	if (sk_acceptq_is_full(sk))
1464		goto exit_overflow;
1465
1466	newsk = tcp_create_openreq_child(sk, req, skb);
1467	if (!newsk)
1468		goto exit_nonewsk;
1469
1470	newsk->sk_gso_type = SKB_GSO_TCPV4;
1471
1472	newtp		      = tcp_sk(newsk);
1473	newinet		      = inet_sk(newsk);
1474	ireq		      = inet_rsk(req);
1475	newinet->inet_daddr   = ireq->rmt_addr;
1476	newinet->inet_rcv_saddr = ireq->loc_addr;
1477	newinet->inet_saddr	      = ireq->loc_addr;
1478	inet_opt	      = ireq->opt;
1479	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1480	ireq->opt	      = NULL;
1481	newinet->mc_index     = inet_iif(skb);
1482	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1483	newinet->rcv_tos      = ip_hdr(skb)->tos;
1484	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1485	if (inet_opt)
1486		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1487	newinet->inet_id = newtp->write_seq ^ jiffies;
1488
1489	if (!dst) {
1490		dst = inet_csk_route_child_sock(sk, newsk, req);
1491		if (!dst)
1492			goto put_and_exit;
1493	} else {
1494		/* syncookie case : see end of cookie_v4_check() */
1495	}
1496	sk_setup_caps(newsk, dst);
1497
1498	tcp_mtup_init(newsk);
1499	tcp_sync_mss(newsk, dst_mtu(dst));
1500	newtp->advmss = dst_metric_advmss(dst);
1501	if (tcp_sk(sk)->rx_opt.user_mss &&
1502	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1503		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1504
1505	tcp_initialize_rcv_mss(newsk);
1506	if (tcp_rsk(req)->snt_synack)
1507		tcp_valid_rtt_meas(newsk,
1508		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1509	newtp->total_retrans = req->retrans;
1510
1511#ifdef CONFIG_TCP_MD5SIG
1512	/* Copy over the MD5 key from the original socket */
1513	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1514				AF_INET);
1515	if (key != NULL) {
1516		/*
1517		 * We're using one, so create a matching key
1518		 * on the newsk structure. If we fail to get
1519		 * memory, then we end up not copying the key
1520		 * across. Shucks.
1521		 */
1522		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1523			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1524		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1525	}
1526#endif
1527
1528	if (__inet_inherit_port(sk, newsk) < 0)
1529		goto put_and_exit;
1530	__inet_hash_nolisten(newsk, NULL);
1531
1532	return newsk;
1533
1534exit_overflow:
1535	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1536exit_nonewsk:
1537	dst_release(dst);
1538exit:
1539	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1540	return NULL;
1541put_and_exit:
1542	tcp_clear_xmit_timers(newsk);
1543	tcp_cleanup_congestion_control(newsk);
1544	bh_unlock_sock(newsk);
1545	sock_put(newsk);
1546	goto exit;
1547}
1548EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1549
1550static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1551{
1552	struct tcphdr *th = tcp_hdr(skb);
1553	const struct iphdr *iph = ip_hdr(skb);
1554	struct sock *nsk;
1555	struct request_sock **prev;
1556	/* Find possible connection requests. */
1557	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1558						       iph->saddr, iph->daddr);
1559	if (req)
1560		return tcp_check_req(sk, skb, req, prev);
1561
1562	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1563			th->source, iph->daddr, th->dest, inet_iif(skb));
1564
1565	if (nsk) {
1566		if (nsk->sk_state != TCP_TIME_WAIT) {
1567			bh_lock_sock(nsk);
1568			return nsk;
1569		}
1570		inet_twsk_put(inet_twsk(nsk));
1571		return NULL;
1572	}
1573
1574#ifdef CONFIG_SYN_COOKIES
1575	if (!th->syn)
1576		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1577#endif
1578	return sk;
1579}
1580
1581static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1582{
1583	const struct iphdr *iph = ip_hdr(skb);
1584
1585	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1586		if (!tcp_v4_check(skb->len, iph->saddr,
1587				  iph->daddr, skb->csum)) {
1588			skb->ip_summed = CHECKSUM_UNNECESSARY;
1589			return 0;
1590		}
1591	}
1592
1593	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1594				       skb->len, IPPROTO_TCP, 0);
1595
1596	if (skb->len <= 76) {
1597		return __skb_checksum_complete(skb);
1598	}
1599	return 0;
1600}
1601
1602
1603/* The socket must have it's spinlock held when we get
1604 * here.
1605 *
1606 * We have a potential double-lock case here, so even when
1607 * doing backlog processing we use the BH locking scheme.
1608 * This is because we cannot sleep with the original spinlock
1609 * held.
1610 */
1611int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1612{
1613	struct sock *rsk;
1614#ifdef CONFIG_TCP_MD5SIG
1615	/*
1616	 * We really want to reject the packet as early as possible
1617	 * if:
1618	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1619	 *  o There is an MD5 option and we're not expecting one
1620	 */
1621	if (tcp_v4_inbound_md5_hash(sk, skb))
1622		goto discard;
1623#endif
1624
1625	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1626		sock_rps_save_rxhash(sk, skb);
1627		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1628			rsk = sk;
1629			goto reset;
1630		}
1631		return 0;
1632	}
1633
1634	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1635		goto csum_err;
1636
1637	if (sk->sk_state == TCP_LISTEN) {
1638		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1639		if (!nsk)
1640			goto discard;
1641
1642		if (nsk != sk) {
1643			sock_rps_save_rxhash(nsk, skb);
1644			if (tcp_child_process(sk, nsk, skb)) {
1645				rsk = nsk;
1646				goto reset;
1647			}
1648			return 0;
1649		}
1650	} else
1651		sock_rps_save_rxhash(sk, skb);
1652
1653	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1654		rsk = sk;
1655		goto reset;
1656	}
1657	return 0;
1658
1659reset:
1660	tcp_v4_send_reset(rsk, skb);
1661discard:
1662	kfree_skb(skb);
1663	/* Be careful here. If this function gets more complicated and
1664	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1665	 * might be destroyed here. This current version compiles correctly,
1666	 * but you have been warned.
1667	 */
1668	return 0;
1669
1670csum_err:
1671	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1672	goto discard;
1673}
1674EXPORT_SYMBOL(tcp_v4_do_rcv);
1675
1676void tcp_v4_early_demux(struct sk_buff *skb)
1677{
1678	struct net *net = dev_net(skb->dev);
1679	const struct iphdr *iph;
1680	const struct tcphdr *th;
1681	struct net_device *dev;
1682	struct sock *sk;
1683
1684	if (skb->pkt_type != PACKET_HOST)
1685		return;
1686
1687	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1688		return;
1689
1690	iph = ip_hdr(skb);
1691	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1692
1693	if (th->doff < sizeof(struct tcphdr) / 4)
1694		return;
1695
1696	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1697		return;
1698
1699	dev = skb->dev;
1700	sk = __inet_lookup_established(net, &tcp_hashinfo,
1701				       iph->saddr, th->source,
1702				       iph->daddr, ntohs(th->dest),
1703				       dev->ifindex);
1704	if (sk) {
1705		skb->sk = sk;
1706		skb->destructor = sock_edemux;
1707		if (sk->sk_state != TCP_TIME_WAIT) {
1708			struct dst_entry *dst = sk->sk_rx_dst;
1709			if (dst)
1710				dst = dst_check(dst, 0);
1711			if (dst) {
1712				struct rtable *rt = (struct rtable *) dst;
1713
1714				if (rt->rt_iif == dev->ifindex)
1715					skb_dst_set_noref(skb, dst);
1716			}
1717		}
1718	}
1719}
1720
1721/*
1722 *	From tcp_input.c
1723 */
1724
1725int tcp_v4_rcv(struct sk_buff *skb)
1726{
1727	const struct iphdr *iph;
1728	const struct tcphdr *th;
1729	struct sock *sk;
1730	int ret;
1731	struct net *net = dev_net(skb->dev);
1732
1733	if (skb->pkt_type != PACKET_HOST)
1734		goto discard_it;
1735
1736	/* Count it even if it's bad */
1737	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1738
1739	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1740		goto discard_it;
1741
1742	th = tcp_hdr(skb);
1743
1744	if (th->doff < sizeof(struct tcphdr) / 4)
1745		goto bad_packet;
1746	if (!pskb_may_pull(skb, th->doff * 4))
1747		goto discard_it;
1748
1749	/* An explanation is required here, I think.
1750	 * Packet length and doff are validated by header prediction,
1751	 * provided case of th->doff==0 is eliminated.
1752	 * So, we defer the checks. */
1753	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1754		goto bad_packet;
1755
1756	th = tcp_hdr(skb);
1757	iph = ip_hdr(skb);
1758	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1759	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1760				    skb->len - th->doff * 4);
1761	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1762	TCP_SKB_CB(skb)->when	 = 0;
1763	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1764	TCP_SKB_CB(skb)->sacked	 = 0;
1765
1766	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1767	if (!sk)
1768		goto no_tcp_socket;
1769
1770process:
1771	if (sk->sk_state == TCP_TIME_WAIT)
1772		goto do_time_wait;
1773
1774	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1775		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1776		goto discard_and_relse;
1777	}
1778
1779	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1780		goto discard_and_relse;
1781	nf_reset(skb);
1782
1783	if (sk_filter(sk, skb))
1784		goto discard_and_relse;
1785
1786	skb->dev = NULL;
1787
1788	bh_lock_sock_nested(sk);
1789	ret = 0;
1790	if (!sock_owned_by_user(sk)) {
1791#ifdef CONFIG_NET_DMA
1792		struct tcp_sock *tp = tcp_sk(sk);
1793		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1794			tp->ucopy.dma_chan = net_dma_find_channel();
1795		if (tp->ucopy.dma_chan)
1796			ret = tcp_v4_do_rcv(sk, skb);
1797		else
1798#endif
1799		{
1800			if (!tcp_prequeue(sk, skb))
1801				ret = tcp_v4_do_rcv(sk, skb);
1802		}
1803	} else if (unlikely(sk_add_backlog(sk, skb,
1804					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1805		bh_unlock_sock(sk);
1806		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1807		goto discard_and_relse;
1808	}
1809	bh_unlock_sock(sk);
1810
1811	sock_put(sk);
1812
1813	return ret;
1814
1815no_tcp_socket:
1816	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1817		goto discard_it;
1818
1819	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1820bad_packet:
1821		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1822	} else {
1823		tcp_v4_send_reset(NULL, skb);
1824	}
1825
1826discard_it:
1827	/* Discard frame. */
1828	kfree_skb(skb);
1829	return 0;
1830
1831discard_and_relse:
1832	sock_put(sk);
1833	goto discard_it;
1834
1835do_time_wait:
1836	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1837		inet_twsk_put(inet_twsk(sk));
1838		goto discard_it;
1839	}
1840
1841	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1842		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1843		inet_twsk_put(inet_twsk(sk));
1844		goto discard_it;
1845	}
1846	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1847	case TCP_TW_SYN: {
1848		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1849							&tcp_hashinfo,
1850							iph->daddr, th->dest,
1851							inet_iif(skb));
1852		if (sk2) {
1853			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1854			inet_twsk_put(inet_twsk(sk));
1855			sk = sk2;
1856			goto process;
1857		}
1858		/* Fall through to ACK */
1859	}
1860	case TCP_TW_ACK:
1861		tcp_v4_timewait_ack(sk, skb);
1862		break;
1863	case TCP_TW_RST:
1864		goto no_tcp_socket;
1865	case TCP_TW_SUCCESS:;
1866	}
1867	goto discard_it;
1868}
1869
1870struct inet_peer *tcp_v4_get_peer(struct sock *sk)
1871{
1872	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1873	struct inet_sock *inet = inet_sk(sk);
1874
1875	/* If we don't have a valid cached route, or we're doing IP
1876	 * options which make the IPv4 header destination address
1877	 * different from our peer's, do not bother with this.
1878	 */
1879	if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
1880		return NULL;
1881	return rt_get_peer_create(rt, inet->inet_daddr);
1882}
1883EXPORT_SYMBOL(tcp_v4_get_peer);
1884
1885static struct timewait_sock_ops tcp_timewait_sock_ops = {
1886	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1887	.twsk_unique	= tcp_twsk_unique,
1888	.twsk_destructor= tcp_twsk_destructor,
1889};
1890
1891const struct inet_connection_sock_af_ops ipv4_specific = {
1892	.queue_xmit	   = ip_queue_xmit,
1893	.send_check	   = tcp_v4_send_check,
1894	.rebuild_header	   = inet_sk_rebuild_header,
1895	.conn_request	   = tcp_v4_conn_request,
1896	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1897	.get_peer	   = tcp_v4_get_peer,
1898	.net_header_len	   = sizeof(struct iphdr),
1899	.setsockopt	   = ip_setsockopt,
1900	.getsockopt	   = ip_getsockopt,
1901	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1902	.sockaddr_len	   = sizeof(struct sockaddr_in),
1903	.bind_conflict	   = inet_csk_bind_conflict,
1904#ifdef CONFIG_COMPAT
1905	.compat_setsockopt = compat_ip_setsockopt,
1906	.compat_getsockopt = compat_ip_getsockopt,
1907#endif
1908};
1909EXPORT_SYMBOL(ipv4_specific);
1910
1911#ifdef CONFIG_TCP_MD5SIG
1912static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1913	.md5_lookup		= tcp_v4_md5_lookup,
1914	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1915	.md5_parse		= tcp_v4_parse_md5_keys,
1916};
1917#endif
1918
1919/* NOTE: A lot of things set to zero explicitly by call to
1920 *       sk_alloc() so need not be done here.
1921 */
1922static int tcp_v4_init_sock(struct sock *sk)
1923{
1924	struct inet_connection_sock *icsk = inet_csk(sk);
1925
1926	tcp_init_sock(sk);
1927
1928	icsk->icsk_af_ops = &ipv4_specific;
1929
1930#ifdef CONFIG_TCP_MD5SIG
1931	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1932#endif
1933
1934	return 0;
1935}
1936
1937void tcp_v4_destroy_sock(struct sock *sk)
1938{
1939	struct tcp_sock *tp = tcp_sk(sk);
1940
1941	tcp_clear_xmit_timers(sk);
1942
1943	tcp_cleanup_congestion_control(sk);
1944
1945	/* Cleanup up the write buffer. */
1946	tcp_write_queue_purge(sk);
1947
1948	/* Cleans up our, hopefully empty, out_of_order_queue. */
1949	__skb_queue_purge(&tp->out_of_order_queue);
1950
1951#ifdef CONFIG_TCP_MD5SIG
1952	/* Clean up the MD5 key list, if any */
1953	if (tp->md5sig_info) {
1954		tcp_clear_md5_list(sk);
1955		kfree_rcu(tp->md5sig_info, rcu);
1956		tp->md5sig_info = NULL;
1957	}
1958#endif
1959
1960#ifdef CONFIG_NET_DMA
1961	/* Cleans up our sk_async_wait_queue */
1962	__skb_queue_purge(&sk->sk_async_wait_queue);
1963#endif
1964
1965	/* Clean prequeue, it must be empty really */
1966	__skb_queue_purge(&tp->ucopy.prequeue);
1967
1968	/* Clean up a referenced TCP bind bucket. */
1969	if (inet_csk(sk)->icsk_bind_hash)
1970		inet_put_port(sk);
1971
1972	/*
1973	 * If sendmsg cached page exists, toss it.
1974	 */
1975	if (sk->sk_sndmsg_page) {
1976		__free_page(sk->sk_sndmsg_page);
1977		sk->sk_sndmsg_page = NULL;
1978	}
1979
1980	/* TCP Cookie Transactions */
1981	if (tp->cookie_values != NULL) {
1982		kref_put(&tp->cookie_values->kref,
1983			 tcp_cookie_values_release);
1984		tp->cookie_values = NULL;
1985	}
1986
1987	sk_sockets_allocated_dec(sk);
1988	sock_release_memcg(sk);
1989}
1990EXPORT_SYMBOL(tcp_v4_destroy_sock);
1991
1992#ifdef CONFIG_PROC_FS
1993/* Proc filesystem TCP sock list dumping. */
1994
1995static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1996{
1997	return hlist_nulls_empty(head) ? NULL :
1998		list_entry(head->first, struct inet_timewait_sock, tw_node);
1999}
2000
2001static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2002{
2003	return !is_a_nulls(tw->tw_node.next) ?
2004		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2005}
2006
2007/*
2008 * Get next listener socket follow cur.  If cur is NULL, get first socket
2009 * starting from bucket given in st->bucket; when st->bucket is zero the
2010 * very first socket in the hash table is returned.
2011 */
2012static void *listening_get_next(struct seq_file *seq, void *cur)
2013{
2014	struct inet_connection_sock *icsk;
2015	struct hlist_nulls_node *node;
2016	struct sock *sk = cur;
2017	struct inet_listen_hashbucket *ilb;
2018	struct tcp_iter_state *st = seq->private;
2019	struct net *net = seq_file_net(seq);
2020
2021	if (!sk) {
2022		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2023		spin_lock_bh(&ilb->lock);
2024		sk = sk_nulls_head(&ilb->head);
2025		st->offset = 0;
2026		goto get_sk;
2027	}
2028	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2029	++st->num;
2030	++st->offset;
2031
2032	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2033		struct request_sock *req = cur;
2034
2035		icsk = inet_csk(st->syn_wait_sk);
2036		req = req->dl_next;
2037		while (1) {
2038			while (req) {
2039				if (req->rsk_ops->family == st->family) {
2040					cur = req;
2041					goto out;
2042				}
2043				req = req->dl_next;
2044			}
2045			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2046				break;
2047get_req:
2048			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2049		}
2050		sk	  = sk_nulls_next(st->syn_wait_sk);
2051		st->state = TCP_SEQ_STATE_LISTENING;
2052		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053	} else {
2054		icsk = inet_csk(sk);
2055		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2056		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2057			goto start_req;
2058		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059		sk = sk_nulls_next(sk);
2060	}
2061get_sk:
2062	sk_nulls_for_each_from(sk, node) {
2063		if (!net_eq(sock_net(sk), net))
2064			continue;
2065		if (sk->sk_family == st->family) {
2066			cur = sk;
2067			goto out;
2068		}
2069		icsk = inet_csk(sk);
2070		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2071		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2072start_req:
2073			st->uid		= sock_i_uid(sk);
2074			st->syn_wait_sk = sk;
2075			st->state	= TCP_SEQ_STATE_OPENREQ;
2076			st->sbucket	= 0;
2077			goto get_req;
2078		}
2079		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2080	}
2081	spin_unlock_bh(&ilb->lock);
2082	st->offset = 0;
2083	if (++st->bucket < INET_LHTABLE_SIZE) {
2084		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2085		spin_lock_bh(&ilb->lock);
2086		sk = sk_nulls_head(&ilb->head);
2087		goto get_sk;
2088	}
2089	cur = NULL;
2090out:
2091	return cur;
2092}
2093
2094static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2095{
2096	struct tcp_iter_state *st = seq->private;
2097	void *rc;
2098
2099	st->bucket = 0;
2100	st->offset = 0;
2101	rc = listening_get_next(seq, NULL);
2102
2103	while (rc && *pos) {
2104		rc = listening_get_next(seq, rc);
2105		--*pos;
2106	}
2107	return rc;
2108}
2109
2110static inline bool empty_bucket(struct tcp_iter_state *st)
2111{
2112	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2113		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2114}
2115
2116/*
2117 * Get first established socket starting from bucket given in st->bucket.
2118 * If st->bucket is zero, the very first socket in the hash is returned.
2119 */
2120static void *established_get_first(struct seq_file *seq)
2121{
2122	struct tcp_iter_state *st = seq->private;
2123	struct net *net = seq_file_net(seq);
2124	void *rc = NULL;
2125
2126	st->offset = 0;
2127	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2128		struct sock *sk;
2129		struct hlist_nulls_node *node;
2130		struct inet_timewait_sock *tw;
2131		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2132
2133		/* Lockless fast path for the common case of empty buckets */
2134		if (empty_bucket(st))
2135			continue;
2136
2137		spin_lock_bh(lock);
2138		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2139			if (sk->sk_family != st->family ||
2140			    !net_eq(sock_net(sk), net)) {
2141				continue;
2142			}
2143			rc = sk;
2144			goto out;
2145		}
2146		st->state = TCP_SEQ_STATE_TIME_WAIT;
2147		inet_twsk_for_each(tw, node,
2148				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2149			if (tw->tw_family != st->family ||
2150			    !net_eq(twsk_net(tw), net)) {
2151				continue;
2152			}
2153			rc = tw;
2154			goto out;
2155		}
2156		spin_unlock_bh(lock);
2157		st->state = TCP_SEQ_STATE_ESTABLISHED;
2158	}
2159out:
2160	return rc;
2161}
2162
2163static void *established_get_next(struct seq_file *seq, void *cur)
2164{
2165	struct sock *sk = cur;
2166	struct inet_timewait_sock *tw;
2167	struct hlist_nulls_node *node;
2168	struct tcp_iter_state *st = seq->private;
2169	struct net *net = seq_file_net(seq);
2170
2171	++st->num;
2172	++st->offset;
2173
2174	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2175		tw = cur;
2176		tw = tw_next(tw);
2177get_tw:
2178		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2179			tw = tw_next(tw);
2180		}
2181		if (tw) {
2182			cur = tw;
2183			goto out;
2184		}
2185		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2186		st->state = TCP_SEQ_STATE_ESTABLISHED;
2187
2188		/* Look for next non empty bucket */
2189		st->offset = 0;
2190		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2191				empty_bucket(st))
2192			;
2193		if (st->bucket > tcp_hashinfo.ehash_mask)
2194			return NULL;
2195
2196		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2197		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2198	} else
2199		sk = sk_nulls_next(sk);
2200
2201	sk_nulls_for_each_from(sk, node) {
2202		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2203			goto found;
2204	}
2205
2206	st->state = TCP_SEQ_STATE_TIME_WAIT;
2207	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2208	goto get_tw;
2209found:
2210	cur = sk;
2211out:
2212	return cur;
2213}
2214
2215static void *established_get_idx(struct seq_file *seq, loff_t pos)
2216{
2217	struct tcp_iter_state *st = seq->private;
2218	void *rc;
2219
2220	st->bucket = 0;
2221	rc = established_get_first(seq);
2222
2223	while (rc && pos) {
2224		rc = established_get_next(seq, rc);
2225		--pos;
2226	}
2227	return rc;
2228}
2229
2230static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2231{
2232	void *rc;
2233	struct tcp_iter_state *st = seq->private;
2234
2235	st->state = TCP_SEQ_STATE_LISTENING;
2236	rc	  = listening_get_idx(seq, &pos);
2237
2238	if (!rc) {
2239		st->state = TCP_SEQ_STATE_ESTABLISHED;
2240		rc	  = established_get_idx(seq, pos);
2241	}
2242
2243	return rc;
2244}
2245
2246static void *tcp_seek_last_pos(struct seq_file *seq)
2247{
2248	struct tcp_iter_state *st = seq->private;
2249	int offset = st->offset;
2250	int orig_num = st->num;
2251	void *rc = NULL;
2252
2253	switch (st->state) {
2254	case TCP_SEQ_STATE_OPENREQ:
2255	case TCP_SEQ_STATE_LISTENING:
2256		if (st->bucket >= INET_LHTABLE_SIZE)
2257			break;
2258		st->state = TCP_SEQ_STATE_LISTENING;
2259		rc = listening_get_next(seq, NULL);
2260		while (offset-- && rc)
2261			rc = listening_get_next(seq, rc);
2262		if (rc)
2263			break;
2264		st->bucket = 0;
2265		/* Fallthrough */
2266	case TCP_SEQ_STATE_ESTABLISHED:
2267	case TCP_SEQ_STATE_TIME_WAIT:
2268		st->state = TCP_SEQ_STATE_ESTABLISHED;
2269		if (st->bucket > tcp_hashinfo.ehash_mask)
2270			break;
2271		rc = established_get_first(seq);
2272		while (offset-- && rc)
2273			rc = established_get_next(seq, rc);
2274	}
2275
2276	st->num = orig_num;
2277
2278	return rc;
2279}
2280
2281static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2282{
2283	struct tcp_iter_state *st = seq->private;
2284	void *rc;
2285
2286	if (*pos && *pos == st->last_pos) {
2287		rc = tcp_seek_last_pos(seq);
2288		if (rc)
2289			goto out;
2290	}
2291
2292	st->state = TCP_SEQ_STATE_LISTENING;
2293	st->num = 0;
2294	st->bucket = 0;
2295	st->offset = 0;
2296	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2297
2298out:
2299	st->last_pos = *pos;
2300	return rc;
2301}
2302
2303static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304{
2305	struct tcp_iter_state *st = seq->private;
2306	void *rc = NULL;
2307
2308	if (v == SEQ_START_TOKEN) {
2309		rc = tcp_get_idx(seq, 0);
2310		goto out;
2311	}
2312
2313	switch (st->state) {
2314	case TCP_SEQ_STATE_OPENREQ:
2315	case TCP_SEQ_STATE_LISTENING:
2316		rc = listening_get_next(seq, v);
2317		if (!rc) {
2318			st->state = TCP_SEQ_STATE_ESTABLISHED;
2319			st->bucket = 0;
2320			st->offset = 0;
2321			rc	  = established_get_first(seq);
2322		}
2323		break;
2324	case TCP_SEQ_STATE_ESTABLISHED:
2325	case TCP_SEQ_STATE_TIME_WAIT:
2326		rc = established_get_next(seq, v);
2327		break;
2328	}
2329out:
2330	++*pos;
2331	st->last_pos = *pos;
2332	return rc;
2333}
2334
2335static void tcp_seq_stop(struct seq_file *seq, void *v)
2336{
2337	struct tcp_iter_state *st = seq->private;
2338
2339	switch (st->state) {
2340	case TCP_SEQ_STATE_OPENREQ:
2341		if (v) {
2342			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2343			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2344		}
2345	case TCP_SEQ_STATE_LISTENING:
2346		if (v != SEQ_START_TOKEN)
2347			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2348		break;
2349	case TCP_SEQ_STATE_TIME_WAIT:
2350	case TCP_SEQ_STATE_ESTABLISHED:
2351		if (v)
2352			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2353		break;
2354	}
2355}
2356
2357int tcp_seq_open(struct inode *inode, struct file *file)
2358{
2359	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2360	struct tcp_iter_state *s;
2361	int err;
2362
2363	err = seq_open_net(inode, file, &afinfo->seq_ops,
2364			  sizeof(struct tcp_iter_state));
2365	if (err < 0)
2366		return err;
2367
2368	s = ((struct seq_file *)file->private_data)->private;
2369	s->family		= afinfo->family;
2370	s->last_pos 		= 0;
2371	return 0;
2372}
2373EXPORT_SYMBOL(tcp_seq_open);
2374
2375int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2376{
2377	int rc = 0;
2378	struct proc_dir_entry *p;
2379
2380	afinfo->seq_ops.start		= tcp_seq_start;
2381	afinfo->seq_ops.next		= tcp_seq_next;
2382	afinfo->seq_ops.stop		= tcp_seq_stop;
2383
2384	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2385			     afinfo->seq_fops, afinfo);
2386	if (!p)
2387		rc = -ENOMEM;
2388	return rc;
2389}
2390EXPORT_SYMBOL(tcp_proc_register);
2391
2392void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2393{
2394	proc_net_remove(net, afinfo->name);
2395}
2396EXPORT_SYMBOL(tcp_proc_unregister);
2397
2398static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2399			 struct seq_file *f, int i, int uid, int *len)
2400{
2401	const struct inet_request_sock *ireq = inet_rsk(req);
2402	int ttd = req->expires - jiffies;
2403
2404	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2405		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2406		i,
2407		ireq->loc_addr,
2408		ntohs(inet_sk(sk)->inet_sport),
2409		ireq->rmt_addr,
2410		ntohs(ireq->rmt_port),
2411		TCP_SYN_RECV,
2412		0, 0, /* could print option size, but that is af dependent. */
2413		1,    /* timers active (only the expire timer) */
2414		jiffies_to_clock_t(ttd),
2415		req->retrans,
2416		uid,
2417		0,  /* non standard timer */
2418		0, /* open_requests have no inode */
2419		atomic_read(&sk->sk_refcnt),
2420		req,
2421		len);
2422}
2423
2424static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2425{
2426	int timer_active;
2427	unsigned long timer_expires;
2428	const struct tcp_sock *tp = tcp_sk(sk);
2429	const struct inet_connection_sock *icsk = inet_csk(sk);
2430	const struct inet_sock *inet = inet_sk(sk);
2431	__be32 dest = inet->inet_daddr;
2432	__be32 src = inet->inet_rcv_saddr;
2433	__u16 destp = ntohs(inet->inet_dport);
2434	__u16 srcp = ntohs(inet->inet_sport);
2435	int rx_queue;
2436
2437	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2438		timer_active	= 1;
2439		timer_expires	= icsk->icsk_timeout;
2440	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2441		timer_active	= 4;
2442		timer_expires	= icsk->icsk_timeout;
2443	} else if (timer_pending(&sk->sk_timer)) {
2444		timer_active	= 2;
2445		timer_expires	= sk->sk_timer.expires;
2446	} else {
2447		timer_active	= 0;
2448		timer_expires = jiffies;
2449	}
2450
2451	if (sk->sk_state == TCP_LISTEN)
2452		rx_queue = sk->sk_ack_backlog;
2453	else
2454		/*
2455		 * because we dont lock socket, we might find a transient negative value
2456		 */
2457		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2458
2459	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2460			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2461		i, src, srcp, dest, destp, sk->sk_state,
2462		tp->write_seq - tp->snd_una,
2463		rx_queue,
2464		timer_active,
2465		jiffies_to_clock_t(timer_expires - jiffies),
2466		icsk->icsk_retransmits,
2467		sock_i_uid(sk),
2468		icsk->icsk_probes_out,
2469		sock_i_ino(sk),
2470		atomic_read(&sk->sk_refcnt), sk,
2471		jiffies_to_clock_t(icsk->icsk_rto),
2472		jiffies_to_clock_t(icsk->icsk_ack.ato),
2473		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2474		tp->snd_cwnd,
2475		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2476		len);
2477}
2478
2479static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2480			       struct seq_file *f, int i, int *len)
2481{
2482	__be32 dest, src;
2483	__u16 destp, srcp;
2484	int ttd = tw->tw_ttd - jiffies;
2485
2486	if (ttd < 0)
2487		ttd = 0;
2488
2489	dest  = tw->tw_daddr;
2490	src   = tw->tw_rcv_saddr;
2491	destp = ntohs(tw->tw_dport);
2492	srcp  = ntohs(tw->tw_sport);
2493
2494	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2495		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2496		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2497		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2498		atomic_read(&tw->tw_refcnt), tw, len);
2499}
2500
2501#define TMPSZ 150
2502
2503static int tcp4_seq_show(struct seq_file *seq, void *v)
2504{
2505	struct tcp_iter_state *st;
2506	int len;
2507
2508	if (v == SEQ_START_TOKEN) {
2509		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2510			   "  sl  local_address rem_address   st tx_queue "
2511			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2512			   "inode");
2513		goto out;
2514	}
2515	st = seq->private;
2516
2517	switch (st->state) {
2518	case TCP_SEQ_STATE_LISTENING:
2519	case TCP_SEQ_STATE_ESTABLISHED:
2520		get_tcp4_sock(v, seq, st->num, &len);
2521		break;
2522	case TCP_SEQ_STATE_OPENREQ:
2523		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2524		break;
2525	case TCP_SEQ_STATE_TIME_WAIT:
2526		get_timewait4_sock(v, seq, st->num, &len);
2527		break;
2528	}
2529	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2530out:
2531	return 0;
2532}
2533
2534static const struct file_operations tcp_afinfo_seq_fops = {
2535	.owner   = THIS_MODULE,
2536	.open    = tcp_seq_open,
2537	.read    = seq_read,
2538	.llseek  = seq_lseek,
2539	.release = seq_release_net
2540};
2541
2542static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2543	.name		= "tcp",
2544	.family		= AF_INET,
2545	.seq_fops	= &tcp_afinfo_seq_fops,
2546	.seq_ops	= {
2547		.show		= tcp4_seq_show,
2548	},
2549};
2550
2551static int __net_init tcp4_proc_init_net(struct net *net)
2552{
2553	return tcp_proc_register(net, &tcp4_seq_afinfo);
2554}
2555
2556static void __net_exit tcp4_proc_exit_net(struct net *net)
2557{
2558	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2559}
2560
2561static struct pernet_operations tcp4_net_ops = {
2562	.init = tcp4_proc_init_net,
2563	.exit = tcp4_proc_exit_net,
2564};
2565
2566int __init tcp4_proc_init(void)
2567{
2568	return register_pernet_subsys(&tcp4_net_ops);
2569}
2570
2571void tcp4_proc_exit(void)
2572{
2573	unregister_pernet_subsys(&tcp4_net_ops);
2574}
2575#endif /* CONFIG_PROC_FS */
2576
2577struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2578{
2579	const struct iphdr *iph = skb_gro_network_header(skb);
2580
2581	switch (skb->ip_summed) {
2582	case CHECKSUM_COMPLETE:
2583		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2584				  skb->csum)) {
2585			skb->ip_summed = CHECKSUM_UNNECESSARY;
2586			break;
2587		}
2588
2589		/* fall through */
2590	case CHECKSUM_NONE:
2591		NAPI_GRO_CB(skb)->flush = 1;
2592		return NULL;
2593	}
2594
2595	return tcp_gro_receive(head, skb);
2596}
2597
2598int tcp4_gro_complete(struct sk_buff *skb)
2599{
2600	const struct iphdr *iph = ip_hdr(skb);
2601	struct tcphdr *th = tcp_hdr(skb);
2602
2603	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2604				  iph->saddr, iph->daddr, 0);
2605	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2606
2607	return tcp_gro_complete(skb);
2608}
2609
2610struct proto tcp_prot = {
2611	.name			= "TCP",
2612	.owner			= THIS_MODULE,
2613	.close			= tcp_close,
2614	.connect		= tcp_v4_connect,
2615	.disconnect		= tcp_disconnect,
2616	.accept			= inet_csk_accept,
2617	.ioctl			= tcp_ioctl,
2618	.init			= tcp_v4_init_sock,
2619	.destroy		= tcp_v4_destroy_sock,
2620	.shutdown		= tcp_shutdown,
2621	.setsockopt		= tcp_setsockopt,
2622	.getsockopt		= tcp_getsockopt,
2623	.recvmsg		= tcp_recvmsg,
2624	.sendmsg		= tcp_sendmsg,
2625	.sendpage		= tcp_sendpage,
2626	.backlog_rcv		= tcp_v4_do_rcv,
2627	.hash			= inet_hash,
2628	.unhash			= inet_unhash,
2629	.get_port		= inet_csk_get_port,
2630	.enter_memory_pressure	= tcp_enter_memory_pressure,
2631	.sockets_allocated	= &tcp_sockets_allocated,
2632	.orphan_count		= &tcp_orphan_count,
2633	.memory_allocated	= &tcp_memory_allocated,
2634	.memory_pressure	= &tcp_memory_pressure,
2635	.sysctl_wmem		= sysctl_tcp_wmem,
2636	.sysctl_rmem		= sysctl_tcp_rmem,
2637	.max_header		= MAX_TCP_HEADER,
2638	.obj_size		= sizeof(struct tcp_sock),
2639	.slab_flags		= SLAB_DESTROY_BY_RCU,
2640	.twsk_prot		= &tcp_timewait_sock_ops,
2641	.rsk_prot		= &tcp_request_sock_ops,
2642	.h.hashinfo		= &tcp_hashinfo,
2643	.no_autobind		= true,
2644#ifdef CONFIG_COMPAT
2645	.compat_setsockopt	= compat_tcp_setsockopt,
2646	.compat_getsockopt	= compat_tcp_getsockopt,
2647#endif
2648#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2649	.init_cgroup		= tcp_init_cgroup,
2650	.destroy_cgroup		= tcp_destroy_cgroup,
2651	.proto_cgroup		= tcp_proto_cgroup,
2652#endif
2653};
2654EXPORT_SYMBOL(tcp_prot);
2655
2656static int __net_init tcp_sk_init(struct net *net)
2657{
2658	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2659				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2660}
2661
2662static void __net_exit tcp_sk_exit(struct net *net)
2663{
2664	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2665}
2666
2667static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2668{
2669	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2670}
2671
2672static struct pernet_operations __net_initdata tcp_sk_ops = {
2673       .init	   = tcp_sk_init,
2674       .exit	   = tcp_sk_exit,
2675       .exit_batch = tcp_sk_exit_batch,
2676};
2677
2678void __init tcp_v4_init(void)
2679{
2680	inet_hashinfo_init(&tcp_hashinfo);
2681	if (register_pernet_subsys(&tcp_sk_ops))
2682		panic("Failed to create the TCP control socket.\n");
2683}
2684