tcp_ipv4.c revision 4c507d2897bd9be810b3403ade73b04cf6fdfd4a
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75#include <net/secure_seq.h>
76#include <net/tcp_memcontrol.h>
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83
84#include <linux/crypto.h>
85#include <linux/scatterlist.h>
86
87int sysctl_tcp_tw_reuse __read_mostly;
88int sysctl_tcp_low_latency __read_mostly;
89EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91
92#ifdef CONFIG_TCP_MD5SIG
93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95#endif
96
97struct inet_hashinfo tcp_hashinfo;
98EXPORT_SYMBOL(tcp_hashinfo);
99
100static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101{
102	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103					  ip_hdr(skb)->saddr,
104					  tcp_hdr(skb)->dest,
105					  tcp_hdr(skb)->source);
106}
107
108int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109{
110	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111	struct tcp_sock *tp = tcp_sk(sk);
112
113	/* With PAWS, it is safe from the viewpoint
114	   of data integrity. Even without PAWS it is safe provided sequence
115	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117	   Actually, the idea is close to VJ's one, only timestamp cache is
118	   held not per host, but per port pair and TW bucket is used as state
119	   holder.
120
121	   If TW bucket has been already destroyed we fall back to VJ's scheme
122	   and use initial timestamp retrieved from peer table.
123	 */
124	if (tcptw->tw_ts_recent_stamp &&
125	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128		if (tp->write_seq == 0)
129			tp->write_seq = 1;
130		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132		sock_hold(sktw);
133		return 1;
134	}
135
136	return 0;
137}
138EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140/* This will initiate an outgoing connection. */
141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142{
143	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144	struct inet_sock *inet = inet_sk(sk);
145	struct tcp_sock *tp = tcp_sk(sk);
146	__be16 orig_sport, orig_dport;
147	__be32 daddr, nexthop;
148	struct flowi4 *fl4;
149	struct rtable *rt;
150	int err;
151	struct ip_options_rcu *inet_opt;
152
153	if (addr_len < sizeof(struct sockaddr_in))
154		return -EINVAL;
155
156	if (usin->sin_family != AF_INET)
157		return -EAFNOSUPPORT;
158
159	nexthop = daddr = usin->sin_addr.s_addr;
160	inet_opt = rcu_dereference_protected(inet->inet_opt,
161					     sock_owned_by_user(sk));
162	if (inet_opt && inet_opt->opt.srr) {
163		if (!daddr)
164			return -EINVAL;
165		nexthop = inet_opt->opt.faddr;
166	}
167
168	orig_sport = inet->inet_sport;
169	orig_dport = usin->sin_port;
170	fl4 = &inet->cork.fl.u.ip4;
171	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173			      IPPROTO_TCP,
174			      orig_sport, orig_dport, sk, true);
175	if (IS_ERR(rt)) {
176		err = PTR_ERR(rt);
177		if (err == -ENETUNREACH)
178			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179		return err;
180	}
181
182	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183		ip_rt_put(rt);
184		return -ENETUNREACH;
185	}
186
187	if (!inet_opt || !inet_opt->opt.srr)
188		daddr = fl4->daddr;
189
190	if (!inet->inet_saddr)
191		inet->inet_saddr = fl4->saddr;
192	inet->inet_rcv_saddr = inet->inet_saddr;
193
194	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195		/* Reset inherited state */
196		tp->rx_opt.ts_recent	   = 0;
197		tp->rx_opt.ts_recent_stamp = 0;
198		tp->write_seq		   = 0;
199	}
200
201	if (tcp_death_row.sysctl_tw_recycle &&
202	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
203		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
204		/*
205		 * VJ's idea. We save last timestamp seen from
206		 * the destination in peer table, when entering state
207		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208		 * when trying new connection.
209		 */
210		if (peer) {
211			inet_peer_refcheck(peer);
212			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
213				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214				tp->rx_opt.ts_recent = peer->tcp_ts;
215			}
216		}
217	}
218
219	inet->inet_dport = usin->sin_port;
220	inet->inet_daddr = daddr;
221
222	inet_csk(sk)->icsk_ext_hdr_len = 0;
223	if (inet_opt)
224		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
225
226	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
227
228	/* Socket identity is still unknown (sport may be zero).
229	 * However we set state to SYN-SENT and not releasing socket
230	 * lock select source port, enter ourselves into the hash tables and
231	 * complete initialization after this.
232	 */
233	tcp_set_state(sk, TCP_SYN_SENT);
234	err = inet_hash_connect(&tcp_death_row, sk);
235	if (err)
236		goto failure;
237
238	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
239			       inet->inet_sport, inet->inet_dport, sk);
240	if (IS_ERR(rt)) {
241		err = PTR_ERR(rt);
242		rt = NULL;
243		goto failure;
244	}
245	/* OK, now commit destination to socket.  */
246	sk->sk_gso_type = SKB_GSO_TCPV4;
247	sk_setup_caps(sk, &rt->dst);
248
249	if (!tp->write_seq)
250		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251							   inet->inet_daddr,
252							   inet->inet_sport,
253							   usin->sin_port);
254
255	inet->inet_id = tp->write_seq ^ jiffies;
256
257	err = tcp_connect(sk);
258	rt = NULL;
259	if (err)
260		goto failure;
261
262	return 0;
263
264failure:
265	/*
266	 * This unhashes the socket and releases the local port,
267	 * if necessary.
268	 */
269	tcp_set_state(sk, TCP_CLOSE);
270	ip_rt_put(rt);
271	sk->sk_route_caps = 0;
272	inet->inet_dport = 0;
273	return err;
274}
275EXPORT_SYMBOL(tcp_v4_connect);
276
277/*
278 * This routine does path mtu discovery as defined in RFC1191.
279 */
280static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
281{
282	struct dst_entry *dst;
283	struct inet_sock *inet = inet_sk(sk);
284
285	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286	 * send out by Linux are always <576bytes so they should go through
287	 * unfragmented).
288	 */
289	if (sk->sk_state == TCP_LISTEN)
290		return;
291
292	/* We don't check in the destentry if pmtu discovery is forbidden
293	 * on this route. We just assume that no packet_to_big packets
294	 * are send back when pmtu discovery is not active.
295	 * There is a small race when the user changes this flag in the
296	 * route, but I think that's acceptable.
297	 */
298	if ((dst = __sk_dst_check(sk, 0)) == NULL)
299		return;
300
301	dst->ops->update_pmtu(dst, mtu);
302
303	/* Something is about to be wrong... Remember soft error
304	 * for the case, if this connection will not able to recover.
305	 */
306	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307		sk->sk_err_soft = EMSGSIZE;
308
309	mtu = dst_mtu(dst);
310
311	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
312	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
313		tcp_sync_mss(sk, mtu);
314
315		/* Resend the TCP packet because it's
316		 * clear that the old packet has been
317		 * dropped. This is the new "fast" path mtu
318		 * discovery.
319		 */
320		tcp_simple_retransmit(sk);
321	} /* else let the usual retransmit timer handle it */
322}
323
324/*
325 * This routine is called by the ICMP module when it gets some
326 * sort of error condition.  If err < 0 then the socket should
327 * be closed and the error returned to the user.  If err > 0
328 * it's just the icmp type << 8 | icmp code.  After adjustment
329 * header points to the first 8 bytes of the tcp header.  We need
330 * to find the appropriate port.
331 *
332 * The locking strategy used here is very "optimistic". When
333 * someone else accesses the socket the ICMP is just dropped
334 * and for some paths there is no check at all.
335 * A more general error queue to queue errors for later handling
336 * is probably better.
337 *
338 */
339
340void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
341{
342	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
343	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
344	struct inet_connection_sock *icsk;
345	struct tcp_sock *tp;
346	struct inet_sock *inet;
347	const int type = icmp_hdr(icmp_skb)->type;
348	const int code = icmp_hdr(icmp_skb)->code;
349	struct sock *sk;
350	struct sk_buff *skb;
351	__u32 seq;
352	__u32 remaining;
353	int err;
354	struct net *net = dev_net(icmp_skb->dev);
355
356	if (icmp_skb->len < (iph->ihl << 2) + 8) {
357		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358		return;
359	}
360
361	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
362			iph->saddr, th->source, inet_iif(icmp_skb));
363	if (!sk) {
364		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365		return;
366	}
367	if (sk->sk_state == TCP_TIME_WAIT) {
368		inet_twsk_put(inet_twsk(sk));
369		return;
370	}
371
372	bh_lock_sock(sk);
373	/* If too many ICMPs get dropped on busy
374	 * servers this needs to be solved differently.
375	 */
376	if (sock_owned_by_user(sk))
377		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
378
379	if (sk->sk_state == TCP_CLOSE)
380		goto out;
381
382	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384		goto out;
385	}
386
387	icsk = inet_csk(sk);
388	tp = tcp_sk(sk);
389	seq = ntohl(th->seq);
390	if (sk->sk_state != TCP_LISTEN &&
391	    !between(seq, tp->snd_una, tp->snd_nxt)) {
392		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393		goto out;
394	}
395
396	switch (type) {
397	case ICMP_SOURCE_QUENCH:
398		/* Just silently ignore these. */
399		goto out;
400	case ICMP_PARAMETERPROB:
401		err = EPROTO;
402		break;
403	case ICMP_DEST_UNREACH:
404		if (code > NR_ICMP_UNREACH)
405			goto out;
406
407		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408			if (!sock_owned_by_user(sk))
409				do_pmtu_discovery(sk, iph, info);
410			goto out;
411		}
412
413		err = icmp_err_convert[code].errno;
414		/* check if icmp_skb allows revert of backoff
415		 * (see draft-zimmermann-tcp-lcd) */
416		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417			break;
418		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419		    !icsk->icsk_backoff)
420			break;
421
422		if (sock_owned_by_user(sk))
423			break;
424
425		icsk->icsk_backoff--;
426		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
427			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
428		tcp_bound_rto(sk);
429
430		skb = tcp_write_queue_head(sk);
431		BUG_ON(!skb);
432
433		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434				tcp_time_stamp - TCP_SKB_CB(skb)->when);
435
436		if (remaining) {
437			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438						  remaining, TCP_RTO_MAX);
439		} else {
440			/* RTO revert clocked out retransmission.
441			 * Will retransmit now */
442			tcp_retransmit_timer(sk);
443		}
444
445		break;
446	case ICMP_TIME_EXCEEDED:
447		err = EHOSTUNREACH;
448		break;
449	default:
450		goto out;
451	}
452
453	switch (sk->sk_state) {
454		struct request_sock *req, **prev;
455	case TCP_LISTEN:
456		if (sock_owned_by_user(sk))
457			goto out;
458
459		req = inet_csk_search_req(sk, &prev, th->dest,
460					  iph->daddr, iph->saddr);
461		if (!req)
462			goto out;
463
464		/* ICMPs are not backlogged, hence we cannot get
465		   an established socket here.
466		 */
467		WARN_ON(req->sk);
468
469		if (seq != tcp_rsk(req)->snt_isn) {
470			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
471			goto out;
472		}
473
474		/*
475		 * Still in SYN_RECV, just remove it silently.
476		 * There is no good way to pass the error to the newly
477		 * created socket, and POSIX does not want network
478		 * errors returned from accept().
479		 */
480		inet_csk_reqsk_queue_drop(sk, req, prev);
481		goto out;
482
483	case TCP_SYN_SENT:
484	case TCP_SYN_RECV:  /* Cannot happen.
485			       It can f.e. if SYNs crossed.
486			     */
487		if (!sock_owned_by_user(sk)) {
488			sk->sk_err = err;
489
490			sk->sk_error_report(sk);
491
492			tcp_done(sk);
493		} else {
494			sk->sk_err_soft = err;
495		}
496		goto out;
497	}
498
499	/* If we've already connected we will keep trying
500	 * until we time out, or the user gives up.
501	 *
502	 * rfc1122 4.2.3.9 allows to consider as hard errors
503	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504	 * but it is obsoleted by pmtu discovery).
505	 *
506	 * Note, that in modern internet, where routing is unreliable
507	 * and in each dark corner broken firewalls sit, sending random
508	 * errors ordered by their masters even this two messages finally lose
509	 * their original sense (even Linux sends invalid PORT_UNREACHs)
510	 *
511	 * Now we are in compliance with RFCs.
512	 *							--ANK (980905)
513	 */
514
515	inet = inet_sk(sk);
516	if (!sock_owned_by_user(sk) && inet->recverr) {
517		sk->sk_err = err;
518		sk->sk_error_report(sk);
519	} else	{ /* Only an error on timeout */
520		sk->sk_err_soft = err;
521	}
522
523out:
524	bh_unlock_sock(sk);
525	sock_put(sk);
526}
527
528static void __tcp_v4_send_check(struct sk_buff *skb,
529				__be32 saddr, __be32 daddr)
530{
531	struct tcphdr *th = tcp_hdr(skb);
532
533	if (skb->ip_summed == CHECKSUM_PARTIAL) {
534		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
535		skb->csum_start = skb_transport_header(skb) - skb->head;
536		skb->csum_offset = offsetof(struct tcphdr, check);
537	} else {
538		th->check = tcp_v4_check(skb->len, saddr, daddr,
539					 csum_partial(th,
540						      th->doff << 2,
541						      skb->csum));
542	}
543}
544
545/* This routine computes an IPv4 TCP checksum. */
546void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
547{
548	const struct inet_sock *inet = inet_sk(sk);
549
550	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
551}
552EXPORT_SYMBOL(tcp_v4_send_check);
553
554int tcp_v4_gso_send_check(struct sk_buff *skb)
555{
556	const struct iphdr *iph;
557	struct tcphdr *th;
558
559	if (!pskb_may_pull(skb, sizeof(*th)))
560		return -EINVAL;
561
562	iph = ip_hdr(skb);
563	th = tcp_hdr(skb);
564
565	th->check = 0;
566	skb->ip_summed = CHECKSUM_PARTIAL;
567	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
568	return 0;
569}
570
571/*
572 *	This routine will send an RST to the other tcp.
573 *
574 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575 *		      for reset.
576 *	Answer: if a packet caused RST, it is not for a socket
577 *		existing in our system, if it is matched to a socket,
578 *		it is just duplicate segment or bug in other side's TCP.
579 *		So that we build reply only basing on parameters
580 *		arrived with segment.
581 *	Exception: precedence violation. We do not implement it in any case.
582 */
583
584static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585{
586	const struct tcphdr *th = tcp_hdr(skb);
587	struct {
588		struct tcphdr th;
589#ifdef CONFIG_TCP_MD5SIG
590		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591#endif
592	} rep;
593	struct ip_reply_arg arg;
594#ifdef CONFIG_TCP_MD5SIG
595	struct tcp_md5sig_key *key;
596	const __u8 *hash_location = NULL;
597	unsigned char newhash[16];
598	int genhash;
599	struct sock *sk1 = NULL;
600#endif
601	struct net *net;
602
603	/* Never send a reset in response to a reset. */
604	if (th->rst)
605		return;
606
607	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
608		return;
609
610	/* Swap the send and the receive. */
611	memset(&rep, 0, sizeof(rep));
612	rep.th.dest   = th->source;
613	rep.th.source = th->dest;
614	rep.th.doff   = sizeof(struct tcphdr) / 4;
615	rep.th.rst    = 1;
616
617	if (th->ack) {
618		rep.th.seq = th->ack_seq;
619	} else {
620		rep.th.ack = 1;
621		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622				       skb->len - (th->doff << 2));
623	}
624
625	memset(&arg, 0, sizeof(arg));
626	arg.iov[0].iov_base = (unsigned char *)&rep;
627	arg.iov[0].iov_len  = sizeof(rep.th);
628
629#ifdef CONFIG_TCP_MD5SIG
630	hash_location = tcp_parse_md5sig_option(th);
631	if (!sk && hash_location) {
632		/*
633		 * active side is lost. Try to find listening socket through
634		 * source port, and then find md5 key through listening socket.
635		 * we are not loose security here:
636		 * Incoming packet is checked with md5 hash with finding key,
637		 * no RST generated if md5 hash doesn't match.
638		 */
639		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
640					     &tcp_hashinfo, ip_hdr(skb)->daddr,
641					     ntohs(th->source), inet_iif(skb));
642		/* don't send rst if it can't find key */
643		if (!sk1)
644			return;
645		rcu_read_lock();
646		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
647					&ip_hdr(skb)->saddr, AF_INET);
648		if (!key)
649			goto release_sk1;
650
651		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
652		if (genhash || memcmp(hash_location, newhash, 16) != 0)
653			goto release_sk1;
654	} else {
655		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
656					     &ip_hdr(skb)->saddr,
657					     AF_INET) : NULL;
658	}
659
660	if (key) {
661		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
662				   (TCPOPT_NOP << 16) |
663				   (TCPOPT_MD5SIG << 8) |
664				   TCPOLEN_MD5SIG);
665		/* Update length and the length the header thinks exists */
666		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
667		rep.th.doff = arg.iov[0].iov_len / 4;
668
669		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
670				     key, ip_hdr(skb)->saddr,
671				     ip_hdr(skb)->daddr, &rep.th);
672	}
673#endif
674	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
675				      ip_hdr(skb)->saddr, /* XXX */
676				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
677	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
678	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
679	/* When socket is gone, all binding information is lost.
680	 * routing might fail in this case. using iif for oif to
681	 * make sure we can deliver it
682	 */
683	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
684
685	net = dev_net(skb_dst(skb)->dev);
686	arg.tos = ip_hdr(skb)->tos;
687	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
688		      &arg, arg.iov[0].iov_len);
689
690	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
691	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
692
693#ifdef CONFIG_TCP_MD5SIG
694release_sk1:
695	if (sk1) {
696		rcu_read_unlock();
697		sock_put(sk1);
698	}
699#endif
700}
701
702/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
703   outside socket context is ugly, certainly. What can I do?
704 */
705
706static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
707			    u32 win, u32 ts, int oif,
708			    struct tcp_md5sig_key *key,
709			    int reply_flags, u8 tos)
710{
711	const struct tcphdr *th = tcp_hdr(skb);
712	struct {
713		struct tcphdr th;
714		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
715#ifdef CONFIG_TCP_MD5SIG
716			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
717#endif
718			];
719	} rep;
720	struct ip_reply_arg arg;
721	struct net *net = dev_net(skb_dst(skb)->dev);
722
723	memset(&rep.th, 0, sizeof(struct tcphdr));
724	memset(&arg, 0, sizeof(arg));
725
726	arg.iov[0].iov_base = (unsigned char *)&rep;
727	arg.iov[0].iov_len  = sizeof(rep.th);
728	if (ts) {
729		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
730				   (TCPOPT_TIMESTAMP << 8) |
731				   TCPOLEN_TIMESTAMP);
732		rep.opt[1] = htonl(tcp_time_stamp);
733		rep.opt[2] = htonl(ts);
734		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
735	}
736
737	/* Swap the send and the receive. */
738	rep.th.dest    = th->source;
739	rep.th.source  = th->dest;
740	rep.th.doff    = arg.iov[0].iov_len / 4;
741	rep.th.seq     = htonl(seq);
742	rep.th.ack_seq = htonl(ack);
743	rep.th.ack     = 1;
744	rep.th.window  = htons(win);
745
746#ifdef CONFIG_TCP_MD5SIG
747	if (key) {
748		int offset = (ts) ? 3 : 0;
749
750		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
751					  (TCPOPT_NOP << 16) |
752					  (TCPOPT_MD5SIG << 8) |
753					  TCPOLEN_MD5SIG);
754		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
755		rep.th.doff = arg.iov[0].iov_len/4;
756
757		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
758				    key, ip_hdr(skb)->saddr,
759				    ip_hdr(skb)->daddr, &rep.th);
760	}
761#endif
762	arg.flags = reply_flags;
763	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
764				      ip_hdr(skb)->saddr, /* XXX */
765				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
766	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
767	if (oif)
768		arg.bound_dev_if = oif;
769	arg.tos = tos;
770	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
771		      &arg, arg.iov[0].iov_len);
772
773	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
774}
775
776static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
777{
778	struct inet_timewait_sock *tw = inet_twsk(sk);
779	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
780
781	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
782			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
783			tcptw->tw_ts_recent,
784			tw->tw_bound_dev_if,
785			tcp_twsk_md5_key(tcptw),
786			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
787			tw->tw_tos
788			);
789
790	inet_twsk_put(tw);
791}
792
793static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
794				  struct request_sock *req)
795{
796	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
797			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
798			req->ts_recent,
799			0,
800			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
801					  AF_INET),
802			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
803			ip_hdr(skb)->tos);
804}
805
806/*
807 *	Send a SYN-ACK after having received a SYN.
808 *	This still operates on a request_sock only, not on a big
809 *	socket.
810 */
811static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
812			      struct request_sock *req,
813			      struct request_values *rvp)
814{
815	const struct inet_request_sock *ireq = inet_rsk(req);
816	struct flowi4 fl4;
817	int err = -1;
818	struct sk_buff * skb;
819
820	/* First, grab a route. */
821	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
822		return -1;
823
824	skb = tcp_make_synack(sk, dst, req, rvp);
825
826	if (skb) {
827		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
828
829		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
830					    ireq->rmt_addr,
831					    ireq->opt);
832		err = net_xmit_eval(err);
833	}
834
835	dst_release(dst);
836	return err;
837}
838
839static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
840			      struct request_values *rvp)
841{
842	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
843	return tcp_v4_send_synack(sk, NULL, req, rvp);
844}
845
846/*
847 *	IPv4 request_sock destructor.
848 */
849static void tcp_v4_reqsk_destructor(struct request_sock *req)
850{
851	kfree(inet_rsk(req)->opt);
852}
853
854/*
855 * Return 1 if a syncookie should be sent
856 */
857int tcp_syn_flood_action(struct sock *sk,
858			 const struct sk_buff *skb,
859			 const char *proto)
860{
861	const char *msg = "Dropping request";
862	int want_cookie = 0;
863	struct listen_sock *lopt;
864
865
866
867#ifdef CONFIG_SYN_COOKIES
868	if (sysctl_tcp_syncookies) {
869		msg = "Sending cookies";
870		want_cookie = 1;
871		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
872	} else
873#endif
874		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
875
876	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
877	if (!lopt->synflood_warned) {
878		lopt->synflood_warned = 1;
879		pr_info("%s: Possible SYN flooding on port %d. %s. "
880			" Check SNMP counters.\n",
881			proto, ntohs(tcp_hdr(skb)->dest), msg);
882	}
883	return want_cookie;
884}
885EXPORT_SYMBOL(tcp_syn_flood_action);
886
887/*
888 * Save and compile IPv4 options into the request_sock if needed.
889 */
890static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
891						  struct sk_buff *skb)
892{
893	const struct ip_options *opt = &(IPCB(skb)->opt);
894	struct ip_options_rcu *dopt = NULL;
895
896	if (opt && opt->optlen) {
897		int opt_size = sizeof(*dopt) + opt->optlen;
898
899		dopt = kmalloc(opt_size, GFP_ATOMIC);
900		if (dopt) {
901			if (ip_options_echo(&dopt->opt, skb)) {
902				kfree(dopt);
903				dopt = NULL;
904			}
905		}
906	}
907	return dopt;
908}
909
910#ifdef CONFIG_TCP_MD5SIG
911/*
912 * RFC2385 MD5 checksumming requires a mapping of
913 * IP address->MD5 Key.
914 * We need to maintain these in the sk structure.
915 */
916
917/* Find the Key structure for an address.  */
918struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
919					 const union tcp_md5_addr *addr,
920					 int family)
921{
922	struct tcp_sock *tp = tcp_sk(sk);
923	struct tcp_md5sig_key *key;
924	struct hlist_node *pos;
925	unsigned int size = sizeof(struct in_addr);
926	struct tcp_md5sig_info *md5sig;
927
928	/* caller either holds rcu_read_lock() or socket lock */
929	md5sig = rcu_dereference_check(tp->md5sig_info,
930				       sock_owned_by_user(sk));
931	if (!md5sig)
932		return NULL;
933#if IS_ENABLED(CONFIG_IPV6)
934	if (family == AF_INET6)
935		size = sizeof(struct in6_addr);
936#endif
937	hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
938		if (key->family != family)
939			continue;
940		if (!memcmp(&key->addr, addr, size))
941			return key;
942	}
943	return NULL;
944}
945EXPORT_SYMBOL(tcp_md5_do_lookup);
946
947struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
948					 struct sock *addr_sk)
949{
950	union tcp_md5_addr *addr;
951
952	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
953	return tcp_md5_do_lookup(sk, addr, AF_INET);
954}
955EXPORT_SYMBOL(tcp_v4_md5_lookup);
956
957static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
958						      struct request_sock *req)
959{
960	union tcp_md5_addr *addr;
961
962	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
963	return tcp_md5_do_lookup(sk, addr, AF_INET);
964}
965
966/* This can be called on a newly created socket, from other files */
967int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
968		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
969{
970	/* Add Key to the list */
971	struct tcp_md5sig_key *key;
972	struct tcp_sock *tp = tcp_sk(sk);
973	struct tcp_md5sig_info *md5sig;
974
975	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
976	if (key) {
977		/* Pre-existing entry - just update that one. */
978		memcpy(key->key, newkey, newkeylen);
979		key->keylen = newkeylen;
980		return 0;
981	}
982
983	md5sig = rcu_dereference_protected(tp->md5sig_info,
984					   sock_owned_by_user(sk));
985	if (!md5sig) {
986		md5sig = kmalloc(sizeof(*md5sig), gfp);
987		if (!md5sig)
988			return -ENOMEM;
989
990		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
991		INIT_HLIST_HEAD(&md5sig->head);
992		rcu_assign_pointer(tp->md5sig_info, md5sig);
993	}
994
995	key = sock_kmalloc(sk, sizeof(*key), gfp);
996	if (!key)
997		return -ENOMEM;
998	if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
999		sock_kfree_s(sk, key, sizeof(*key));
1000		return -ENOMEM;
1001	}
1002
1003	memcpy(key->key, newkey, newkeylen);
1004	key->keylen = newkeylen;
1005	key->family = family;
1006	memcpy(&key->addr, addr,
1007	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1008				      sizeof(struct in_addr));
1009	hlist_add_head_rcu(&key->node, &md5sig->head);
1010	return 0;
1011}
1012EXPORT_SYMBOL(tcp_md5_do_add);
1013
1014int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1015{
1016	struct tcp_sock *tp = tcp_sk(sk);
1017	struct tcp_md5sig_key *key;
1018	struct tcp_md5sig_info *md5sig;
1019
1020	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1021	if (!key)
1022		return -ENOENT;
1023	hlist_del_rcu(&key->node);
1024	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1025	kfree_rcu(key, rcu);
1026	md5sig = rcu_dereference_protected(tp->md5sig_info,
1027					   sock_owned_by_user(sk));
1028	if (hlist_empty(&md5sig->head))
1029		tcp_free_md5sig_pool();
1030	return 0;
1031}
1032EXPORT_SYMBOL(tcp_md5_do_del);
1033
1034void tcp_clear_md5_list(struct sock *sk)
1035{
1036	struct tcp_sock *tp = tcp_sk(sk);
1037	struct tcp_md5sig_key *key;
1038	struct hlist_node *pos, *n;
1039	struct tcp_md5sig_info *md5sig;
1040
1041	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1042
1043	if (!hlist_empty(&md5sig->head))
1044		tcp_free_md5sig_pool();
1045	hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1046		hlist_del_rcu(&key->node);
1047		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048		kfree_rcu(key, rcu);
1049	}
1050}
1051
1052static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1053				 int optlen)
1054{
1055	struct tcp_md5sig cmd;
1056	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1057
1058	if (optlen < sizeof(cmd))
1059		return -EINVAL;
1060
1061	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1062		return -EFAULT;
1063
1064	if (sin->sin_family != AF_INET)
1065		return -EINVAL;
1066
1067	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1068		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1069				      AF_INET);
1070
1071	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1072		return -EINVAL;
1073
1074	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1075			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1076			      GFP_KERNEL);
1077}
1078
1079static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1080					__be32 daddr, __be32 saddr, int nbytes)
1081{
1082	struct tcp4_pseudohdr *bp;
1083	struct scatterlist sg;
1084
1085	bp = &hp->md5_blk.ip4;
1086
1087	/*
1088	 * 1. the TCP pseudo-header (in the order: source IP address,
1089	 * destination IP address, zero-padded protocol number, and
1090	 * segment length)
1091	 */
1092	bp->saddr = saddr;
1093	bp->daddr = daddr;
1094	bp->pad = 0;
1095	bp->protocol = IPPROTO_TCP;
1096	bp->len = cpu_to_be16(nbytes);
1097
1098	sg_init_one(&sg, bp, sizeof(*bp));
1099	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1100}
1101
1102static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1103			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1104{
1105	struct tcp_md5sig_pool *hp;
1106	struct hash_desc *desc;
1107
1108	hp = tcp_get_md5sig_pool();
1109	if (!hp)
1110		goto clear_hash_noput;
1111	desc = &hp->md5_desc;
1112
1113	if (crypto_hash_init(desc))
1114		goto clear_hash;
1115	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1116		goto clear_hash;
1117	if (tcp_md5_hash_header(hp, th))
1118		goto clear_hash;
1119	if (tcp_md5_hash_key(hp, key))
1120		goto clear_hash;
1121	if (crypto_hash_final(desc, md5_hash))
1122		goto clear_hash;
1123
1124	tcp_put_md5sig_pool();
1125	return 0;
1126
1127clear_hash:
1128	tcp_put_md5sig_pool();
1129clear_hash_noput:
1130	memset(md5_hash, 0, 16);
1131	return 1;
1132}
1133
1134int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1135			const struct sock *sk, const struct request_sock *req,
1136			const struct sk_buff *skb)
1137{
1138	struct tcp_md5sig_pool *hp;
1139	struct hash_desc *desc;
1140	const struct tcphdr *th = tcp_hdr(skb);
1141	__be32 saddr, daddr;
1142
1143	if (sk) {
1144		saddr = inet_sk(sk)->inet_saddr;
1145		daddr = inet_sk(sk)->inet_daddr;
1146	} else if (req) {
1147		saddr = inet_rsk(req)->loc_addr;
1148		daddr = inet_rsk(req)->rmt_addr;
1149	} else {
1150		const struct iphdr *iph = ip_hdr(skb);
1151		saddr = iph->saddr;
1152		daddr = iph->daddr;
1153	}
1154
1155	hp = tcp_get_md5sig_pool();
1156	if (!hp)
1157		goto clear_hash_noput;
1158	desc = &hp->md5_desc;
1159
1160	if (crypto_hash_init(desc))
1161		goto clear_hash;
1162
1163	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1164		goto clear_hash;
1165	if (tcp_md5_hash_header(hp, th))
1166		goto clear_hash;
1167	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1168		goto clear_hash;
1169	if (tcp_md5_hash_key(hp, key))
1170		goto clear_hash;
1171	if (crypto_hash_final(desc, md5_hash))
1172		goto clear_hash;
1173
1174	tcp_put_md5sig_pool();
1175	return 0;
1176
1177clear_hash:
1178	tcp_put_md5sig_pool();
1179clear_hash_noput:
1180	memset(md5_hash, 0, 16);
1181	return 1;
1182}
1183EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1184
1185static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1186{
1187	/*
1188	 * This gets called for each TCP segment that arrives
1189	 * so we want to be efficient.
1190	 * We have 3 drop cases:
1191	 * o No MD5 hash and one expected.
1192	 * o MD5 hash and we're not expecting one.
1193	 * o MD5 hash and its wrong.
1194	 */
1195	const __u8 *hash_location = NULL;
1196	struct tcp_md5sig_key *hash_expected;
1197	const struct iphdr *iph = ip_hdr(skb);
1198	const struct tcphdr *th = tcp_hdr(skb);
1199	int genhash;
1200	unsigned char newhash[16];
1201
1202	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1203					  AF_INET);
1204	hash_location = tcp_parse_md5sig_option(th);
1205
1206	/* We've parsed the options - do we have a hash? */
1207	if (!hash_expected && !hash_location)
1208		return 0;
1209
1210	if (hash_expected && !hash_location) {
1211		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1212		return 1;
1213	}
1214
1215	if (!hash_expected && hash_location) {
1216		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1217		return 1;
1218	}
1219
1220	/* Okay, so this is hash_expected and hash_location -
1221	 * so we need to calculate the checksum.
1222	 */
1223	genhash = tcp_v4_md5_hash_skb(newhash,
1224				      hash_expected,
1225				      NULL, NULL, skb);
1226
1227	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1228		if (net_ratelimit()) {
1229			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1230			       &iph->saddr, ntohs(th->source),
1231			       &iph->daddr, ntohs(th->dest),
1232			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1233		}
1234		return 1;
1235	}
1236	return 0;
1237}
1238
1239#endif
1240
1241struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1242	.family		=	PF_INET,
1243	.obj_size	=	sizeof(struct tcp_request_sock),
1244	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1245	.send_ack	=	tcp_v4_reqsk_send_ack,
1246	.destructor	=	tcp_v4_reqsk_destructor,
1247	.send_reset	=	tcp_v4_send_reset,
1248	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1249};
1250
1251#ifdef CONFIG_TCP_MD5SIG
1252static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1253	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1254	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1255};
1256#endif
1257
1258int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1259{
1260	struct tcp_extend_values tmp_ext;
1261	struct tcp_options_received tmp_opt;
1262	const u8 *hash_location;
1263	struct request_sock *req;
1264	struct inet_request_sock *ireq;
1265	struct tcp_sock *tp = tcp_sk(sk);
1266	struct dst_entry *dst = NULL;
1267	__be32 saddr = ip_hdr(skb)->saddr;
1268	__be32 daddr = ip_hdr(skb)->daddr;
1269	__u32 isn = TCP_SKB_CB(skb)->when;
1270	int want_cookie = 0;
1271
1272	/* Never answer to SYNs send to broadcast or multicast */
1273	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1274		goto drop;
1275
1276	/* TW buckets are converted to open requests without
1277	 * limitations, they conserve resources and peer is
1278	 * evidently real one.
1279	 */
1280	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1281		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1282		if (!want_cookie)
1283			goto drop;
1284	}
1285
1286	/* Accept backlog is full. If we have already queued enough
1287	 * of warm entries in syn queue, drop request. It is better than
1288	 * clogging syn queue with openreqs with exponentially increasing
1289	 * timeout.
1290	 */
1291	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1292		goto drop;
1293
1294	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1295	if (!req)
1296		goto drop;
1297
1298#ifdef CONFIG_TCP_MD5SIG
1299	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1300#endif
1301
1302	tcp_clear_options(&tmp_opt);
1303	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1304	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1305	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1306
1307	if (tmp_opt.cookie_plus > 0 &&
1308	    tmp_opt.saw_tstamp &&
1309	    !tp->rx_opt.cookie_out_never &&
1310	    (sysctl_tcp_cookie_size > 0 ||
1311	     (tp->cookie_values != NULL &&
1312	      tp->cookie_values->cookie_desired > 0))) {
1313		u8 *c;
1314		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1315		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1316
1317		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1318			goto drop_and_release;
1319
1320		/* Secret recipe starts with IP addresses */
1321		*mess++ ^= (__force u32)daddr;
1322		*mess++ ^= (__force u32)saddr;
1323
1324		/* plus variable length Initiator Cookie */
1325		c = (u8 *)mess;
1326		while (l-- > 0)
1327			*c++ ^= *hash_location++;
1328
1329		want_cookie = 0;	/* not our kind of cookie */
1330		tmp_ext.cookie_out_never = 0; /* false */
1331		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1332	} else if (!tp->rx_opt.cookie_in_always) {
1333		/* redundant indications, but ensure initialization. */
1334		tmp_ext.cookie_out_never = 1; /* true */
1335		tmp_ext.cookie_plus = 0;
1336	} else {
1337		goto drop_and_release;
1338	}
1339	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1340
1341	if (want_cookie && !tmp_opt.saw_tstamp)
1342		tcp_clear_options(&tmp_opt);
1343
1344	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1345	tcp_openreq_init(req, &tmp_opt, skb);
1346
1347	ireq = inet_rsk(req);
1348	ireq->loc_addr = daddr;
1349	ireq->rmt_addr = saddr;
1350	ireq->no_srccheck = inet_sk(sk)->transparent;
1351	ireq->opt = tcp_v4_save_options(sk, skb);
1352
1353	if (security_inet_conn_request(sk, skb, req))
1354		goto drop_and_free;
1355
1356	if (!want_cookie || tmp_opt.tstamp_ok)
1357		TCP_ECN_create_request(req, tcp_hdr(skb));
1358
1359	if (want_cookie) {
1360		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1361		req->cookie_ts = tmp_opt.tstamp_ok;
1362	} else if (!isn) {
1363		struct inet_peer *peer = NULL;
1364		struct flowi4 fl4;
1365
1366		/* VJ's idea. We save last timestamp seen
1367		 * from the destination in peer table, when entering
1368		 * state TIME-WAIT, and check against it before
1369		 * accepting new connection request.
1370		 *
1371		 * If "isn" is not zero, this request hit alive
1372		 * timewait bucket, so that all the necessary checks
1373		 * are made in the function processing timewait state.
1374		 */
1375		if (tmp_opt.saw_tstamp &&
1376		    tcp_death_row.sysctl_tw_recycle &&
1377		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1378		    fl4.daddr == saddr &&
1379		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1380			inet_peer_refcheck(peer);
1381			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1382			    (s32)(peer->tcp_ts - req->ts_recent) >
1383							TCP_PAWS_WINDOW) {
1384				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1385				goto drop_and_release;
1386			}
1387		}
1388		/* Kill the following clause, if you dislike this way. */
1389		else if (!sysctl_tcp_syncookies &&
1390			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1391			  (sysctl_max_syn_backlog >> 2)) &&
1392			 (!peer || !peer->tcp_ts_stamp) &&
1393			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1394			/* Without syncookies last quarter of
1395			 * backlog is filled with destinations,
1396			 * proven to be alive.
1397			 * It means that we continue to communicate
1398			 * to destinations, already remembered
1399			 * to the moment of synflood.
1400			 */
1401			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1402				       &saddr, ntohs(tcp_hdr(skb)->source));
1403			goto drop_and_release;
1404		}
1405
1406		isn = tcp_v4_init_sequence(skb);
1407	}
1408	tcp_rsk(req)->snt_isn = isn;
1409	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1410
1411	if (tcp_v4_send_synack(sk, dst, req,
1412			       (struct request_values *)&tmp_ext) ||
1413	    want_cookie)
1414		goto drop_and_free;
1415
1416	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1417	return 0;
1418
1419drop_and_release:
1420	dst_release(dst);
1421drop_and_free:
1422	reqsk_free(req);
1423drop:
1424	return 0;
1425}
1426EXPORT_SYMBOL(tcp_v4_conn_request);
1427
1428
1429/*
1430 * The three way handshake has completed - we got a valid synack -
1431 * now create the new socket.
1432 */
1433struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1434				  struct request_sock *req,
1435				  struct dst_entry *dst)
1436{
1437	struct inet_request_sock *ireq;
1438	struct inet_sock *newinet;
1439	struct tcp_sock *newtp;
1440	struct sock *newsk;
1441#ifdef CONFIG_TCP_MD5SIG
1442	struct tcp_md5sig_key *key;
1443#endif
1444	struct ip_options_rcu *inet_opt;
1445
1446	if (sk_acceptq_is_full(sk))
1447		goto exit_overflow;
1448
1449	newsk = tcp_create_openreq_child(sk, req, skb);
1450	if (!newsk)
1451		goto exit_nonewsk;
1452
1453	newsk->sk_gso_type = SKB_GSO_TCPV4;
1454
1455	newtp		      = tcp_sk(newsk);
1456	newinet		      = inet_sk(newsk);
1457	ireq		      = inet_rsk(req);
1458	newinet->inet_daddr   = ireq->rmt_addr;
1459	newinet->inet_rcv_saddr = ireq->loc_addr;
1460	newinet->inet_saddr	      = ireq->loc_addr;
1461	inet_opt	      = ireq->opt;
1462	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1463	ireq->opt	      = NULL;
1464	newinet->mc_index     = inet_iif(skb);
1465	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1466	newinet->rcv_tos      = ip_hdr(skb)->tos;
1467	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1468	if (inet_opt)
1469		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1470	newinet->inet_id = newtp->write_seq ^ jiffies;
1471
1472	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1473		goto put_and_exit;
1474
1475	sk_setup_caps(newsk, dst);
1476
1477	tcp_mtup_init(newsk);
1478	tcp_sync_mss(newsk, dst_mtu(dst));
1479	newtp->advmss = dst_metric_advmss(dst);
1480	if (tcp_sk(sk)->rx_opt.user_mss &&
1481	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1482		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1483
1484	tcp_initialize_rcv_mss(newsk);
1485	if (tcp_rsk(req)->snt_synack)
1486		tcp_valid_rtt_meas(newsk,
1487		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1488	newtp->total_retrans = req->retrans;
1489
1490#ifdef CONFIG_TCP_MD5SIG
1491	/* Copy over the MD5 key from the original socket */
1492	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1493				AF_INET);
1494	if (key != NULL) {
1495		/*
1496		 * We're using one, so create a matching key
1497		 * on the newsk structure. If we fail to get
1498		 * memory, then we end up not copying the key
1499		 * across. Shucks.
1500		 */
1501		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1502			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1503		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1504	}
1505#endif
1506
1507	if (__inet_inherit_port(sk, newsk) < 0)
1508		goto put_and_exit;
1509	__inet_hash_nolisten(newsk, NULL);
1510
1511	return newsk;
1512
1513exit_overflow:
1514	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1515exit_nonewsk:
1516	dst_release(dst);
1517exit:
1518	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1519	return NULL;
1520put_and_exit:
1521	tcp_clear_xmit_timers(newsk);
1522	tcp_cleanup_congestion_control(newsk);
1523	bh_unlock_sock(newsk);
1524	sock_put(newsk);
1525	goto exit;
1526}
1527EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1528
1529static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1530{
1531	struct tcphdr *th = tcp_hdr(skb);
1532	const struct iphdr *iph = ip_hdr(skb);
1533	struct sock *nsk;
1534	struct request_sock **prev;
1535	/* Find possible connection requests. */
1536	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1537						       iph->saddr, iph->daddr);
1538	if (req)
1539		return tcp_check_req(sk, skb, req, prev);
1540
1541	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1542			th->source, iph->daddr, th->dest, inet_iif(skb));
1543
1544	if (nsk) {
1545		if (nsk->sk_state != TCP_TIME_WAIT) {
1546			bh_lock_sock(nsk);
1547			return nsk;
1548		}
1549		inet_twsk_put(inet_twsk(nsk));
1550		return NULL;
1551	}
1552
1553#ifdef CONFIG_SYN_COOKIES
1554	if (!th->syn)
1555		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1556#endif
1557	return sk;
1558}
1559
1560static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1561{
1562	const struct iphdr *iph = ip_hdr(skb);
1563
1564	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1565		if (!tcp_v4_check(skb->len, iph->saddr,
1566				  iph->daddr, skb->csum)) {
1567			skb->ip_summed = CHECKSUM_UNNECESSARY;
1568			return 0;
1569		}
1570	}
1571
1572	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1573				       skb->len, IPPROTO_TCP, 0);
1574
1575	if (skb->len <= 76) {
1576		return __skb_checksum_complete(skb);
1577	}
1578	return 0;
1579}
1580
1581
1582/* The socket must have it's spinlock held when we get
1583 * here.
1584 *
1585 * We have a potential double-lock case here, so even when
1586 * doing backlog processing we use the BH locking scheme.
1587 * This is because we cannot sleep with the original spinlock
1588 * held.
1589 */
1590int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1591{
1592	struct sock *rsk;
1593#ifdef CONFIG_TCP_MD5SIG
1594	/*
1595	 * We really want to reject the packet as early as possible
1596	 * if:
1597	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1598	 *  o There is an MD5 option and we're not expecting one
1599	 */
1600	if (tcp_v4_inbound_md5_hash(sk, skb))
1601		goto discard;
1602#endif
1603
1604	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1605		sock_rps_save_rxhash(sk, skb);
1606		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1607			rsk = sk;
1608			goto reset;
1609		}
1610		return 0;
1611	}
1612
1613	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1614		goto csum_err;
1615
1616	if (sk->sk_state == TCP_LISTEN) {
1617		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1618		if (!nsk)
1619			goto discard;
1620
1621		if (nsk != sk) {
1622			sock_rps_save_rxhash(nsk, skb);
1623			if (tcp_child_process(sk, nsk, skb)) {
1624				rsk = nsk;
1625				goto reset;
1626			}
1627			return 0;
1628		}
1629	} else
1630		sock_rps_save_rxhash(sk, skb);
1631
1632	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1633		rsk = sk;
1634		goto reset;
1635	}
1636	return 0;
1637
1638reset:
1639	tcp_v4_send_reset(rsk, skb);
1640discard:
1641	kfree_skb(skb);
1642	/* Be careful here. If this function gets more complicated and
1643	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1644	 * might be destroyed here. This current version compiles correctly,
1645	 * but you have been warned.
1646	 */
1647	return 0;
1648
1649csum_err:
1650	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1651	goto discard;
1652}
1653EXPORT_SYMBOL(tcp_v4_do_rcv);
1654
1655/*
1656 *	From tcp_input.c
1657 */
1658
1659int tcp_v4_rcv(struct sk_buff *skb)
1660{
1661	const struct iphdr *iph;
1662	const struct tcphdr *th;
1663	struct sock *sk;
1664	int ret;
1665	struct net *net = dev_net(skb->dev);
1666
1667	if (skb->pkt_type != PACKET_HOST)
1668		goto discard_it;
1669
1670	/* Count it even if it's bad */
1671	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1672
1673	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1674		goto discard_it;
1675
1676	th = tcp_hdr(skb);
1677
1678	if (th->doff < sizeof(struct tcphdr) / 4)
1679		goto bad_packet;
1680	if (!pskb_may_pull(skb, th->doff * 4))
1681		goto discard_it;
1682
1683	/* An explanation is required here, I think.
1684	 * Packet length and doff are validated by header prediction,
1685	 * provided case of th->doff==0 is eliminated.
1686	 * So, we defer the checks. */
1687	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1688		goto bad_packet;
1689
1690	th = tcp_hdr(skb);
1691	iph = ip_hdr(skb);
1692	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1693	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1694				    skb->len - th->doff * 4);
1695	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1696	TCP_SKB_CB(skb)->when	 = 0;
1697	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1698	TCP_SKB_CB(skb)->sacked	 = 0;
1699
1700	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1701	if (!sk)
1702		goto no_tcp_socket;
1703
1704process:
1705	if (sk->sk_state == TCP_TIME_WAIT)
1706		goto do_time_wait;
1707
1708	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1709		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1710		goto discard_and_relse;
1711	}
1712
1713	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1714		goto discard_and_relse;
1715	nf_reset(skb);
1716
1717	if (sk_filter(sk, skb))
1718		goto discard_and_relse;
1719
1720	skb->dev = NULL;
1721
1722	bh_lock_sock_nested(sk);
1723	ret = 0;
1724	if (!sock_owned_by_user(sk)) {
1725#ifdef CONFIG_NET_DMA
1726		struct tcp_sock *tp = tcp_sk(sk);
1727		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1728			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1729		if (tp->ucopy.dma_chan)
1730			ret = tcp_v4_do_rcv(sk, skb);
1731		else
1732#endif
1733		{
1734			if (!tcp_prequeue(sk, skb))
1735				ret = tcp_v4_do_rcv(sk, skb);
1736		}
1737	} else if (unlikely(sk_add_backlog(sk, skb))) {
1738		bh_unlock_sock(sk);
1739		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1740		goto discard_and_relse;
1741	}
1742	bh_unlock_sock(sk);
1743
1744	sock_put(sk);
1745
1746	return ret;
1747
1748no_tcp_socket:
1749	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1750		goto discard_it;
1751
1752	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1753bad_packet:
1754		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1755	} else {
1756		tcp_v4_send_reset(NULL, skb);
1757	}
1758
1759discard_it:
1760	/* Discard frame. */
1761	kfree_skb(skb);
1762	return 0;
1763
1764discard_and_relse:
1765	sock_put(sk);
1766	goto discard_it;
1767
1768do_time_wait:
1769	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1770		inet_twsk_put(inet_twsk(sk));
1771		goto discard_it;
1772	}
1773
1774	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1775		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1776		inet_twsk_put(inet_twsk(sk));
1777		goto discard_it;
1778	}
1779	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1780	case TCP_TW_SYN: {
1781		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1782							&tcp_hashinfo,
1783							iph->daddr, th->dest,
1784							inet_iif(skb));
1785		if (sk2) {
1786			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1787			inet_twsk_put(inet_twsk(sk));
1788			sk = sk2;
1789			goto process;
1790		}
1791		/* Fall through to ACK */
1792	}
1793	case TCP_TW_ACK:
1794		tcp_v4_timewait_ack(sk, skb);
1795		break;
1796	case TCP_TW_RST:
1797		goto no_tcp_socket;
1798	case TCP_TW_SUCCESS:;
1799	}
1800	goto discard_it;
1801}
1802
1803struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1804{
1805	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1806	struct inet_sock *inet = inet_sk(sk);
1807	struct inet_peer *peer;
1808
1809	if (!rt ||
1810	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1811		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1812		*release_it = true;
1813	} else {
1814		if (!rt->peer)
1815			rt_bind_peer(rt, inet->inet_daddr, 1);
1816		peer = rt->peer;
1817		*release_it = false;
1818	}
1819
1820	return peer;
1821}
1822EXPORT_SYMBOL(tcp_v4_get_peer);
1823
1824void *tcp_v4_tw_get_peer(struct sock *sk)
1825{
1826	const struct inet_timewait_sock *tw = inet_twsk(sk);
1827
1828	return inet_getpeer_v4(tw->tw_daddr, 1);
1829}
1830EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1831
1832static struct timewait_sock_ops tcp_timewait_sock_ops = {
1833	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1834	.twsk_unique	= tcp_twsk_unique,
1835	.twsk_destructor= tcp_twsk_destructor,
1836	.twsk_getpeer	= tcp_v4_tw_get_peer,
1837};
1838
1839const struct inet_connection_sock_af_ops ipv4_specific = {
1840	.queue_xmit	   = ip_queue_xmit,
1841	.send_check	   = tcp_v4_send_check,
1842	.rebuild_header	   = inet_sk_rebuild_header,
1843	.conn_request	   = tcp_v4_conn_request,
1844	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1845	.get_peer	   = tcp_v4_get_peer,
1846	.net_header_len	   = sizeof(struct iphdr),
1847	.setsockopt	   = ip_setsockopt,
1848	.getsockopt	   = ip_getsockopt,
1849	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1850	.sockaddr_len	   = sizeof(struct sockaddr_in),
1851	.bind_conflict	   = inet_csk_bind_conflict,
1852#ifdef CONFIG_COMPAT
1853	.compat_setsockopt = compat_ip_setsockopt,
1854	.compat_getsockopt = compat_ip_getsockopt,
1855#endif
1856};
1857EXPORT_SYMBOL(ipv4_specific);
1858
1859#ifdef CONFIG_TCP_MD5SIG
1860static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1861	.md5_lookup		= tcp_v4_md5_lookup,
1862	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1863	.md5_parse		= tcp_v4_parse_md5_keys,
1864};
1865#endif
1866
1867/* NOTE: A lot of things set to zero explicitly by call to
1868 *       sk_alloc() so need not be done here.
1869 */
1870static int tcp_v4_init_sock(struct sock *sk)
1871{
1872	struct inet_connection_sock *icsk = inet_csk(sk);
1873	struct tcp_sock *tp = tcp_sk(sk);
1874
1875	skb_queue_head_init(&tp->out_of_order_queue);
1876	tcp_init_xmit_timers(sk);
1877	tcp_prequeue_init(tp);
1878
1879	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1880	tp->mdev = TCP_TIMEOUT_INIT;
1881
1882	/* So many TCP implementations out there (incorrectly) count the
1883	 * initial SYN frame in their delayed-ACK and congestion control
1884	 * algorithms that we must have the following bandaid to talk
1885	 * efficiently to them.  -DaveM
1886	 */
1887	tp->snd_cwnd = TCP_INIT_CWND;
1888
1889	/* See draft-stevens-tcpca-spec-01 for discussion of the
1890	 * initialization of these values.
1891	 */
1892	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1893	tp->snd_cwnd_clamp = ~0;
1894	tp->mss_cache = TCP_MSS_DEFAULT;
1895
1896	tp->reordering = sysctl_tcp_reordering;
1897	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1898
1899	sk->sk_state = TCP_CLOSE;
1900
1901	sk->sk_write_space = sk_stream_write_space;
1902	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1903
1904	icsk->icsk_af_ops = &ipv4_specific;
1905	icsk->icsk_sync_mss = tcp_sync_mss;
1906#ifdef CONFIG_TCP_MD5SIG
1907	tp->af_specific = &tcp_sock_ipv4_specific;
1908#endif
1909
1910	/* TCP Cookie Transactions */
1911	if (sysctl_tcp_cookie_size > 0) {
1912		/* Default, cookies without s_data_payload. */
1913		tp->cookie_values =
1914			kzalloc(sizeof(*tp->cookie_values),
1915				sk->sk_allocation);
1916		if (tp->cookie_values != NULL)
1917			kref_init(&tp->cookie_values->kref);
1918	}
1919	/* Presumed zeroed, in order of appearance:
1920	 *	cookie_in_always, cookie_out_never,
1921	 *	s_data_constant, s_data_in, s_data_out
1922	 */
1923	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1924	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1925
1926	local_bh_disable();
1927	sock_update_memcg(sk);
1928	sk_sockets_allocated_inc(sk);
1929	local_bh_enable();
1930
1931	return 0;
1932}
1933
1934void tcp_v4_destroy_sock(struct sock *sk)
1935{
1936	struct tcp_sock *tp = tcp_sk(sk);
1937
1938	tcp_clear_xmit_timers(sk);
1939
1940	tcp_cleanup_congestion_control(sk);
1941
1942	/* Cleanup up the write buffer. */
1943	tcp_write_queue_purge(sk);
1944
1945	/* Cleans up our, hopefully empty, out_of_order_queue. */
1946	__skb_queue_purge(&tp->out_of_order_queue);
1947
1948#ifdef CONFIG_TCP_MD5SIG
1949	/* Clean up the MD5 key list, if any */
1950	if (tp->md5sig_info) {
1951		tcp_clear_md5_list(sk);
1952		kfree_rcu(tp->md5sig_info, rcu);
1953		tp->md5sig_info = NULL;
1954	}
1955#endif
1956
1957#ifdef CONFIG_NET_DMA
1958	/* Cleans up our sk_async_wait_queue */
1959	__skb_queue_purge(&sk->sk_async_wait_queue);
1960#endif
1961
1962	/* Clean prequeue, it must be empty really */
1963	__skb_queue_purge(&tp->ucopy.prequeue);
1964
1965	/* Clean up a referenced TCP bind bucket. */
1966	if (inet_csk(sk)->icsk_bind_hash)
1967		inet_put_port(sk);
1968
1969	/*
1970	 * If sendmsg cached page exists, toss it.
1971	 */
1972	if (sk->sk_sndmsg_page) {
1973		__free_page(sk->sk_sndmsg_page);
1974		sk->sk_sndmsg_page = NULL;
1975	}
1976
1977	/* TCP Cookie Transactions */
1978	if (tp->cookie_values != NULL) {
1979		kref_put(&tp->cookie_values->kref,
1980			 tcp_cookie_values_release);
1981		tp->cookie_values = NULL;
1982	}
1983
1984	sk_sockets_allocated_dec(sk);
1985	sock_release_memcg(sk);
1986}
1987EXPORT_SYMBOL(tcp_v4_destroy_sock);
1988
1989#ifdef CONFIG_PROC_FS
1990/* Proc filesystem TCP sock list dumping. */
1991
1992static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1993{
1994	return hlist_nulls_empty(head) ? NULL :
1995		list_entry(head->first, struct inet_timewait_sock, tw_node);
1996}
1997
1998static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1999{
2000	return !is_a_nulls(tw->tw_node.next) ?
2001		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2002}
2003
2004/*
2005 * Get next listener socket follow cur.  If cur is NULL, get first socket
2006 * starting from bucket given in st->bucket; when st->bucket is zero the
2007 * very first socket in the hash table is returned.
2008 */
2009static void *listening_get_next(struct seq_file *seq, void *cur)
2010{
2011	struct inet_connection_sock *icsk;
2012	struct hlist_nulls_node *node;
2013	struct sock *sk = cur;
2014	struct inet_listen_hashbucket *ilb;
2015	struct tcp_iter_state *st = seq->private;
2016	struct net *net = seq_file_net(seq);
2017
2018	if (!sk) {
2019		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2020		spin_lock_bh(&ilb->lock);
2021		sk = sk_nulls_head(&ilb->head);
2022		st->offset = 0;
2023		goto get_sk;
2024	}
2025	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2026	++st->num;
2027	++st->offset;
2028
2029	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2030		struct request_sock *req = cur;
2031
2032		icsk = inet_csk(st->syn_wait_sk);
2033		req = req->dl_next;
2034		while (1) {
2035			while (req) {
2036				if (req->rsk_ops->family == st->family) {
2037					cur = req;
2038					goto out;
2039				}
2040				req = req->dl_next;
2041			}
2042			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2043				break;
2044get_req:
2045			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2046		}
2047		sk	  = sk_nulls_next(st->syn_wait_sk);
2048		st->state = TCP_SEQ_STATE_LISTENING;
2049		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050	} else {
2051		icsk = inet_csk(sk);
2052		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2054			goto start_req;
2055		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2056		sk = sk_nulls_next(sk);
2057	}
2058get_sk:
2059	sk_nulls_for_each_from(sk, node) {
2060		if (!net_eq(sock_net(sk), net))
2061			continue;
2062		if (sk->sk_family == st->family) {
2063			cur = sk;
2064			goto out;
2065		}
2066		icsk = inet_csk(sk);
2067		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2068		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2069start_req:
2070			st->uid		= sock_i_uid(sk);
2071			st->syn_wait_sk = sk;
2072			st->state	= TCP_SEQ_STATE_OPENREQ;
2073			st->sbucket	= 0;
2074			goto get_req;
2075		}
2076		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2077	}
2078	spin_unlock_bh(&ilb->lock);
2079	st->offset = 0;
2080	if (++st->bucket < INET_LHTABLE_SIZE) {
2081		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2082		spin_lock_bh(&ilb->lock);
2083		sk = sk_nulls_head(&ilb->head);
2084		goto get_sk;
2085	}
2086	cur = NULL;
2087out:
2088	return cur;
2089}
2090
2091static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2092{
2093	struct tcp_iter_state *st = seq->private;
2094	void *rc;
2095
2096	st->bucket = 0;
2097	st->offset = 0;
2098	rc = listening_get_next(seq, NULL);
2099
2100	while (rc && *pos) {
2101		rc = listening_get_next(seq, rc);
2102		--*pos;
2103	}
2104	return rc;
2105}
2106
2107static inline int empty_bucket(struct tcp_iter_state *st)
2108{
2109	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2110		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2111}
2112
2113/*
2114 * Get first established socket starting from bucket given in st->bucket.
2115 * If st->bucket is zero, the very first socket in the hash is returned.
2116 */
2117static void *established_get_first(struct seq_file *seq)
2118{
2119	struct tcp_iter_state *st = seq->private;
2120	struct net *net = seq_file_net(seq);
2121	void *rc = NULL;
2122
2123	st->offset = 0;
2124	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2125		struct sock *sk;
2126		struct hlist_nulls_node *node;
2127		struct inet_timewait_sock *tw;
2128		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2129
2130		/* Lockless fast path for the common case of empty buckets */
2131		if (empty_bucket(st))
2132			continue;
2133
2134		spin_lock_bh(lock);
2135		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2136			if (sk->sk_family != st->family ||
2137			    !net_eq(sock_net(sk), net)) {
2138				continue;
2139			}
2140			rc = sk;
2141			goto out;
2142		}
2143		st->state = TCP_SEQ_STATE_TIME_WAIT;
2144		inet_twsk_for_each(tw, node,
2145				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2146			if (tw->tw_family != st->family ||
2147			    !net_eq(twsk_net(tw), net)) {
2148				continue;
2149			}
2150			rc = tw;
2151			goto out;
2152		}
2153		spin_unlock_bh(lock);
2154		st->state = TCP_SEQ_STATE_ESTABLISHED;
2155	}
2156out:
2157	return rc;
2158}
2159
2160static void *established_get_next(struct seq_file *seq, void *cur)
2161{
2162	struct sock *sk = cur;
2163	struct inet_timewait_sock *tw;
2164	struct hlist_nulls_node *node;
2165	struct tcp_iter_state *st = seq->private;
2166	struct net *net = seq_file_net(seq);
2167
2168	++st->num;
2169	++st->offset;
2170
2171	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2172		tw = cur;
2173		tw = tw_next(tw);
2174get_tw:
2175		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2176			tw = tw_next(tw);
2177		}
2178		if (tw) {
2179			cur = tw;
2180			goto out;
2181		}
2182		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183		st->state = TCP_SEQ_STATE_ESTABLISHED;
2184
2185		/* Look for next non empty bucket */
2186		st->offset = 0;
2187		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2188				empty_bucket(st))
2189			;
2190		if (st->bucket > tcp_hashinfo.ehash_mask)
2191			return NULL;
2192
2193		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2194		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2195	} else
2196		sk = sk_nulls_next(sk);
2197
2198	sk_nulls_for_each_from(sk, node) {
2199		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2200			goto found;
2201	}
2202
2203	st->state = TCP_SEQ_STATE_TIME_WAIT;
2204	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2205	goto get_tw;
2206found:
2207	cur = sk;
2208out:
2209	return cur;
2210}
2211
2212static void *established_get_idx(struct seq_file *seq, loff_t pos)
2213{
2214	struct tcp_iter_state *st = seq->private;
2215	void *rc;
2216
2217	st->bucket = 0;
2218	rc = established_get_first(seq);
2219
2220	while (rc && pos) {
2221		rc = established_get_next(seq, rc);
2222		--pos;
2223	}
2224	return rc;
2225}
2226
2227static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2228{
2229	void *rc;
2230	struct tcp_iter_state *st = seq->private;
2231
2232	st->state = TCP_SEQ_STATE_LISTENING;
2233	rc	  = listening_get_idx(seq, &pos);
2234
2235	if (!rc) {
2236		st->state = TCP_SEQ_STATE_ESTABLISHED;
2237		rc	  = established_get_idx(seq, pos);
2238	}
2239
2240	return rc;
2241}
2242
2243static void *tcp_seek_last_pos(struct seq_file *seq)
2244{
2245	struct tcp_iter_state *st = seq->private;
2246	int offset = st->offset;
2247	int orig_num = st->num;
2248	void *rc = NULL;
2249
2250	switch (st->state) {
2251	case TCP_SEQ_STATE_OPENREQ:
2252	case TCP_SEQ_STATE_LISTENING:
2253		if (st->bucket >= INET_LHTABLE_SIZE)
2254			break;
2255		st->state = TCP_SEQ_STATE_LISTENING;
2256		rc = listening_get_next(seq, NULL);
2257		while (offset-- && rc)
2258			rc = listening_get_next(seq, rc);
2259		if (rc)
2260			break;
2261		st->bucket = 0;
2262		/* Fallthrough */
2263	case TCP_SEQ_STATE_ESTABLISHED:
2264	case TCP_SEQ_STATE_TIME_WAIT:
2265		st->state = TCP_SEQ_STATE_ESTABLISHED;
2266		if (st->bucket > tcp_hashinfo.ehash_mask)
2267			break;
2268		rc = established_get_first(seq);
2269		while (offset-- && rc)
2270			rc = established_get_next(seq, rc);
2271	}
2272
2273	st->num = orig_num;
2274
2275	return rc;
2276}
2277
2278static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2279{
2280	struct tcp_iter_state *st = seq->private;
2281	void *rc;
2282
2283	if (*pos && *pos == st->last_pos) {
2284		rc = tcp_seek_last_pos(seq);
2285		if (rc)
2286			goto out;
2287	}
2288
2289	st->state = TCP_SEQ_STATE_LISTENING;
2290	st->num = 0;
2291	st->bucket = 0;
2292	st->offset = 0;
2293	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2294
2295out:
2296	st->last_pos = *pos;
2297	return rc;
2298}
2299
2300static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2301{
2302	struct tcp_iter_state *st = seq->private;
2303	void *rc = NULL;
2304
2305	if (v == SEQ_START_TOKEN) {
2306		rc = tcp_get_idx(seq, 0);
2307		goto out;
2308	}
2309
2310	switch (st->state) {
2311	case TCP_SEQ_STATE_OPENREQ:
2312	case TCP_SEQ_STATE_LISTENING:
2313		rc = listening_get_next(seq, v);
2314		if (!rc) {
2315			st->state = TCP_SEQ_STATE_ESTABLISHED;
2316			st->bucket = 0;
2317			st->offset = 0;
2318			rc	  = established_get_first(seq);
2319		}
2320		break;
2321	case TCP_SEQ_STATE_ESTABLISHED:
2322	case TCP_SEQ_STATE_TIME_WAIT:
2323		rc = established_get_next(seq, v);
2324		break;
2325	}
2326out:
2327	++*pos;
2328	st->last_pos = *pos;
2329	return rc;
2330}
2331
2332static void tcp_seq_stop(struct seq_file *seq, void *v)
2333{
2334	struct tcp_iter_state *st = seq->private;
2335
2336	switch (st->state) {
2337	case TCP_SEQ_STATE_OPENREQ:
2338		if (v) {
2339			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2340			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2341		}
2342	case TCP_SEQ_STATE_LISTENING:
2343		if (v != SEQ_START_TOKEN)
2344			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2345		break;
2346	case TCP_SEQ_STATE_TIME_WAIT:
2347	case TCP_SEQ_STATE_ESTABLISHED:
2348		if (v)
2349			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2350		break;
2351	}
2352}
2353
2354int tcp_seq_open(struct inode *inode, struct file *file)
2355{
2356	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2357	struct tcp_iter_state *s;
2358	int err;
2359
2360	err = seq_open_net(inode, file, &afinfo->seq_ops,
2361			  sizeof(struct tcp_iter_state));
2362	if (err < 0)
2363		return err;
2364
2365	s = ((struct seq_file *)file->private_data)->private;
2366	s->family		= afinfo->family;
2367	s->last_pos 		= 0;
2368	return 0;
2369}
2370EXPORT_SYMBOL(tcp_seq_open);
2371
2372int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2373{
2374	int rc = 0;
2375	struct proc_dir_entry *p;
2376
2377	afinfo->seq_ops.start		= tcp_seq_start;
2378	afinfo->seq_ops.next		= tcp_seq_next;
2379	afinfo->seq_ops.stop		= tcp_seq_stop;
2380
2381	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2382			     afinfo->seq_fops, afinfo);
2383	if (!p)
2384		rc = -ENOMEM;
2385	return rc;
2386}
2387EXPORT_SYMBOL(tcp_proc_register);
2388
2389void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2390{
2391	proc_net_remove(net, afinfo->name);
2392}
2393EXPORT_SYMBOL(tcp_proc_unregister);
2394
2395static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2396			 struct seq_file *f, int i, int uid, int *len)
2397{
2398	const struct inet_request_sock *ireq = inet_rsk(req);
2399	int ttd = req->expires - jiffies;
2400
2401	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2402		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2403		i,
2404		ireq->loc_addr,
2405		ntohs(inet_sk(sk)->inet_sport),
2406		ireq->rmt_addr,
2407		ntohs(ireq->rmt_port),
2408		TCP_SYN_RECV,
2409		0, 0, /* could print option size, but that is af dependent. */
2410		1,    /* timers active (only the expire timer) */
2411		jiffies_to_clock_t(ttd),
2412		req->retrans,
2413		uid,
2414		0,  /* non standard timer */
2415		0, /* open_requests have no inode */
2416		atomic_read(&sk->sk_refcnt),
2417		req,
2418		len);
2419}
2420
2421static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2422{
2423	int timer_active;
2424	unsigned long timer_expires;
2425	const struct tcp_sock *tp = tcp_sk(sk);
2426	const struct inet_connection_sock *icsk = inet_csk(sk);
2427	const struct inet_sock *inet = inet_sk(sk);
2428	__be32 dest = inet->inet_daddr;
2429	__be32 src = inet->inet_rcv_saddr;
2430	__u16 destp = ntohs(inet->inet_dport);
2431	__u16 srcp = ntohs(inet->inet_sport);
2432	int rx_queue;
2433
2434	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2435		timer_active	= 1;
2436		timer_expires	= icsk->icsk_timeout;
2437	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2438		timer_active	= 4;
2439		timer_expires	= icsk->icsk_timeout;
2440	} else if (timer_pending(&sk->sk_timer)) {
2441		timer_active	= 2;
2442		timer_expires	= sk->sk_timer.expires;
2443	} else {
2444		timer_active	= 0;
2445		timer_expires = jiffies;
2446	}
2447
2448	if (sk->sk_state == TCP_LISTEN)
2449		rx_queue = sk->sk_ack_backlog;
2450	else
2451		/*
2452		 * because we dont lock socket, we might find a transient negative value
2453		 */
2454		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2455
2456	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2457			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2458		i, src, srcp, dest, destp, sk->sk_state,
2459		tp->write_seq - tp->snd_una,
2460		rx_queue,
2461		timer_active,
2462		jiffies_to_clock_t(timer_expires - jiffies),
2463		icsk->icsk_retransmits,
2464		sock_i_uid(sk),
2465		icsk->icsk_probes_out,
2466		sock_i_ino(sk),
2467		atomic_read(&sk->sk_refcnt), sk,
2468		jiffies_to_clock_t(icsk->icsk_rto),
2469		jiffies_to_clock_t(icsk->icsk_ack.ato),
2470		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2471		tp->snd_cwnd,
2472		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2473		len);
2474}
2475
2476static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2477			       struct seq_file *f, int i, int *len)
2478{
2479	__be32 dest, src;
2480	__u16 destp, srcp;
2481	int ttd = tw->tw_ttd - jiffies;
2482
2483	if (ttd < 0)
2484		ttd = 0;
2485
2486	dest  = tw->tw_daddr;
2487	src   = tw->tw_rcv_saddr;
2488	destp = ntohs(tw->tw_dport);
2489	srcp  = ntohs(tw->tw_sport);
2490
2491	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2492		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2493		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2494		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2495		atomic_read(&tw->tw_refcnt), tw, len);
2496}
2497
2498#define TMPSZ 150
2499
2500static int tcp4_seq_show(struct seq_file *seq, void *v)
2501{
2502	struct tcp_iter_state *st;
2503	int len;
2504
2505	if (v == SEQ_START_TOKEN) {
2506		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2507			   "  sl  local_address rem_address   st tx_queue "
2508			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2509			   "inode");
2510		goto out;
2511	}
2512	st = seq->private;
2513
2514	switch (st->state) {
2515	case TCP_SEQ_STATE_LISTENING:
2516	case TCP_SEQ_STATE_ESTABLISHED:
2517		get_tcp4_sock(v, seq, st->num, &len);
2518		break;
2519	case TCP_SEQ_STATE_OPENREQ:
2520		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2521		break;
2522	case TCP_SEQ_STATE_TIME_WAIT:
2523		get_timewait4_sock(v, seq, st->num, &len);
2524		break;
2525	}
2526	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2527out:
2528	return 0;
2529}
2530
2531static const struct file_operations tcp_afinfo_seq_fops = {
2532	.owner   = THIS_MODULE,
2533	.open    = tcp_seq_open,
2534	.read    = seq_read,
2535	.llseek  = seq_lseek,
2536	.release = seq_release_net
2537};
2538
2539static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2540	.name		= "tcp",
2541	.family		= AF_INET,
2542	.seq_fops	= &tcp_afinfo_seq_fops,
2543	.seq_ops	= {
2544		.show		= tcp4_seq_show,
2545	},
2546};
2547
2548static int __net_init tcp4_proc_init_net(struct net *net)
2549{
2550	return tcp_proc_register(net, &tcp4_seq_afinfo);
2551}
2552
2553static void __net_exit tcp4_proc_exit_net(struct net *net)
2554{
2555	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2556}
2557
2558static struct pernet_operations tcp4_net_ops = {
2559	.init = tcp4_proc_init_net,
2560	.exit = tcp4_proc_exit_net,
2561};
2562
2563int __init tcp4_proc_init(void)
2564{
2565	return register_pernet_subsys(&tcp4_net_ops);
2566}
2567
2568void tcp4_proc_exit(void)
2569{
2570	unregister_pernet_subsys(&tcp4_net_ops);
2571}
2572#endif /* CONFIG_PROC_FS */
2573
2574struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2575{
2576	const struct iphdr *iph = skb_gro_network_header(skb);
2577
2578	switch (skb->ip_summed) {
2579	case CHECKSUM_COMPLETE:
2580		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2581				  skb->csum)) {
2582			skb->ip_summed = CHECKSUM_UNNECESSARY;
2583			break;
2584		}
2585
2586		/* fall through */
2587	case CHECKSUM_NONE:
2588		NAPI_GRO_CB(skb)->flush = 1;
2589		return NULL;
2590	}
2591
2592	return tcp_gro_receive(head, skb);
2593}
2594
2595int tcp4_gro_complete(struct sk_buff *skb)
2596{
2597	const struct iphdr *iph = ip_hdr(skb);
2598	struct tcphdr *th = tcp_hdr(skb);
2599
2600	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2601				  iph->saddr, iph->daddr, 0);
2602	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2603
2604	return tcp_gro_complete(skb);
2605}
2606
2607struct proto tcp_prot = {
2608	.name			= "TCP",
2609	.owner			= THIS_MODULE,
2610	.close			= tcp_close,
2611	.connect		= tcp_v4_connect,
2612	.disconnect		= tcp_disconnect,
2613	.accept			= inet_csk_accept,
2614	.ioctl			= tcp_ioctl,
2615	.init			= tcp_v4_init_sock,
2616	.destroy		= tcp_v4_destroy_sock,
2617	.shutdown		= tcp_shutdown,
2618	.setsockopt		= tcp_setsockopt,
2619	.getsockopt		= tcp_getsockopt,
2620	.recvmsg		= tcp_recvmsg,
2621	.sendmsg		= tcp_sendmsg,
2622	.sendpage		= tcp_sendpage,
2623	.backlog_rcv		= tcp_v4_do_rcv,
2624	.hash			= inet_hash,
2625	.unhash			= inet_unhash,
2626	.get_port		= inet_csk_get_port,
2627	.enter_memory_pressure	= tcp_enter_memory_pressure,
2628	.sockets_allocated	= &tcp_sockets_allocated,
2629	.orphan_count		= &tcp_orphan_count,
2630	.memory_allocated	= &tcp_memory_allocated,
2631	.memory_pressure	= &tcp_memory_pressure,
2632	.sysctl_wmem		= sysctl_tcp_wmem,
2633	.sysctl_rmem		= sysctl_tcp_rmem,
2634	.max_header		= MAX_TCP_HEADER,
2635	.obj_size		= sizeof(struct tcp_sock),
2636	.slab_flags		= SLAB_DESTROY_BY_RCU,
2637	.twsk_prot		= &tcp_timewait_sock_ops,
2638	.rsk_prot		= &tcp_request_sock_ops,
2639	.h.hashinfo		= &tcp_hashinfo,
2640	.no_autobind		= true,
2641#ifdef CONFIG_COMPAT
2642	.compat_setsockopt	= compat_tcp_setsockopt,
2643	.compat_getsockopt	= compat_tcp_getsockopt,
2644#endif
2645#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2646	.init_cgroup		= tcp_init_cgroup,
2647	.destroy_cgroup		= tcp_destroy_cgroup,
2648	.proto_cgroup		= tcp_proto_cgroup,
2649#endif
2650};
2651EXPORT_SYMBOL(tcp_prot);
2652
2653static int __net_init tcp_sk_init(struct net *net)
2654{
2655	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2656				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2657}
2658
2659static void __net_exit tcp_sk_exit(struct net *net)
2660{
2661	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2662}
2663
2664static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2665{
2666	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2667}
2668
2669static struct pernet_operations __net_initdata tcp_sk_ops = {
2670       .init	   = tcp_sk_init,
2671       .exit	   = tcp_sk_exit,
2672       .exit_batch = tcp_sk_exit_batch,
2673};
2674
2675void __init tcp_v4_init(void)
2676{
2677	inet_hashinfo_init(&tcp_hashinfo);
2678	if (register_pernet_subsys(&tcp_sk_ops))
2679		panic("Failed to create the TCP control socket.\n");
2680}
2681