tcp_ipv4.c revision 0e734419923bd8e599858f8fc196c7804bb85564
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90#ifdef CONFIG_TCP_MD5SIG
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92						   __be32 addr);
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99	return NULL;
100}
101#endif
102
103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	__be16 orig_sport, orig_dport;
153	__be32 daddr, nexthop;
154	struct flowi4 *fl4;
155	struct rtable *rt;
156	int err;
157	struct ip_options_rcu *inet_opt;
158
159	if (addr_len < sizeof(struct sockaddr_in))
160		return -EINVAL;
161
162	if (usin->sin_family != AF_INET)
163		return -EAFNOSUPPORT;
164
165	nexthop = daddr = usin->sin_addr.s_addr;
166	inet_opt = rcu_dereference_protected(inet->inet_opt,
167					     sock_owned_by_user(sk));
168	if (inet_opt && inet_opt->opt.srr) {
169		if (!daddr)
170			return -EINVAL;
171		nexthop = inet_opt->opt.faddr;
172	}
173
174	orig_sport = inet->inet_sport;
175	orig_dport = usin->sin_port;
176	fl4 = &inet->cork.fl.u.ip4;
177	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
178			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179			      IPPROTO_TCP,
180			      orig_sport, orig_dport, sk, true);
181	if (IS_ERR(rt)) {
182		err = PTR_ERR(rt);
183		if (err == -ENETUNREACH)
184			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
185		return err;
186	}
187
188	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189		ip_rt_put(rt);
190		return -ENETUNREACH;
191	}
192
193	if (!inet_opt || !inet_opt->opt.srr)
194		daddr = fl4->daddr;
195
196	if (!inet->inet_saddr)
197		inet->inet_saddr = fl4->saddr;
198	inet->inet_rcv_saddr = inet->inet_saddr;
199
200	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
201		/* Reset inherited state */
202		tp->rx_opt.ts_recent	   = 0;
203		tp->rx_opt.ts_recent_stamp = 0;
204		tp->write_seq		   = 0;
205	}
206
207	if (tcp_death_row.sysctl_tw_recycle &&
208	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
209		struct inet_peer *peer = rt_get_peer(rt);
210		/*
211		 * VJ's idea. We save last timestamp seen from
212		 * the destination in peer table, when entering state
213		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214		 * when trying new connection.
215		 */
216		if (peer) {
217			inet_peer_refcheck(peer);
218			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220				tp->rx_opt.ts_recent = peer->tcp_ts;
221			}
222		}
223	}
224
225	inet->inet_dport = usin->sin_port;
226	inet->inet_daddr = daddr;
227
228	inet_csk(sk)->icsk_ext_hdr_len = 0;
229	if (inet_opt)
230		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
231
232	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
233
234	/* Socket identity is still unknown (sport may be zero).
235	 * However we set state to SYN-SENT and not releasing socket
236	 * lock select source port, enter ourselves into the hash tables and
237	 * complete initialization after this.
238	 */
239	tcp_set_state(sk, TCP_SYN_SENT);
240	err = inet_hash_connect(&tcp_death_row, sk);
241	if (err)
242		goto failure;
243
244	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
245			       inet->inet_sport, inet->inet_dport, sk);
246	if (IS_ERR(rt)) {
247		err = PTR_ERR(rt);
248		rt = NULL;
249		goto failure;
250	}
251	/* OK, now commit destination to socket.  */
252	sk->sk_gso_type = SKB_GSO_TCPV4;
253	sk_setup_caps(sk, &rt->dst);
254
255	if (!tp->write_seq)
256		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257							   inet->inet_daddr,
258							   inet->inet_sport,
259							   usin->sin_port);
260
261	inet->inet_id = tp->write_seq ^ jiffies;
262
263	err = tcp_connect(sk);
264	rt = NULL;
265	if (err)
266		goto failure;
267
268	return 0;
269
270failure:
271	/*
272	 * This unhashes the socket and releases the local port,
273	 * if necessary.
274	 */
275	tcp_set_state(sk, TCP_CLOSE);
276	ip_rt_put(rt);
277	sk->sk_route_caps = 0;
278	inet->inet_dport = 0;
279	return err;
280}
281EXPORT_SYMBOL(tcp_v4_connect);
282
283/*
284 * This routine does path mtu discovery as defined in RFC1191.
285 */
286static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
287{
288	struct dst_entry *dst;
289	struct inet_sock *inet = inet_sk(sk);
290
291	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292	 * send out by Linux are always <576bytes so they should go through
293	 * unfragmented).
294	 */
295	if (sk->sk_state == TCP_LISTEN)
296		return;
297
298	/* We don't check in the destentry if pmtu discovery is forbidden
299	 * on this route. We just assume that no packet_to_big packets
300	 * are send back when pmtu discovery is not active.
301	 * There is a small race when the user changes this flag in the
302	 * route, but I think that's acceptable.
303	 */
304	if ((dst = __sk_dst_check(sk, 0)) == NULL)
305		return;
306
307	dst->ops->update_pmtu(dst, mtu);
308
309	/* Something is about to be wrong... Remember soft error
310	 * for the case, if this connection will not able to recover.
311	 */
312	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313		sk->sk_err_soft = EMSGSIZE;
314
315	mtu = dst_mtu(dst);
316
317	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
318	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
319		tcp_sync_mss(sk, mtu);
320
321		/* Resend the TCP packet because it's
322		 * clear that the old packet has been
323		 * dropped. This is the new "fast" path mtu
324		 * discovery.
325		 */
326		tcp_simple_retransmit(sk);
327	} /* else let the usual retransmit timer handle it */
328}
329
330/*
331 * This routine is called by the ICMP module when it gets some
332 * sort of error condition.  If err < 0 then the socket should
333 * be closed and the error returned to the user.  If err > 0
334 * it's just the icmp type << 8 | icmp code.  After adjustment
335 * header points to the first 8 bytes of the tcp header.  We need
336 * to find the appropriate port.
337 *
338 * The locking strategy used here is very "optimistic". When
339 * someone else accesses the socket the ICMP is just dropped
340 * and for some paths there is no check at all.
341 * A more general error queue to queue errors for later handling
342 * is probably better.
343 *
344 */
345
346void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
347{
348	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
349	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
350	struct inet_connection_sock *icsk;
351	struct tcp_sock *tp;
352	struct inet_sock *inet;
353	const int type = icmp_hdr(icmp_skb)->type;
354	const int code = icmp_hdr(icmp_skb)->code;
355	struct sock *sk;
356	struct sk_buff *skb;
357	__u32 seq;
358	__u32 remaining;
359	int err;
360	struct net *net = dev_net(icmp_skb->dev);
361
362	if (icmp_skb->len < (iph->ihl << 2) + 8) {
363		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364		return;
365	}
366
367	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
368			iph->saddr, th->source, inet_iif(icmp_skb));
369	if (!sk) {
370		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371		return;
372	}
373	if (sk->sk_state == TCP_TIME_WAIT) {
374		inet_twsk_put(inet_twsk(sk));
375		return;
376	}
377
378	bh_lock_sock(sk);
379	/* If too many ICMPs get dropped on busy
380	 * servers this needs to be solved differently.
381	 */
382	if (sock_owned_by_user(sk))
383		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
384
385	if (sk->sk_state == TCP_CLOSE)
386		goto out;
387
388	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390		goto out;
391	}
392
393	icsk = inet_csk(sk);
394	tp = tcp_sk(sk);
395	seq = ntohl(th->seq);
396	if (sk->sk_state != TCP_LISTEN &&
397	    !between(seq, tp->snd_una, tp->snd_nxt)) {
398		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
399		goto out;
400	}
401
402	switch (type) {
403	case ICMP_SOURCE_QUENCH:
404		/* Just silently ignore these. */
405		goto out;
406	case ICMP_PARAMETERPROB:
407		err = EPROTO;
408		break;
409	case ICMP_DEST_UNREACH:
410		if (code > NR_ICMP_UNREACH)
411			goto out;
412
413		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414			if (!sock_owned_by_user(sk))
415				do_pmtu_discovery(sk, iph, info);
416			goto out;
417		}
418
419		err = icmp_err_convert[code].errno;
420		/* check if icmp_skb allows revert of backoff
421		 * (see draft-zimmermann-tcp-lcd) */
422		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423			break;
424		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425		    !icsk->icsk_backoff)
426			break;
427
428		if (sock_owned_by_user(sk))
429			break;
430
431		icsk->icsk_backoff--;
432		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433					 icsk->icsk_backoff;
434		tcp_bound_rto(sk);
435
436		skb = tcp_write_queue_head(sk);
437		BUG_ON(!skb);
438
439		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440				tcp_time_stamp - TCP_SKB_CB(skb)->when);
441
442		if (remaining) {
443			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444						  remaining, TCP_RTO_MAX);
445		} else {
446			/* RTO revert clocked out retransmission.
447			 * Will retransmit now */
448			tcp_retransmit_timer(sk);
449		}
450
451		break;
452	case ICMP_TIME_EXCEEDED:
453		err = EHOSTUNREACH;
454		break;
455	default:
456		goto out;
457	}
458
459	switch (sk->sk_state) {
460		struct request_sock *req, **prev;
461	case TCP_LISTEN:
462		if (sock_owned_by_user(sk))
463			goto out;
464
465		req = inet_csk_search_req(sk, &prev, th->dest,
466					  iph->daddr, iph->saddr);
467		if (!req)
468			goto out;
469
470		/* ICMPs are not backlogged, hence we cannot get
471		   an established socket here.
472		 */
473		WARN_ON(req->sk);
474
475		if (seq != tcp_rsk(req)->snt_isn) {
476			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
477			goto out;
478		}
479
480		/*
481		 * Still in SYN_RECV, just remove it silently.
482		 * There is no good way to pass the error to the newly
483		 * created socket, and POSIX does not want network
484		 * errors returned from accept().
485		 */
486		inet_csk_reqsk_queue_drop(sk, req, prev);
487		goto out;
488
489	case TCP_SYN_SENT:
490	case TCP_SYN_RECV:  /* Cannot happen.
491			       It can f.e. if SYNs crossed.
492			     */
493		if (!sock_owned_by_user(sk)) {
494			sk->sk_err = err;
495
496			sk->sk_error_report(sk);
497
498			tcp_done(sk);
499		} else {
500			sk->sk_err_soft = err;
501		}
502		goto out;
503	}
504
505	/* If we've already connected we will keep trying
506	 * until we time out, or the user gives up.
507	 *
508	 * rfc1122 4.2.3.9 allows to consider as hard errors
509	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510	 * but it is obsoleted by pmtu discovery).
511	 *
512	 * Note, that in modern internet, where routing is unreliable
513	 * and in each dark corner broken firewalls sit, sending random
514	 * errors ordered by their masters even this two messages finally lose
515	 * their original sense (even Linux sends invalid PORT_UNREACHs)
516	 *
517	 * Now we are in compliance with RFCs.
518	 *							--ANK (980905)
519	 */
520
521	inet = inet_sk(sk);
522	if (!sock_owned_by_user(sk) && inet->recverr) {
523		sk->sk_err = err;
524		sk->sk_error_report(sk);
525	} else	{ /* Only an error on timeout */
526		sk->sk_err_soft = err;
527	}
528
529out:
530	bh_unlock_sock(sk);
531	sock_put(sk);
532}
533
534static void __tcp_v4_send_check(struct sk_buff *skb,
535				__be32 saddr, __be32 daddr)
536{
537	struct tcphdr *th = tcp_hdr(skb);
538
539	if (skb->ip_summed == CHECKSUM_PARTIAL) {
540		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
541		skb->csum_start = skb_transport_header(skb) - skb->head;
542		skb->csum_offset = offsetof(struct tcphdr, check);
543	} else {
544		th->check = tcp_v4_check(skb->len, saddr, daddr,
545					 csum_partial(th,
546						      th->doff << 2,
547						      skb->csum));
548	}
549}
550
551/* This routine computes an IPv4 TCP checksum. */
552void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
553{
554	struct inet_sock *inet = inet_sk(sk);
555
556	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557}
558EXPORT_SYMBOL(tcp_v4_send_check);
559
560int tcp_v4_gso_send_check(struct sk_buff *skb)
561{
562	const struct iphdr *iph;
563	struct tcphdr *th;
564
565	if (!pskb_may_pull(skb, sizeof(*th)))
566		return -EINVAL;
567
568	iph = ip_hdr(skb);
569	th = tcp_hdr(skb);
570
571	th->check = 0;
572	skb->ip_summed = CHECKSUM_PARTIAL;
573	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
574	return 0;
575}
576
577/*
578 *	This routine will send an RST to the other tcp.
579 *
580 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581 *		      for reset.
582 *	Answer: if a packet caused RST, it is not for a socket
583 *		existing in our system, if it is matched to a socket,
584 *		it is just duplicate segment or bug in other side's TCP.
585 *		So that we build reply only basing on parameters
586 *		arrived with segment.
587 *	Exception: precedence violation. We do not implement it in any case.
588 */
589
590static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
591{
592	struct tcphdr *th = tcp_hdr(skb);
593	struct {
594		struct tcphdr th;
595#ifdef CONFIG_TCP_MD5SIG
596		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597#endif
598	} rep;
599	struct ip_reply_arg arg;
600#ifdef CONFIG_TCP_MD5SIG
601	struct tcp_md5sig_key *key;
602#endif
603	struct net *net;
604
605	/* Never send a reset in response to a reset. */
606	if (th->rst)
607		return;
608
609	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
610		return;
611
612	/* Swap the send and the receive. */
613	memset(&rep, 0, sizeof(rep));
614	rep.th.dest   = th->source;
615	rep.th.source = th->dest;
616	rep.th.doff   = sizeof(struct tcphdr) / 4;
617	rep.th.rst    = 1;
618
619	if (th->ack) {
620		rep.th.seq = th->ack_seq;
621	} else {
622		rep.th.ack = 1;
623		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624				       skb->len - (th->doff << 2));
625	}
626
627	memset(&arg, 0, sizeof(arg));
628	arg.iov[0].iov_base = (unsigned char *)&rep;
629	arg.iov[0].iov_len  = sizeof(rep.th);
630
631#ifdef CONFIG_TCP_MD5SIG
632	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
633	if (key) {
634		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635				   (TCPOPT_NOP << 16) |
636				   (TCPOPT_MD5SIG << 8) |
637				   TCPOLEN_MD5SIG);
638		/* Update length and the length the header thinks exists */
639		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640		rep.th.doff = arg.iov[0].iov_len / 4;
641
642		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
643				     key, ip_hdr(skb)->saddr,
644				     ip_hdr(skb)->daddr, &rep.th);
645	}
646#endif
647	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648				      ip_hdr(skb)->saddr, /* XXX */
649				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
650	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
651	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
652
653	net = dev_net(skb_dst(skb)->dev);
654	ip_send_reply(net->ipv4.tcp_sock, skb,
655		      &arg, arg.iov[0].iov_len);
656
657	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
659}
660
661/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662   outside socket context is ugly, certainly. What can I do?
663 */
664
665static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666			    u32 win, u32 ts, int oif,
667			    struct tcp_md5sig_key *key,
668			    int reply_flags)
669{
670	struct tcphdr *th = tcp_hdr(skb);
671	struct {
672		struct tcphdr th;
673		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
674#ifdef CONFIG_TCP_MD5SIG
675			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
676#endif
677			];
678	} rep;
679	struct ip_reply_arg arg;
680	struct net *net = dev_net(skb_dst(skb)->dev);
681
682	memset(&rep.th, 0, sizeof(struct tcphdr));
683	memset(&arg, 0, sizeof(arg));
684
685	arg.iov[0].iov_base = (unsigned char *)&rep;
686	arg.iov[0].iov_len  = sizeof(rep.th);
687	if (ts) {
688		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689				   (TCPOPT_TIMESTAMP << 8) |
690				   TCPOLEN_TIMESTAMP);
691		rep.opt[1] = htonl(tcp_time_stamp);
692		rep.opt[2] = htonl(ts);
693		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
694	}
695
696	/* Swap the send and the receive. */
697	rep.th.dest    = th->source;
698	rep.th.source  = th->dest;
699	rep.th.doff    = arg.iov[0].iov_len / 4;
700	rep.th.seq     = htonl(seq);
701	rep.th.ack_seq = htonl(ack);
702	rep.th.ack     = 1;
703	rep.th.window  = htons(win);
704
705#ifdef CONFIG_TCP_MD5SIG
706	if (key) {
707		int offset = (ts) ? 3 : 0;
708
709		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710					  (TCPOPT_NOP << 16) |
711					  (TCPOPT_MD5SIG << 8) |
712					  TCPOLEN_MD5SIG);
713		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714		rep.th.doff = arg.iov[0].iov_len/4;
715
716		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
717				    key, ip_hdr(skb)->saddr,
718				    ip_hdr(skb)->daddr, &rep.th);
719	}
720#endif
721	arg.flags = reply_flags;
722	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723				      ip_hdr(skb)->saddr, /* XXX */
724				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
725	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
726	if (oif)
727		arg.bound_dev_if = oif;
728
729	ip_send_reply(net->ipv4.tcp_sock, skb,
730		      &arg, arg.iov[0].iov_len);
731
732	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
733}
734
735static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736{
737	struct inet_timewait_sock *tw = inet_twsk(sk);
738	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
739
740	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
741			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
742			tcptw->tw_ts_recent,
743			tw->tw_bound_dev_if,
744			tcp_twsk_md5_key(tcptw),
745			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
746			);
747
748	inet_twsk_put(tw);
749}
750
751static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
752				  struct request_sock *req)
753{
754	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
755			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
756			req->ts_recent,
757			0,
758			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
760}
761
762/*
763 *	Send a SYN-ACK after having received a SYN.
764 *	This still operates on a request_sock only, not on a big
765 *	socket.
766 */
767static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768			      struct request_sock *req,
769			      struct request_values *rvp)
770{
771	const struct inet_request_sock *ireq = inet_rsk(req);
772	int err = -1;
773	struct sk_buff * skb;
774
775	/* First, grab a route. */
776	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
777		return -1;
778
779	skb = tcp_make_synack(sk, dst, req, rvp);
780
781	if (skb) {
782		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
783
784		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
785					    ireq->rmt_addr,
786					    ireq->opt);
787		err = net_xmit_eval(err);
788	}
789
790	dst_release(dst);
791	return err;
792}
793
794static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
795			      struct request_values *rvp)
796{
797	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
798	return tcp_v4_send_synack(sk, NULL, req, rvp);
799}
800
801/*
802 *	IPv4 request_sock destructor.
803 */
804static void tcp_v4_reqsk_destructor(struct request_sock *req)
805{
806	kfree(inet_rsk(req)->opt);
807}
808
809static void syn_flood_warning(const struct sk_buff *skb)
810{
811	const char *msg;
812
813#ifdef CONFIG_SYN_COOKIES
814	if (sysctl_tcp_syncookies)
815		msg = "Sending cookies";
816	else
817#endif
818		msg = "Dropping request";
819
820	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
821				ntohs(tcp_hdr(skb)->dest), msg);
822}
823
824/*
825 * Save and compile IPv4 options into the request_sock if needed.
826 */
827static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
828						  struct sk_buff *skb)
829{
830	const struct ip_options *opt = &(IPCB(skb)->opt);
831	struct ip_options_rcu *dopt = NULL;
832
833	if (opt && opt->optlen) {
834		int opt_size = sizeof(*dopt) + opt->optlen;
835
836		dopt = kmalloc(opt_size, GFP_ATOMIC);
837		if (dopt) {
838			if (ip_options_echo(&dopt->opt, skb)) {
839				kfree(dopt);
840				dopt = NULL;
841			}
842		}
843	}
844	return dopt;
845}
846
847#ifdef CONFIG_TCP_MD5SIG
848/*
849 * RFC2385 MD5 checksumming requires a mapping of
850 * IP address->MD5 Key.
851 * We need to maintain these in the sk structure.
852 */
853
854/* Find the Key structure for an address.  */
855static struct tcp_md5sig_key *
856			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
857{
858	struct tcp_sock *tp = tcp_sk(sk);
859	int i;
860
861	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
862		return NULL;
863	for (i = 0; i < tp->md5sig_info->entries4; i++) {
864		if (tp->md5sig_info->keys4[i].addr == addr)
865			return &tp->md5sig_info->keys4[i].base;
866	}
867	return NULL;
868}
869
870struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
871					 struct sock *addr_sk)
872{
873	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
874}
875EXPORT_SYMBOL(tcp_v4_md5_lookup);
876
877static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
878						      struct request_sock *req)
879{
880	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
881}
882
883/* This can be called on a newly created socket, from other files */
884int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
885		      u8 *newkey, u8 newkeylen)
886{
887	/* Add Key to the list */
888	struct tcp_md5sig_key *key;
889	struct tcp_sock *tp = tcp_sk(sk);
890	struct tcp4_md5sig_key *keys;
891
892	key = tcp_v4_md5_do_lookup(sk, addr);
893	if (key) {
894		/* Pre-existing entry - just update that one. */
895		kfree(key->key);
896		key->key = newkey;
897		key->keylen = newkeylen;
898	} else {
899		struct tcp_md5sig_info *md5sig;
900
901		if (!tp->md5sig_info) {
902			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
903						  GFP_ATOMIC);
904			if (!tp->md5sig_info) {
905				kfree(newkey);
906				return -ENOMEM;
907			}
908			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
909		}
910		if (tcp_alloc_md5sig_pool(sk) == NULL) {
911			kfree(newkey);
912			return -ENOMEM;
913		}
914		md5sig = tp->md5sig_info;
915
916		if (md5sig->alloced4 == md5sig->entries4) {
917			keys = kmalloc((sizeof(*keys) *
918					(md5sig->entries4 + 1)), GFP_ATOMIC);
919			if (!keys) {
920				kfree(newkey);
921				tcp_free_md5sig_pool();
922				return -ENOMEM;
923			}
924
925			if (md5sig->entries4)
926				memcpy(keys, md5sig->keys4,
927				       sizeof(*keys) * md5sig->entries4);
928
929			/* Free old key list, and reference new one */
930			kfree(md5sig->keys4);
931			md5sig->keys4 = keys;
932			md5sig->alloced4++;
933		}
934		md5sig->entries4++;
935		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
936		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
937		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
938	}
939	return 0;
940}
941EXPORT_SYMBOL(tcp_v4_md5_do_add);
942
943static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
944			       u8 *newkey, u8 newkeylen)
945{
946	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
947				 newkey, newkeylen);
948}
949
950int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
951{
952	struct tcp_sock *tp = tcp_sk(sk);
953	int i;
954
955	for (i = 0; i < tp->md5sig_info->entries4; i++) {
956		if (tp->md5sig_info->keys4[i].addr == addr) {
957			/* Free the key */
958			kfree(tp->md5sig_info->keys4[i].base.key);
959			tp->md5sig_info->entries4--;
960
961			if (tp->md5sig_info->entries4 == 0) {
962				kfree(tp->md5sig_info->keys4);
963				tp->md5sig_info->keys4 = NULL;
964				tp->md5sig_info->alloced4 = 0;
965			} else if (tp->md5sig_info->entries4 != i) {
966				/* Need to do some manipulation */
967				memmove(&tp->md5sig_info->keys4[i],
968					&tp->md5sig_info->keys4[i+1],
969					(tp->md5sig_info->entries4 - i) *
970					 sizeof(struct tcp4_md5sig_key));
971			}
972			tcp_free_md5sig_pool();
973			return 0;
974		}
975	}
976	return -ENOENT;
977}
978EXPORT_SYMBOL(tcp_v4_md5_do_del);
979
980static void tcp_v4_clear_md5_list(struct sock *sk)
981{
982	struct tcp_sock *tp = tcp_sk(sk);
983
984	/* Free each key, then the set of key keys,
985	 * the crypto element, and then decrement our
986	 * hold on the last resort crypto.
987	 */
988	if (tp->md5sig_info->entries4) {
989		int i;
990		for (i = 0; i < tp->md5sig_info->entries4; i++)
991			kfree(tp->md5sig_info->keys4[i].base.key);
992		tp->md5sig_info->entries4 = 0;
993		tcp_free_md5sig_pool();
994	}
995	if (tp->md5sig_info->keys4) {
996		kfree(tp->md5sig_info->keys4);
997		tp->md5sig_info->keys4 = NULL;
998		tp->md5sig_info->alloced4  = 0;
999	}
1000}
1001
1002static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1003				 int optlen)
1004{
1005	struct tcp_md5sig cmd;
1006	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1007	u8 *newkey;
1008
1009	if (optlen < sizeof(cmd))
1010		return -EINVAL;
1011
1012	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1013		return -EFAULT;
1014
1015	if (sin->sin_family != AF_INET)
1016		return -EINVAL;
1017
1018	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1019		if (!tcp_sk(sk)->md5sig_info)
1020			return -ENOENT;
1021		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1022	}
1023
1024	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1025		return -EINVAL;
1026
1027	if (!tcp_sk(sk)->md5sig_info) {
1028		struct tcp_sock *tp = tcp_sk(sk);
1029		struct tcp_md5sig_info *p;
1030
1031		p = kzalloc(sizeof(*p), sk->sk_allocation);
1032		if (!p)
1033			return -EINVAL;
1034
1035		tp->md5sig_info = p;
1036		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1037	}
1038
1039	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1040	if (!newkey)
1041		return -ENOMEM;
1042	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1043				 newkey, cmd.tcpm_keylen);
1044}
1045
1046static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1047					__be32 daddr, __be32 saddr, int nbytes)
1048{
1049	struct tcp4_pseudohdr *bp;
1050	struct scatterlist sg;
1051
1052	bp = &hp->md5_blk.ip4;
1053
1054	/*
1055	 * 1. the TCP pseudo-header (in the order: source IP address,
1056	 * destination IP address, zero-padded protocol number, and
1057	 * segment length)
1058	 */
1059	bp->saddr = saddr;
1060	bp->daddr = daddr;
1061	bp->pad = 0;
1062	bp->protocol = IPPROTO_TCP;
1063	bp->len = cpu_to_be16(nbytes);
1064
1065	sg_init_one(&sg, bp, sizeof(*bp));
1066	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1067}
1068
1069static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1070			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1071{
1072	struct tcp_md5sig_pool *hp;
1073	struct hash_desc *desc;
1074
1075	hp = tcp_get_md5sig_pool();
1076	if (!hp)
1077		goto clear_hash_noput;
1078	desc = &hp->md5_desc;
1079
1080	if (crypto_hash_init(desc))
1081		goto clear_hash;
1082	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1083		goto clear_hash;
1084	if (tcp_md5_hash_header(hp, th))
1085		goto clear_hash;
1086	if (tcp_md5_hash_key(hp, key))
1087		goto clear_hash;
1088	if (crypto_hash_final(desc, md5_hash))
1089		goto clear_hash;
1090
1091	tcp_put_md5sig_pool();
1092	return 0;
1093
1094clear_hash:
1095	tcp_put_md5sig_pool();
1096clear_hash_noput:
1097	memset(md5_hash, 0, 16);
1098	return 1;
1099}
1100
1101int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1102			struct sock *sk, struct request_sock *req,
1103			struct sk_buff *skb)
1104{
1105	struct tcp_md5sig_pool *hp;
1106	struct hash_desc *desc;
1107	struct tcphdr *th = tcp_hdr(skb);
1108	__be32 saddr, daddr;
1109
1110	if (sk) {
1111		saddr = inet_sk(sk)->inet_saddr;
1112		daddr = inet_sk(sk)->inet_daddr;
1113	} else if (req) {
1114		saddr = inet_rsk(req)->loc_addr;
1115		daddr = inet_rsk(req)->rmt_addr;
1116	} else {
1117		const struct iphdr *iph = ip_hdr(skb);
1118		saddr = iph->saddr;
1119		daddr = iph->daddr;
1120	}
1121
1122	hp = tcp_get_md5sig_pool();
1123	if (!hp)
1124		goto clear_hash_noput;
1125	desc = &hp->md5_desc;
1126
1127	if (crypto_hash_init(desc))
1128		goto clear_hash;
1129
1130	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1131		goto clear_hash;
1132	if (tcp_md5_hash_header(hp, th))
1133		goto clear_hash;
1134	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1135		goto clear_hash;
1136	if (tcp_md5_hash_key(hp, key))
1137		goto clear_hash;
1138	if (crypto_hash_final(desc, md5_hash))
1139		goto clear_hash;
1140
1141	tcp_put_md5sig_pool();
1142	return 0;
1143
1144clear_hash:
1145	tcp_put_md5sig_pool();
1146clear_hash_noput:
1147	memset(md5_hash, 0, 16);
1148	return 1;
1149}
1150EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1151
1152static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1153{
1154	/*
1155	 * This gets called for each TCP segment that arrives
1156	 * so we want to be efficient.
1157	 * We have 3 drop cases:
1158	 * o No MD5 hash and one expected.
1159	 * o MD5 hash and we're not expecting one.
1160	 * o MD5 hash and its wrong.
1161	 */
1162	__u8 *hash_location = NULL;
1163	struct tcp_md5sig_key *hash_expected;
1164	const struct iphdr *iph = ip_hdr(skb);
1165	struct tcphdr *th = tcp_hdr(skb);
1166	int genhash;
1167	unsigned char newhash[16];
1168
1169	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1170	hash_location = tcp_parse_md5sig_option(th);
1171
1172	/* We've parsed the options - do we have a hash? */
1173	if (!hash_expected && !hash_location)
1174		return 0;
1175
1176	if (hash_expected && !hash_location) {
1177		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1178		return 1;
1179	}
1180
1181	if (!hash_expected && hash_location) {
1182		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1183		return 1;
1184	}
1185
1186	/* Okay, so this is hash_expected and hash_location -
1187	 * so we need to calculate the checksum.
1188	 */
1189	genhash = tcp_v4_md5_hash_skb(newhash,
1190				      hash_expected,
1191				      NULL, NULL, skb);
1192
1193	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1194		if (net_ratelimit()) {
1195			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1196			       &iph->saddr, ntohs(th->source),
1197			       &iph->daddr, ntohs(th->dest),
1198			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1199		}
1200		return 1;
1201	}
1202	return 0;
1203}
1204
1205#endif
1206
1207struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1208	.family		=	PF_INET,
1209	.obj_size	=	sizeof(struct tcp_request_sock),
1210	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1211	.send_ack	=	tcp_v4_reqsk_send_ack,
1212	.destructor	=	tcp_v4_reqsk_destructor,
1213	.send_reset	=	tcp_v4_send_reset,
1214	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1215};
1216
1217#ifdef CONFIG_TCP_MD5SIG
1218static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1219	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1220	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1221};
1222#endif
1223
1224int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1225{
1226	struct tcp_extend_values tmp_ext;
1227	struct tcp_options_received tmp_opt;
1228	u8 *hash_location;
1229	struct request_sock *req;
1230	struct inet_request_sock *ireq;
1231	struct tcp_sock *tp = tcp_sk(sk);
1232	struct dst_entry *dst = NULL;
1233	__be32 saddr = ip_hdr(skb)->saddr;
1234	__be32 daddr = ip_hdr(skb)->daddr;
1235	__u32 isn = TCP_SKB_CB(skb)->when;
1236#ifdef CONFIG_SYN_COOKIES
1237	int want_cookie = 0;
1238#else
1239#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1240#endif
1241
1242	/* Never answer to SYNs send to broadcast or multicast */
1243	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1244		goto drop;
1245
1246	/* TW buckets are converted to open requests without
1247	 * limitations, they conserve resources and peer is
1248	 * evidently real one.
1249	 */
1250	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1251		if (net_ratelimit())
1252			syn_flood_warning(skb);
1253#ifdef CONFIG_SYN_COOKIES
1254		if (sysctl_tcp_syncookies) {
1255			want_cookie = 1;
1256		} else
1257#endif
1258		goto drop;
1259	}
1260
1261	/* Accept backlog is full. If we have already queued enough
1262	 * of warm entries in syn queue, drop request. It is better than
1263	 * clogging syn queue with openreqs with exponentially increasing
1264	 * timeout.
1265	 */
1266	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1267		goto drop;
1268
1269	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1270	if (!req)
1271		goto drop;
1272
1273#ifdef CONFIG_TCP_MD5SIG
1274	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1275#endif
1276
1277	tcp_clear_options(&tmp_opt);
1278	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1279	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1280	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1281
1282	if (tmp_opt.cookie_plus > 0 &&
1283	    tmp_opt.saw_tstamp &&
1284	    !tp->rx_opt.cookie_out_never &&
1285	    (sysctl_tcp_cookie_size > 0 ||
1286	     (tp->cookie_values != NULL &&
1287	      tp->cookie_values->cookie_desired > 0))) {
1288		u8 *c;
1289		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1290		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1291
1292		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1293			goto drop_and_release;
1294
1295		/* Secret recipe starts with IP addresses */
1296		*mess++ ^= (__force u32)daddr;
1297		*mess++ ^= (__force u32)saddr;
1298
1299		/* plus variable length Initiator Cookie */
1300		c = (u8 *)mess;
1301		while (l-- > 0)
1302			*c++ ^= *hash_location++;
1303
1304#ifdef CONFIG_SYN_COOKIES
1305		want_cookie = 0;	/* not our kind of cookie */
1306#endif
1307		tmp_ext.cookie_out_never = 0; /* false */
1308		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1309	} else if (!tp->rx_opt.cookie_in_always) {
1310		/* redundant indications, but ensure initialization. */
1311		tmp_ext.cookie_out_never = 1; /* true */
1312		tmp_ext.cookie_plus = 0;
1313	} else {
1314		goto drop_and_release;
1315	}
1316	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1317
1318	if (want_cookie && !tmp_opt.saw_tstamp)
1319		tcp_clear_options(&tmp_opt);
1320
1321	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1322	tcp_openreq_init(req, &tmp_opt, skb);
1323
1324	ireq = inet_rsk(req);
1325	ireq->loc_addr = daddr;
1326	ireq->rmt_addr = saddr;
1327	ireq->no_srccheck = inet_sk(sk)->transparent;
1328	ireq->opt = tcp_v4_save_options(sk, skb);
1329
1330	if (security_inet_conn_request(sk, skb, req))
1331		goto drop_and_free;
1332
1333	if (!want_cookie || tmp_opt.tstamp_ok)
1334		TCP_ECN_create_request(req, tcp_hdr(skb));
1335
1336	if (want_cookie) {
1337		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1338		req->cookie_ts = tmp_opt.tstamp_ok;
1339	} else if (!isn) {
1340		struct inet_peer *peer = NULL;
1341
1342		/* VJ's idea. We save last timestamp seen
1343		 * from the destination in peer table, when entering
1344		 * state TIME-WAIT, and check against it before
1345		 * accepting new connection request.
1346		 *
1347		 * If "isn" is not zero, this request hit alive
1348		 * timewait bucket, so that all the necessary checks
1349		 * are made in the function processing timewait state.
1350		 */
1351		if (tmp_opt.saw_tstamp &&
1352		    tcp_death_row.sysctl_tw_recycle &&
1353		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1354		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355		    peer->daddr.addr.a4 == saddr) {
1356			inet_peer_refcheck(peer);
1357			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1358			    (s32)(peer->tcp_ts - req->ts_recent) >
1359							TCP_PAWS_WINDOW) {
1360				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1361				goto drop_and_release;
1362			}
1363		}
1364		/* Kill the following clause, if you dislike this way. */
1365		else if (!sysctl_tcp_syncookies &&
1366			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1367			  (sysctl_max_syn_backlog >> 2)) &&
1368			 (!peer || !peer->tcp_ts_stamp) &&
1369			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1370			/* Without syncookies last quarter of
1371			 * backlog is filled with destinations,
1372			 * proven to be alive.
1373			 * It means that we continue to communicate
1374			 * to destinations, already remembered
1375			 * to the moment of synflood.
1376			 */
1377			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1378				       &saddr, ntohs(tcp_hdr(skb)->source));
1379			goto drop_and_release;
1380		}
1381
1382		isn = tcp_v4_init_sequence(skb);
1383	}
1384	tcp_rsk(req)->snt_isn = isn;
1385
1386	if (tcp_v4_send_synack(sk, dst, req,
1387			       (struct request_values *)&tmp_ext) ||
1388	    want_cookie)
1389		goto drop_and_free;
1390
1391	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1392	return 0;
1393
1394drop_and_release:
1395	dst_release(dst);
1396drop_and_free:
1397	reqsk_free(req);
1398drop:
1399	return 0;
1400}
1401EXPORT_SYMBOL(tcp_v4_conn_request);
1402
1403
1404/*
1405 * The three way handshake has completed - we got a valid synack -
1406 * now create the new socket.
1407 */
1408struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409				  struct request_sock *req,
1410				  struct dst_entry *dst)
1411{
1412	struct inet_request_sock *ireq;
1413	struct inet_sock *newinet;
1414	struct tcp_sock *newtp;
1415	struct sock *newsk;
1416#ifdef CONFIG_TCP_MD5SIG
1417	struct tcp_md5sig_key *key;
1418#endif
1419	struct ip_options_rcu *inet_opt;
1420
1421	if (sk_acceptq_is_full(sk))
1422		goto exit_overflow;
1423
1424	newsk = tcp_create_openreq_child(sk, req, skb);
1425	if (!newsk)
1426		goto exit_nonewsk;
1427
1428	newsk->sk_gso_type = SKB_GSO_TCPV4;
1429
1430	newtp		      = tcp_sk(newsk);
1431	newinet		      = inet_sk(newsk);
1432	ireq		      = inet_rsk(req);
1433	newinet->inet_daddr   = ireq->rmt_addr;
1434	newinet->inet_rcv_saddr = ireq->loc_addr;
1435	newinet->inet_saddr	      = ireq->loc_addr;
1436	inet_opt	      = ireq->opt;
1437	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1438	ireq->opt	      = NULL;
1439	newinet->mc_index     = inet_iif(skb);
1440	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1441	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1442	if (inet_opt)
1443		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1444	newinet->inet_id = newtp->write_seq ^ jiffies;
1445
1446	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1447		goto put_and_exit;
1448
1449	sk_setup_caps(newsk, dst);
1450
1451	tcp_mtup_init(newsk);
1452	tcp_sync_mss(newsk, dst_mtu(dst));
1453	newtp->advmss = dst_metric_advmss(dst);
1454	if (tcp_sk(sk)->rx_opt.user_mss &&
1455	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1456		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1457
1458	tcp_initialize_rcv_mss(newsk);
1459
1460#ifdef CONFIG_TCP_MD5SIG
1461	/* Copy over the MD5 key from the original socket */
1462	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1463	if (key != NULL) {
1464		/*
1465		 * We're using one, so create a matching key
1466		 * on the newsk structure. If we fail to get
1467		 * memory, then we end up not copying the key
1468		 * across. Shucks.
1469		 */
1470		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1471		if (newkey != NULL)
1472			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1473					  newkey, key->keylen);
1474		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1475	}
1476#endif
1477
1478	if (__inet_inherit_port(sk, newsk) < 0)
1479		goto put_and_exit;
1480	__inet_hash_nolisten(newsk, NULL);
1481
1482	return newsk;
1483
1484exit_overflow:
1485	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486exit_nonewsk:
1487	dst_release(dst);
1488exit:
1489	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1490	return NULL;
1491put_and_exit:
1492	sock_put(newsk);
1493	goto exit;
1494}
1495EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1496
1497static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1498{
1499	struct tcphdr *th = tcp_hdr(skb);
1500	const struct iphdr *iph = ip_hdr(skb);
1501	struct sock *nsk;
1502	struct request_sock **prev;
1503	/* Find possible connection requests. */
1504	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1505						       iph->saddr, iph->daddr);
1506	if (req)
1507		return tcp_check_req(sk, skb, req, prev);
1508
1509	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1510			th->source, iph->daddr, th->dest, inet_iif(skb));
1511
1512	if (nsk) {
1513		if (nsk->sk_state != TCP_TIME_WAIT) {
1514			bh_lock_sock(nsk);
1515			return nsk;
1516		}
1517		inet_twsk_put(inet_twsk(nsk));
1518		return NULL;
1519	}
1520
1521#ifdef CONFIG_SYN_COOKIES
1522	if (!th->syn)
1523		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1524#endif
1525	return sk;
1526}
1527
1528static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1529{
1530	const struct iphdr *iph = ip_hdr(skb);
1531
1532	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1533		if (!tcp_v4_check(skb->len, iph->saddr,
1534				  iph->daddr, skb->csum)) {
1535			skb->ip_summed = CHECKSUM_UNNECESSARY;
1536			return 0;
1537		}
1538	}
1539
1540	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1541				       skb->len, IPPROTO_TCP, 0);
1542
1543	if (skb->len <= 76) {
1544		return __skb_checksum_complete(skb);
1545	}
1546	return 0;
1547}
1548
1549
1550/* The socket must have it's spinlock held when we get
1551 * here.
1552 *
1553 * We have a potential double-lock case here, so even when
1554 * doing backlog processing we use the BH locking scheme.
1555 * This is because we cannot sleep with the original spinlock
1556 * held.
1557 */
1558int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1559{
1560	struct sock *rsk;
1561#ifdef CONFIG_TCP_MD5SIG
1562	/*
1563	 * We really want to reject the packet as early as possible
1564	 * if:
1565	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1566	 *  o There is an MD5 option and we're not expecting one
1567	 */
1568	if (tcp_v4_inbound_md5_hash(sk, skb))
1569		goto discard;
1570#endif
1571
1572	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1573		sock_rps_save_rxhash(sk, skb->rxhash);
1574		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1575			rsk = sk;
1576			goto reset;
1577		}
1578		return 0;
1579	}
1580
1581	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1582		goto csum_err;
1583
1584	if (sk->sk_state == TCP_LISTEN) {
1585		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1586		if (!nsk)
1587			goto discard;
1588
1589		if (nsk != sk) {
1590			if (tcp_child_process(sk, nsk, skb)) {
1591				rsk = nsk;
1592				goto reset;
1593			}
1594			return 0;
1595		}
1596	} else
1597		sock_rps_save_rxhash(sk, skb->rxhash);
1598
1599	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1600		rsk = sk;
1601		goto reset;
1602	}
1603	return 0;
1604
1605reset:
1606	tcp_v4_send_reset(rsk, skb);
1607discard:
1608	kfree_skb(skb);
1609	/* Be careful here. If this function gets more complicated and
1610	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1611	 * might be destroyed here. This current version compiles correctly,
1612	 * but you have been warned.
1613	 */
1614	return 0;
1615
1616csum_err:
1617	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1618	goto discard;
1619}
1620EXPORT_SYMBOL(tcp_v4_do_rcv);
1621
1622/*
1623 *	From tcp_input.c
1624 */
1625
1626int tcp_v4_rcv(struct sk_buff *skb)
1627{
1628	const struct iphdr *iph;
1629	struct tcphdr *th;
1630	struct sock *sk;
1631	int ret;
1632	struct net *net = dev_net(skb->dev);
1633
1634	if (skb->pkt_type != PACKET_HOST)
1635		goto discard_it;
1636
1637	/* Count it even if it's bad */
1638	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1639
1640	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1641		goto discard_it;
1642
1643	th = tcp_hdr(skb);
1644
1645	if (th->doff < sizeof(struct tcphdr) / 4)
1646		goto bad_packet;
1647	if (!pskb_may_pull(skb, th->doff * 4))
1648		goto discard_it;
1649
1650	/* An explanation is required here, I think.
1651	 * Packet length and doff are validated by header prediction,
1652	 * provided case of th->doff==0 is eliminated.
1653	 * So, we defer the checks. */
1654	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1655		goto bad_packet;
1656
1657	th = tcp_hdr(skb);
1658	iph = ip_hdr(skb);
1659	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1660	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1661				    skb->len - th->doff * 4);
1662	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1663	TCP_SKB_CB(skb)->when	 = 0;
1664	TCP_SKB_CB(skb)->flags	 = iph->tos;
1665	TCP_SKB_CB(skb)->sacked	 = 0;
1666
1667	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1668	if (!sk)
1669		goto no_tcp_socket;
1670
1671process:
1672	if (sk->sk_state == TCP_TIME_WAIT)
1673		goto do_time_wait;
1674
1675	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1676		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1677		goto discard_and_relse;
1678	}
1679
1680	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1681		goto discard_and_relse;
1682	nf_reset(skb);
1683
1684	if (sk_filter(sk, skb))
1685		goto discard_and_relse;
1686
1687	skb->dev = NULL;
1688
1689	bh_lock_sock_nested(sk);
1690	ret = 0;
1691	if (!sock_owned_by_user(sk)) {
1692#ifdef CONFIG_NET_DMA
1693		struct tcp_sock *tp = tcp_sk(sk);
1694		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1695			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1696		if (tp->ucopy.dma_chan)
1697			ret = tcp_v4_do_rcv(sk, skb);
1698		else
1699#endif
1700		{
1701			if (!tcp_prequeue(sk, skb))
1702				ret = tcp_v4_do_rcv(sk, skb);
1703		}
1704	} else if (unlikely(sk_add_backlog(sk, skb))) {
1705		bh_unlock_sock(sk);
1706		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1707		goto discard_and_relse;
1708	}
1709	bh_unlock_sock(sk);
1710
1711	sock_put(sk);
1712
1713	return ret;
1714
1715no_tcp_socket:
1716	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1717		goto discard_it;
1718
1719	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1720bad_packet:
1721		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1722	} else {
1723		tcp_v4_send_reset(NULL, skb);
1724	}
1725
1726discard_it:
1727	/* Discard frame. */
1728	kfree_skb(skb);
1729	return 0;
1730
1731discard_and_relse:
1732	sock_put(sk);
1733	goto discard_it;
1734
1735do_time_wait:
1736	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1737		inet_twsk_put(inet_twsk(sk));
1738		goto discard_it;
1739	}
1740
1741	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1742		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1743		inet_twsk_put(inet_twsk(sk));
1744		goto discard_it;
1745	}
1746	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1747	case TCP_TW_SYN: {
1748		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1749							&tcp_hashinfo,
1750							iph->daddr, th->dest,
1751							inet_iif(skb));
1752		if (sk2) {
1753			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1754			inet_twsk_put(inet_twsk(sk));
1755			sk = sk2;
1756			goto process;
1757		}
1758		/* Fall through to ACK */
1759	}
1760	case TCP_TW_ACK:
1761		tcp_v4_timewait_ack(sk, skb);
1762		break;
1763	case TCP_TW_RST:
1764		goto no_tcp_socket;
1765	case TCP_TW_SUCCESS:;
1766	}
1767	goto discard_it;
1768}
1769
1770struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1771{
1772	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1773	struct inet_sock *inet = inet_sk(sk);
1774	struct inet_peer *peer;
1775
1776	if (!rt || rt->rt_dst != inet->inet_daddr) {
1777		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1778		*release_it = true;
1779	} else {
1780		if (!rt->peer)
1781			rt_bind_peer(rt, 1);
1782		peer = rt->peer;
1783		*release_it = false;
1784	}
1785
1786	return peer;
1787}
1788EXPORT_SYMBOL(tcp_v4_get_peer);
1789
1790void *tcp_v4_tw_get_peer(struct sock *sk)
1791{
1792	struct inet_timewait_sock *tw = inet_twsk(sk);
1793
1794	return inet_getpeer_v4(tw->tw_daddr, 1);
1795}
1796EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1797
1798static struct timewait_sock_ops tcp_timewait_sock_ops = {
1799	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1800	.twsk_unique	= tcp_twsk_unique,
1801	.twsk_destructor= tcp_twsk_destructor,
1802	.twsk_getpeer	= tcp_v4_tw_get_peer,
1803};
1804
1805const struct inet_connection_sock_af_ops ipv4_specific = {
1806	.queue_xmit	   = ip_queue_xmit,
1807	.send_check	   = tcp_v4_send_check,
1808	.rebuild_header	   = inet_sk_rebuild_header,
1809	.conn_request	   = tcp_v4_conn_request,
1810	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1811	.get_peer	   = tcp_v4_get_peer,
1812	.net_header_len	   = sizeof(struct iphdr),
1813	.setsockopt	   = ip_setsockopt,
1814	.getsockopt	   = ip_getsockopt,
1815	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1816	.sockaddr_len	   = sizeof(struct sockaddr_in),
1817	.bind_conflict	   = inet_csk_bind_conflict,
1818#ifdef CONFIG_COMPAT
1819	.compat_setsockopt = compat_ip_setsockopt,
1820	.compat_getsockopt = compat_ip_getsockopt,
1821#endif
1822};
1823EXPORT_SYMBOL(ipv4_specific);
1824
1825#ifdef CONFIG_TCP_MD5SIG
1826static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1827	.md5_lookup		= tcp_v4_md5_lookup,
1828	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1829	.md5_add		= tcp_v4_md5_add_func,
1830	.md5_parse		= tcp_v4_parse_md5_keys,
1831};
1832#endif
1833
1834/* NOTE: A lot of things set to zero explicitly by call to
1835 *       sk_alloc() so need not be done here.
1836 */
1837static int tcp_v4_init_sock(struct sock *sk)
1838{
1839	struct inet_connection_sock *icsk = inet_csk(sk);
1840	struct tcp_sock *tp = tcp_sk(sk);
1841
1842	skb_queue_head_init(&tp->out_of_order_queue);
1843	tcp_init_xmit_timers(sk);
1844	tcp_prequeue_init(tp);
1845
1846	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1847	tp->mdev = TCP_TIMEOUT_INIT;
1848
1849	/* So many TCP implementations out there (incorrectly) count the
1850	 * initial SYN frame in their delayed-ACK and congestion control
1851	 * algorithms that we must have the following bandaid to talk
1852	 * efficiently to them.  -DaveM
1853	 */
1854	tp->snd_cwnd = 2;
1855
1856	/* See draft-stevens-tcpca-spec-01 for discussion of the
1857	 * initialization of these values.
1858	 */
1859	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1860	tp->snd_cwnd_clamp = ~0;
1861	tp->mss_cache = TCP_MSS_DEFAULT;
1862
1863	tp->reordering = sysctl_tcp_reordering;
1864	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1865
1866	sk->sk_state = TCP_CLOSE;
1867
1868	sk->sk_write_space = sk_stream_write_space;
1869	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1870
1871	icsk->icsk_af_ops = &ipv4_specific;
1872	icsk->icsk_sync_mss = tcp_sync_mss;
1873#ifdef CONFIG_TCP_MD5SIG
1874	tp->af_specific = &tcp_sock_ipv4_specific;
1875#endif
1876
1877	/* TCP Cookie Transactions */
1878	if (sysctl_tcp_cookie_size > 0) {
1879		/* Default, cookies without s_data_payload. */
1880		tp->cookie_values =
1881			kzalloc(sizeof(*tp->cookie_values),
1882				sk->sk_allocation);
1883		if (tp->cookie_values != NULL)
1884			kref_init(&tp->cookie_values->kref);
1885	}
1886	/* Presumed zeroed, in order of appearance:
1887	 *	cookie_in_always, cookie_out_never,
1888	 *	s_data_constant, s_data_in, s_data_out
1889	 */
1890	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1891	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1892
1893	local_bh_disable();
1894	percpu_counter_inc(&tcp_sockets_allocated);
1895	local_bh_enable();
1896
1897	return 0;
1898}
1899
1900void tcp_v4_destroy_sock(struct sock *sk)
1901{
1902	struct tcp_sock *tp = tcp_sk(sk);
1903
1904	tcp_clear_xmit_timers(sk);
1905
1906	tcp_cleanup_congestion_control(sk);
1907
1908	/* Cleanup up the write buffer. */
1909	tcp_write_queue_purge(sk);
1910
1911	/* Cleans up our, hopefully empty, out_of_order_queue. */
1912	__skb_queue_purge(&tp->out_of_order_queue);
1913
1914#ifdef CONFIG_TCP_MD5SIG
1915	/* Clean up the MD5 key list, if any */
1916	if (tp->md5sig_info) {
1917		tcp_v4_clear_md5_list(sk);
1918		kfree(tp->md5sig_info);
1919		tp->md5sig_info = NULL;
1920	}
1921#endif
1922
1923#ifdef CONFIG_NET_DMA
1924	/* Cleans up our sk_async_wait_queue */
1925	__skb_queue_purge(&sk->sk_async_wait_queue);
1926#endif
1927
1928	/* Clean prequeue, it must be empty really */
1929	__skb_queue_purge(&tp->ucopy.prequeue);
1930
1931	/* Clean up a referenced TCP bind bucket. */
1932	if (inet_csk(sk)->icsk_bind_hash)
1933		inet_put_port(sk);
1934
1935	/*
1936	 * If sendmsg cached page exists, toss it.
1937	 */
1938	if (sk->sk_sndmsg_page) {
1939		__free_page(sk->sk_sndmsg_page);
1940		sk->sk_sndmsg_page = NULL;
1941	}
1942
1943	/* TCP Cookie Transactions */
1944	if (tp->cookie_values != NULL) {
1945		kref_put(&tp->cookie_values->kref,
1946			 tcp_cookie_values_release);
1947		tp->cookie_values = NULL;
1948	}
1949
1950	percpu_counter_dec(&tcp_sockets_allocated);
1951}
1952EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954#ifdef CONFIG_PROC_FS
1955/* Proc filesystem TCP sock list dumping. */
1956
1957static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1958{
1959	return hlist_nulls_empty(head) ? NULL :
1960		list_entry(head->first, struct inet_timewait_sock, tw_node);
1961}
1962
1963static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1964{
1965	return !is_a_nulls(tw->tw_node.next) ?
1966		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1967}
1968
1969/*
1970 * Get next listener socket follow cur.  If cur is NULL, get first socket
1971 * starting from bucket given in st->bucket; when st->bucket is zero the
1972 * very first socket in the hash table is returned.
1973 */
1974static void *listening_get_next(struct seq_file *seq, void *cur)
1975{
1976	struct inet_connection_sock *icsk;
1977	struct hlist_nulls_node *node;
1978	struct sock *sk = cur;
1979	struct inet_listen_hashbucket *ilb;
1980	struct tcp_iter_state *st = seq->private;
1981	struct net *net = seq_file_net(seq);
1982
1983	if (!sk) {
1984		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1985		spin_lock_bh(&ilb->lock);
1986		sk = sk_nulls_head(&ilb->head);
1987		st->offset = 0;
1988		goto get_sk;
1989	}
1990	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1991	++st->num;
1992	++st->offset;
1993
1994	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1995		struct request_sock *req = cur;
1996
1997		icsk = inet_csk(st->syn_wait_sk);
1998		req = req->dl_next;
1999		while (1) {
2000			while (req) {
2001				if (req->rsk_ops->family == st->family) {
2002					cur = req;
2003					goto out;
2004				}
2005				req = req->dl_next;
2006			}
2007			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2008				break;
2009get_req:
2010			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2011		}
2012		sk	  = sk_nulls_next(st->syn_wait_sk);
2013		st->state = TCP_SEQ_STATE_LISTENING;
2014		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2015	} else {
2016		icsk = inet_csk(sk);
2017		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2019			goto start_req;
2020		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021		sk = sk_nulls_next(sk);
2022	}
2023get_sk:
2024	sk_nulls_for_each_from(sk, node) {
2025		if (!net_eq(sock_net(sk), net))
2026			continue;
2027		if (sk->sk_family == st->family) {
2028			cur = sk;
2029			goto out;
2030		}
2031		icsk = inet_csk(sk);
2032		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2034start_req:
2035			st->uid		= sock_i_uid(sk);
2036			st->syn_wait_sk = sk;
2037			st->state	= TCP_SEQ_STATE_OPENREQ;
2038			st->sbucket	= 0;
2039			goto get_req;
2040		}
2041		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2042	}
2043	spin_unlock_bh(&ilb->lock);
2044	st->offset = 0;
2045	if (++st->bucket < INET_LHTABLE_SIZE) {
2046		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2047		spin_lock_bh(&ilb->lock);
2048		sk = sk_nulls_head(&ilb->head);
2049		goto get_sk;
2050	}
2051	cur = NULL;
2052out:
2053	return cur;
2054}
2055
2056static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2057{
2058	struct tcp_iter_state *st = seq->private;
2059	void *rc;
2060
2061	st->bucket = 0;
2062	st->offset = 0;
2063	rc = listening_get_next(seq, NULL);
2064
2065	while (rc && *pos) {
2066		rc = listening_get_next(seq, rc);
2067		--*pos;
2068	}
2069	return rc;
2070}
2071
2072static inline int empty_bucket(struct tcp_iter_state *st)
2073{
2074	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2075		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2076}
2077
2078/*
2079 * Get first established socket starting from bucket given in st->bucket.
2080 * If st->bucket is zero, the very first socket in the hash is returned.
2081 */
2082static void *established_get_first(struct seq_file *seq)
2083{
2084	struct tcp_iter_state *st = seq->private;
2085	struct net *net = seq_file_net(seq);
2086	void *rc = NULL;
2087
2088	st->offset = 0;
2089	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2090		struct sock *sk;
2091		struct hlist_nulls_node *node;
2092		struct inet_timewait_sock *tw;
2093		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2094
2095		/* Lockless fast path for the common case of empty buckets */
2096		if (empty_bucket(st))
2097			continue;
2098
2099		spin_lock_bh(lock);
2100		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2101			if (sk->sk_family != st->family ||
2102			    !net_eq(sock_net(sk), net)) {
2103				continue;
2104			}
2105			rc = sk;
2106			goto out;
2107		}
2108		st->state = TCP_SEQ_STATE_TIME_WAIT;
2109		inet_twsk_for_each(tw, node,
2110				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2111			if (tw->tw_family != st->family ||
2112			    !net_eq(twsk_net(tw), net)) {
2113				continue;
2114			}
2115			rc = tw;
2116			goto out;
2117		}
2118		spin_unlock_bh(lock);
2119		st->state = TCP_SEQ_STATE_ESTABLISHED;
2120	}
2121out:
2122	return rc;
2123}
2124
2125static void *established_get_next(struct seq_file *seq, void *cur)
2126{
2127	struct sock *sk = cur;
2128	struct inet_timewait_sock *tw;
2129	struct hlist_nulls_node *node;
2130	struct tcp_iter_state *st = seq->private;
2131	struct net *net = seq_file_net(seq);
2132
2133	++st->num;
2134	++st->offset;
2135
2136	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2137		tw = cur;
2138		tw = tw_next(tw);
2139get_tw:
2140		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2141			tw = tw_next(tw);
2142		}
2143		if (tw) {
2144			cur = tw;
2145			goto out;
2146		}
2147		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2148		st->state = TCP_SEQ_STATE_ESTABLISHED;
2149
2150		/* Look for next non empty bucket */
2151		st->offset = 0;
2152		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2153				empty_bucket(st))
2154			;
2155		if (st->bucket > tcp_hashinfo.ehash_mask)
2156			return NULL;
2157
2158		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2159		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2160	} else
2161		sk = sk_nulls_next(sk);
2162
2163	sk_nulls_for_each_from(sk, node) {
2164		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2165			goto found;
2166	}
2167
2168	st->state = TCP_SEQ_STATE_TIME_WAIT;
2169	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2170	goto get_tw;
2171found:
2172	cur = sk;
2173out:
2174	return cur;
2175}
2176
2177static void *established_get_idx(struct seq_file *seq, loff_t pos)
2178{
2179	struct tcp_iter_state *st = seq->private;
2180	void *rc;
2181
2182	st->bucket = 0;
2183	rc = established_get_first(seq);
2184
2185	while (rc && pos) {
2186		rc = established_get_next(seq, rc);
2187		--pos;
2188	}
2189	return rc;
2190}
2191
2192static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2193{
2194	void *rc;
2195	struct tcp_iter_state *st = seq->private;
2196
2197	st->state = TCP_SEQ_STATE_LISTENING;
2198	rc	  = listening_get_idx(seq, &pos);
2199
2200	if (!rc) {
2201		st->state = TCP_SEQ_STATE_ESTABLISHED;
2202		rc	  = established_get_idx(seq, pos);
2203	}
2204
2205	return rc;
2206}
2207
2208static void *tcp_seek_last_pos(struct seq_file *seq)
2209{
2210	struct tcp_iter_state *st = seq->private;
2211	int offset = st->offset;
2212	int orig_num = st->num;
2213	void *rc = NULL;
2214
2215	switch (st->state) {
2216	case TCP_SEQ_STATE_OPENREQ:
2217	case TCP_SEQ_STATE_LISTENING:
2218		if (st->bucket >= INET_LHTABLE_SIZE)
2219			break;
2220		st->state = TCP_SEQ_STATE_LISTENING;
2221		rc = listening_get_next(seq, NULL);
2222		while (offset-- && rc)
2223			rc = listening_get_next(seq, rc);
2224		if (rc)
2225			break;
2226		st->bucket = 0;
2227		/* Fallthrough */
2228	case TCP_SEQ_STATE_ESTABLISHED:
2229	case TCP_SEQ_STATE_TIME_WAIT:
2230		st->state = TCP_SEQ_STATE_ESTABLISHED;
2231		if (st->bucket > tcp_hashinfo.ehash_mask)
2232			break;
2233		rc = established_get_first(seq);
2234		while (offset-- && rc)
2235			rc = established_get_next(seq, rc);
2236	}
2237
2238	st->num = orig_num;
2239
2240	return rc;
2241}
2242
2243static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2244{
2245	struct tcp_iter_state *st = seq->private;
2246	void *rc;
2247
2248	if (*pos && *pos == st->last_pos) {
2249		rc = tcp_seek_last_pos(seq);
2250		if (rc)
2251			goto out;
2252	}
2253
2254	st->state = TCP_SEQ_STATE_LISTENING;
2255	st->num = 0;
2256	st->bucket = 0;
2257	st->offset = 0;
2258	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2259
2260out:
2261	st->last_pos = *pos;
2262	return rc;
2263}
2264
2265static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2266{
2267	struct tcp_iter_state *st = seq->private;
2268	void *rc = NULL;
2269
2270	if (v == SEQ_START_TOKEN) {
2271		rc = tcp_get_idx(seq, 0);
2272		goto out;
2273	}
2274
2275	switch (st->state) {
2276	case TCP_SEQ_STATE_OPENREQ:
2277	case TCP_SEQ_STATE_LISTENING:
2278		rc = listening_get_next(seq, v);
2279		if (!rc) {
2280			st->state = TCP_SEQ_STATE_ESTABLISHED;
2281			st->bucket = 0;
2282			st->offset = 0;
2283			rc	  = established_get_first(seq);
2284		}
2285		break;
2286	case TCP_SEQ_STATE_ESTABLISHED:
2287	case TCP_SEQ_STATE_TIME_WAIT:
2288		rc = established_get_next(seq, v);
2289		break;
2290	}
2291out:
2292	++*pos;
2293	st->last_pos = *pos;
2294	return rc;
2295}
2296
2297static void tcp_seq_stop(struct seq_file *seq, void *v)
2298{
2299	struct tcp_iter_state *st = seq->private;
2300
2301	switch (st->state) {
2302	case TCP_SEQ_STATE_OPENREQ:
2303		if (v) {
2304			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2305			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2306		}
2307	case TCP_SEQ_STATE_LISTENING:
2308		if (v != SEQ_START_TOKEN)
2309			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2310		break;
2311	case TCP_SEQ_STATE_TIME_WAIT:
2312	case TCP_SEQ_STATE_ESTABLISHED:
2313		if (v)
2314			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2315		break;
2316	}
2317}
2318
2319static int tcp_seq_open(struct inode *inode, struct file *file)
2320{
2321	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2322	struct tcp_iter_state *s;
2323	int err;
2324
2325	err = seq_open_net(inode, file, &afinfo->seq_ops,
2326			  sizeof(struct tcp_iter_state));
2327	if (err < 0)
2328		return err;
2329
2330	s = ((struct seq_file *)file->private_data)->private;
2331	s->family		= afinfo->family;
2332	s->last_pos 		= 0;
2333	return 0;
2334}
2335
2336int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2337{
2338	int rc = 0;
2339	struct proc_dir_entry *p;
2340
2341	afinfo->seq_fops.open		= tcp_seq_open;
2342	afinfo->seq_fops.read		= seq_read;
2343	afinfo->seq_fops.llseek		= seq_lseek;
2344	afinfo->seq_fops.release	= seq_release_net;
2345
2346	afinfo->seq_ops.start		= tcp_seq_start;
2347	afinfo->seq_ops.next		= tcp_seq_next;
2348	afinfo->seq_ops.stop		= tcp_seq_stop;
2349
2350	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2351			     &afinfo->seq_fops, afinfo);
2352	if (!p)
2353		rc = -ENOMEM;
2354	return rc;
2355}
2356EXPORT_SYMBOL(tcp_proc_register);
2357
2358void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2359{
2360	proc_net_remove(net, afinfo->name);
2361}
2362EXPORT_SYMBOL(tcp_proc_unregister);
2363
2364static void get_openreq4(struct sock *sk, struct request_sock *req,
2365			 struct seq_file *f, int i, int uid, int *len)
2366{
2367	const struct inet_request_sock *ireq = inet_rsk(req);
2368	int ttd = req->expires - jiffies;
2369
2370	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2372		i,
2373		ireq->loc_addr,
2374		ntohs(inet_sk(sk)->inet_sport),
2375		ireq->rmt_addr,
2376		ntohs(ireq->rmt_port),
2377		TCP_SYN_RECV,
2378		0, 0, /* could print option size, but that is af dependent. */
2379		1,    /* timers active (only the expire timer) */
2380		jiffies_to_clock_t(ttd),
2381		req->retrans,
2382		uid,
2383		0,  /* non standard timer */
2384		0, /* open_requests have no inode */
2385		atomic_read(&sk->sk_refcnt),
2386		req,
2387		len);
2388}
2389
2390static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2391{
2392	int timer_active;
2393	unsigned long timer_expires;
2394	struct tcp_sock *tp = tcp_sk(sk);
2395	const struct inet_connection_sock *icsk = inet_csk(sk);
2396	struct inet_sock *inet = inet_sk(sk);
2397	__be32 dest = inet->inet_daddr;
2398	__be32 src = inet->inet_rcv_saddr;
2399	__u16 destp = ntohs(inet->inet_dport);
2400	__u16 srcp = ntohs(inet->inet_sport);
2401	int rx_queue;
2402
2403	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2404		timer_active	= 1;
2405		timer_expires	= icsk->icsk_timeout;
2406	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2407		timer_active	= 4;
2408		timer_expires	= icsk->icsk_timeout;
2409	} else if (timer_pending(&sk->sk_timer)) {
2410		timer_active	= 2;
2411		timer_expires	= sk->sk_timer.expires;
2412	} else {
2413		timer_active	= 0;
2414		timer_expires = jiffies;
2415	}
2416
2417	if (sk->sk_state == TCP_LISTEN)
2418		rx_queue = sk->sk_ack_backlog;
2419	else
2420		/*
2421		 * because we dont lock socket, we might find a transient negative value
2422		 */
2423		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2424
2425	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2426			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2427		i, src, srcp, dest, destp, sk->sk_state,
2428		tp->write_seq - tp->snd_una,
2429		rx_queue,
2430		timer_active,
2431		jiffies_to_clock_t(timer_expires - jiffies),
2432		icsk->icsk_retransmits,
2433		sock_i_uid(sk),
2434		icsk->icsk_probes_out,
2435		sock_i_ino(sk),
2436		atomic_read(&sk->sk_refcnt), sk,
2437		jiffies_to_clock_t(icsk->icsk_rto),
2438		jiffies_to_clock_t(icsk->icsk_ack.ato),
2439		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2440		tp->snd_cwnd,
2441		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2442		len);
2443}
2444
2445static void get_timewait4_sock(struct inet_timewait_sock *tw,
2446			       struct seq_file *f, int i, int *len)
2447{
2448	__be32 dest, src;
2449	__u16 destp, srcp;
2450	int ttd = tw->tw_ttd - jiffies;
2451
2452	if (ttd < 0)
2453		ttd = 0;
2454
2455	dest  = tw->tw_daddr;
2456	src   = tw->tw_rcv_saddr;
2457	destp = ntohs(tw->tw_dport);
2458	srcp  = ntohs(tw->tw_sport);
2459
2460	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2461		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2462		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2463		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2464		atomic_read(&tw->tw_refcnt), tw, len);
2465}
2466
2467#define TMPSZ 150
2468
2469static int tcp4_seq_show(struct seq_file *seq, void *v)
2470{
2471	struct tcp_iter_state *st;
2472	int len;
2473
2474	if (v == SEQ_START_TOKEN) {
2475		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2476			   "  sl  local_address rem_address   st tx_queue "
2477			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2478			   "inode");
2479		goto out;
2480	}
2481	st = seq->private;
2482
2483	switch (st->state) {
2484	case TCP_SEQ_STATE_LISTENING:
2485	case TCP_SEQ_STATE_ESTABLISHED:
2486		get_tcp4_sock(v, seq, st->num, &len);
2487		break;
2488	case TCP_SEQ_STATE_OPENREQ:
2489		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2490		break;
2491	case TCP_SEQ_STATE_TIME_WAIT:
2492		get_timewait4_sock(v, seq, st->num, &len);
2493		break;
2494	}
2495	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2496out:
2497	return 0;
2498}
2499
2500static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2501	.name		= "tcp",
2502	.family		= AF_INET,
2503	.seq_fops	= {
2504		.owner		= THIS_MODULE,
2505	},
2506	.seq_ops	= {
2507		.show		= tcp4_seq_show,
2508	},
2509};
2510
2511static int __net_init tcp4_proc_init_net(struct net *net)
2512{
2513	return tcp_proc_register(net, &tcp4_seq_afinfo);
2514}
2515
2516static void __net_exit tcp4_proc_exit_net(struct net *net)
2517{
2518	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2519}
2520
2521static struct pernet_operations tcp4_net_ops = {
2522	.init = tcp4_proc_init_net,
2523	.exit = tcp4_proc_exit_net,
2524};
2525
2526int __init tcp4_proc_init(void)
2527{
2528	return register_pernet_subsys(&tcp4_net_ops);
2529}
2530
2531void tcp4_proc_exit(void)
2532{
2533	unregister_pernet_subsys(&tcp4_net_ops);
2534}
2535#endif /* CONFIG_PROC_FS */
2536
2537struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2538{
2539	const struct iphdr *iph = skb_gro_network_header(skb);
2540
2541	switch (skb->ip_summed) {
2542	case CHECKSUM_COMPLETE:
2543		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2544				  skb->csum)) {
2545			skb->ip_summed = CHECKSUM_UNNECESSARY;
2546			break;
2547		}
2548
2549		/* fall through */
2550	case CHECKSUM_NONE:
2551		NAPI_GRO_CB(skb)->flush = 1;
2552		return NULL;
2553	}
2554
2555	return tcp_gro_receive(head, skb);
2556}
2557
2558int tcp4_gro_complete(struct sk_buff *skb)
2559{
2560	const struct iphdr *iph = ip_hdr(skb);
2561	struct tcphdr *th = tcp_hdr(skb);
2562
2563	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2564				  iph->saddr, iph->daddr, 0);
2565	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2566
2567	return tcp_gro_complete(skb);
2568}
2569
2570struct proto tcp_prot = {
2571	.name			= "TCP",
2572	.owner			= THIS_MODULE,
2573	.close			= tcp_close,
2574	.connect		= tcp_v4_connect,
2575	.disconnect		= tcp_disconnect,
2576	.accept			= inet_csk_accept,
2577	.ioctl			= tcp_ioctl,
2578	.init			= tcp_v4_init_sock,
2579	.destroy		= tcp_v4_destroy_sock,
2580	.shutdown		= tcp_shutdown,
2581	.setsockopt		= tcp_setsockopt,
2582	.getsockopt		= tcp_getsockopt,
2583	.recvmsg		= tcp_recvmsg,
2584	.sendmsg		= tcp_sendmsg,
2585	.sendpage		= tcp_sendpage,
2586	.backlog_rcv		= tcp_v4_do_rcv,
2587	.hash			= inet_hash,
2588	.unhash			= inet_unhash,
2589	.get_port		= inet_csk_get_port,
2590	.enter_memory_pressure	= tcp_enter_memory_pressure,
2591	.sockets_allocated	= &tcp_sockets_allocated,
2592	.orphan_count		= &tcp_orphan_count,
2593	.memory_allocated	= &tcp_memory_allocated,
2594	.memory_pressure	= &tcp_memory_pressure,
2595	.sysctl_mem		= sysctl_tcp_mem,
2596	.sysctl_wmem		= sysctl_tcp_wmem,
2597	.sysctl_rmem		= sysctl_tcp_rmem,
2598	.max_header		= MAX_TCP_HEADER,
2599	.obj_size		= sizeof(struct tcp_sock),
2600	.slab_flags		= SLAB_DESTROY_BY_RCU,
2601	.twsk_prot		= &tcp_timewait_sock_ops,
2602	.rsk_prot		= &tcp_request_sock_ops,
2603	.h.hashinfo		= &tcp_hashinfo,
2604	.no_autobind		= true,
2605#ifdef CONFIG_COMPAT
2606	.compat_setsockopt	= compat_tcp_setsockopt,
2607	.compat_getsockopt	= compat_tcp_getsockopt,
2608#endif
2609};
2610EXPORT_SYMBOL(tcp_prot);
2611
2612
2613static int __net_init tcp_sk_init(struct net *net)
2614{
2615	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2616				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2617}
2618
2619static void __net_exit tcp_sk_exit(struct net *net)
2620{
2621	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2622}
2623
2624static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2625{
2626	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2627}
2628
2629static struct pernet_operations __net_initdata tcp_sk_ops = {
2630       .init	   = tcp_sk_init,
2631       .exit	   = tcp_sk_exit,
2632       .exit_batch = tcp_sk_exit_batch,
2633};
2634
2635void __init tcp_v4_init(void)
2636{
2637	inet_hashinfo_init(&tcp_hashinfo);
2638	if (register_pernet_subsys(&tcp_sk_ops))
2639		panic("Failed to create the TCP control socket.\n");
2640}
2641