tcp_ipv4.c revision a48eff128865aa20520fa6e0e0c5fbd2ac50d712
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90#ifdef CONFIG_TCP_MD5SIG
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92						   __be32 addr);
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99	return NULL;
100}
101#endif
102
103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	__be16 orig_sport, orig_dport;
153	__be32 daddr, nexthop;
154	struct flowi4 *fl4;
155	struct rtable *rt;
156	int err;
157	struct ip_options_rcu *inet_opt;
158
159	if (addr_len < sizeof(struct sockaddr_in))
160		return -EINVAL;
161
162	if (usin->sin_family != AF_INET)
163		return -EAFNOSUPPORT;
164
165	nexthop = daddr = usin->sin_addr.s_addr;
166	inet_opt = rcu_dereference_protected(inet->inet_opt,
167					     sock_owned_by_user(sk));
168	if (inet_opt && inet_opt->opt.srr) {
169		if (!daddr)
170			return -EINVAL;
171		nexthop = inet_opt->opt.faddr;
172	}
173
174	orig_sport = inet->inet_sport;
175	orig_dport = usin->sin_port;
176	fl4 = &inet->cork.fl.u.ip4;
177	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
178			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179			      IPPROTO_TCP,
180			      orig_sport, orig_dport, sk, true);
181	if (IS_ERR(rt)) {
182		err = PTR_ERR(rt);
183		if (err == -ENETUNREACH)
184			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
185		return err;
186	}
187
188	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189		ip_rt_put(rt);
190		return -ENETUNREACH;
191	}
192
193	if (!inet_opt || !inet_opt->opt.srr)
194		daddr = fl4->daddr;
195
196	if (!inet->inet_saddr)
197		inet->inet_saddr = fl4->saddr;
198	inet->inet_rcv_saddr = inet->inet_saddr;
199
200	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
201		/* Reset inherited state */
202		tp->rx_opt.ts_recent	   = 0;
203		tp->rx_opt.ts_recent_stamp = 0;
204		tp->write_seq		   = 0;
205	}
206
207	if (tcp_death_row.sysctl_tw_recycle &&
208	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
209		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
210		/*
211		 * VJ's idea. We save last timestamp seen from
212		 * the destination in peer table, when entering state
213		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214		 * when trying new connection.
215		 */
216		if (peer) {
217			inet_peer_refcheck(peer);
218			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220				tp->rx_opt.ts_recent = peer->tcp_ts;
221			}
222		}
223	}
224
225	inet->inet_dport = usin->sin_port;
226	inet->inet_daddr = daddr;
227
228	inet_csk(sk)->icsk_ext_hdr_len = 0;
229	if (inet_opt)
230		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
231
232	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
233
234	/* Socket identity is still unknown (sport may be zero).
235	 * However we set state to SYN-SENT and not releasing socket
236	 * lock select source port, enter ourselves into the hash tables and
237	 * complete initialization after this.
238	 */
239	tcp_set_state(sk, TCP_SYN_SENT);
240	err = inet_hash_connect(&tcp_death_row, sk);
241	if (err)
242		goto failure;
243
244	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
245			       inet->inet_sport, inet->inet_dport, sk);
246	if (IS_ERR(rt)) {
247		err = PTR_ERR(rt);
248		rt = NULL;
249		goto failure;
250	}
251	/* OK, now commit destination to socket.  */
252	sk->sk_gso_type = SKB_GSO_TCPV4;
253	sk_setup_caps(sk, &rt->dst);
254
255	if (!tp->write_seq)
256		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257							   inet->inet_daddr,
258							   inet->inet_sport,
259							   usin->sin_port);
260
261	inet->inet_id = tp->write_seq ^ jiffies;
262
263	err = tcp_connect(sk);
264	rt = NULL;
265	if (err)
266		goto failure;
267
268	return 0;
269
270failure:
271	/*
272	 * This unhashes the socket and releases the local port,
273	 * if necessary.
274	 */
275	tcp_set_state(sk, TCP_CLOSE);
276	ip_rt_put(rt);
277	sk->sk_route_caps = 0;
278	inet->inet_dport = 0;
279	return err;
280}
281EXPORT_SYMBOL(tcp_v4_connect);
282
283/*
284 * This routine does path mtu discovery as defined in RFC1191.
285 */
286static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
287{
288	struct dst_entry *dst;
289	struct inet_sock *inet = inet_sk(sk);
290
291	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292	 * send out by Linux are always <576bytes so they should go through
293	 * unfragmented).
294	 */
295	if (sk->sk_state == TCP_LISTEN)
296		return;
297
298	/* We don't check in the destentry if pmtu discovery is forbidden
299	 * on this route. We just assume that no packet_to_big packets
300	 * are send back when pmtu discovery is not active.
301	 * There is a small race when the user changes this flag in the
302	 * route, but I think that's acceptable.
303	 */
304	if ((dst = __sk_dst_check(sk, 0)) == NULL)
305		return;
306
307	dst->ops->update_pmtu(dst, mtu);
308
309	/* Something is about to be wrong... Remember soft error
310	 * for the case, if this connection will not able to recover.
311	 */
312	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313		sk->sk_err_soft = EMSGSIZE;
314
315	mtu = dst_mtu(dst);
316
317	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
318	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
319		tcp_sync_mss(sk, mtu);
320
321		/* Resend the TCP packet because it's
322		 * clear that the old packet has been
323		 * dropped. This is the new "fast" path mtu
324		 * discovery.
325		 */
326		tcp_simple_retransmit(sk);
327	} /* else let the usual retransmit timer handle it */
328}
329
330/*
331 * This routine is called by the ICMP module when it gets some
332 * sort of error condition.  If err < 0 then the socket should
333 * be closed and the error returned to the user.  If err > 0
334 * it's just the icmp type << 8 | icmp code.  After adjustment
335 * header points to the first 8 bytes of the tcp header.  We need
336 * to find the appropriate port.
337 *
338 * The locking strategy used here is very "optimistic". When
339 * someone else accesses the socket the ICMP is just dropped
340 * and for some paths there is no check at all.
341 * A more general error queue to queue errors for later handling
342 * is probably better.
343 *
344 */
345
346void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
347{
348	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
349	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
350	struct inet_connection_sock *icsk;
351	struct tcp_sock *tp;
352	struct inet_sock *inet;
353	const int type = icmp_hdr(icmp_skb)->type;
354	const int code = icmp_hdr(icmp_skb)->code;
355	struct sock *sk;
356	struct sk_buff *skb;
357	__u32 seq;
358	__u32 remaining;
359	int err;
360	struct net *net = dev_net(icmp_skb->dev);
361
362	if (icmp_skb->len < (iph->ihl << 2) + 8) {
363		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364		return;
365	}
366
367	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
368			iph->saddr, th->source, inet_iif(icmp_skb));
369	if (!sk) {
370		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371		return;
372	}
373	if (sk->sk_state == TCP_TIME_WAIT) {
374		inet_twsk_put(inet_twsk(sk));
375		return;
376	}
377
378	bh_lock_sock(sk);
379	/* If too many ICMPs get dropped on busy
380	 * servers this needs to be solved differently.
381	 */
382	if (sock_owned_by_user(sk))
383		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
384
385	if (sk->sk_state == TCP_CLOSE)
386		goto out;
387
388	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390		goto out;
391	}
392
393	icsk = inet_csk(sk);
394	tp = tcp_sk(sk);
395	seq = ntohl(th->seq);
396	if (sk->sk_state != TCP_LISTEN &&
397	    !between(seq, tp->snd_una, tp->snd_nxt)) {
398		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
399		goto out;
400	}
401
402	switch (type) {
403	case ICMP_SOURCE_QUENCH:
404		/* Just silently ignore these. */
405		goto out;
406	case ICMP_PARAMETERPROB:
407		err = EPROTO;
408		break;
409	case ICMP_DEST_UNREACH:
410		if (code > NR_ICMP_UNREACH)
411			goto out;
412
413		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414			if (!sock_owned_by_user(sk))
415				do_pmtu_discovery(sk, iph, info);
416			goto out;
417		}
418
419		err = icmp_err_convert[code].errno;
420		/* check if icmp_skb allows revert of backoff
421		 * (see draft-zimmermann-tcp-lcd) */
422		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423			break;
424		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425		    !icsk->icsk_backoff)
426			break;
427
428		if (sock_owned_by_user(sk))
429			break;
430
431		icsk->icsk_backoff--;
432		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433					 icsk->icsk_backoff;
434		tcp_bound_rto(sk);
435
436		skb = tcp_write_queue_head(sk);
437		BUG_ON(!skb);
438
439		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440				tcp_time_stamp - TCP_SKB_CB(skb)->when);
441
442		if (remaining) {
443			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444						  remaining, TCP_RTO_MAX);
445		} else {
446			/* RTO revert clocked out retransmission.
447			 * Will retransmit now */
448			tcp_retransmit_timer(sk);
449		}
450
451		break;
452	case ICMP_TIME_EXCEEDED:
453		err = EHOSTUNREACH;
454		break;
455	default:
456		goto out;
457	}
458
459	switch (sk->sk_state) {
460		struct request_sock *req, **prev;
461	case TCP_LISTEN:
462		if (sock_owned_by_user(sk))
463			goto out;
464
465		req = inet_csk_search_req(sk, &prev, th->dest,
466					  iph->daddr, iph->saddr);
467		if (!req)
468			goto out;
469
470		/* ICMPs are not backlogged, hence we cannot get
471		   an established socket here.
472		 */
473		WARN_ON(req->sk);
474
475		if (seq != tcp_rsk(req)->snt_isn) {
476			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
477			goto out;
478		}
479
480		/*
481		 * Still in SYN_RECV, just remove it silently.
482		 * There is no good way to pass the error to the newly
483		 * created socket, and POSIX does not want network
484		 * errors returned from accept().
485		 */
486		inet_csk_reqsk_queue_drop(sk, req, prev);
487		goto out;
488
489	case TCP_SYN_SENT:
490	case TCP_SYN_RECV:  /* Cannot happen.
491			       It can f.e. if SYNs crossed.
492			     */
493		if (!sock_owned_by_user(sk)) {
494			sk->sk_err = err;
495
496			sk->sk_error_report(sk);
497
498			tcp_done(sk);
499		} else {
500			sk->sk_err_soft = err;
501		}
502		goto out;
503	}
504
505	/* If we've already connected we will keep trying
506	 * until we time out, or the user gives up.
507	 *
508	 * rfc1122 4.2.3.9 allows to consider as hard errors
509	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510	 * but it is obsoleted by pmtu discovery).
511	 *
512	 * Note, that in modern internet, where routing is unreliable
513	 * and in each dark corner broken firewalls sit, sending random
514	 * errors ordered by their masters even this two messages finally lose
515	 * their original sense (even Linux sends invalid PORT_UNREACHs)
516	 *
517	 * Now we are in compliance with RFCs.
518	 *							--ANK (980905)
519	 */
520
521	inet = inet_sk(sk);
522	if (!sock_owned_by_user(sk) && inet->recverr) {
523		sk->sk_err = err;
524		sk->sk_error_report(sk);
525	} else	{ /* Only an error on timeout */
526		sk->sk_err_soft = err;
527	}
528
529out:
530	bh_unlock_sock(sk);
531	sock_put(sk);
532}
533
534static void __tcp_v4_send_check(struct sk_buff *skb,
535				__be32 saddr, __be32 daddr)
536{
537	struct tcphdr *th = tcp_hdr(skb);
538
539	if (skb->ip_summed == CHECKSUM_PARTIAL) {
540		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
541		skb->csum_start = skb_transport_header(skb) - skb->head;
542		skb->csum_offset = offsetof(struct tcphdr, check);
543	} else {
544		th->check = tcp_v4_check(skb->len, saddr, daddr,
545					 csum_partial(th,
546						      th->doff << 2,
547						      skb->csum));
548	}
549}
550
551/* This routine computes an IPv4 TCP checksum. */
552void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
553{
554	struct inet_sock *inet = inet_sk(sk);
555
556	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557}
558EXPORT_SYMBOL(tcp_v4_send_check);
559
560int tcp_v4_gso_send_check(struct sk_buff *skb)
561{
562	const struct iphdr *iph;
563	struct tcphdr *th;
564
565	if (!pskb_may_pull(skb, sizeof(*th)))
566		return -EINVAL;
567
568	iph = ip_hdr(skb);
569	th = tcp_hdr(skb);
570
571	th->check = 0;
572	skb->ip_summed = CHECKSUM_PARTIAL;
573	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
574	return 0;
575}
576
577/*
578 *	This routine will send an RST to the other tcp.
579 *
580 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581 *		      for reset.
582 *	Answer: if a packet caused RST, it is not for a socket
583 *		existing in our system, if it is matched to a socket,
584 *		it is just duplicate segment or bug in other side's TCP.
585 *		So that we build reply only basing on parameters
586 *		arrived with segment.
587 *	Exception: precedence violation. We do not implement it in any case.
588 */
589
590static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
591{
592	struct tcphdr *th = tcp_hdr(skb);
593	struct {
594		struct tcphdr th;
595#ifdef CONFIG_TCP_MD5SIG
596		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597#endif
598	} rep;
599	struct ip_reply_arg arg;
600#ifdef CONFIG_TCP_MD5SIG
601	struct tcp_md5sig_key *key;
602#endif
603	struct net *net;
604
605	/* Never send a reset in response to a reset. */
606	if (th->rst)
607		return;
608
609	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
610		return;
611
612	/* Swap the send and the receive. */
613	memset(&rep, 0, sizeof(rep));
614	rep.th.dest   = th->source;
615	rep.th.source = th->dest;
616	rep.th.doff   = sizeof(struct tcphdr) / 4;
617	rep.th.rst    = 1;
618
619	if (th->ack) {
620		rep.th.seq = th->ack_seq;
621	} else {
622		rep.th.ack = 1;
623		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624				       skb->len - (th->doff << 2));
625	}
626
627	memset(&arg, 0, sizeof(arg));
628	arg.iov[0].iov_base = (unsigned char *)&rep;
629	arg.iov[0].iov_len  = sizeof(rep.th);
630
631#ifdef CONFIG_TCP_MD5SIG
632	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
633	if (key) {
634		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635				   (TCPOPT_NOP << 16) |
636				   (TCPOPT_MD5SIG << 8) |
637				   TCPOLEN_MD5SIG);
638		/* Update length and the length the header thinks exists */
639		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640		rep.th.doff = arg.iov[0].iov_len / 4;
641
642		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
643				     key, ip_hdr(skb)->saddr,
644				     ip_hdr(skb)->daddr, &rep.th);
645	}
646#endif
647	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648				      ip_hdr(skb)->saddr, /* XXX */
649				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
650	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
651	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
652
653	net = dev_net(skb_dst(skb)->dev);
654	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
655		      &arg, arg.iov[0].iov_len);
656
657	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
659}
660
661/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662   outside socket context is ugly, certainly. What can I do?
663 */
664
665static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666			    u32 win, u32 ts, int oif,
667			    struct tcp_md5sig_key *key,
668			    int reply_flags)
669{
670	struct tcphdr *th = tcp_hdr(skb);
671	struct {
672		struct tcphdr th;
673		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
674#ifdef CONFIG_TCP_MD5SIG
675			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
676#endif
677			];
678	} rep;
679	struct ip_reply_arg arg;
680	struct net *net = dev_net(skb_dst(skb)->dev);
681
682	memset(&rep.th, 0, sizeof(struct tcphdr));
683	memset(&arg, 0, sizeof(arg));
684
685	arg.iov[0].iov_base = (unsigned char *)&rep;
686	arg.iov[0].iov_len  = sizeof(rep.th);
687	if (ts) {
688		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689				   (TCPOPT_TIMESTAMP << 8) |
690				   TCPOLEN_TIMESTAMP);
691		rep.opt[1] = htonl(tcp_time_stamp);
692		rep.opt[2] = htonl(ts);
693		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
694	}
695
696	/* Swap the send and the receive. */
697	rep.th.dest    = th->source;
698	rep.th.source  = th->dest;
699	rep.th.doff    = arg.iov[0].iov_len / 4;
700	rep.th.seq     = htonl(seq);
701	rep.th.ack_seq = htonl(ack);
702	rep.th.ack     = 1;
703	rep.th.window  = htons(win);
704
705#ifdef CONFIG_TCP_MD5SIG
706	if (key) {
707		int offset = (ts) ? 3 : 0;
708
709		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710					  (TCPOPT_NOP << 16) |
711					  (TCPOPT_MD5SIG << 8) |
712					  TCPOLEN_MD5SIG);
713		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714		rep.th.doff = arg.iov[0].iov_len/4;
715
716		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
717				    key, ip_hdr(skb)->saddr,
718				    ip_hdr(skb)->daddr, &rep.th);
719	}
720#endif
721	arg.flags = reply_flags;
722	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723				      ip_hdr(skb)->saddr, /* XXX */
724				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
725	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
726	if (oif)
727		arg.bound_dev_if = oif;
728
729	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
730		      &arg, arg.iov[0].iov_len);
731
732	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
733}
734
735static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736{
737	struct inet_timewait_sock *tw = inet_twsk(sk);
738	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
739
740	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
741			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
742			tcptw->tw_ts_recent,
743			tw->tw_bound_dev_if,
744			tcp_twsk_md5_key(tcptw),
745			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
746			);
747
748	inet_twsk_put(tw);
749}
750
751static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
752				  struct request_sock *req)
753{
754	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
755			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
756			req->ts_recent,
757			0,
758			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
760}
761
762/*
763 *	Send a SYN-ACK after having received a SYN.
764 *	This still operates on a request_sock only, not on a big
765 *	socket.
766 */
767static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768			      struct request_sock *req,
769			      struct request_values *rvp)
770{
771	const struct inet_request_sock *ireq = inet_rsk(req);
772	struct flowi4 fl4;
773	int err = -1;
774	struct sk_buff * skb;
775
776	/* First, grab a route. */
777	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
778		return -1;
779
780	skb = tcp_make_synack(sk, dst, req, rvp);
781
782	if (skb) {
783		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
784
785		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
786					    ireq->rmt_addr,
787					    ireq->opt);
788		err = net_xmit_eval(err);
789	}
790
791	dst_release(dst);
792	return err;
793}
794
795static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
796			      struct request_values *rvp)
797{
798	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
799	return tcp_v4_send_synack(sk, NULL, req, rvp);
800}
801
802/*
803 *	IPv4 request_sock destructor.
804 */
805static void tcp_v4_reqsk_destructor(struct request_sock *req)
806{
807	kfree(inet_rsk(req)->opt);
808}
809
810static void syn_flood_warning(const struct sk_buff *skb)
811{
812	const char *msg;
813
814#ifdef CONFIG_SYN_COOKIES
815	if (sysctl_tcp_syncookies)
816		msg = "Sending cookies";
817	else
818#endif
819		msg = "Dropping request";
820
821	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
822				ntohs(tcp_hdr(skb)->dest), msg);
823}
824
825/*
826 * Save and compile IPv4 options into the request_sock if needed.
827 */
828static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
829						  struct sk_buff *skb)
830{
831	const struct ip_options *opt = &(IPCB(skb)->opt);
832	struct ip_options_rcu *dopt = NULL;
833
834	if (opt && opt->optlen) {
835		int opt_size = sizeof(*dopt) + opt->optlen;
836
837		dopt = kmalloc(opt_size, GFP_ATOMIC);
838		if (dopt) {
839			if (ip_options_echo(&dopt->opt, skb)) {
840				kfree(dopt);
841				dopt = NULL;
842			}
843		}
844	}
845	return dopt;
846}
847
848#ifdef CONFIG_TCP_MD5SIG
849/*
850 * RFC2385 MD5 checksumming requires a mapping of
851 * IP address->MD5 Key.
852 * We need to maintain these in the sk structure.
853 */
854
855/* Find the Key structure for an address.  */
856static struct tcp_md5sig_key *
857			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
858{
859	struct tcp_sock *tp = tcp_sk(sk);
860	int i;
861
862	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
863		return NULL;
864	for (i = 0; i < tp->md5sig_info->entries4; i++) {
865		if (tp->md5sig_info->keys4[i].addr == addr)
866			return &tp->md5sig_info->keys4[i].base;
867	}
868	return NULL;
869}
870
871struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
872					 struct sock *addr_sk)
873{
874	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
875}
876EXPORT_SYMBOL(tcp_v4_md5_lookup);
877
878static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
879						      struct request_sock *req)
880{
881	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
882}
883
884/* This can be called on a newly created socket, from other files */
885int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
886		      u8 *newkey, u8 newkeylen)
887{
888	/* Add Key to the list */
889	struct tcp_md5sig_key *key;
890	struct tcp_sock *tp = tcp_sk(sk);
891	struct tcp4_md5sig_key *keys;
892
893	key = tcp_v4_md5_do_lookup(sk, addr);
894	if (key) {
895		/* Pre-existing entry - just update that one. */
896		kfree(key->key);
897		key->key = newkey;
898		key->keylen = newkeylen;
899	} else {
900		struct tcp_md5sig_info *md5sig;
901
902		if (!tp->md5sig_info) {
903			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
904						  GFP_ATOMIC);
905			if (!tp->md5sig_info) {
906				kfree(newkey);
907				return -ENOMEM;
908			}
909			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
910		}
911		if (tcp_alloc_md5sig_pool(sk) == NULL) {
912			kfree(newkey);
913			return -ENOMEM;
914		}
915		md5sig = tp->md5sig_info;
916
917		if (md5sig->alloced4 == md5sig->entries4) {
918			keys = kmalloc((sizeof(*keys) *
919					(md5sig->entries4 + 1)), GFP_ATOMIC);
920			if (!keys) {
921				kfree(newkey);
922				tcp_free_md5sig_pool();
923				return -ENOMEM;
924			}
925
926			if (md5sig->entries4)
927				memcpy(keys, md5sig->keys4,
928				       sizeof(*keys) * md5sig->entries4);
929
930			/* Free old key list, and reference new one */
931			kfree(md5sig->keys4);
932			md5sig->keys4 = keys;
933			md5sig->alloced4++;
934		}
935		md5sig->entries4++;
936		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
937		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
938		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
939	}
940	return 0;
941}
942EXPORT_SYMBOL(tcp_v4_md5_do_add);
943
944static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
945			       u8 *newkey, u8 newkeylen)
946{
947	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
948				 newkey, newkeylen);
949}
950
951int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
952{
953	struct tcp_sock *tp = tcp_sk(sk);
954	int i;
955
956	for (i = 0; i < tp->md5sig_info->entries4; i++) {
957		if (tp->md5sig_info->keys4[i].addr == addr) {
958			/* Free the key */
959			kfree(tp->md5sig_info->keys4[i].base.key);
960			tp->md5sig_info->entries4--;
961
962			if (tp->md5sig_info->entries4 == 0) {
963				kfree(tp->md5sig_info->keys4);
964				tp->md5sig_info->keys4 = NULL;
965				tp->md5sig_info->alloced4 = 0;
966			} else if (tp->md5sig_info->entries4 != i) {
967				/* Need to do some manipulation */
968				memmove(&tp->md5sig_info->keys4[i],
969					&tp->md5sig_info->keys4[i+1],
970					(tp->md5sig_info->entries4 - i) *
971					 sizeof(struct tcp4_md5sig_key));
972			}
973			tcp_free_md5sig_pool();
974			return 0;
975		}
976	}
977	return -ENOENT;
978}
979EXPORT_SYMBOL(tcp_v4_md5_do_del);
980
981static void tcp_v4_clear_md5_list(struct sock *sk)
982{
983	struct tcp_sock *tp = tcp_sk(sk);
984
985	/* Free each key, then the set of key keys,
986	 * the crypto element, and then decrement our
987	 * hold on the last resort crypto.
988	 */
989	if (tp->md5sig_info->entries4) {
990		int i;
991		for (i = 0; i < tp->md5sig_info->entries4; i++)
992			kfree(tp->md5sig_info->keys4[i].base.key);
993		tp->md5sig_info->entries4 = 0;
994		tcp_free_md5sig_pool();
995	}
996	if (tp->md5sig_info->keys4) {
997		kfree(tp->md5sig_info->keys4);
998		tp->md5sig_info->keys4 = NULL;
999		tp->md5sig_info->alloced4  = 0;
1000	}
1001}
1002
1003static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004				 int optlen)
1005{
1006	struct tcp_md5sig cmd;
1007	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008	u8 *newkey;
1009
1010	if (optlen < sizeof(cmd))
1011		return -EINVAL;
1012
1013	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1014		return -EFAULT;
1015
1016	if (sin->sin_family != AF_INET)
1017		return -EINVAL;
1018
1019	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1020		if (!tcp_sk(sk)->md5sig_info)
1021			return -ENOENT;
1022		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1023	}
1024
1025	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026		return -EINVAL;
1027
1028	if (!tcp_sk(sk)->md5sig_info) {
1029		struct tcp_sock *tp = tcp_sk(sk);
1030		struct tcp_md5sig_info *p;
1031
1032		p = kzalloc(sizeof(*p), sk->sk_allocation);
1033		if (!p)
1034			return -EINVAL;
1035
1036		tp->md5sig_info = p;
1037		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1038	}
1039
1040	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1041	if (!newkey)
1042		return -ENOMEM;
1043	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1044				 newkey, cmd.tcpm_keylen);
1045}
1046
1047static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1048					__be32 daddr, __be32 saddr, int nbytes)
1049{
1050	struct tcp4_pseudohdr *bp;
1051	struct scatterlist sg;
1052
1053	bp = &hp->md5_blk.ip4;
1054
1055	/*
1056	 * 1. the TCP pseudo-header (in the order: source IP address,
1057	 * destination IP address, zero-padded protocol number, and
1058	 * segment length)
1059	 */
1060	bp->saddr = saddr;
1061	bp->daddr = daddr;
1062	bp->pad = 0;
1063	bp->protocol = IPPROTO_TCP;
1064	bp->len = cpu_to_be16(nbytes);
1065
1066	sg_init_one(&sg, bp, sizeof(*bp));
1067	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1068}
1069
1070static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1071			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1072{
1073	struct tcp_md5sig_pool *hp;
1074	struct hash_desc *desc;
1075
1076	hp = tcp_get_md5sig_pool();
1077	if (!hp)
1078		goto clear_hash_noput;
1079	desc = &hp->md5_desc;
1080
1081	if (crypto_hash_init(desc))
1082		goto clear_hash;
1083	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1084		goto clear_hash;
1085	if (tcp_md5_hash_header(hp, th))
1086		goto clear_hash;
1087	if (tcp_md5_hash_key(hp, key))
1088		goto clear_hash;
1089	if (crypto_hash_final(desc, md5_hash))
1090		goto clear_hash;
1091
1092	tcp_put_md5sig_pool();
1093	return 0;
1094
1095clear_hash:
1096	tcp_put_md5sig_pool();
1097clear_hash_noput:
1098	memset(md5_hash, 0, 16);
1099	return 1;
1100}
1101
1102int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1103			struct sock *sk, struct request_sock *req,
1104			struct sk_buff *skb)
1105{
1106	struct tcp_md5sig_pool *hp;
1107	struct hash_desc *desc;
1108	struct tcphdr *th = tcp_hdr(skb);
1109	__be32 saddr, daddr;
1110
1111	if (sk) {
1112		saddr = inet_sk(sk)->inet_saddr;
1113		daddr = inet_sk(sk)->inet_daddr;
1114	} else if (req) {
1115		saddr = inet_rsk(req)->loc_addr;
1116		daddr = inet_rsk(req)->rmt_addr;
1117	} else {
1118		const struct iphdr *iph = ip_hdr(skb);
1119		saddr = iph->saddr;
1120		daddr = iph->daddr;
1121	}
1122
1123	hp = tcp_get_md5sig_pool();
1124	if (!hp)
1125		goto clear_hash_noput;
1126	desc = &hp->md5_desc;
1127
1128	if (crypto_hash_init(desc))
1129		goto clear_hash;
1130
1131	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1132		goto clear_hash;
1133	if (tcp_md5_hash_header(hp, th))
1134		goto clear_hash;
1135	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1136		goto clear_hash;
1137	if (tcp_md5_hash_key(hp, key))
1138		goto clear_hash;
1139	if (crypto_hash_final(desc, md5_hash))
1140		goto clear_hash;
1141
1142	tcp_put_md5sig_pool();
1143	return 0;
1144
1145clear_hash:
1146	tcp_put_md5sig_pool();
1147clear_hash_noput:
1148	memset(md5_hash, 0, 16);
1149	return 1;
1150}
1151EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1152
1153static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1154{
1155	/*
1156	 * This gets called for each TCP segment that arrives
1157	 * so we want to be efficient.
1158	 * We have 3 drop cases:
1159	 * o No MD5 hash and one expected.
1160	 * o MD5 hash and we're not expecting one.
1161	 * o MD5 hash and its wrong.
1162	 */
1163	__u8 *hash_location = NULL;
1164	struct tcp_md5sig_key *hash_expected;
1165	const struct iphdr *iph = ip_hdr(skb);
1166	struct tcphdr *th = tcp_hdr(skb);
1167	int genhash;
1168	unsigned char newhash[16];
1169
1170	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1171	hash_location = tcp_parse_md5sig_option(th);
1172
1173	/* We've parsed the options - do we have a hash? */
1174	if (!hash_expected && !hash_location)
1175		return 0;
1176
1177	if (hash_expected && !hash_location) {
1178		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1179		return 1;
1180	}
1181
1182	if (!hash_expected && hash_location) {
1183		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1184		return 1;
1185	}
1186
1187	/* Okay, so this is hash_expected and hash_location -
1188	 * so we need to calculate the checksum.
1189	 */
1190	genhash = tcp_v4_md5_hash_skb(newhash,
1191				      hash_expected,
1192				      NULL, NULL, skb);
1193
1194	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1195		if (net_ratelimit()) {
1196			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1197			       &iph->saddr, ntohs(th->source),
1198			       &iph->daddr, ntohs(th->dest),
1199			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1200		}
1201		return 1;
1202	}
1203	return 0;
1204}
1205
1206#endif
1207
1208struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1209	.family		=	PF_INET,
1210	.obj_size	=	sizeof(struct tcp_request_sock),
1211	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1212	.send_ack	=	tcp_v4_reqsk_send_ack,
1213	.destructor	=	tcp_v4_reqsk_destructor,
1214	.send_reset	=	tcp_v4_send_reset,
1215	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1216};
1217
1218#ifdef CONFIG_TCP_MD5SIG
1219static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1220	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1221	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1222};
1223#endif
1224
1225int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1226{
1227	struct tcp_extend_values tmp_ext;
1228	struct tcp_options_received tmp_opt;
1229	u8 *hash_location;
1230	struct request_sock *req;
1231	struct inet_request_sock *ireq;
1232	struct tcp_sock *tp = tcp_sk(sk);
1233	struct dst_entry *dst = NULL;
1234	__be32 saddr = ip_hdr(skb)->saddr;
1235	__be32 daddr = ip_hdr(skb)->daddr;
1236	__u32 isn = TCP_SKB_CB(skb)->when;
1237#ifdef CONFIG_SYN_COOKIES
1238	int want_cookie = 0;
1239#else
1240#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1241#endif
1242
1243	/* Never answer to SYNs send to broadcast or multicast */
1244	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245		goto drop;
1246
1247	/* TW buckets are converted to open requests without
1248	 * limitations, they conserve resources and peer is
1249	 * evidently real one.
1250	 */
1251	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1252		if (net_ratelimit())
1253			syn_flood_warning(skb);
1254#ifdef CONFIG_SYN_COOKIES
1255		if (sysctl_tcp_syncookies) {
1256			want_cookie = 1;
1257		} else
1258#endif
1259		goto drop;
1260	}
1261
1262	/* Accept backlog is full. If we have already queued enough
1263	 * of warm entries in syn queue, drop request. It is better than
1264	 * clogging syn queue with openreqs with exponentially increasing
1265	 * timeout.
1266	 */
1267	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1268		goto drop;
1269
1270	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1271	if (!req)
1272		goto drop;
1273
1274#ifdef CONFIG_TCP_MD5SIG
1275	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1276#endif
1277
1278	tcp_clear_options(&tmp_opt);
1279	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1280	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1281	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1282
1283	if (tmp_opt.cookie_plus > 0 &&
1284	    tmp_opt.saw_tstamp &&
1285	    !tp->rx_opt.cookie_out_never &&
1286	    (sysctl_tcp_cookie_size > 0 ||
1287	     (tp->cookie_values != NULL &&
1288	      tp->cookie_values->cookie_desired > 0))) {
1289		u8 *c;
1290		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1291		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1292
1293		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1294			goto drop_and_release;
1295
1296		/* Secret recipe starts with IP addresses */
1297		*mess++ ^= (__force u32)daddr;
1298		*mess++ ^= (__force u32)saddr;
1299
1300		/* plus variable length Initiator Cookie */
1301		c = (u8 *)mess;
1302		while (l-- > 0)
1303			*c++ ^= *hash_location++;
1304
1305#ifdef CONFIG_SYN_COOKIES
1306		want_cookie = 0;	/* not our kind of cookie */
1307#endif
1308		tmp_ext.cookie_out_never = 0; /* false */
1309		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1310	} else if (!tp->rx_opt.cookie_in_always) {
1311		/* redundant indications, but ensure initialization. */
1312		tmp_ext.cookie_out_never = 1; /* true */
1313		tmp_ext.cookie_plus = 0;
1314	} else {
1315		goto drop_and_release;
1316	}
1317	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1318
1319	if (want_cookie && !tmp_opt.saw_tstamp)
1320		tcp_clear_options(&tmp_opt);
1321
1322	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1323	tcp_openreq_init(req, &tmp_opt, skb);
1324
1325	ireq = inet_rsk(req);
1326	ireq->loc_addr = daddr;
1327	ireq->rmt_addr = saddr;
1328	ireq->no_srccheck = inet_sk(sk)->transparent;
1329	ireq->opt = tcp_v4_save_options(sk, skb);
1330
1331	if (security_inet_conn_request(sk, skb, req))
1332		goto drop_and_free;
1333
1334	if (!want_cookie || tmp_opt.tstamp_ok)
1335		TCP_ECN_create_request(req, tcp_hdr(skb));
1336
1337	if (want_cookie) {
1338		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339		req->cookie_ts = tmp_opt.tstamp_ok;
1340	} else if (!isn) {
1341		struct inet_peer *peer = NULL;
1342		struct flowi4 fl4;
1343
1344		/* VJ's idea. We save last timestamp seen
1345		 * from the destination in peer table, when entering
1346		 * state TIME-WAIT, and check against it before
1347		 * accepting new connection request.
1348		 *
1349		 * If "isn" is not zero, this request hit alive
1350		 * timewait bucket, so that all the necessary checks
1351		 * are made in the function processing timewait state.
1352		 */
1353		if (tmp_opt.saw_tstamp &&
1354		    tcp_death_row.sysctl_tw_recycle &&
1355		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1356		    fl4.daddr == saddr &&
1357		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1358			inet_peer_refcheck(peer);
1359			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1360			    (s32)(peer->tcp_ts - req->ts_recent) >
1361							TCP_PAWS_WINDOW) {
1362				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1363				goto drop_and_release;
1364			}
1365		}
1366		/* Kill the following clause, if you dislike this way. */
1367		else if (!sysctl_tcp_syncookies &&
1368			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1369			  (sysctl_max_syn_backlog >> 2)) &&
1370			 (!peer || !peer->tcp_ts_stamp) &&
1371			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1372			/* Without syncookies last quarter of
1373			 * backlog is filled with destinations,
1374			 * proven to be alive.
1375			 * It means that we continue to communicate
1376			 * to destinations, already remembered
1377			 * to the moment of synflood.
1378			 */
1379			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1380				       &saddr, ntohs(tcp_hdr(skb)->source));
1381			goto drop_and_release;
1382		}
1383
1384		isn = tcp_v4_init_sequence(skb);
1385	}
1386	tcp_rsk(req)->snt_isn = isn;
1387
1388	if (tcp_v4_send_synack(sk, dst, req,
1389			       (struct request_values *)&tmp_ext) ||
1390	    want_cookie)
1391		goto drop_and_free;
1392
1393	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394	return 0;
1395
1396drop_and_release:
1397	dst_release(dst);
1398drop_and_free:
1399	reqsk_free(req);
1400drop:
1401	return 0;
1402}
1403EXPORT_SYMBOL(tcp_v4_conn_request);
1404
1405
1406/*
1407 * The three way handshake has completed - we got a valid synack -
1408 * now create the new socket.
1409 */
1410struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1411				  struct request_sock *req,
1412				  struct dst_entry *dst)
1413{
1414	struct inet_request_sock *ireq;
1415	struct inet_sock *newinet;
1416	struct tcp_sock *newtp;
1417	struct sock *newsk;
1418#ifdef CONFIG_TCP_MD5SIG
1419	struct tcp_md5sig_key *key;
1420#endif
1421	struct ip_options_rcu *inet_opt;
1422
1423	if (sk_acceptq_is_full(sk))
1424		goto exit_overflow;
1425
1426	newsk = tcp_create_openreq_child(sk, req, skb);
1427	if (!newsk)
1428		goto exit_nonewsk;
1429
1430	newsk->sk_gso_type = SKB_GSO_TCPV4;
1431
1432	newtp		      = tcp_sk(newsk);
1433	newinet		      = inet_sk(newsk);
1434	ireq		      = inet_rsk(req);
1435	newinet->inet_daddr   = ireq->rmt_addr;
1436	newinet->inet_rcv_saddr = ireq->loc_addr;
1437	newinet->inet_saddr	      = ireq->loc_addr;
1438	inet_opt	      = ireq->opt;
1439	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1440	ireq->opt	      = NULL;
1441	newinet->mc_index     = inet_iif(skb);
1442	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1443	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444	if (inet_opt)
1445		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446	newinet->inet_id = newtp->write_seq ^ jiffies;
1447
1448	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1449		goto put_and_exit;
1450
1451	sk_setup_caps(newsk, dst);
1452
1453	tcp_mtup_init(newsk);
1454	tcp_sync_mss(newsk, dst_mtu(dst));
1455	newtp->advmss = dst_metric_advmss(dst);
1456	if (tcp_sk(sk)->rx_opt.user_mss &&
1457	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1458		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1459
1460	tcp_initialize_rcv_mss(newsk);
1461
1462#ifdef CONFIG_TCP_MD5SIG
1463	/* Copy over the MD5 key from the original socket */
1464	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1465	if (key != NULL) {
1466		/*
1467		 * We're using one, so create a matching key
1468		 * on the newsk structure. If we fail to get
1469		 * memory, then we end up not copying the key
1470		 * across. Shucks.
1471		 */
1472		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1473		if (newkey != NULL)
1474			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1475					  newkey, key->keylen);
1476		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1477	}
1478#endif
1479
1480	if (__inet_inherit_port(sk, newsk) < 0)
1481		goto put_and_exit;
1482	__inet_hash_nolisten(newsk, NULL);
1483
1484	return newsk;
1485
1486exit_overflow:
1487	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488exit_nonewsk:
1489	dst_release(dst);
1490exit:
1491	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1492	return NULL;
1493put_and_exit:
1494	sock_put(newsk);
1495	goto exit;
1496}
1497EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498
1499static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1500{
1501	struct tcphdr *th = tcp_hdr(skb);
1502	const struct iphdr *iph = ip_hdr(skb);
1503	struct sock *nsk;
1504	struct request_sock **prev;
1505	/* Find possible connection requests. */
1506	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1507						       iph->saddr, iph->daddr);
1508	if (req)
1509		return tcp_check_req(sk, skb, req, prev);
1510
1511	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1512			th->source, iph->daddr, th->dest, inet_iif(skb));
1513
1514	if (nsk) {
1515		if (nsk->sk_state != TCP_TIME_WAIT) {
1516			bh_lock_sock(nsk);
1517			return nsk;
1518		}
1519		inet_twsk_put(inet_twsk(nsk));
1520		return NULL;
1521	}
1522
1523#ifdef CONFIG_SYN_COOKIES
1524	if (!th->syn)
1525		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1526#endif
1527	return sk;
1528}
1529
1530static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1531{
1532	const struct iphdr *iph = ip_hdr(skb);
1533
1534	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1535		if (!tcp_v4_check(skb->len, iph->saddr,
1536				  iph->daddr, skb->csum)) {
1537			skb->ip_summed = CHECKSUM_UNNECESSARY;
1538			return 0;
1539		}
1540	}
1541
1542	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1543				       skb->len, IPPROTO_TCP, 0);
1544
1545	if (skb->len <= 76) {
1546		return __skb_checksum_complete(skb);
1547	}
1548	return 0;
1549}
1550
1551
1552/* The socket must have it's spinlock held when we get
1553 * here.
1554 *
1555 * We have a potential double-lock case here, so even when
1556 * doing backlog processing we use the BH locking scheme.
1557 * This is because we cannot sleep with the original spinlock
1558 * held.
1559 */
1560int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1561{
1562	struct sock *rsk;
1563#ifdef CONFIG_TCP_MD5SIG
1564	/*
1565	 * We really want to reject the packet as early as possible
1566	 * if:
1567	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1568	 *  o There is an MD5 option and we're not expecting one
1569	 */
1570	if (tcp_v4_inbound_md5_hash(sk, skb))
1571		goto discard;
1572#endif
1573
1574	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575		sock_rps_save_rxhash(sk, skb->rxhash);
1576		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1577			rsk = sk;
1578			goto reset;
1579		}
1580		return 0;
1581	}
1582
1583	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1584		goto csum_err;
1585
1586	if (sk->sk_state == TCP_LISTEN) {
1587		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1588		if (!nsk)
1589			goto discard;
1590
1591		if (nsk != sk) {
1592			if (tcp_child_process(sk, nsk, skb)) {
1593				rsk = nsk;
1594				goto reset;
1595			}
1596			return 0;
1597		}
1598	} else
1599		sock_rps_save_rxhash(sk, skb->rxhash);
1600
1601	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1602		rsk = sk;
1603		goto reset;
1604	}
1605	return 0;
1606
1607reset:
1608	tcp_v4_send_reset(rsk, skb);
1609discard:
1610	kfree_skb(skb);
1611	/* Be careful here. If this function gets more complicated and
1612	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1613	 * might be destroyed here. This current version compiles correctly,
1614	 * but you have been warned.
1615	 */
1616	return 0;
1617
1618csum_err:
1619	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1620	goto discard;
1621}
1622EXPORT_SYMBOL(tcp_v4_do_rcv);
1623
1624/*
1625 *	From tcp_input.c
1626 */
1627
1628int tcp_v4_rcv(struct sk_buff *skb)
1629{
1630	const struct iphdr *iph;
1631	struct tcphdr *th;
1632	struct sock *sk;
1633	int ret;
1634	struct net *net = dev_net(skb->dev);
1635
1636	if (skb->pkt_type != PACKET_HOST)
1637		goto discard_it;
1638
1639	/* Count it even if it's bad */
1640	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1641
1642	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1643		goto discard_it;
1644
1645	th = tcp_hdr(skb);
1646
1647	if (th->doff < sizeof(struct tcphdr) / 4)
1648		goto bad_packet;
1649	if (!pskb_may_pull(skb, th->doff * 4))
1650		goto discard_it;
1651
1652	/* An explanation is required here, I think.
1653	 * Packet length and doff are validated by header prediction,
1654	 * provided case of th->doff==0 is eliminated.
1655	 * So, we defer the checks. */
1656	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1657		goto bad_packet;
1658
1659	th = tcp_hdr(skb);
1660	iph = ip_hdr(skb);
1661	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1662	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1663				    skb->len - th->doff * 4);
1664	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1665	TCP_SKB_CB(skb)->when	 = 0;
1666	TCP_SKB_CB(skb)->flags	 = iph->tos;
1667	TCP_SKB_CB(skb)->sacked	 = 0;
1668
1669	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1670	if (!sk)
1671		goto no_tcp_socket;
1672
1673process:
1674	if (sk->sk_state == TCP_TIME_WAIT)
1675		goto do_time_wait;
1676
1677	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1678		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1679		goto discard_and_relse;
1680	}
1681
1682	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1683		goto discard_and_relse;
1684	nf_reset(skb);
1685
1686	if (sk_filter(sk, skb))
1687		goto discard_and_relse;
1688
1689	skb->dev = NULL;
1690
1691	bh_lock_sock_nested(sk);
1692	ret = 0;
1693	if (!sock_owned_by_user(sk)) {
1694#ifdef CONFIG_NET_DMA
1695		struct tcp_sock *tp = tcp_sk(sk);
1696		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1697			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1698		if (tp->ucopy.dma_chan)
1699			ret = tcp_v4_do_rcv(sk, skb);
1700		else
1701#endif
1702		{
1703			if (!tcp_prequeue(sk, skb))
1704				ret = tcp_v4_do_rcv(sk, skb);
1705		}
1706	} else if (unlikely(sk_add_backlog(sk, skb))) {
1707		bh_unlock_sock(sk);
1708		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1709		goto discard_and_relse;
1710	}
1711	bh_unlock_sock(sk);
1712
1713	sock_put(sk);
1714
1715	return ret;
1716
1717no_tcp_socket:
1718	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1719		goto discard_it;
1720
1721	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1722bad_packet:
1723		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1724	} else {
1725		tcp_v4_send_reset(NULL, skb);
1726	}
1727
1728discard_it:
1729	/* Discard frame. */
1730	kfree_skb(skb);
1731	return 0;
1732
1733discard_and_relse:
1734	sock_put(sk);
1735	goto discard_it;
1736
1737do_time_wait:
1738	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1739		inet_twsk_put(inet_twsk(sk));
1740		goto discard_it;
1741	}
1742
1743	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1744		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1745		inet_twsk_put(inet_twsk(sk));
1746		goto discard_it;
1747	}
1748	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1749	case TCP_TW_SYN: {
1750		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1751							&tcp_hashinfo,
1752							iph->daddr, th->dest,
1753							inet_iif(skb));
1754		if (sk2) {
1755			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1756			inet_twsk_put(inet_twsk(sk));
1757			sk = sk2;
1758			goto process;
1759		}
1760		/* Fall through to ACK */
1761	}
1762	case TCP_TW_ACK:
1763		tcp_v4_timewait_ack(sk, skb);
1764		break;
1765	case TCP_TW_RST:
1766		goto no_tcp_socket;
1767	case TCP_TW_SUCCESS:;
1768	}
1769	goto discard_it;
1770}
1771
1772struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1773{
1774	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1775	struct inet_sock *inet = inet_sk(sk);
1776	struct inet_peer *peer;
1777
1778	if (!rt ||
1779	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1780		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1781		*release_it = true;
1782	} else {
1783		if (!rt->peer)
1784			rt_bind_peer(rt, inet->inet_daddr, 1);
1785		peer = rt->peer;
1786		*release_it = false;
1787	}
1788
1789	return peer;
1790}
1791EXPORT_SYMBOL(tcp_v4_get_peer);
1792
1793void *tcp_v4_tw_get_peer(struct sock *sk)
1794{
1795	struct inet_timewait_sock *tw = inet_twsk(sk);
1796
1797	return inet_getpeer_v4(tw->tw_daddr, 1);
1798}
1799EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1800
1801static struct timewait_sock_ops tcp_timewait_sock_ops = {
1802	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1803	.twsk_unique	= tcp_twsk_unique,
1804	.twsk_destructor= tcp_twsk_destructor,
1805	.twsk_getpeer	= tcp_v4_tw_get_peer,
1806};
1807
1808const struct inet_connection_sock_af_ops ipv4_specific = {
1809	.queue_xmit	   = ip_queue_xmit,
1810	.send_check	   = tcp_v4_send_check,
1811	.rebuild_header	   = inet_sk_rebuild_header,
1812	.conn_request	   = tcp_v4_conn_request,
1813	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1814	.get_peer	   = tcp_v4_get_peer,
1815	.net_header_len	   = sizeof(struct iphdr),
1816	.setsockopt	   = ip_setsockopt,
1817	.getsockopt	   = ip_getsockopt,
1818	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1819	.sockaddr_len	   = sizeof(struct sockaddr_in),
1820	.bind_conflict	   = inet_csk_bind_conflict,
1821#ifdef CONFIG_COMPAT
1822	.compat_setsockopt = compat_ip_setsockopt,
1823	.compat_getsockopt = compat_ip_getsockopt,
1824#endif
1825};
1826EXPORT_SYMBOL(ipv4_specific);
1827
1828#ifdef CONFIG_TCP_MD5SIG
1829static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1830	.md5_lookup		= tcp_v4_md5_lookup,
1831	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1832	.md5_add		= tcp_v4_md5_add_func,
1833	.md5_parse		= tcp_v4_parse_md5_keys,
1834};
1835#endif
1836
1837/* NOTE: A lot of things set to zero explicitly by call to
1838 *       sk_alloc() so need not be done here.
1839 */
1840static int tcp_v4_init_sock(struct sock *sk)
1841{
1842	struct inet_connection_sock *icsk = inet_csk(sk);
1843	struct tcp_sock *tp = tcp_sk(sk);
1844
1845	skb_queue_head_init(&tp->out_of_order_queue);
1846	tcp_init_xmit_timers(sk);
1847	tcp_prequeue_init(tp);
1848
1849	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1850	tp->mdev = TCP_TIMEOUT_INIT;
1851
1852	/* So many TCP implementations out there (incorrectly) count the
1853	 * initial SYN frame in their delayed-ACK and congestion control
1854	 * algorithms that we must have the following bandaid to talk
1855	 * efficiently to them.  -DaveM
1856	 */
1857	tp->snd_cwnd = 2;
1858
1859	/* See draft-stevens-tcpca-spec-01 for discussion of the
1860	 * initialization of these values.
1861	 */
1862	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1863	tp->snd_cwnd_clamp = ~0;
1864	tp->mss_cache = TCP_MSS_DEFAULT;
1865
1866	tp->reordering = sysctl_tcp_reordering;
1867	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1868
1869	sk->sk_state = TCP_CLOSE;
1870
1871	sk->sk_write_space = sk_stream_write_space;
1872	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1873
1874	icsk->icsk_af_ops = &ipv4_specific;
1875	icsk->icsk_sync_mss = tcp_sync_mss;
1876#ifdef CONFIG_TCP_MD5SIG
1877	tp->af_specific = &tcp_sock_ipv4_specific;
1878#endif
1879
1880	/* TCP Cookie Transactions */
1881	if (sysctl_tcp_cookie_size > 0) {
1882		/* Default, cookies without s_data_payload. */
1883		tp->cookie_values =
1884			kzalloc(sizeof(*tp->cookie_values),
1885				sk->sk_allocation);
1886		if (tp->cookie_values != NULL)
1887			kref_init(&tp->cookie_values->kref);
1888	}
1889	/* Presumed zeroed, in order of appearance:
1890	 *	cookie_in_always, cookie_out_never,
1891	 *	s_data_constant, s_data_in, s_data_out
1892	 */
1893	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1894	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1895
1896	local_bh_disable();
1897	percpu_counter_inc(&tcp_sockets_allocated);
1898	local_bh_enable();
1899
1900	return 0;
1901}
1902
1903void tcp_v4_destroy_sock(struct sock *sk)
1904{
1905	struct tcp_sock *tp = tcp_sk(sk);
1906
1907	tcp_clear_xmit_timers(sk);
1908
1909	tcp_cleanup_congestion_control(sk);
1910
1911	/* Cleanup up the write buffer. */
1912	tcp_write_queue_purge(sk);
1913
1914	/* Cleans up our, hopefully empty, out_of_order_queue. */
1915	__skb_queue_purge(&tp->out_of_order_queue);
1916
1917#ifdef CONFIG_TCP_MD5SIG
1918	/* Clean up the MD5 key list, if any */
1919	if (tp->md5sig_info) {
1920		tcp_v4_clear_md5_list(sk);
1921		kfree(tp->md5sig_info);
1922		tp->md5sig_info = NULL;
1923	}
1924#endif
1925
1926#ifdef CONFIG_NET_DMA
1927	/* Cleans up our sk_async_wait_queue */
1928	__skb_queue_purge(&sk->sk_async_wait_queue);
1929#endif
1930
1931	/* Clean prequeue, it must be empty really */
1932	__skb_queue_purge(&tp->ucopy.prequeue);
1933
1934	/* Clean up a referenced TCP bind bucket. */
1935	if (inet_csk(sk)->icsk_bind_hash)
1936		inet_put_port(sk);
1937
1938	/*
1939	 * If sendmsg cached page exists, toss it.
1940	 */
1941	if (sk->sk_sndmsg_page) {
1942		__free_page(sk->sk_sndmsg_page);
1943		sk->sk_sndmsg_page = NULL;
1944	}
1945
1946	/* TCP Cookie Transactions */
1947	if (tp->cookie_values != NULL) {
1948		kref_put(&tp->cookie_values->kref,
1949			 tcp_cookie_values_release);
1950		tp->cookie_values = NULL;
1951	}
1952
1953	percpu_counter_dec(&tcp_sockets_allocated);
1954}
1955EXPORT_SYMBOL(tcp_v4_destroy_sock);
1956
1957#ifdef CONFIG_PROC_FS
1958/* Proc filesystem TCP sock list dumping. */
1959
1960static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1961{
1962	return hlist_nulls_empty(head) ? NULL :
1963		list_entry(head->first, struct inet_timewait_sock, tw_node);
1964}
1965
1966static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1967{
1968	return !is_a_nulls(tw->tw_node.next) ?
1969		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1970}
1971
1972/*
1973 * Get next listener socket follow cur.  If cur is NULL, get first socket
1974 * starting from bucket given in st->bucket; when st->bucket is zero the
1975 * very first socket in the hash table is returned.
1976 */
1977static void *listening_get_next(struct seq_file *seq, void *cur)
1978{
1979	struct inet_connection_sock *icsk;
1980	struct hlist_nulls_node *node;
1981	struct sock *sk = cur;
1982	struct inet_listen_hashbucket *ilb;
1983	struct tcp_iter_state *st = seq->private;
1984	struct net *net = seq_file_net(seq);
1985
1986	if (!sk) {
1987		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1988		spin_lock_bh(&ilb->lock);
1989		sk = sk_nulls_head(&ilb->head);
1990		st->offset = 0;
1991		goto get_sk;
1992	}
1993	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1994	++st->num;
1995	++st->offset;
1996
1997	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1998		struct request_sock *req = cur;
1999
2000		icsk = inet_csk(st->syn_wait_sk);
2001		req = req->dl_next;
2002		while (1) {
2003			while (req) {
2004				if (req->rsk_ops->family == st->family) {
2005					cur = req;
2006					goto out;
2007				}
2008				req = req->dl_next;
2009			}
2010			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2011				break;
2012get_req:
2013			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2014		}
2015		sk	  = sk_nulls_next(st->syn_wait_sk);
2016		st->state = TCP_SEQ_STATE_LISTENING;
2017		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018	} else {
2019		icsk = inet_csk(sk);
2020		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2022			goto start_req;
2023		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024		sk = sk_nulls_next(sk);
2025	}
2026get_sk:
2027	sk_nulls_for_each_from(sk, node) {
2028		if (!net_eq(sock_net(sk), net))
2029			continue;
2030		if (sk->sk_family == st->family) {
2031			cur = sk;
2032			goto out;
2033		}
2034		icsk = inet_csk(sk);
2035		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2037start_req:
2038			st->uid		= sock_i_uid(sk);
2039			st->syn_wait_sk = sk;
2040			st->state	= TCP_SEQ_STATE_OPENREQ;
2041			st->sbucket	= 0;
2042			goto get_req;
2043		}
2044		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2045	}
2046	spin_unlock_bh(&ilb->lock);
2047	st->offset = 0;
2048	if (++st->bucket < INET_LHTABLE_SIZE) {
2049		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2050		spin_lock_bh(&ilb->lock);
2051		sk = sk_nulls_head(&ilb->head);
2052		goto get_sk;
2053	}
2054	cur = NULL;
2055out:
2056	return cur;
2057}
2058
2059static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2060{
2061	struct tcp_iter_state *st = seq->private;
2062	void *rc;
2063
2064	st->bucket = 0;
2065	st->offset = 0;
2066	rc = listening_get_next(seq, NULL);
2067
2068	while (rc && *pos) {
2069		rc = listening_get_next(seq, rc);
2070		--*pos;
2071	}
2072	return rc;
2073}
2074
2075static inline int empty_bucket(struct tcp_iter_state *st)
2076{
2077	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2078		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2079}
2080
2081/*
2082 * Get first established socket starting from bucket given in st->bucket.
2083 * If st->bucket is zero, the very first socket in the hash is returned.
2084 */
2085static void *established_get_first(struct seq_file *seq)
2086{
2087	struct tcp_iter_state *st = seq->private;
2088	struct net *net = seq_file_net(seq);
2089	void *rc = NULL;
2090
2091	st->offset = 0;
2092	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2093		struct sock *sk;
2094		struct hlist_nulls_node *node;
2095		struct inet_timewait_sock *tw;
2096		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2097
2098		/* Lockless fast path for the common case of empty buckets */
2099		if (empty_bucket(st))
2100			continue;
2101
2102		spin_lock_bh(lock);
2103		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2104			if (sk->sk_family != st->family ||
2105			    !net_eq(sock_net(sk), net)) {
2106				continue;
2107			}
2108			rc = sk;
2109			goto out;
2110		}
2111		st->state = TCP_SEQ_STATE_TIME_WAIT;
2112		inet_twsk_for_each(tw, node,
2113				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2114			if (tw->tw_family != st->family ||
2115			    !net_eq(twsk_net(tw), net)) {
2116				continue;
2117			}
2118			rc = tw;
2119			goto out;
2120		}
2121		spin_unlock_bh(lock);
2122		st->state = TCP_SEQ_STATE_ESTABLISHED;
2123	}
2124out:
2125	return rc;
2126}
2127
2128static void *established_get_next(struct seq_file *seq, void *cur)
2129{
2130	struct sock *sk = cur;
2131	struct inet_timewait_sock *tw;
2132	struct hlist_nulls_node *node;
2133	struct tcp_iter_state *st = seq->private;
2134	struct net *net = seq_file_net(seq);
2135
2136	++st->num;
2137	++st->offset;
2138
2139	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2140		tw = cur;
2141		tw = tw_next(tw);
2142get_tw:
2143		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2144			tw = tw_next(tw);
2145		}
2146		if (tw) {
2147			cur = tw;
2148			goto out;
2149		}
2150		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2151		st->state = TCP_SEQ_STATE_ESTABLISHED;
2152
2153		/* Look for next non empty bucket */
2154		st->offset = 0;
2155		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2156				empty_bucket(st))
2157			;
2158		if (st->bucket > tcp_hashinfo.ehash_mask)
2159			return NULL;
2160
2161		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2162		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2163	} else
2164		sk = sk_nulls_next(sk);
2165
2166	sk_nulls_for_each_from(sk, node) {
2167		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2168			goto found;
2169	}
2170
2171	st->state = TCP_SEQ_STATE_TIME_WAIT;
2172	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2173	goto get_tw;
2174found:
2175	cur = sk;
2176out:
2177	return cur;
2178}
2179
2180static void *established_get_idx(struct seq_file *seq, loff_t pos)
2181{
2182	struct tcp_iter_state *st = seq->private;
2183	void *rc;
2184
2185	st->bucket = 0;
2186	rc = established_get_first(seq);
2187
2188	while (rc && pos) {
2189		rc = established_get_next(seq, rc);
2190		--pos;
2191	}
2192	return rc;
2193}
2194
2195static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2196{
2197	void *rc;
2198	struct tcp_iter_state *st = seq->private;
2199
2200	st->state = TCP_SEQ_STATE_LISTENING;
2201	rc	  = listening_get_idx(seq, &pos);
2202
2203	if (!rc) {
2204		st->state = TCP_SEQ_STATE_ESTABLISHED;
2205		rc	  = established_get_idx(seq, pos);
2206	}
2207
2208	return rc;
2209}
2210
2211static void *tcp_seek_last_pos(struct seq_file *seq)
2212{
2213	struct tcp_iter_state *st = seq->private;
2214	int offset = st->offset;
2215	int orig_num = st->num;
2216	void *rc = NULL;
2217
2218	switch (st->state) {
2219	case TCP_SEQ_STATE_OPENREQ:
2220	case TCP_SEQ_STATE_LISTENING:
2221		if (st->bucket >= INET_LHTABLE_SIZE)
2222			break;
2223		st->state = TCP_SEQ_STATE_LISTENING;
2224		rc = listening_get_next(seq, NULL);
2225		while (offset-- && rc)
2226			rc = listening_get_next(seq, rc);
2227		if (rc)
2228			break;
2229		st->bucket = 0;
2230		/* Fallthrough */
2231	case TCP_SEQ_STATE_ESTABLISHED:
2232	case TCP_SEQ_STATE_TIME_WAIT:
2233		st->state = TCP_SEQ_STATE_ESTABLISHED;
2234		if (st->bucket > tcp_hashinfo.ehash_mask)
2235			break;
2236		rc = established_get_first(seq);
2237		while (offset-- && rc)
2238			rc = established_get_next(seq, rc);
2239	}
2240
2241	st->num = orig_num;
2242
2243	return rc;
2244}
2245
2246static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2247{
2248	struct tcp_iter_state *st = seq->private;
2249	void *rc;
2250
2251	if (*pos && *pos == st->last_pos) {
2252		rc = tcp_seek_last_pos(seq);
2253		if (rc)
2254			goto out;
2255	}
2256
2257	st->state = TCP_SEQ_STATE_LISTENING;
2258	st->num = 0;
2259	st->bucket = 0;
2260	st->offset = 0;
2261	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2262
2263out:
2264	st->last_pos = *pos;
2265	return rc;
2266}
2267
2268static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2269{
2270	struct tcp_iter_state *st = seq->private;
2271	void *rc = NULL;
2272
2273	if (v == SEQ_START_TOKEN) {
2274		rc = tcp_get_idx(seq, 0);
2275		goto out;
2276	}
2277
2278	switch (st->state) {
2279	case TCP_SEQ_STATE_OPENREQ:
2280	case TCP_SEQ_STATE_LISTENING:
2281		rc = listening_get_next(seq, v);
2282		if (!rc) {
2283			st->state = TCP_SEQ_STATE_ESTABLISHED;
2284			st->bucket = 0;
2285			st->offset = 0;
2286			rc	  = established_get_first(seq);
2287		}
2288		break;
2289	case TCP_SEQ_STATE_ESTABLISHED:
2290	case TCP_SEQ_STATE_TIME_WAIT:
2291		rc = established_get_next(seq, v);
2292		break;
2293	}
2294out:
2295	++*pos;
2296	st->last_pos = *pos;
2297	return rc;
2298}
2299
2300static void tcp_seq_stop(struct seq_file *seq, void *v)
2301{
2302	struct tcp_iter_state *st = seq->private;
2303
2304	switch (st->state) {
2305	case TCP_SEQ_STATE_OPENREQ:
2306		if (v) {
2307			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2308			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2309		}
2310	case TCP_SEQ_STATE_LISTENING:
2311		if (v != SEQ_START_TOKEN)
2312			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2313		break;
2314	case TCP_SEQ_STATE_TIME_WAIT:
2315	case TCP_SEQ_STATE_ESTABLISHED:
2316		if (v)
2317			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2318		break;
2319	}
2320}
2321
2322static int tcp_seq_open(struct inode *inode, struct file *file)
2323{
2324	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2325	struct tcp_iter_state *s;
2326	int err;
2327
2328	err = seq_open_net(inode, file, &afinfo->seq_ops,
2329			  sizeof(struct tcp_iter_state));
2330	if (err < 0)
2331		return err;
2332
2333	s = ((struct seq_file *)file->private_data)->private;
2334	s->family		= afinfo->family;
2335	s->last_pos 		= 0;
2336	return 0;
2337}
2338
2339int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2340{
2341	int rc = 0;
2342	struct proc_dir_entry *p;
2343
2344	afinfo->seq_fops.open		= tcp_seq_open;
2345	afinfo->seq_fops.read		= seq_read;
2346	afinfo->seq_fops.llseek		= seq_lseek;
2347	afinfo->seq_fops.release	= seq_release_net;
2348
2349	afinfo->seq_ops.start		= tcp_seq_start;
2350	afinfo->seq_ops.next		= tcp_seq_next;
2351	afinfo->seq_ops.stop		= tcp_seq_stop;
2352
2353	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2354			     &afinfo->seq_fops, afinfo);
2355	if (!p)
2356		rc = -ENOMEM;
2357	return rc;
2358}
2359EXPORT_SYMBOL(tcp_proc_register);
2360
2361void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2362{
2363	proc_net_remove(net, afinfo->name);
2364}
2365EXPORT_SYMBOL(tcp_proc_unregister);
2366
2367static void get_openreq4(struct sock *sk, struct request_sock *req,
2368			 struct seq_file *f, int i, int uid, int *len)
2369{
2370	const struct inet_request_sock *ireq = inet_rsk(req);
2371	int ttd = req->expires - jiffies;
2372
2373	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2375		i,
2376		ireq->loc_addr,
2377		ntohs(inet_sk(sk)->inet_sport),
2378		ireq->rmt_addr,
2379		ntohs(ireq->rmt_port),
2380		TCP_SYN_RECV,
2381		0, 0, /* could print option size, but that is af dependent. */
2382		1,    /* timers active (only the expire timer) */
2383		jiffies_to_clock_t(ttd),
2384		req->retrans,
2385		uid,
2386		0,  /* non standard timer */
2387		0, /* open_requests have no inode */
2388		atomic_read(&sk->sk_refcnt),
2389		req,
2390		len);
2391}
2392
2393static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2394{
2395	int timer_active;
2396	unsigned long timer_expires;
2397	struct tcp_sock *tp = tcp_sk(sk);
2398	const struct inet_connection_sock *icsk = inet_csk(sk);
2399	struct inet_sock *inet = inet_sk(sk);
2400	__be32 dest = inet->inet_daddr;
2401	__be32 src = inet->inet_rcv_saddr;
2402	__u16 destp = ntohs(inet->inet_dport);
2403	__u16 srcp = ntohs(inet->inet_sport);
2404	int rx_queue;
2405
2406	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2407		timer_active	= 1;
2408		timer_expires	= icsk->icsk_timeout;
2409	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2410		timer_active	= 4;
2411		timer_expires	= icsk->icsk_timeout;
2412	} else if (timer_pending(&sk->sk_timer)) {
2413		timer_active	= 2;
2414		timer_expires	= sk->sk_timer.expires;
2415	} else {
2416		timer_active	= 0;
2417		timer_expires = jiffies;
2418	}
2419
2420	if (sk->sk_state == TCP_LISTEN)
2421		rx_queue = sk->sk_ack_backlog;
2422	else
2423		/*
2424		 * because we dont lock socket, we might find a transient negative value
2425		 */
2426		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2427
2428	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2429			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2430		i, src, srcp, dest, destp, sk->sk_state,
2431		tp->write_seq - tp->snd_una,
2432		rx_queue,
2433		timer_active,
2434		jiffies_to_clock_t(timer_expires - jiffies),
2435		icsk->icsk_retransmits,
2436		sock_i_uid(sk),
2437		icsk->icsk_probes_out,
2438		sock_i_ino(sk),
2439		atomic_read(&sk->sk_refcnt), sk,
2440		jiffies_to_clock_t(icsk->icsk_rto),
2441		jiffies_to_clock_t(icsk->icsk_ack.ato),
2442		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2443		tp->snd_cwnd,
2444		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2445		len);
2446}
2447
2448static void get_timewait4_sock(struct inet_timewait_sock *tw,
2449			       struct seq_file *f, int i, int *len)
2450{
2451	__be32 dest, src;
2452	__u16 destp, srcp;
2453	int ttd = tw->tw_ttd - jiffies;
2454
2455	if (ttd < 0)
2456		ttd = 0;
2457
2458	dest  = tw->tw_daddr;
2459	src   = tw->tw_rcv_saddr;
2460	destp = ntohs(tw->tw_dport);
2461	srcp  = ntohs(tw->tw_sport);
2462
2463	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2464		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2465		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2466		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2467		atomic_read(&tw->tw_refcnt), tw, len);
2468}
2469
2470#define TMPSZ 150
2471
2472static int tcp4_seq_show(struct seq_file *seq, void *v)
2473{
2474	struct tcp_iter_state *st;
2475	int len;
2476
2477	if (v == SEQ_START_TOKEN) {
2478		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2479			   "  sl  local_address rem_address   st tx_queue "
2480			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2481			   "inode");
2482		goto out;
2483	}
2484	st = seq->private;
2485
2486	switch (st->state) {
2487	case TCP_SEQ_STATE_LISTENING:
2488	case TCP_SEQ_STATE_ESTABLISHED:
2489		get_tcp4_sock(v, seq, st->num, &len);
2490		break;
2491	case TCP_SEQ_STATE_OPENREQ:
2492		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2493		break;
2494	case TCP_SEQ_STATE_TIME_WAIT:
2495		get_timewait4_sock(v, seq, st->num, &len);
2496		break;
2497	}
2498	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2499out:
2500	return 0;
2501}
2502
2503static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2504	.name		= "tcp",
2505	.family		= AF_INET,
2506	.seq_fops	= {
2507		.owner		= THIS_MODULE,
2508	},
2509	.seq_ops	= {
2510		.show		= tcp4_seq_show,
2511	},
2512};
2513
2514static int __net_init tcp4_proc_init_net(struct net *net)
2515{
2516	return tcp_proc_register(net, &tcp4_seq_afinfo);
2517}
2518
2519static void __net_exit tcp4_proc_exit_net(struct net *net)
2520{
2521	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2522}
2523
2524static struct pernet_operations tcp4_net_ops = {
2525	.init = tcp4_proc_init_net,
2526	.exit = tcp4_proc_exit_net,
2527};
2528
2529int __init tcp4_proc_init(void)
2530{
2531	return register_pernet_subsys(&tcp4_net_ops);
2532}
2533
2534void tcp4_proc_exit(void)
2535{
2536	unregister_pernet_subsys(&tcp4_net_ops);
2537}
2538#endif /* CONFIG_PROC_FS */
2539
2540struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2541{
2542	const struct iphdr *iph = skb_gro_network_header(skb);
2543
2544	switch (skb->ip_summed) {
2545	case CHECKSUM_COMPLETE:
2546		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2547				  skb->csum)) {
2548			skb->ip_summed = CHECKSUM_UNNECESSARY;
2549			break;
2550		}
2551
2552		/* fall through */
2553	case CHECKSUM_NONE:
2554		NAPI_GRO_CB(skb)->flush = 1;
2555		return NULL;
2556	}
2557
2558	return tcp_gro_receive(head, skb);
2559}
2560
2561int tcp4_gro_complete(struct sk_buff *skb)
2562{
2563	const struct iphdr *iph = ip_hdr(skb);
2564	struct tcphdr *th = tcp_hdr(skb);
2565
2566	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2567				  iph->saddr, iph->daddr, 0);
2568	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2569
2570	return tcp_gro_complete(skb);
2571}
2572
2573struct proto tcp_prot = {
2574	.name			= "TCP",
2575	.owner			= THIS_MODULE,
2576	.close			= tcp_close,
2577	.connect		= tcp_v4_connect,
2578	.disconnect		= tcp_disconnect,
2579	.accept			= inet_csk_accept,
2580	.ioctl			= tcp_ioctl,
2581	.init			= tcp_v4_init_sock,
2582	.destroy		= tcp_v4_destroy_sock,
2583	.shutdown		= tcp_shutdown,
2584	.setsockopt		= tcp_setsockopt,
2585	.getsockopt		= tcp_getsockopt,
2586	.recvmsg		= tcp_recvmsg,
2587	.sendmsg		= tcp_sendmsg,
2588	.sendpage		= tcp_sendpage,
2589	.backlog_rcv		= tcp_v4_do_rcv,
2590	.hash			= inet_hash,
2591	.unhash			= inet_unhash,
2592	.get_port		= inet_csk_get_port,
2593	.enter_memory_pressure	= tcp_enter_memory_pressure,
2594	.sockets_allocated	= &tcp_sockets_allocated,
2595	.orphan_count		= &tcp_orphan_count,
2596	.memory_allocated	= &tcp_memory_allocated,
2597	.memory_pressure	= &tcp_memory_pressure,
2598	.sysctl_mem		= sysctl_tcp_mem,
2599	.sysctl_wmem		= sysctl_tcp_wmem,
2600	.sysctl_rmem		= sysctl_tcp_rmem,
2601	.max_header		= MAX_TCP_HEADER,
2602	.obj_size		= sizeof(struct tcp_sock),
2603	.slab_flags		= SLAB_DESTROY_BY_RCU,
2604	.twsk_prot		= &tcp_timewait_sock_ops,
2605	.rsk_prot		= &tcp_request_sock_ops,
2606	.h.hashinfo		= &tcp_hashinfo,
2607	.no_autobind		= true,
2608#ifdef CONFIG_COMPAT
2609	.compat_setsockopt	= compat_tcp_setsockopt,
2610	.compat_getsockopt	= compat_tcp_getsockopt,
2611#endif
2612};
2613EXPORT_SYMBOL(tcp_prot);
2614
2615
2616static int __net_init tcp_sk_init(struct net *net)
2617{
2618	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2619				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2620}
2621
2622static void __net_exit tcp_sk_exit(struct net *net)
2623{
2624	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2625}
2626
2627static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2628{
2629	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2630}
2631
2632static struct pernet_operations __net_initdata tcp_sk_ops = {
2633       .init	   = tcp_sk_init,
2634       .exit	   = tcp_sk_exit,
2635       .exit_batch = tcp_sk_exit_batch,
2636};
2637
2638void __init tcp_v4_init(void)
2639{
2640	inet_hashinfo_init(&tcp_hashinfo);
2641	if (register_pernet_subsys(&tcp_sk_ops))
2642		panic("Failed to create the TCP control socket.\n");
2643}
2644