tcp_ipv4.c revision 9327f7053e3993c125944fdb137a0618319ef2a0
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63
64#include <net/net_namespace.h>
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
84int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
86
87
88#ifdef CONFIG_TCP_MD5SIG
89static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90						   __be32 addr);
91static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92			       __be32 daddr, __be32 saddr, struct tcphdr *th);
93#else
94static inline
95struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96{
97	return NULL;
98}
99#endif
100
101struct inet_hashinfo tcp_hashinfo;
102
103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104{
105	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106					  ip_hdr(skb)->saddr,
107					  tcp_hdr(skb)->dest,
108					  tcp_hdr(skb)->source);
109}
110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
113	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114	struct tcp_sock *tp = tcp_sk(sk);
115
116	/* With PAWS, it is safe from the viewpoint
117	   of data integrity. Even without PAWS it is safe provided sequence
118	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120	   Actually, the idea is close to VJ's one, only timestamp cache is
121	   held not per host, but per port pair and TW bucket is used as state
122	   holder.
123
124	   If TW bucket has been already destroyed we fall back to VJ's scheme
125	   and use initial timestamp retrieved from peer table.
126	 */
127	if (tcptw->tw_ts_recent_stamp &&
128	    (twp == NULL || (sysctl_tcp_tw_reuse &&
129			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131		if (tp->write_seq == 0)
132			tp->write_seq = 1;
133		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135		sock_hold(sktw);
136		return 1;
137	}
138
139	return 0;
140}
141
142EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144/* This will initiate an outgoing connection. */
145int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146{
147	struct inet_sock *inet = inet_sk(sk);
148	struct tcp_sock *tp = tcp_sk(sk);
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct rtable *rt;
151	__be32 daddr, nexthop;
152	int tmp;
153	int err;
154
155	if (addr_len < sizeof(struct sockaddr_in))
156		return -EINVAL;
157
158	if (usin->sin_family != AF_INET)
159		return -EAFNOSUPPORT;
160
161	nexthop = daddr = usin->sin_addr.s_addr;
162	if (inet->opt && inet->opt->srr) {
163		if (!daddr)
164			return -EINVAL;
165		nexthop = inet->opt->faddr;
166	}
167
168	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170			       IPPROTO_TCP,
171			       inet->inet_sport, usin->sin_port, sk, 1);
172	if (tmp < 0) {
173		if (tmp == -ENETUNREACH)
174			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175		return tmp;
176	}
177
178	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179		ip_rt_put(rt);
180		return -ENETUNREACH;
181	}
182
183	if (!inet->opt || !inet->opt->srr)
184		daddr = rt->rt_dst;
185
186	if (!inet->inet_saddr)
187		inet->inet_saddr = rt->rt_src;
188	inet->inet_rcv_saddr = inet->inet_saddr;
189
190	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191		/* Reset inherited state */
192		tp->rx_opt.ts_recent	   = 0;
193		tp->rx_opt.ts_recent_stamp = 0;
194		tp->write_seq		   = 0;
195	}
196
197	if (tcp_death_row.sysctl_tw_recycle &&
198	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199		struct inet_peer *peer = rt_get_peer(rt);
200		/*
201		 * VJ's idea. We save last timestamp seen from
202		 * the destination in peer table, when entering state
203		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204		 * when trying new connection.
205		 */
206		if (peer != NULL &&
207		    (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209			tp->rx_opt.ts_recent = peer->tcp_ts;
210		}
211	}
212
213	inet->inet_dport = usin->sin_port;
214	inet->inet_daddr = daddr;
215
216	inet_csk(sk)->icsk_ext_hdr_len = 0;
217	if (inet->opt)
218		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221
222	/* Socket identity is still unknown (sport may be zero).
223	 * However we set state to SYN-SENT and not releasing socket
224	 * lock select source port, enter ourselves into the hash tables and
225	 * complete initialization after this.
226	 */
227	tcp_set_state(sk, TCP_SYN_SENT);
228	err = inet_hash_connect(&tcp_death_row, sk);
229	if (err)
230		goto failure;
231
232	err = ip_route_newports(&rt, IPPROTO_TCP,
233				inet->inet_sport, inet->inet_dport, sk);
234	if (err)
235		goto failure;
236
237	/* OK, now commit destination to socket.  */
238	sk->sk_gso_type = SKB_GSO_TCPV4;
239	sk_setup_caps(sk, &rt->u.dst);
240
241	if (!tp->write_seq)
242		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243							   inet->inet_daddr,
244							   inet->inet_sport,
245							   usin->sin_port);
246
247	inet->inet_id = tp->write_seq ^ jiffies;
248
249	err = tcp_connect(sk);
250	rt = NULL;
251	if (err)
252		goto failure;
253
254	return 0;
255
256failure:
257	/*
258	 * This unhashes the socket and releases the local port,
259	 * if necessary.
260	 */
261	tcp_set_state(sk, TCP_CLOSE);
262	ip_rt_put(rt);
263	sk->sk_route_caps = 0;
264	inet->inet_dport = 0;
265	return err;
266}
267
268/*
269 * This routine does path mtu discovery as defined in RFC1191.
270 */
271static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272{
273	struct dst_entry *dst;
274	struct inet_sock *inet = inet_sk(sk);
275
276	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277	 * send out by Linux are always <576bytes so they should go through
278	 * unfragmented).
279	 */
280	if (sk->sk_state == TCP_LISTEN)
281		return;
282
283	/* We don't check in the destentry if pmtu discovery is forbidden
284	 * on this route. We just assume that no packet_to_big packets
285	 * are send back when pmtu discovery is not active.
286	 * There is a small race when the user changes this flag in the
287	 * route, but I think that's acceptable.
288	 */
289	if ((dst = __sk_dst_check(sk, 0)) == NULL)
290		return;
291
292	dst->ops->update_pmtu(dst, mtu);
293
294	/* Something is about to be wrong... Remember soft error
295	 * for the case, if this connection will not able to recover.
296	 */
297	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298		sk->sk_err_soft = EMSGSIZE;
299
300	mtu = dst_mtu(dst);
301
302	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304		tcp_sync_mss(sk, mtu);
305
306		/* Resend the TCP packet because it's
307		 * clear that the old packet has been
308		 * dropped. This is the new "fast" path mtu
309		 * discovery.
310		 */
311		tcp_simple_retransmit(sk);
312	} /* else let the usual retransmit timer handle it */
313}
314
315/*
316 * This routine is called by the ICMP module when it gets some
317 * sort of error condition.  If err < 0 then the socket should
318 * be closed and the error returned to the user.  If err > 0
319 * it's just the icmp type << 8 | icmp code.  After adjustment
320 * header points to the first 8 bytes of the tcp header.  We need
321 * to find the appropriate port.
322 *
323 * The locking strategy used here is very "optimistic". When
324 * someone else accesses the socket the ICMP is just dropped
325 * and for some paths there is no check at all.
326 * A more general error queue to queue errors for later handling
327 * is probably better.
328 *
329 */
330
331void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332{
333	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335	struct inet_connection_sock *icsk;
336	struct tcp_sock *tp;
337	struct inet_sock *inet;
338	const int type = icmp_hdr(icmp_skb)->type;
339	const int code = icmp_hdr(icmp_skb)->code;
340	struct sock *sk;
341	struct sk_buff *skb;
342	__u32 seq;
343	__u32 remaining;
344	int err;
345	struct net *net = dev_net(icmp_skb->dev);
346
347	if (icmp_skb->len < (iph->ihl << 2) + 8) {
348		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349		return;
350	}
351
352	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353			iph->saddr, th->source, inet_iif(icmp_skb));
354	if (!sk) {
355		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356		return;
357	}
358	if (sk->sk_state == TCP_TIME_WAIT) {
359		inet_twsk_put(inet_twsk(sk));
360		return;
361	}
362
363	bh_lock_sock(sk);
364	/* If too many ICMPs get dropped on busy
365	 * servers this needs to be solved differently.
366	 */
367	if (sock_owned_by_user(sk))
368		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370	if (sk->sk_state == TCP_CLOSE)
371		goto out;
372
373	icsk = inet_csk(sk);
374	tp = tcp_sk(sk);
375	seq = ntohl(th->seq);
376	if (sk->sk_state != TCP_LISTEN &&
377	    !between(seq, tp->snd_una, tp->snd_nxt)) {
378		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379		goto out;
380	}
381
382	switch (type) {
383	case ICMP_SOURCE_QUENCH:
384		/* Just silently ignore these. */
385		goto out;
386	case ICMP_PARAMETERPROB:
387		err = EPROTO;
388		break;
389	case ICMP_DEST_UNREACH:
390		if (code > NR_ICMP_UNREACH)
391			goto out;
392
393		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394			if (!sock_owned_by_user(sk))
395				do_pmtu_discovery(sk, iph, info);
396			goto out;
397		}
398
399		err = icmp_err_convert[code].errno;
400		/* check if icmp_skb allows revert of backoff
401		 * (see draft-zimmermann-tcp-lcd) */
402		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
403			break;
404		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
405		    !icsk->icsk_backoff)
406			break;
407
408		icsk->icsk_backoff--;
409		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
410					 icsk->icsk_backoff;
411		tcp_bound_rto(sk);
412
413		skb = tcp_write_queue_head(sk);
414		BUG_ON(!skb);
415
416		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
417				tcp_time_stamp - TCP_SKB_CB(skb)->when);
418
419		if (remaining) {
420			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
421						  remaining, TCP_RTO_MAX);
422		} else if (sock_owned_by_user(sk)) {
423			/* RTO revert clocked out retransmission,
424			 * but socket is locked. Will defer. */
425			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426						  HZ/20, TCP_RTO_MAX);
427		} else {
428			/* RTO revert clocked out retransmission.
429			 * Will retransmit now */
430			tcp_retransmit_timer(sk);
431		}
432
433		break;
434	case ICMP_TIME_EXCEEDED:
435		err = EHOSTUNREACH;
436		break;
437	default:
438		goto out;
439	}
440
441	switch (sk->sk_state) {
442		struct request_sock *req, **prev;
443	case TCP_LISTEN:
444		if (sock_owned_by_user(sk))
445			goto out;
446
447		req = inet_csk_search_req(sk, &prev, th->dest,
448					  iph->daddr, iph->saddr);
449		if (!req)
450			goto out;
451
452		/* ICMPs are not backlogged, hence we cannot get
453		   an established socket here.
454		 */
455		WARN_ON(req->sk);
456
457		if (seq != tcp_rsk(req)->snt_isn) {
458			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
459			goto out;
460		}
461
462		/*
463		 * Still in SYN_RECV, just remove it silently.
464		 * There is no good way to pass the error to the newly
465		 * created socket, and POSIX does not want network
466		 * errors returned from accept().
467		 */
468		inet_csk_reqsk_queue_drop(sk, req, prev);
469		goto out;
470
471	case TCP_SYN_SENT:
472	case TCP_SYN_RECV:  /* Cannot happen.
473			       It can f.e. if SYNs crossed.
474			     */
475		if (!sock_owned_by_user(sk)) {
476			sk->sk_err = err;
477
478			sk->sk_error_report(sk);
479
480			tcp_done(sk);
481		} else {
482			sk->sk_err_soft = err;
483		}
484		goto out;
485	}
486
487	/* If we've already connected we will keep trying
488	 * until we time out, or the user gives up.
489	 *
490	 * rfc1122 4.2.3.9 allows to consider as hard errors
491	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
492	 * but it is obsoleted by pmtu discovery).
493	 *
494	 * Note, that in modern internet, where routing is unreliable
495	 * and in each dark corner broken firewalls sit, sending random
496	 * errors ordered by their masters even this two messages finally lose
497	 * their original sense (even Linux sends invalid PORT_UNREACHs)
498	 *
499	 * Now we are in compliance with RFCs.
500	 *							--ANK (980905)
501	 */
502
503	inet = inet_sk(sk);
504	if (!sock_owned_by_user(sk) && inet->recverr) {
505		sk->sk_err = err;
506		sk->sk_error_report(sk);
507	} else	{ /* Only an error on timeout */
508		sk->sk_err_soft = err;
509	}
510
511out:
512	bh_unlock_sock(sk);
513	sock_put(sk);
514}
515
516/* This routine computes an IPv4 TCP checksum. */
517void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
518{
519	struct inet_sock *inet = inet_sk(sk);
520	struct tcphdr *th = tcp_hdr(skb);
521
522	if (skb->ip_summed == CHECKSUM_PARTIAL) {
523		th->check = ~tcp_v4_check(len, inet->inet_saddr,
524					  inet->inet_daddr, 0);
525		skb->csum_start = skb_transport_header(skb) - skb->head;
526		skb->csum_offset = offsetof(struct tcphdr, check);
527	} else {
528		th->check = tcp_v4_check(len, inet->inet_saddr,
529					 inet->inet_daddr,
530					 csum_partial(th,
531						      th->doff << 2,
532						      skb->csum));
533	}
534}
535
536int tcp_v4_gso_send_check(struct sk_buff *skb)
537{
538	const struct iphdr *iph;
539	struct tcphdr *th;
540
541	if (!pskb_may_pull(skb, sizeof(*th)))
542		return -EINVAL;
543
544	iph = ip_hdr(skb);
545	th = tcp_hdr(skb);
546
547	th->check = 0;
548	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
549	skb->csum_start = skb_transport_header(skb) - skb->head;
550	skb->csum_offset = offsetof(struct tcphdr, check);
551	skb->ip_summed = CHECKSUM_PARTIAL;
552	return 0;
553}
554
555/*
556 *	This routine will send an RST to the other tcp.
557 *
558 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
559 *		      for reset.
560 *	Answer: if a packet caused RST, it is not for a socket
561 *		existing in our system, if it is matched to a socket,
562 *		it is just duplicate segment or bug in other side's TCP.
563 *		So that we build reply only basing on parameters
564 *		arrived with segment.
565 *	Exception: precedence violation. We do not implement it in any case.
566 */
567
568static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
569{
570	struct tcphdr *th = tcp_hdr(skb);
571	struct {
572		struct tcphdr th;
573#ifdef CONFIG_TCP_MD5SIG
574		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
575#endif
576	} rep;
577	struct ip_reply_arg arg;
578#ifdef CONFIG_TCP_MD5SIG
579	struct tcp_md5sig_key *key;
580#endif
581	struct net *net;
582
583	/* Never send a reset in response to a reset. */
584	if (th->rst)
585		return;
586
587	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
588		return;
589
590	/* Swap the send and the receive. */
591	memset(&rep, 0, sizeof(rep));
592	rep.th.dest   = th->source;
593	rep.th.source = th->dest;
594	rep.th.doff   = sizeof(struct tcphdr) / 4;
595	rep.th.rst    = 1;
596
597	if (th->ack) {
598		rep.th.seq = th->ack_seq;
599	} else {
600		rep.th.ack = 1;
601		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
602				       skb->len - (th->doff << 2));
603	}
604
605	memset(&arg, 0, sizeof(arg));
606	arg.iov[0].iov_base = (unsigned char *)&rep;
607	arg.iov[0].iov_len  = sizeof(rep.th);
608
609#ifdef CONFIG_TCP_MD5SIG
610	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
611	if (key) {
612		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
613				   (TCPOPT_NOP << 16) |
614				   (TCPOPT_MD5SIG << 8) |
615				   TCPOLEN_MD5SIG);
616		/* Update length and the length the header thinks exists */
617		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
618		rep.th.doff = arg.iov[0].iov_len / 4;
619
620		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
621				     key, ip_hdr(skb)->saddr,
622				     ip_hdr(skb)->daddr, &rep.th);
623	}
624#endif
625	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
626				      ip_hdr(skb)->saddr, /* XXX */
627				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
628	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
629	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
630
631	net = dev_net(skb_dst(skb)->dev);
632	ip_send_reply(net->ipv4.tcp_sock, skb,
633		      &arg, arg.iov[0].iov_len);
634
635	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
636	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
637}
638
639/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
640   outside socket context is ugly, certainly. What can I do?
641 */
642
643static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
644			    u32 win, u32 ts, int oif,
645			    struct tcp_md5sig_key *key,
646			    int reply_flags)
647{
648	struct tcphdr *th = tcp_hdr(skb);
649	struct {
650		struct tcphdr th;
651		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
652#ifdef CONFIG_TCP_MD5SIG
653			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
654#endif
655			];
656	} rep;
657	struct ip_reply_arg arg;
658	struct net *net = dev_net(skb_dst(skb)->dev);
659
660	memset(&rep.th, 0, sizeof(struct tcphdr));
661	memset(&arg, 0, sizeof(arg));
662
663	arg.iov[0].iov_base = (unsigned char *)&rep;
664	arg.iov[0].iov_len  = sizeof(rep.th);
665	if (ts) {
666		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
667				   (TCPOPT_TIMESTAMP << 8) |
668				   TCPOLEN_TIMESTAMP);
669		rep.opt[1] = htonl(tcp_time_stamp);
670		rep.opt[2] = htonl(ts);
671		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
672	}
673
674	/* Swap the send and the receive. */
675	rep.th.dest    = th->source;
676	rep.th.source  = th->dest;
677	rep.th.doff    = arg.iov[0].iov_len / 4;
678	rep.th.seq     = htonl(seq);
679	rep.th.ack_seq = htonl(ack);
680	rep.th.ack     = 1;
681	rep.th.window  = htons(win);
682
683#ifdef CONFIG_TCP_MD5SIG
684	if (key) {
685		int offset = (ts) ? 3 : 0;
686
687		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
688					  (TCPOPT_NOP << 16) |
689					  (TCPOPT_MD5SIG << 8) |
690					  TCPOLEN_MD5SIG);
691		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
692		rep.th.doff = arg.iov[0].iov_len/4;
693
694		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
695				    key, ip_hdr(skb)->saddr,
696				    ip_hdr(skb)->daddr, &rep.th);
697	}
698#endif
699	arg.flags = reply_flags;
700	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
701				      ip_hdr(skb)->saddr, /* XXX */
702				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
703	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
704	if (oif)
705		arg.bound_dev_if = oif;
706
707	ip_send_reply(net->ipv4.tcp_sock, skb,
708		      &arg, arg.iov[0].iov_len);
709
710	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
711}
712
713static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
714{
715	struct inet_timewait_sock *tw = inet_twsk(sk);
716	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
717
718	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
719			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
720			tcptw->tw_ts_recent,
721			tw->tw_bound_dev_if,
722			tcp_twsk_md5_key(tcptw),
723			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
724			);
725
726	inet_twsk_put(tw);
727}
728
729static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
730				  struct request_sock *req)
731{
732	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
733			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734			req->ts_recent,
735			0,
736			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
737			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
738}
739
740/*
741 *	Send a SYN-ACK after having received a SYN.
742 *	This still operates on a request_sock only, not on a big
743 *	socket.
744 */
745static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
746				struct request_sock *req,
747				struct request_values *rvp)
748{
749	const struct inet_request_sock *ireq = inet_rsk(req);
750	int err = -1;
751	struct sk_buff * skb;
752
753	/* First, grab a route. */
754	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
755		return -1;
756
757	skb = tcp_make_synack(sk, dst, req, rvp);
758
759	if (skb) {
760		struct tcphdr *th = tcp_hdr(skb);
761
762		th->check = tcp_v4_check(skb->len,
763					 ireq->loc_addr,
764					 ireq->rmt_addr,
765					 csum_partial(th, skb->len,
766						      skb->csum));
767
768		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
769					    ireq->rmt_addr,
770					    ireq->opt);
771		err = net_xmit_eval(err);
772	}
773
774	dst_release(dst);
775	return err;
776}
777
778static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
779			      struct request_values *rvp)
780{
781	return __tcp_v4_send_synack(sk, NULL, req, rvp);
782}
783
784/*
785 *	IPv4 request_sock destructor.
786 */
787static void tcp_v4_reqsk_destructor(struct request_sock *req)
788{
789	kfree(inet_rsk(req)->opt);
790}
791
792#ifdef CONFIG_SYN_COOKIES
793static void syn_flood_warning(struct sk_buff *skb)
794{
795	static unsigned long warntime;
796
797	if (time_after(jiffies, (warntime + HZ * 60))) {
798		warntime = jiffies;
799		printk(KERN_INFO
800		       "possible SYN flooding on port %d. Sending cookies.\n",
801		       ntohs(tcp_hdr(skb)->dest));
802	}
803}
804#endif
805
806/*
807 * Save and compile IPv4 options into the request_sock if needed.
808 */
809static struct ip_options *tcp_v4_save_options(struct sock *sk,
810					      struct sk_buff *skb)
811{
812	struct ip_options *opt = &(IPCB(skb)->opt);
813	struct ip_options *dopt = NULL;
814
815	if (opt && opt->optlen) {
816		int opt_size = optlength(opt);
817		dopt = kmalloc(opt_size, GFP_ATOMIC);
818		if (dopt) {
819			if (ip_options_echo(dopt, skb)) {
820				kfree(dopt);
821				dopt = NULL;
822			}
823		}
824	}
825	return dopt;
826}
827
828#ifdef CONFIG_TCP_MD5SIG
829/*
830 * RFC2385 MD5 checksumming requires a mapping of
831 * IP address->MD5 Key.
832 * We need to maintain these in the sk structure.
833 */
834
835/* Find the Key structure for an address.  */
836static struct tcp_md5sig_key *
837			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
838{
839	struct tcp_sock *tp = tcp_sk(sk);
840	int i;
841
842	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
843		return NULL;
844	for (i = 0; i < tp->md5sig_info->entries4; i++) {
845		if (tp->md5sig_info->keys4[i].addr == addr)
846			return &tp->md5sig_info->keys4[i].base;
847	}
848	return NULL;
849}
850
851struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
852					 struct sock *addr_sk)
853{
854	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
855}
856
857EXPORT_SYMBOL(tcp_v4_md5_lookup);
858
859static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
860						      struct request_sock *req)
861{
862	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
863}
864
865/* This can be called on a newly created socket, from other files */
866int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
867		      u8 *newkey, u8 newkeylen)
868{
869	/* Add Key to the list */
870	struct tcp_md5sig_key *key;
871	struct tcp_sock *tp = tcp_sk(sk);
872	struct tcp4_md5sig_key *keys;
873
874	key = tcp_v4_md5_do_lookup(sk, addr);
875	if (key) {
876		/* Pre-existing entry - just update that one. */
877		kfree(key->key);
878		key->key = newkey;
879		key->keylen = newkeylen;
880	} else {
881		struct tcp_md5sig_info *md5sig;
882
883		if (!tp->md5sig_info) {
884			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
885						  GFP_ATOMIC);
886			if (!tp->md5sig_info) {
887				kfree(newkey);
888				return -ENOMEM;
889			}
890			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
891		}
892		if (tcp_alloc_md5sig_pool(sk) == NULL) {
893			kfree(newkey);
894			return -ENOMEM;
895		}
896		md5sig = tp->md5sig_info;
897
898		if (md5sig->alloced4 == md5sig->entries4) {
899			keys = kmalloc((sizeof(*keys) *
900					(md5sig->entries4 + 1)), GFP_ATOMIC);
901			if (!keys) {
902				kfree(newkey);
903				tcp_free_md5sig_pool();
904				return -ENOMEM;
905			}
906
907			if (md5sig->entries4)
908				memcpy(keys, md5sig->keys4,
909				       sizeof(*keys) * md5sig->entries4);
910
911			/* Free old key list, and reference new one */
912			kfree(md5sig->keys4);
913			md5sig->keys4 = keys;
914			md5sig->alloced4++;
915		}
916		md5sig->entries4++;
917		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
918		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
919		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
920	}
921	return 0;
922}
923
924EXPORT_SYMBOL(tcp_v4_md5_do_add);
925
926static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
927			       u8 *newkey, u8 newkeylen)
928{
929	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
930				 newkey, newkeylen);
931}
932
933int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
934{
935	struct tcp_sock *tp = tcp_sk(sk);
936	int i;
937
938	for (i = 0; i < tp->md5sig_info->entries4; i++) {
939		if (tp->md5sig_info->keys4[i].addr == addr) {
940			/* Free the key */
941			kfree(tp->md5sig_info->keys4[i].base.key);
942			tp->md5sig_info->entries4--;
943
944			if (tp->md5sig_info->entries4 == 0) {
945				kfree(tp->md5sig_info->keys4);
946				tp->md5sig_info->keys4 = NULL;
947				tp->md5sig_info->alloced4 = 0;
948			} else if (tp->md5sig_info->entries4 != i) {
949				/* Need to do some manipulation */
950				memmove(&tp->md5sig_info->keys4[i],
951					&tp->md5sig_info->keys4[i+1],
952					(tp->md5sig_info->entries4 - i) *
953					 sizeof(struct tcp4_md5sig_key));
954			}
955			tcp_free_md5sig_pool();
956			return 0;
957		}
958	}
959	return -ENOENT;
960}
961
962EXPORT_SYMBOL(tcp_v4_md5_do_del);
963
964static void tcp_v4_clear_md5_list(struct sock *sk)
965{
966	struct tcp_sock *tp = tcp_sk(sk);
967
968	/* Free each key, then the set of key keys,
969	 * the crypto element, and then decrement our
970	 * hold on the last resort crypto.
971	 */
972	if (tp->md5sig_info->entries4) {
973		int i;
974		for (i = 0; i < tp->md5sig_info->entries4; i++)
975			kfree(tp->md5sig_info->keys4[i].base.key);
976		tp->md5sig_info->entries4 = 0;
977		tcp_free_md5sig_pool();
978	}
979	if (tp->md5sig_info->keys4) {
980		kfree(tp->md5sig_info->keys4);
981		tp->md5sig_info->keys4 = NULL;
982		tp->md5sig_info->alloced4  = 0;
983	}
984}
985
986static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
987				 int optlen)
988{
989	struct tcp_md5sig cmd;
990	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
991	u8 *newkey;
992
993	if (optlen < sizeof(cmd))
994		return -EINVAL;
995
996	if (copy_from_user(&cmd, optval, sizeof(cmd)))
997		return -EFAULT;
998
999	if (sin->sin_family != AF_INET)
1000		return -EINVAL;
1001
1002	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1003		if (!tcp_sk(sk)->md5sig_info)
1004			return -ENOENT;
1005		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1006	}
1007
1008	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1009		return -EINVAL;
1010
1011	if (!tcp_sk(sk)->md5sig_info) {
1012		struct tcp_sock *tp = tcp_sk(sk);
1013		struct tcp_md5sig_info *p;
1014
1015		p = kzalloc(sizeof(*p), sk->sk_allocation);
1016		if (!p)
1017			return -EINVAL;
1018
1019		tp->md5sig_info = p;
1020		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1021	}
1022
1023	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1024	if (!newkey)
1025		return -ENOMEM;
1026	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1027				 newkey, cmd.tcpm_keylen);
1028}
1029
1030static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1031					__be32 daddr, __be32 saddr, int nbytes)
1032{
1033	struct tcp4_pseudohdr *bp;
1034	struct scatterlist sg;
1035
1036	bp = &hp->md5_blk.ip4;
1037
1038	/*
1039	 * 1. the TCP pseudo-header (in the order: source IP address,
1040	 * destination IP address, zero-padded protocol number, and
1041	 * segment length)
1042	 */
1043	bp->saddr = saddr;
1044	bp->daddr = daddr;
1045	bp->pad = 0;
1046	bp->protocol = IPPROTO_TCP;
1047	bp->len = cpu_to_be16(nbytes);
1048
1049	sg_init_one(&sg, bp, sizeof(*bp));
1050	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1051}
1052
1053static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1054			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1055{
1056	struct tcp_md5sig_pool *hp;
1057	struct hash_desc *desc;
1058
1059	hp = tcp_get_md5sig_pool();
1060	if (!hp)
1061		goto clear_hash_noput;
1062	desc = &hp->md5_desc;
1063
1064	if (crypto_hash_init(desc))
1065		goto clear_hash;
1066	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1067		goto clear_hash;
1068	if (tcp_md5_hash_header(hp, th))
1069		goto clear_hash;
1070	if (tcp_md5_hash_key(hp, key))
1071		goto clear_hash;
1072	if (crypto_hash_final(desc, md5_hash))
1073		goto clear_hash;
1074
1075	tcp_put_md5sig_pool();
1076	return 0;
1077
1078clear_hash:
1079	tcp_put_md5sig_pool();
1080clear_hash_noput:
1081	memset(md5_hash, 0, 16);
1082	return 1;
1083}
1084
1085int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1086			struct sock *sk, struct request_sock *req,
1087			struct sk_buff *skb)
1088{
1089	struct tcp_md5sig_pool *hp;
1090	struct hash_desc *desc;
1091	struct tcphdr *th = tcp_hdr(skb);
1092	__be32 saddr, daddr;
1093
1094	if (sk) {
1095		saddr = inet_sk(sk)->inet_saddr;
1096		daddr = inet_sk(sk)->inet_daddr;
1097	} else if (req) {
1098		saddr = inet_rsk(req)->loc_addr;
1099		daddr = inet_rsk(req)->rmt_addr;
1100	} else {
1101		const struct iphdr *iph = ip_hdr(skb);
1102		saddr = iph->saddr;
1103		daddr = iph->daddr;
1104	}
1105
1106	hp = tcp_get_md5sig_pool();
1107	if (!hp)
1108		goto clear_hash_noput;
1109	desc = &hp->md5_desc;
1110
1111	if (crypto_hash_init(desc))
1112		goto clear_hash;
1113
1114	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1115		goto clear_hash;
1116	if (tcp_md5_hash_header(hp, th))
1117		goto clear_hash;
1118	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1119		goto clear_hash;
1120	if (tcp_md5_hash_key(hp, key))
1121		goto clear_hash;
1122	if (crypto_hash_final(desc, md5_hash))
1123		goto clear_hash;
1124
1125	tcp_put_md5sig_pool();
1126	return 0;
1127
1128clear_hash:
1129	tcp_put_md5sig_pool();
1130clear_hash_noput:
1131	memset(md5_hash, 0, 16);
1132	return 1;
1133}
1134
1135EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1136
1137static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1138{
1139	/*
1140	 * This gets called for each TCP segment that arrives
1141	 * so we want to be efficient.
1142	 * We have 3 drop cases:
1143	 * o No MD5 hash and one expected.
1144	 * o MD5 hash and we're not expecting one.
1145	 * o MD5 hash and its wrong.
1146	 */
1147	__u8 *hash_location = NULL;
1148	struct tcp_md5sig_key *hash_expected;
1149	const struct iphdr *iph = ip_hdr(skb);
1150	struct tcphdr *th = tcp_hdr(skb);
1151	int genhash;
1152	unsigned char newhash[16];
1153
1154	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1155	hash_location = tcp_parse_md5sig_option(th);
1156
1157	/* We've parsed the options - do we have a hash? */
1158	if (!hash_expected && !hash_location)
1159		return 0;
1160
1161	if (hash_expected && !hash_location) {
1162		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163		return 1;
1164	}
1165
1166	if (!hash_expected && hash_location) {
1167		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168		return 1;
1169	}
1170
1171	/* Okay, so this is hash_expected and hash_location -
1172	 * so we need to calculate the checksum.
1173	 */
1174	genhash = tcp_v4_md5_hash_skb(newhash,
1175				      hash_expected,
1176				      NULL, NULL, skb);
1177
1178	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179		if (net_ratelimit()) {
1180			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1181			       &iph->saddr, ntohs(th->source),
1182			       &iph->daddr, ntohs(th->dest),
1183			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1184		}
1185		return 1;
1186	}
1187	return 0;
1188}
1189
1190#endif
1191
1192struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1193	.family		=	PF_INET,
1194	.obj_size	=	sizeof(struct tcp_request_sock),
1195	.rtx_syn_ack	=	tcp_v4_send_synack,
1196	.send_ack	=	tcp_v4_reqsk_send_ack,
1197	.destructor	=	tcp_v4_reqsk_destructor,
1198	.send_reset	=	tcp_v4_send_reset,
1199};
1200
1201#ifdef CONFIG_TCP_MD5SIG
1202static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1203	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1204	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1205};
1206#endif
1207
1208static struct timewait_sock_ops tcp_timewait_sock_ops = {
1209	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1210	.twsk_unique	= tcp_twsk_unique,
1211	.twsk_destructor= tcp_twsk_destructor,
1212};
1213
1214int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1215{
1216	struct tcp_extend_values tmp_ext;
1217	struct tcp_options_received tmp_opt;
1218	u8 *hash_location;
1219	struct request_sock *req;
1220	struct inet_request_sock *ireq;
1221	struct tcp_sock *tp = tcp_sk(sk);
1222	struct dst_entry *dst = NULL;
1223	__be32 saddr = ip_hdr(skb)->saddr;
1224	__be32 daddr = ip_hdr(skb)->daddr;
1225	__u32 isn = TCP_SKB_CB(skb)->when;
1226#ifdef CONFIG_SYN_COOKIES
1227	int want_cookie = 0;
1228#else
1229#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1230#endif
1231
1232	/* Never answer to SYNs send to broadcast or multicast */
1233	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1234		goto drop;
1235
1236	/* TW buckets are converted to open requests without
1237	 * limitations, they conserve resources and peer is
1238	 * evidently real one.
1239	 */
1240	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1241#ifdef CONFIG_SYN_COOKIES
1242		if (sysctl_tcp_syncookies) {
1243			want_cookie = 1;
1244		} else
1245#endif
1246		goto drop;
1247	}
1248
1249	/* Accept backlog is full. If we have already queued enough
1250	 * of warm entries in syn queue, drop request. It is better than
1251	 * clogging syn queue with openreqs with exponentially increasing
1252	 * timeout.
1253	 */
1254	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1255		goto drop;
1256
1257	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1258	if (!req)
1259		goto drop;
1260
1261#ifdef CONFIG_TCP_MD5SIG
1262	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1263#endif
1264
1265	ireq = inet_rsk(req);
1266	ireq->loc_addr = daddr;
1267	ireq->rmt_addr = saddr;
1268	ireq->no_srccheck = inet_sk(sk)->transparent;
1269	ireq->opt = tcp_v4_save_options(sk, skb);
1270
1271	dst = inet_csk_route_req(sk, req);
1272	if(!dst)
1273		goto drop_and_free;
1274
1275	tcp_clear_options(&tmp_opt);
1276	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1277	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1278	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst);
1279
1280	if (tmp_opt.cookie_plus > 0 &&
1281	    tmp_opt.saw_tstamp &&
1282	    !tp->rx_opt.cookie_out_never &&
1283	    (sysctl_tcp_cookie_size > 0 ||
1284	     (tp->cookie_values != NULL &&
1285	      tp->cookie_values->cookie_desired > 0))) {
1286		u8 *c;
1287		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291			goto drop_and_release;
1292
1293		/* Secret recipe starts with IP addresses */
1294		*mess++ ^= daddr;
1295		*mess++ ^= saddr;
1296
1297		/* plus variable length Initiator Cookie */
1298		c = (u8 *)mess;
1299		while (l-- > 0)
1300			*c++ ^= *hash_location++;
1301
1302#ifdef CONFIG_SYN_COOKIES
1303		want_cookie = 0;	/* not our kind of cookie */
1304#endif
1305		tmp_ext.cookie_out_never = 0; /* false */
1306		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307	} else if (!tp->rx_opt.cookie_in_always) {
1308		/* redundant indications, but ensure initialization. */
1309		tmp_ext.cookie_out_never = 1; /* true */
1310		tmp_ext.cookie_plus = 0;
1311	} else {
1312		goto drop_and_release;
1313	}
1314	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1315
1316	if (want_cookie && !tmp_opt.saw_tstamp)
1317		tcp_clear_options(&tmp_opt);
1318
1319	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1320	tcp_openreq_init(req, &tmp_opt, skb);
1321
1322	if (security_inet_conn_request(sk, skb, req))
1323		goto drop_and_release;
1324
1325	if (!want_cookie)
1326		TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328	if (want_cookie) {
1329#ifdef CONFIG_SYN_COOKIES
1330		syn_flood_warning(skb);
1331		req->cookie_ts = tmp_opt.tstamp_ok;
1332#endif
1333		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1334	} else if (!isn) {
1335		struct inet_peer *peer = NULL;
1336
1337		/* VJ's idea. We save last timestamp seen
1338		 * from the destination in peer table, when entering
1339		 * state TIME-WAIT, and check against it before
1340		 * accepting new connection request.
1341		 *
1342		 * If "isn" is not zero, this request hit alive
1343		 * timewait bucket, so that all the necessary checks
1344		 * are made in the function processing timewait state.
1345		 */
1346		if (tmp_opt.saw_tstamp &&
1347		    tcp_death_row.sysctl_tw_recycle &&
1348		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1349		    peer->v4daddr == saddr) {
1350			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1351			    (s32)(peer->tcp_ts - req->ts_recent) >
1352							TCP_PAWS_WINDOW) {
1353				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1354				goto drop_and_release;
1355			}
1356		}
1357		/* Kill the following clause, if you dislike this way. */
1358		else if (!sysctl_tcp_syncookies &&
1359			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1360			  (sysctl_max_syn_backlog >> 2)) &&
1361			 (!peer || !peer->tcp_ts_stamp) &&
1362			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1363			/* Without syncookies last quarter of
1364			 * backlog is filled with destinations,
1365			 * proven to be alive.
1366			 * It means that we continue to communicate
1367			 * to destinations, already remembered
1368			 * to the moment of synflood.
1369			 */
1370			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1371				       &saddr, ntohs(tcp_hdr(skb)->source));
1372			goto drop_and_release;
1373		}
1374
1375		isn = tcp_v4_init_sequence(skb);
1376	}
1377	tcp_rsk(req)->snt_isn = isn;
1378
1379	if (__tcp_v4_send_synack(sk, dst, req,
1380				 (struct request_values *)&tmp_ext) ||
1381	    want_cookie)
1382		goto drop_and_free;
1383
1384	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1385	return 0;
1386
1387drop_and_release:
1388	dst_release(dst);
1389drop_and_free:
1390	reqsk_free(req);
1391drop:
1392	return 0;
1393}
1394
1395
1396/*
1397 * The three way handshake has completed - we got a valid synack -
1398 * now create the new socket.
1399 */
1400struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1401				  struct request_sock *req,
1402				  struct dst_entry *dst)
1403{
1404	struct inet_request_sock *ireq;
1405	struct inet_sock *newinet;
1406	struct tcp_sock *newtp;
1407	struct sock *newsk;
1408#ifdef CONFIG_TCP_MD5SIG
1409	struct tcp_md5sig_key *key;
1410#endif
1411
1412	if (sk_acceptq_is_full(sk))
1413		goto exit_overflow;
1414
1415	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1416		goto exit;
1417
1418	newsk = tcp_create_openreq_child(sk, req, skb);
1419	if (!newsk)
1420		goto exit;
1421
1422	newsk->sk_gso_type = SKB_GSO_TCPV4;
1423	sk_setup_caps(newsk, dst);
1424
1425	newtp		      = tcp_sk(newsk);
1426	newinet		      = inet_sk(newsk);
1427	ireq		      = inet_rsk(req);
1428	newinet->inet_daddr   = ireq->rmt_addr;
1429	newinet->inet_rcv_saddr = ireq->loc_addr;
1430	newinet->inet_saddr	      = ireq->loc_addr;
1431	newinet->opt	      = ireq->opt;
1432	ireq->opt	      = NULL;
1433	newinet->mc_index     = inet_iif(skb);
1434	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1435	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436	if (newinet->opt)
1437		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1438	newinet->inet_id = newtp->write_seq ^ jiffies;
1439
1440	tcp_mtup_init(newsk);
1441	tcp_sync_mss(newsk, dst_mtu(dst));
1442	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1443	if (tcp_sk(sk)->rx_opt.user_mss &&
1444	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1445		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1446
1447	tcp_initialize_rcv_mss(newsk);
1448
1449#ifdef CONFIG_TCP_MD5SIG
1450	/* Copy over the MD5 key from the original socket */
1451	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1452	if (key != NULL) {
1453		/*
1454		 * We're using one, so create a matching key
1455		 * on the newsk structure. If we fail to get
1456		 * memory, then we end up not copying the key
1457		 * across. Shucks.
1458		 */
1459		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1460		if (newkey != NULL)
1461			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1462					  newkey, key->keylen);
1463		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1464	}
1465#endif
1466
1467	__inet_hash_nolisten(newsk, NULL);
1468	__inet_inherit_port(sk, newsk);
1469
1470	return newsk;
1471
1472exit_overflow:
1473	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1474exit:
1475	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1476	dst_release(dst);
1477	return NULL;
1478}
1479
1480static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1481{
1482	struct tcphdr *th = tcp_hdr(skb);
1483	const struct iphdr *iph = ip_hdr(skb);
1484	struct sock *nsk;
1485	struct request_sock **prev;
1486	/* Find possible connection requests. */
1487	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1488						       iph->saddr, iph->daddr);
1489	if (req)
1490		return tcp_check_req(sk, skb, req, prev);
1491
1492	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1493			th->source, iph->daddr, th->dest, inet_iif(skb));
1494
1495	if (nsk) {
1496		if (nsk->sk_state != TCP_TIME_WAIT) {
1497			bh_lock_sock(nsk);
1498			return nsk;
1499		}
1500		inet_twsk_put(inet_twsk(nsk));
1501		return NULL;
1502	}
1503
1504#ifdef CONFIG_SYN_COOKIES
1505	if (!th->rst && !th->syn && th->ack)
1506		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1507#endif
1508	return sk;
1509}
1510
1511static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1512{
1513	const struct iphdr *iph = ip_hdr(skb);
1514
1515	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1516		if (!tcp_v4_check(skb->len, iph->saddr,
1517				  iph->daddr, skb->csum)) {
1518			skb->ip_summed = CHECKSUM_UNNECESSARY;
1519			return 0;
1520		}
1521	}
1522
1523	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1524				       skb->len, IPPROTO_TCP, 0);
1525
1526	if (skb->len <= 76) {
1527		return __skb_checksum_complete(skb);
1528	}
1529	return 0;
1530}
1531
1532
1533/* The socket must have it's spinlock held when we get
1534 * here.
1535 *
1536 * We have a potential double-lock case here, so even when
1537 * doing backlog processing we use the BH locking scheme.
1538 * This is because we cannot sleep with the original spinlock
1539 * held.
1540 */
1541int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1542{
1543	struct sock *rsk;
1544#ifdef CONFIG_TCP_MD5SIG
1545	/*
1546	 * We really want to reject the packet as early as possible
1547	 * if:
1548	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1549	 *  o There is an MD5 option and we're not expecting one
1550	 */
1551	if (tcp_v4_inbound_md5_hash(sk, skb))
1552		goto discard;
1553#endif
1554
1555	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1556		TCP_CHECK_TIMER(sk);
1557		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1558			rsk = sk;
1559			goto reset;
1560		}
1561		TCP_CHECK_TIMER(sk);
1562		return 0;
1563	}
1564
1565	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1566		goto csum_err;
1567
1568	if (sk->sk_state == TCP_LISTEN) {
1569		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1570		if (!nsk)
1571			goto discard;
1572
1573		if (nsk != sk) {
1574			if (tcp_child_process(sk, nsk, skb)) {
1575				rsk = nsk;
1576				goto reset;
1577			}
1578			return 0;
1579		}
1580	}
1581
1582	TCP_CHECK_TIMER(sk);
1583	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1584		rsk = sk;
1585		goto reset;
1586	}
1587	TCP_CHECK_TIMER(sk);
1588	return 0;
1589
1590reset:
1591	tcp_v4_send_reset(rsk, skb);
1592discard:
1593	kfree_skb(skb);
1594	/* Be careful here. If this function gets more complicated and
1595	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1596	 * might be destroyed here. This current version compiles correctly,
1597	 * but you have been warned.
1598	 */
1599	return 0;
1600
1601csum_err:
1602	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1603	goto discard;
1604}
1605
1606/*
1607 *	From tcp_input.c
1608 */
1609
1610int tcp_v4_rcv(struct sk_buff *skb)
1611{
1612	const struct iphdr *iph;
1613	struct tcphdr *th;
1614	struct sock *sk;
1615	int ret;
1616	struct net *net = dev_net(skb->dev);
1617
1618	if (skb->pkt_type != PACKET_HOST)
1619		goto discard_it;
1620
1621	/* Count it even if it's bad */
1622	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1623
1624	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1625		goto discard_it;
1626
1627	th = tcp_hdr(skb);
1628
1629	if (th->doff < sizeof(struct tcphdr) / 4)
1630		goto bad_packet;
1631	if (!pskb_may_pull(skb, th->doff * 4))
1632		goto discard_it;
1633
1634	/* An explanation is required here, I think.
1635	 * Packet length and doff are validated by header prediction,
1636	 * provided case of th->doff==0 is eliminated.
1637	 * So, we defer the checks. */
1638	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1639		goto bad_packet;
1640
1641	th = tcp_hdr(skb);
1642	iph = ip_hdr(skb);
1643	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1644	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1645				    skb->len - th->doff * 4);
1646	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1647	TCP_SKB_CB(skb)->when	 = 0;
1648	TCP_SKB_CB(skb)->flags	 = iph->tos;
1649	TCP_SKB_CB(skb)->sacked	 = 0;
1650
1651	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1652	if (!sk)
1653		goto no_tcp_socket;
1654
1655process:
1656	if (sk->sk_state == TCP_TIME_WAIT)
1657		goto do_time_wait;
1658
1659	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1660		goto discard_and_relse;
1661	nf_reset(skb);
1662
1663	if (sk_filter(sk, skb))
1664		goto discard_and_relse;
1665
1666	skb->dev = NULL;
1667
1668	bh_lock_sock_nested(sk);
1669	ret = 0;
1670	if (!sock_owned_by_user(sk)) {
1671#ifdef CONFIG_NET_DMA
1672		struct tcp_sock *tp = tcp_sk(sk);
1673		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1674			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1675		if (tp->ucopy.dma_chan)
1676			ret = tcp_v4_do_rcv(sk, skb);
1677		else
1678#endif
1679		{
1680			if (!tcp_prequeue(sk, skb))
1681				ret = tcp_v4_do_rcv(sk, skb);
1682		}
1683	} else
1684		sk_add_backlog(sk, skb);
1685	bh_unlock_sock(sk);
1686
1687	sock_put(sk);
1688
1689	return ret;
1690
1691no_tcp_socket:
1692	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1693		goto discard_it;
1694
1695	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1696bad_packet:
1697		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1698	} else {
1699		tcp_v4_send_reset(NULL, skb);
1700	}
1701
1702discard_it:
1703	/* Discard frame. */
1704	kfree_skb(skb);
1705	return 0;
1706
1707discard_and_relse:
1708	sock_put(sk);
1709	goto discard_it;
1710
1711do_time_wait:
1712	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1713		inet_twsk_put(inet_twsk(sk));
1714		goto discard_it;
1715	}
1716
1717	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1718		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1719		inet_twsk_put(inet_twsk(sk));
1720		goto discard_it;
1721	}
1722	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1723	case TCP_TW_SYN: {
1724		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1725							&tcp_hashinfo,
1726							iph->daddr, th->dest,
1727							inet_iif(skb));
1728		if (sk2) {
1729			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1730			inet_twsk_put(inet_twsk(sk));
1731			sk = sk2;
1732			goto process;
1733		}
1734		/* Fall through to ACK */
1735	}
1736	case TCP_TW_ACK:
1737		tcp_v4_timewait_ack(sk, skb);
1738		break;
1739	case TCP_TW_RST:
1740		goto no_tcp_socket;
1741	case TCP_TW_SUCCESS:;
1742	}
1743	goto discard_it;
1744}
1745
1746/* VJ's idea. Save last timestamp seen from this destination
1747 * and hold it at least for normal timewait interval to use for duplicate
1748 * segment detection in subsequent connections, before they enter synchronized
1749 * state.
1750 */
1751
1752int tcp_v4_remember_stamp(struct sock *sk)
1753{
1754	struct inet_sock *inet = inet_sk(sk);
1755	struct tcp_sock *tp = tcp_sk(sk);
1756	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1757	struct inet_peer *peer = NULL;
1758	int release_it = 0;
1759
1760	if (!rt || rt->rt_dst != inet->inet_daddr) {
1761		peer = inet_getpeer(inet->inet_daddr, 1);
1762		release_it = 1;
1763	} else {
1764		if (!rt->peer)
1765			rt_bind_peer(rt, 1);
1766		peer = rt->peer;
1767	}
1768
1769	if (peer) {
1770		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1771		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1772		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1773			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1774			peer->tcp_ts = tp->rx_opt.ts_recent;
1775		}
1776		if (release_it)
1777			inet_putpeer(peer);
1778		return 1;
1779	}
1780
1781	return 0;
1782}
1783
1784int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1785{
1786	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1787
1788	if (peer) {
1789		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1790
1791		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1792		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1793		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1794			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1795			peer->tcp_ts	   = tcptw->tw_ts_recent;
1796		}
1797		inet_putpeer(peer);
1798		return 1;
1799	}
1800
1801	return 0;
1802}
1803
1804const struct inet_connection_sock_af_ops ipv4_specific = {
1805	.queue_xmit	   = ip_queue_xmit,
1806	.send_check	   = tcp_v4_send_check,
1807	.rebuild_header	   = inet_sk_rebuild_header,
1808	.conn_request	   = tcp_v4_conn_request,
1809	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1810	.remember_stamp	   = tcp_v4_remember_stamp,
1811	.net_header_len	   = sizeof(struct iphdr),
1812	.setsockopt	   = ip_setsockopt,
1813	.getsockopt	   = ip_getsockopt,
1814	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1815	.sockaddr_len	   = sizeof(struct sockaddr_in),
1816	.bind_conflict	   = inet_csk_bind_conflict,
1817#ifdef CONFIG_COMPAT
1818	.compat_setsockopt = compat_ip_setsockopt,
1819	.compat_getsockopt = compat_ip_getsockopt,
1820#endif
1821};
1822
1823#ifdef CONFIG_TCP_MD5SIG
1824static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1825	.md5_lookup		= tcp_v4_md5_lookup,
1826	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1827	.md5_add		= tcp_v4_md5_add_func,
1828	.md5_parse		= tcp_v4_parse_md5_keys,
1829};
1830#endif
1831
1832/* NOTE: A lot of things set to zero explicitly by call to
1833 *       sk_alloc() so need not be done here.
1834 */
1835static int tcp_v4_init_sock(struct sock *sk)
1836{
1837	struct inet_connection_sock *icsk = inet_csk(sk);
1838	struct tcp_sock *tp = tcp_sk(sk);
1839
1840	skb_queue_head_init(&tp->out_of_order_queue);
1841	tcp_init_xmit_timers(sk);
1842	tcp_prequeue_init(tp);
1843
1844	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1845	tp->mdev = TCP_TIMEOUT_INIT;
1846
1847	/* So many TCP implementations out there (incorrectly) count the
1848	 * initial SYN frame in their delayed-ACK and congestion control
1849	 * algorithms that we must have the following bandaid to talk
1850	 * efficiently to them.  -DaveM
1851	 */
1852	tp->snd_cwnd = 2;
1853
1854	/* See draft-stevens-tcpca-spec-01 for discussion of the
1855	 * initialization of these values.
1856	 */
1857	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1858	tp->snd_cwnd_clamp = ~0;
1859	tp->mss_cache = TCP_MSS_DEFAULT;
1860
1861	tp->reordering = sysctl_tcp_reordering;
1862	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1863
1864	sk->sk_state = TCP_CLOSE;
1865
1866	sk->sk_write_space = sk_stream_write_space;
1867	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1868
1869	icsk->icsk_af_ops = &ipv4_specific;
1870	icsk->icsk_sync_mss = tcp_sync_mss;
1871#ifdef CONFIG_TCP_MD5SIG
1872	tp->af_specific = &tcp_sock_ipv4_specific;
1873#endif
1874
1875	/* TCP Cookie Transactions */
1876	if (sysctl_tcp_cookie_size > 0) {
1877		/* Default, cookies without s_data_payload. */
1878		tp->cookie_values =
1879			kzalloc(sizeof(*tp->cookie_values),
1880				sk->sk_allocation);
1881		if (tp->cookie_values != NULL)
1882			kref_init(&tp->cookie_values->kref);
1883	}
1884	/* Presumed zeroed, in order of appearance:
1885	 *	cookie_in_always, cookie_out_never,
1886	 *	s_data_constant, s_data_in, s_data_out
1887	 */
1888	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1889	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1890
1891	local_bh_disable();
1892	percpu_counter_inc(&tcp_sockets_allocated);
1893	local_bh_enable();
1894
1895	return 0;
1896}
1897
1898void tcp_v4_destroy_sock(struct sock *sk)
1899{
1900	struct tcp_sock *tp = tcp_sk(sk);
1901
1902	tcp_clear_xmit_timers(sk);
1903
1904	tcp_cleanup_congestion_control(sk);
1905
1906	/* Cleanup up the write buffer. */
1907	tcp_write_queue_purge(sk);
1908
1909	/* Cleans up our, hopefully empty, out_of_order_queue. */
1910	__skb_queue_purge(&tp->out_of_order_queue);
1911
1912#ifdef CONFIG_TCP_MD5SIG
1913	/* Clean up the MD5 key list, if any */
1914	if (tp->md5sig_info) {
1915		tcp_v4_clear_md5_list(sk);
1916		kfree(tp->md5sig_info);
1917		tp->md5sig_info = NULL;
1918	}
1919#endif
1920
1921#ifdef CONFIG_NET_DMA
1922	/* Cleans up our sk_async_wait_queue */
1923	__skb_queue_purge(&sk->sk_async_wait_queue);
1924#endif
1925
1926	/* Clean prequeue, it must be empty really */
1927	__skb_queue_purge(&tp->ucopy.prequeue);
1928
1929	/* Clean up a referenced TCP bind bucket. */
1930	if (inet_csk(sk)->icsk_bind_hash)
1931		inet_put_port(sk);
1932
1933	/*
1934	 * If sendmsg cached page exists, toss it.
1935	 */
1936	if (sk->sk_sndmsg_page) {
1937		__free_page(sk->sk_sndmsg_page);
1938		sk->sk_sndmsg_page = NULL;
1939	}
1940
1941	/* TCP Cookie Transactions */
1942	if (tp->cookie_values != NULL) {
1943		kref_put(&tp->cookie_values->kref,
1944			 tcp_cookie_values_release);
1945		tp->cookie_values = NULL;
1946	}
1947
1948	percpu_counter_dec(&tcp_sockets_allocated);
1949}
1950
1951EXPORT_SYMBOL(tcp_v4_destroy_sock);
1952
1953#ifdef CONFIG_PROC_FS
1954/* Proc filesystem TCP sock list dumping. */
1955
1956static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1957{
1958	return hlist_nulls_empty(head) ? NULL :
1959		list_entry(head->first, struct inet_timewait_sock, tw_node);
1960}
1961
1962static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1963{
1964	return !is_a_nulls(tw->tw_node.next) ?
1965		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1966}
1967
1968static void *listening_get_next(struct seq_file *seq, void *cur)
1969{
1970	struct inet_connection_sock *icsk;
1971	struct hlist_nulls_node *node;
1972	struct sock *sk = cur;
1973	struct inet_listen_hashbucket *ilb;
1974	struct tcp_iter_state *st = seq->private;
1975	struct net *net = seq_file_net(seq);
1976
1977	if (!sk) {
1978		st->bucket = 0;
1979		ilb = &tcp_hashinfo.listening_hash[0];
1980		spin_lock_bh(&ilb->lock);
1981		sk = sk_nulls_head(&ilb->head);
1982		goto get_sk;
1983	}
1984	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1985	++st->num;
1986
1987	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1988		struct request_sock *req = cur;
1989
1990		icsk = inet_csk(st->syn_wait_sk);
1991		req = req->dl_next;
1992		while (1) {
1993			while (req) {
1994				if (req->rsk_ops->family == st->family) {
1995					cur = req;
1996					goto out;
1997				}
1998				req = req->dl_next;
1999			}
2000			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2001				break;
2002get_req:
2003			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2004		}
2005		sk	  = sk_next(st->syn_wait_sk);
2006		st->state = TCP_SEQ_STATE_LISTENING;
2007		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008	} else {
2009		icsk = inet_csk(sk);
2010		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2012			goto start_req;
2013		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2014		sk = sk_next(sk);
2015	}
2016get_sk:
2017	sk_nulls_for_each_from(sk, node) {
2018		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2019			cur = sk;
2020			goto out;
2021		}
2022		icsk = inet_csk(sk);
2023		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2025start_req:
2026			st->uid		= sock_i_uid(sk);
2027			st->syn_wait_sk = sk;
2028			st->state	= TCP_SEQ_STATE_OPENREQ;
2029			st->sbucket	= 0;
2030			goto get_req;
2031		}
2032		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033	}
2034	spin_unlock_bh(&ilb->lock);
2035	if (++st->bucket < INET_LHTABLE_SIZE) {
2036		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037		spin_lock_bh(&ilb->lock);
2038		sk = sk_nulls_head(&ilb->head);
2039		goto get_sk;
2040	}
2041	cur = NULL;
2042out:
2043	return cur;
2044}
2045
2046static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2047{
2048	void *rc = listening_get_next(seq, NULL);
2049
2050	while (rc && *pos) {
2051		rc = listening_get_next(seq, rc);
2052		--*pos;
2053	}
2054	return rc;
2055}
2056
2057static inline int empty_bucket(struct tcp_iter_state *st)
2058{
2059	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2060		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2061}
2062
2063static void *established_get_first(struct seq_file *seq)
2064{
2065	struct tcp_iter_state *st = seq->private;
2066	struct net *net = seq_file_net(seq);
2067	void *rc = NULL;
2068
2069	for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2070		struct sock *sk;
2071		struct hlist_nulls_node *node;
2072		struct inet_timewait_sock *tw;
2073		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2074
2075		/* Lockless fast path for the common case of empty buckets */
2076		if (empty_bucket(st))
2077			continue;
2078
2079		spin_lock_bh(lock);
2080		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2081			if (sk->sk_family != st->family ||
2082			    !net_eq(sock_net(sk), net)) {
2083				continue;
2084			}
2085			rc = sk;
2086			goto out;
2087		}
2088		st->state = TCP_SEQ_STATE_TIME_WAIT;
2089		inet_twsk_for_each(tw, node,
2090				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2091			if (tw->tw_family != st->family ||
2092			    !net_eq(twsk_net(tw), net)) {
2093				continue;
2094			}
2095			rc = tw;
2096			goto out;
2097		}
2098		spin_unlock_bh(lock);
2099		st->state = TCP_SEQ_STATE_ESTABLISHED;
2100	}
2101out:
2102	return rc;
2103}
2104
2105static void *established_get_next(struct seq_file *seq, void *cur)
2106{
2107	struct sock *sk = cur;
2108	struct inet_timewait_sock *tw;
2109	struct hlist_nulls_node *node;
2110	struct tcp_iter_state *st = seq->private;
2111	struct net *net = seq_file_net(seq);
2112
2113	++st->num;
2114
2115	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2116		tw = cur;
2117		tw = tw_next(tw);
2118get_tw:
2119		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2120			tw = tw_next(tw);
2121		}
2122		if (tw) {
2123			cur = tw;
2124			goto out;
2125		}
2126		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2127		st->state = TCP_SEQ_STATE_ESTABLISHED;
2128
2129		/* Look for next non empty bucket */
2130		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2131				empty_bucket(st))
2132			;
2133		if (st->bucket > tcp_hashinfo.ehash_mask)
2134			return NULL;
2135
2136		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2137		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2138	} else
2139		sk = sk_nulls_next(sk);
2140
2141	sk_nulls_for_each_from(sk, node) {
2142		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2143			goto found;
2144	}
2145
2146	st->state = TCP_SEQ_STATE_TIME_WAIT;
2147	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2148	goto get_tw;
2149found:
2150	cur = sk;
2151out:
2152	return cur;
2153}
2154
2155static void *established_get_idx(struct seq_file *seq, loff_t pos)
2156{
2157	void *rc = established_get_first(seq);
2158
2159	while (rc && pos) {
2160		rc = established_get_next(seq, rc);
2161		--pos;
2162	}
2163	return rc;
2164}
2165
2166static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2167{
2168	void *rc;
2169	struct tcp_iter_state *st = seq->private;
2170
2171	st->state = TCP_SEQ_STATE_LISTENING;
2172	rc	  = listening_get_idx(seq, &pos);
2173
2174	if (!rc) {
2175		st->state = TCP_SEQ_STATE_ESTABLISHED;
2176		rc	  = established_get_idx(seq, pos);
2177	}
2178
2179	return rc;
2180}
2181
2182static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2183{
2184	struct tcp_iter_state *st = seq->private;
2185	st->state = TCP_SEQ_STATE_LISTENING;
2186	st->num = 0;
2187	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2188}
2189
2190static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2191{
2192	void *rc = NULL;
2193	struct tcp_iter_state *st;
2194
2195	if (v == SEQ_START_TOKEN) {
2196		rc = tcp_get_idx(seq, 0);
2197		goto out;
2198	}
2199	st = seq->private;
2200
2201	switch (st->state) {
2202	case TCP_SEQ_STATE_OPENREQ:
2203	case TCP_SEQ_STATE_LISTENING:
2204		rc = listening_get_next(seq, v);
2205		if (!rc) {
2206			st->state = TCP_SEQ_STATE_ESTABLISHED;
2207			rc	  = established_get_first(seq);
2208		}
2209		break;
2210	case TCP_SEQ_STATE_ESTABLISHED:
2211	case TCP_SEQ_STATE_TIME_WAIT:
2212		rc = established_get_next(seq, v);
2213		break;
2214	}
2215out:
2216	++*pos;
2217	return rc;
2218}
2219
2220static void tcp_seq_stop(struct seq_file *seq, void *v)
2221{
2222	struct tcp_iter_state *st = seq->private;
2223
2224	switch (st->state) {
2225	case TCP_SEQ_STATE_OPENREQ:
2226		if (v) {
2227			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2228			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2229		}
2230	case TCP_SEQ_STATE_LISTENING:
2231		if (v != SEQ_START_TOKEN)
2232			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2233		break;
2234	case TCP_SEQ_STATE_TIME_WAIT:
2235	case TCP_SEQ_STATE_ESTABLISHED:
2236		if (v)
2237			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2238		break;
2239	}
2240}
2241
2242static int tcp_seq_open(struct inode *inode, struct file *file)
2243{
2244	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2245	struct tcp_iter_state *s;
2246	int err;
2247
2248	err = seq_open_net(inode, file, &afinfo->seq_ops,
2249			  sizeof(struct tcp_iter_state));
2250	if (err < 0)
2251		return err;
2252
2253	s = ((struct seq_file *)file->private_data)->private;
2254	s->family		= afinfo->family;
2255	return 0;
2256}
2257
2258int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2259{
2260	int rc = 0;
2261	struct proc_dir_entry *p;
2262
2263	afinfo->seq_fops.open		= tcp_seq_open;
2264	afinfo->seq_fops.read		= seq_read;
2265	afinfo->seq_fops.llseek		= seq_lseek;
2266	afinfo->seq_fops.release	= seq_release_net;
2267
2268	afinfo->seq_ops.start		= tcp_seq_start;
2269	afinfo->seq_ops.next		= tcp_seq_next;
2270	afinfo->seq_ops.stop		= tcp_seq_stop;
2271
2272	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2273			     &afinfo->seq_fops, afinfo);
2274	if (!p)
2275		rc = -ENOMEM;
2276	return rc;
2277}
2278
2279void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2280{
2281	proc_net_remove(net, afinfo->name);
2282}
2283
2284static void get_openreq4(struct sock *sk, struct request_sock *req,
2285			 struct seq_file *f, int i, int uid, int *len)
2286{
2287	const struct inet_request_sock *ireq = inet_rsk(req);
2288	int ttd = req->expires - jiffies;
2289
2290	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2291		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2292		i,
2293		ireq->loc_addr,
2294		ntohs(inet_sk(sk)->inet_sport),
2295		ireq->rmt_addr,
2296		ntohs(ireq->rmt_port),
2297		TCP_SYN_RECV,
2298		0, 0, /* could print option size, but that is af dependent. */
2299		1,    /* timers active (only the expire timer) */
2300		jiffies_to_clock_t(ttd),
2301		req->retrans,
2302		uid,
2303		0,  /* non standard timer */
2304		0, /* open_requests have no inode */
2305		atomic_read(&sk->sk_refcnt),
2306		req,
2307		len);
2308}
2309
2310static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2311{
2312	int timer_active;
2313	unsigned long timer_expires;
2314	struct tcp_sock *tp = tcp_sk(sk);
2315	const struct inet_connection_sock *icsk = inet_csk(sk);
2316	struct inet_sock *inet = inet_sk(sk);
2317	__be32 dest = inet->inet_daddr;
2318	__be32 src = inet->inet_rcv_saddr;
2319	__u16 destp = ntohs(inet->inet_dport);
2320	__u16 srcp = ntohs(inet->inet_sport);
2321	int rx_queue;
2322
2323	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2324		timer_active	= 1;
2325		timer_expires	= icsk->icsk_timeout;
2326	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2327		timer_active	= 4;
2328		timer_expires	= icsk->icsk_timeout;
2329	} else if (timer_pending(&sk->sk_timer)) {
2330		timer_active	= 2;
2331		timer_expires	= sk->sk_timer.expires;
2332	} else {
2333		timer_active	= 0;
2334		timer_expires = jiffies;
2335	}
2336
2337	if (sk->sk_state == TCP_LISTEN)
2338		rx_queue = sk->sk_ack_backlog;
2339	else
2340		/*
2341		 * because we dont lock socket, we might find a transient negative value
2342		 */
2343		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2344
2345	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2346			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2347		i, src, srcp, dest, destp, sk->sk_state,
2348		tp->write_seq - tp->snd_una,
2349		rx_queue,
2350		timer_active,
2351		jiffies_to_clock_t(timer_expires - jiffies),
2352		icsk->icsk_retransmits,
2353		sock_i_uid(sk),
2354		icsk->icsk_probes_out,
2355		sock_i_ino(sk),
2356		atomic_read(&sk->sk_refcnt), sk,
2357		jiffies_to_clock_t(icsk->icsk_rto),
2358		jiffies_to_clock_t(icsk->icsk_ack.ato),
2359		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2360		tp->snd_cwnd,
2361		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2362		len);
2363}
2364
2365static void get_timewait4_sock(struct inet_timewait_sock *tw,
2366			       struct seq_file *f, int i, int *len)
2367{
2368	__be32 dest, src;
2369	__u16 destp, srcp;
2370	int ttd = tw->tw_ttd - jiffies;
2371
2372	if (ttd < 0)
2373		ttd = 0;
2374
2375	dest  = tw->tw_daddr;
2376	src   = tw->tw_rcv_saddr;
2377	destp = ntohs(tw->tw_dport);
2378	srcp  = ntohs(tw->tw_sport);
2379
2380	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2381		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2382		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2383		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2384		atomic_read(&tw->tw_refcnt), tw, len);
2385}
2386
2387#define TMPSZ 150
2388
2389static int tcp4_seq_show(struct seq_file *seq, void *v)
2390{
2391	struct tcp_iter_state *st;
2392	int len;
2393
2394	if (v == SEQ_START_TOKEN) {
2395		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2396			   "  sl  local_address rem_address   st tx_queue "
2397			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2398			   "inode");
2399		goto out;
2400	}
2401	st = seq->private;
2402
2403	switch (st->state) {
2404	case TCP_SEQ_STATE_LISTENING:
2405	case TCP_SEQ_STATE_ESTABLISHED:
2406		get_tcp4_sock(v, seq, st->num, &len);
2407		break;
2408	case TCP_SEQ_STATE_OPENREQ:
2409		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2410		break;
2411	case TCP_SEQ_STATE_TIME_WAIT:
2412		get_timewait4_sock(v, seq, st->num, &len);
2413		break;
2414	}
2415	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2416out:
2417	return 0;
2418}
2419
2420static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421	.name		= "tcp",
2422	.family		= AF_INET,
2423	.seq_fops	= {
2424		.owner		= THIS_MODULE,
2425	},
2426	.seq_ops	= {
2427		.show		= tcp4_seq_show,
2428	},
2429};
2430
2431static int tcp4_proc_init_net(struct net *net)
2432{
2433	return tcp_proc_register(net, &tcp4_seq_afinfo);
2434}
2435
2436static void tcp4_proc_exit_net(struct net *net)
2437{
2438	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2439}
2440
2441static struct pernet_operations tcp4_net_ops = {
2442	.init = tcp4_proc_init_net,
2443	.exit = tcp4_proc_exit_net,
2444};
2445
2446int __init tcp4_proc_init(void)
2447{
2448	return register_pernet_subsys(&tcp4_net_ops);
2449}
2450
2451void tcp4_proc_exit(void)
2452{
2453	unregister_pernet_subsys(&tcp4_net_ops);
2454}
2455#endif /* CONFIG_PROC_FS */
2456
2457struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2458{
2459	struct iphdr *iph = skb_gro_network_header(skb);
2460
2461	switch (skb->ip_summed) {
2462	case CHECKSUM_COMPLETE:
2463		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2464				  skb->csum)) {
2465			skb->ip_summed = CHECKSUM_UNNECESSARY;
2466			break;
2467		}
2468
2469		/* fall through */
2470	case CHECKSUM_NONE:
2471		NAPI_GRO_CB(skb)->flush = 1;
2472		return NULL;
2473	}
2474
2475	return tcp_gro_receive(head, skb);
2476}
2477EXPORT_SYMBOL(tcp4_gro_receive);
2478
2479int tcp4_gro_complete(struct sk_buff *skb)
2480{
2481	struct iphdr *iph = ip_hdr(skb);
2482	struct tcphdr *th = tcp_hdr(skb);
2483
2484	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2485				  iph->saddr, iph->daddr, 0);
2486	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2487
2488	return tcp_gro_complete(skb);
2489}
2490EXPORT_SYMBOL(tcp4_gro_complete);
2491
2492struct proto tcp_prot = {
2493	.name			= "TCP",
2494	.owner			= THIS_MODULE,
2495	.close			= tcp_close,
2496	.connect		= tcp_v4_connect,
2497	.disconnect		= tcp_disconnect,
2498	.accept			= inet_csk_accept,
2499	.ioctl			= tcp_ioctl,
2500	.init			= tcp_v4_init_sock,
2501	.destroy		= tcp_v4_destroy_sock,
2502	.shutdown		= tcp_shutdown,
2503	.setsockopt		= tcp_setsockopt,
2504	.getsockopt		= tcp_getsockopt,
2505	.recvmsg		= tcp_recvmsg,
2506	.backlog_rcv		= tcp_v4_do_rcv,
2507	.hash			= inet_hash,
2508	.unhash			= inet_unhash,
2509	.get_port		= inet_csk_get_port,
2510	.enter_memory_pressure	= tcp_enter_memory_pressure,
2511	.sockets_allocated	= &tcp_sockets_allocated,
2512	.orphan_count		= &tcp_orphan_count,
2513	.memory_allocated	= &tcp_memory_allocated,
2514	.memory_pressure	= &tcp_memory_pressure,
2515	.sysctl_mem		= sysctl_tcp_mem,
2516	.sysctl_wmem		= sysctl_tcp_wmem,
2517	.sysctl_rmem		= sysctl_tcp_rmem,
2518	.max_header		= MAX_TCP_HEADER,
2519	.obj_size		= sizeof(struct tcp_sock),
2520	.slab_flags		= SLAB_DESTROY_BY_RCU,
2521	.twsk_prot		= &tcp_timewait_sock_ops,
2522	.rsk_prot		= &tcp_request_sock_ops,
2523	.h.hashinfo		= &tcp_hashinfo,
2524#ifdef CONFIG_COMPAT
2525	.compat_setsockopt	= compat_tcp_setsockopt,
2526	.compat_getsockopt	= compat_tcp_getsockopt,
2527#endif
2528};
2529
2530
2531static int __net_init tcp_sk_init(struct net *net)
2532{
2533	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2534				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2535}
2536
2537static void __net_exit tcp_sk_exit(struct net *net)
2538{
2539	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2540}
2541
2542static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2543{
2544	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2545}
2546
2547static struct pernet_operations __net_initdata tcp_sk_ops = {
2548       .init	   = tcp_sk_init,
2549       .exit	   = tcp_sk_exit,
2550       .exit_batch = tcp_sk_exit_batch,
2551};
2552
2553void __init tcp_v4_init(void)
2554{
2555	inet_hashinfo_init(&tcp_hashinfo);
2556	if (register_pernet_subsys(&tcp_sk_ops))
2557		panic("Failed to create the TCP control socket.\n");
2558}
2559
2560EXPORT_SYMBOL(ipv4_specific);
2561EXPORT_SYMBOL(tcp_hashinfo);
2562EXPORT_SYMBOL(tcp_prot);
2563EXPORT_SYMBOL(tcp_v4_conn_request);
2564EXPORT_SYMBOL(tcp_v4_connect);
2565EXPORT_SYMBOL(tcp_v4_do_rcv);
2566EXPORT_SYMBOL(tcp_v4_remember_stamp);
2567EXPORT_SYMBOL(tcp_v4_send_check);
2568EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2569
2570#ifdef CONFIG_PROC_FS
2571EXPORT_SYMBOL(tcp_proc_register);
2572EXPORT_SYMBOL(tcp_proc_unregister);
2573#endif
2574EXPORT_SYMBOL(sysctl_tcp_low_latency);
2575
2576