tcp_ipv4.c revision fb286bb2990a107009dbf25f6ffebeb7df77f9be
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen sematics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/xfrm.h>
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90		       struct sk_buff *skb);
91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock	= RW_LOCK_UNLOCKED,
94	.lhash_users	= ATOMIC_INIT(0),
95	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96};
97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{
100	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
101}
102
103static void tcp_v4_hash(struct sock *sk)
104{
105	inet_hash(&tcp_hashinfo, sk);
106}
107
108void tcp_unhash(struct sock *sk)
109{
110	inet_unhash(&tcp_hashinfo, sk);
111}
112
113static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
114{
115	return secure_tcp_sequence_number(skb->nh.iph->daddr,
116					  skb->nh.iph->saddr,
117					  skb->h.th->dest,
118					  skb->h.th->source);
119}
120
121/* called with local bh disabled */
122static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123				      struct inet_timewait_sock **twp)
124{
125	struct inet_sock *inet = inet_sk(sk);
126	u32 daddr = inet->rcv_saddr;
127	u32 saddr = inet->daddr;
128	int dif = sk->sk_bound_dev_if;
129	INET_ADDR_COOKIE(acookie, saddr, daddr)
130	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133	struct sock *sk2;
134	const struct hlist_node *node;
135	struct inet_timewait_sock *tw;
136
137	prefetch(head->chain.first);
138	write_lock(&head->lock);
139
140	/* Check TIME-WAIT sockets first. */
141	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142		tw = inet_twsk(sk2);
143
144		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146			struct tcp_sock *tp = tcp_sk(sk);
147
148			/* With PAWS, it is safe from the viewpoint
149			   of data integrity. Even without PAWS it
150			   is safe provided sequence spaces do not
151			   overlap i.e. at data rates <= 80Mbit/sec.
152
153			   Actually, the idea is close to VJ's one,
154			   only timestamp cache is held not per host,
155			   but per port pair and TW bucket is used
156			   as state holder.
157
158			   If TW bucket has been already destroyed we
159			   fall back to VJ's scheme and use initial
160			   timestamp retrieved from peer table.
161			 */
162			if (tcptw->tw_ts_recent_stamp &&
163			    (!twp || (sysctl_tcp_tw_reuse &&
164				      xtime.tv_sec -
165				      tcptw->tw_ts_recent_stamp > 1))) {
166				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167				if (tp->write_seq == 0)
168					tp->write_seq = 1;
169				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
170				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171				sock_hold(sk2);
172				goto unique;
173			} else
174				goto not_unique;
175		}
176	}
177	tw = NULL;
178
179	/* And established part... */
180	sk_for_each(sk2, node, &head->chain) {
181		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
182			goto not_unique;
183	}
184
185unique:
186	/* Must record num and sport now. Otherwise we will see
187	 * in hash table socket with a funny identity. */
188	inet->num = lport;
189	inet->sport = htons(lport);
190	sk->sk_hash = hash;
191	BUG_TRAP(sk_unhashed(sk));
192	__sk_add_node(sk, &head->chain);
193	sock_prot_inc_use(sk->sk_prot);
194	write_unlock(&head->lock);
195
196	if (twp) {
197		*twp = tw;
198		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199	} else if (tw) {
200		/* Silly. Should hash-dance instead... */
201		inet_twsk_deschedule(tw, &tcp_death_row);
202		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203
204		inet_twsk_put(tw);
205	}
206
207	return 0;
208
209not_unique:
210	write_unlock(&head->lock);
211	return -EADDRNOTAVAIL;
212}
213
214static inline u32 connect_port_offset(const struct sock *sk)
215{
216	const struct inet_sock *inet = inet_sk(sk);
217
218	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219					 inet->dport);
220}
221
222/*
223 * Bind a port for a connect operation and hash it.
224 */
225static inline int tcp_v4_hash_connect(struct sock *sk)
226{
227	const unsigned short snum = inet_sk(sk)->num;
228 	struct inet_bind_hashbucket *head;
229 	struct inet_bind_bucket *tb;
230	int ret;
231
232 	if (!snum) {
233 		int low = sysctl_local_port_range[0];
234 		int high = sysctl_local_port_range[1];
235		int range = high - low;
236 		int i;
237		int port;
238		static u32 hint;
239		u32 offset = hint + connect_port_offset(sk);
240		struct hlist_node *node;
241 		struct inet_timewait_sock *tw = NULL;
242
243 		local_bh_disable();
244		for (i = 1; i <= range; i++) {
245			port = low + (i + offset) % range;
246 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247 			spin_lock(&head->lock);
248
249 			/* Does not bother with rcv_saddr checks,
250 			 * because the established check is already
251 			 * unique enough.
252 			 */
253			inet_bind_bucket_for_each(tb, node, &head->chain) {
254 				if (tb->port == port) {
255 					BUG_TRAP(!hlist_empty(&tb->owners));
256 					if (tb->fastreuse >= 0)
257 						goto next_port;
258 					if (!__tcp_v4_check_established(sk,
259									port,
260									&tw))
261 						goto ok;
262 					goto next_port;
263 				}
264 			}
265
266 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267 			if (!tb) {
268 				spin_unlock(&head->lock);
269 				break;
270 			}
271 			tb->fastreuse = -1;
272 			goto ok;
273
274 		next_port:
275 			spin_unlock(&head->lock);
276 		}
277 		local_bh_enable();
278
279 		return -EADDRNOTAVAIL;
280
281ok:
282		hint += i;
283
284 		/* Head lock still held and bh's disabled */
285 		inet_bind_hash(sk, tb, port);
286		if (sk_unhashed(sk)) {
287 			inet_sk(sk)->sport = htons(port);
288 			__inet_hash(&tcp_hashinfo, sk, 0);
289 		}
290 		spin_unlock(&head->lock);
291
292 		if (tw) {
293 			inet_twsk_deschedule(tw, &tcp_death_row);;
294 			inet_twsk_put(tw);
295 		}
296
297		ret = 0;
298		goto out;
299 	}
300
301 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302 	tb  = inet_csk(sk)->icsk_bind_hash;
303	spin_lock_bh(&head->lock);
304	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305		__inet_hash(&tcp_hashinfo, sk, 0);
306		spin_unlock_bh(&head->lock);
307		return 0;
308	} else {
309		spin_unlock(&head->lock);
310		/* No definite answer... Walk to established hash table */
311		ret = __tcp_v4_check_established(sk, snum, NULL);
312out:
313		local_bh_enable();
314		return ret;
315	}
316}
317
318/* This will initiate an outgoing connection. */
319int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
320{
321	struct inet_sock *inet = inet_sk(sk);
322	struct tcp_sock *tp = tcp_sk(sk);
323	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
324	struct rtable *rt;
325	u32 daddr, nexthop;
326	int tmp;
327	int err;
328
329	if (addr_len < sizeof(struct sockaddr_in))
330		return -EINVAL;
331
332	if (usin->sin_family != AF_INET)
333		return -EAFNOSUPPORT;
334
335	nexthop = daddr = usin->sin_addr.s_addr;
336	if (inet->opt && inet->opt->srr) {
337		if (!daddr)
338			return -EINVAL;
339		nexthop = inet->opt->faddr;
340	}
341
342	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
343			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
344			       IPPROTO_TCP,
345			       inet->sport, usin->sin_port, sk);
346	if (tmp < 0)
347		return tmp;
348
349	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
350		ip_rt_put(rt);
351		return -ENETUNREACH;
352	}
353
354	if (!inet->opt || !inet->opt->srr)
355		daddr = rt->rt_dst;
356
357	if (!inet->saddr)
358		inet->saddr = rt->rt_src;
359	inet->rcv_saddr = inet->saddr;
360
361	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
362		/* Reset inherited state */
363		tp->rx_opt.ts_recent	   = 0;
364		tp->rx_opt.ts_recent_stamp = 0;
365		tp->write_seq		   = 0;
366	}
367
368	if (tcp_death_row.sysctl_tw_recycle &&
369	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
370		struct inet_peer *peer = rt_get_peer(rt);
371
372		/* VJ's idea. We save last timestamp seen from
373		 * the destination in peer table, when entering state TIME-WAIT
374		 * and initialize rx_opt.ts_recent from it, when trying new connection.
375		 */
376
377		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
378			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
379			tp->rx_opt.ts_recent = peer->tcp_ts;
380		}
381	}
382
383	inet->dport = usin->sin_port;
384	inet->daddr = daddr;
385
386	tp->ext_header_len = 0;
387	if (inet->opt)
388		tp->ext_header_len = inet->opt->optlen;
389
390	tp->rx_opt.mss_clamp = 536;
391
392	/* Socket identity is still unknown (sport may be zero).
393	 * However we set state to SYN-SENT and not releasing socket
394	 * lock select source port, enter ourselves into the hash tables and
395	 * complete initialization after this.
396	 */
397	tcp_set_state(sk, TCP_SYN_SENT);
398	err = tcp_v4_hash_connect(sk);
399	if (err)
400		goto failure;
401
402	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
403	if (err)
404		goto failure;
405
406	/* OK, now commit destination to socket.  */
407	sk_setup_caps(sk, &rt->u.dst);
408
409	if (!tp->write_seq)
410		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
411							   inet->daddr,
412							   inet->sport,
413							   usin->sin_port);
414
415	inet->id = tp->write_seq ^ jiffies;
416
417	err = tcp_connect(sk);
418	rt = NULL;
419	if (err)
420		goto failure;
421
422	return 0;
423
424failure:
425	/* This unhashes the socket and releases the local port, if necessary. */
426	tcp_set_state(sk, TCP_CLOSE);
427	ip_rt_put(rt);
428	sk->sk_route_caps = 0;
429	inet->dport = 0;
430	return err;
431}
432
433/*
434 * This routine does path mtu discovery as defined in RFC1191.
435 */
436static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
437				     u32 mtu)
438{
439	struct dst_entry *dst;
440	struct inet_sock *inet = inet_sk(sk);
441	struct tcp_sock *tp = tcp_sk(sk);
442
443	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444	 * send out by Linux are always <576bytes so they should go through
445	 * unfragmented).
446	 */
447	if (sk->sk_state == TCP_LISTEN)
448		return;
449
450	/* We don't check in the destentry if pmtu discovery is forbidden
451	 * on this route. We just assume that no packet_to_big packets
452	 * are send back when pmtu discovery is not active.
453     	 * There is a small race when the user changes this flag in the
454	 * route, but I think that's acceptable.
455	 */
456	if ((dst = __sk_dst_check(sk, 0)) == NULL)
457		return;
458
459	dst->ops->update_pmtu(dst, mtu);
460
461	/* Something is about to be wrong... Remember soft error
462	 * for the case, if this connection will not able to recover.
463	 */
464	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
465		sk->sk_err_soft = EMSGSIZE;
466
467	mtu = dst_mtu(dst);
468
469	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470	    tp->pmtu_cookie > mtu) {
471		tcp_sync_mss(sk, mtu);
472
473		/* Resend the TCP packet because it's
474		 * clear that the old packet has been
475		 * dropped. This is the new "fast" path mtu
476		 * discovery.
477		 */
478		tcp_simple_retransmit(sk);
479	} /* else let the usual retransmit timer handle it */
480}
481
482/*
483 * This routine is called by the ICMP module when it gets some
484 * sort of error condition.  If err < 0 then the socket should
485 * be closed and the error returned to the user.  If err > 0
486 * it's just the icmp type << 8 | icmp code.  After adjustment
487 * header points to the first 8 bytes of the tcp header.  We need
488 * to find the appropriate port.
489 *
490 * The locking strategy used here is very "optimistic". When
491 * someone else accesses the socket the ICMP is just dropped
492 * and for some paths there is no check at all.
493 * A more general error queue to queue errors for later handling
494 * is probably better.
495 *
496 */
497
498void tcp_v4_err(struct sk_buff *skb, u32 info)
499{
500	struct iphdr *iph = (struct iphdr *)skb->data;
501	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
502	struct tcp_sock *tp;
503	struct inet_sock *inet;
504	int type = skb->h.icmph->type;
505	int code = skb->h.icmph->code;
506	struct sock *sk;
507	__u32 seq;
508	int err;
509
510	if (skb->len < (iph->ihl << 2) + 8) {
511		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
512		return;
513	}
514
515	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
516			 th->source, inet_iif(skb));
517	if (!sk) {
518		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
519		return;
520	}
521	if (sk->sk_state == TCP_TIME_WAIT) {
522		inet_twsk_put((struct inet_timewait_sock *)sk);
523		return;
524	}
525
526	bh_lock_sock(sk);
527	/* If too many ICMPs get dropped on busy
528	 * servers this needs to be solved differently.
529	 */
530	if (sock_owned_by_user(sk))
531		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
532
533	if (sk->sk_state == TCP_CLOSE)
534		goto out;
535
536	tp = tcp_sk(sk);
537	seq = ntohl(th->seq);
538	if (sk->sk_state != TCP_LISTEN &&
539	    !between(seq, tp->snd_una, tp->snd_nxt)) {
540		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
541		goto out;
542	}
543
544	switch (type) {
545	case ICMP_SOURCE_QUENCH:
546		/* Just silently ignore these. */
547		goto out;
548	case ICMP_PARAMETERPROB:
549		err = EPROTO;
550		break;
551	case ICMP_DEST_UNREACH:
552		if (code > NR_ICMP_UNREACH)
553			goto out;
554
555		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556			if (!sock_owned_by_user(sk))
557				do_pmtu_discovery(sk, iph, info);
558			goto out;
559		}
560
561		err = icmp_err_convert[code].errno;
562		break;
563	case ICMP_TIME_EXCEEDED:
564		err = EHOSTUNREACH;
565		break;
566	default:
567		goto out;
568	}
569
570	switch (sk->sk_state) {
571		struct request_sock *req, **prev;
572	case TCP_LISTEN:
573		if (sock_owned_by_user(sk))
574			goto out;
575
576		req = inet_csk_search_req(sk, &prev, th->dest,
577					  iph->daddr, iph->saddr);
578		if (!req)
579			goto out;
580
581		/* ICMPs are not backlogged, hence we cannot get
582		   an established socket here.
583		 */
584		BUG_TRAP(!req->sk);
585
586		if (seq != tcp_rsk(req)->snt_isn) {
587			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
588			goto out;
589		}
590
591		/*
592		 * Still in SYN_RECV, just remove it silently.
593		 * There is no good way to pass the error to the newly
594		 * created socket, and POSIX does not want network
595		 * errors returned from accept().
596		 */
597		inet_csk_reqsk_queue_drop(sk, req, prev);
598		goto out;
599
600	case TCP_SYN_SENT:
601	case TCP_SYN_RECV:  /* Cannot happen.
602			       It can f.e. if SYNs crossed.
603			     */
604		if (!sock_owned_by_user(sk)) {
605			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
606			sk->sk_err = err;
607
608			sk->sk_error_report(sk);
609
610			tcp_done(sk);
611		} else {
612			sk->sk_err_soft = err;
613		}
614		goto out;
615	}
616
617	/* If we've already connected we will keep trying
618	 * until we time out, or the user gives up.
619	 *
620	 * rfc1122 4.2.3.9 allows to consider as hard errors
621	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
622	 * but it is obsoleted by pmtu discovery).
623	 *
624	 * Note, that in modern internet, where routing is unreliable
625	 * and in each dark corner broken firewalls sit, sending random
626	 * errors ordered by their masters even this two messages finally lose
627	 * their original sense (even Linux sends invalid PORT_UNREACHs)
628	 *
629	 * Now we are in compliance with RFCs.
630	 *							--ANK (980905)
631	 */
632
633	inet = inet_sk(sk);
634	if (!sock_owned_by_user(sk) && inet->recverr) {
635		sk->sk_err = err;
636		sk->sk_error_report(sk);
637	} else	{ /* Only an error on timeout */
638		sk->sk_err_soft = err;
639	}
640
641out:
642	bh_unlock_sock(sk);
643	sock_put(sk);
644}
645
646/* This routine computes an IPv4 TCP checksum. */
647void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
648		       struct sk_buff *skb)
649{
650	struct inet_sock *inet = inet_sk(sk);
651
652	if (skb->ip_summed == CHECKSUM_HW) {
653		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
654		skb->csum = offsetof(struct tcphdr, check);
655	} else {
656		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
657					 csum_partial((char *)th,
658						      th->doff << 2,
659						      skb->csum));
660	}
661}
662
663/*
664 *	This routine will send an RST to the other tcp.
665 *
666 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
667 *		      for reset.
668 *	Answer: if a packet caused RST, it is not for a socket
669 *		existing in our system, if it is matched to a socket,
670 *		it is just duplicate segment or bug in other side's TCP.
671 *		So that we build reply only basing on parameters
672 *		arrived with segment.
673 *	Exception: precedence violation. We do not implement it in any case.
674 */
675
676static void tcp_v4_send_reset(struct sk_buff *skb)
677{
678	struct tcphdr *th = skb->h.th;
679	struct tcphdr rth;
680	struct ip_reply_arg arg;
681
682	/* Never send a reset in response to a reset. */
683	if (th->rst)
684		return;
685
686	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
687		return;
688
689	/* Swap the send and the receive. */
690	memset(&rth, 0, sizeof(struct tcphdr));
691	rth.dest   = th->source;
692	rth.source = th->dest;
693	rth.doff   = sizeof(struct tcphdr) / 4;
694	rth.rst    = 1;
695
696	if (th->ack) {
697		rth.seq = th->ack_seq;
698	} else {
699		rth.ack = 1;
700		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701				    skb->len - (th->doff << 2));
702	}
703
704	memset(&arg, 0, sizeof arg);
705	arg.iov[0].iov_base = (unsigned char *)&rth;
706	arg.iov[0].iov_len  = sizeof rth;
707	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
708				      skb->nh.iph->saddr, /*XXX*/
709				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
710	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
711
712	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
713
714	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
715	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
716}
717
718/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
719   outside socket context is ugly, certainly. What can I do?
720 */
721
722static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
723			    u32 win, u32 ts)
724{
725	struct tcphdr *th = skb->h.th;
726	struct {
727		struct tcphdr th;
728		u32 tsopt[3];
729	} rep;
730	struct ip_reply_arg arg;
731
732	memset(&rep.th, 0, sizeof(struct tcphdr));
733	memset(&arg, 0, sizeof arg);
734
735	arg.iov[0].iov_base = (unsigned char *)&rep;
736	arg.iov[0].iov_len  = sizeof(rep.th);
737	if (ts) {
738		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739				     (TCPOPT_TIMESTAMP << 8) |
740				     TCPOLEN_TIMESTAMP);
741		rep.tsopt[1] = htonl(tcp_time_stamp);
742		rep.tsopt[2] = htonl(ts);
743		arg.iov[0].iov_len = sizeof(rep);
744	}
745
746	/* Swap the send and the receive. */
747	rep.th.dest    = th->source;
748	rep.th.source  = th->dest;
749	rep.th.doff    = arg.iov[0].iov_len / 4;
750	rep.th.seq     = htonl(seq);
751	rep.th.ack_seq = htonl(ack);
752	rep.th.ack     = 1;
753	rep.th.window  = htons(win);
754
755	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
756				      skb->nh.iph->saddr, /*XXX*/
757				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
758	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
759
760	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
761
762	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
763}
764
765static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
766{
767	struct inet_timewait_sock *tw = inet_twsk(sk);
768	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
769
770	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
771			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
772
773	inet_twsk_put(tw);
774}
775
776static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
777{
778	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
779			req->ts_recent);
780}
781
782/*
783 *	Send a SYN-ACK after having received an ACK.
784 *	This still operates on a request_sock only, not on a big
785 *	socket.
786 */
787static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
788			      struct dst_entry *dst)
789{
790	const struct inet_request_sock *ireq = inet_rsk(req);
791	int err = -1;
792	struct sk_buff * skb;
793
794	/* First, grab a route. */
795	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
796		goto out;
797
798	skb = tcp_make_synack(sk, dst, req);
799
800	if (skb) {
801		struct tcphdr *th = skb->h.th;
802
803		th->check = tcp_v4_check(th, skb->len,
804					 ireq->loc_addr,
805					 ireq->rmt_addr,
806					 csum_partial((char *)th, skb->len,
807						      skb->csum));
808
809		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
810					    ireq->rmt_addr,
811					    ireq->opt);
812		if (err == NET_XMIT_CN)
813			err = 0;
814	}
815
816out:
817	dst_release(dst);
818	return err;
819}
820
821/*
822 *	IPv4 request_sock destructor.
823 */
824static void tcp_v4_reqsk_destructor(struct request_sock *req)
825{
826	kfree(inet_rsk(req)->opt);
827}
828
829static inline void syn_flood_warning(struct sk_buff *skb)
830{
831	static unsigned long warntime;
832
833	if (time_after(jiffies, (warntime + HZ * 60))) {
834		warntime = jiffies;
835		printk(KERN_INFO
836		       "possible SYN flooding on port %d. Sending cookies.\n",
837		       ntohs(skb->h.th->dest));
838	}
839}
840
841/*
842 * Save and compile IPv4 options into the request_sock if needed.
843 */
844static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
845						     struct sk_buff *skb)
846{
847	struct ip_options *opt = &(IPCB(skb)->opt);
848	struct ip_options *dopt = NULL;
849
850	if (opt && opt->optlen) {
851		int opt_size = optlength(opt);
852		dopt = kmalloc(opt_size, GFP_ATOMIC);
853		if (dopt) {
854			if (ip_options_echo(dopt, skb)) {
855				kfree(dopt);
856				dopt = NULL;
857			}
858		}
859	}
860	return dopt;
861}
862
863struct request_sock_ops tcp_request_sock_ops = {
864	.family		=	PF_INET,
865	.obj_size	=	sizeof(struct tcp_request_sock),
866	.rtx_syn_ack	=	tcp_v4_send_synack,
867	.send_ack	=	tcp_v4_reqsk_send_ack,
868	.destructor	=	tcp_v4_reqsk_destructor,
869	.send_reset	=	tcp_v4_send_reset,
870};
871
872int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873{
874	struct inet_request_sock *ireq;
875	struct tcp_options_received tmp_opt;
876	struct request_sock *req;
877	__u32 saddr = skb->nh.iph->saddr;
878	__u32 daddr = skb->nh.iph->daddr;
879	__u32 isn = TCP_SKB_CB(skb)->when;
880	struct dst_entry *dst = NULL;
881#ifdef CONFIG_SYN_COOKIES
882	int want_cookie = 0;
883#else
884#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
885#endif
886
887	/* Never answer to SYNs send to broadcast or multicast */
888	if (((struct rtable *)skb->dst)->rt_flags &
889	    (RTCF_BROADCAST | RTCF_MULTICAST))
890		goto drop;
891
892	/* TW buckets are converted to open requests without
893	 * limitations, they conserve resources and peer is
894	 * evidently real one.
895	 */
896	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
897#ifdef CONFIG_SYN_COOKIES
898		if (sysctl_tcp_syncookies) {
899			want_cookie = 1;
900		} else
901#endif
902		goto drop;
903	}
904
905	/* Accept backlog is full. If we have already queued enough
906	 * of warm entries in syn queue, drop request. It is better than
907	 * clogging syn queue with openreqs with exponentially increasing
908	 * timeout.
909	 */
910	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
911		goto drop;
912
913	req = reqsk_alloc(&tcp_request_sock_ops);
914	if (!req)
915		goto drop;
916
917	tcp_clear_options(&tmp_opt);
918	tmp_opt.mss_clamp = 536;
919	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
920
921	tcp_parse_options(skb, &tmp_opt, 0);
922
923	if (want_cookie) {
924		tcp_clear_options(&tmp_opt);
925		tmp_opt.saw_tstamp = 0;
926	}
927
928	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
929		/* Some OSes (unknown ones, but I see them on web server, which
930		 * contains information interesting only for windows'
931		 * users) do not send their stamp in SYN. It is easy case.
932		 * We simply do not advertise TS support.
933		 */
934		tmp_opt.saw_tstamp = 0;
935		tmp_opt.tstamp_ok  = 0;
936	}
937	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
938
939	tcp_openreq_init(req, &tmp_opt, skb);
940
941	ireq = inet_rsk(req);
942	ireq->loc_addr = daddr;
943	ireq->rmt_addr = saddr;
944	ireq->opt = tcp_v4_save_options(sk, skb);
945	if (!want_cookie)
946		TCP_ECN_create_request(req, skb->h.th);
947
948	if (want_cookie) {
949#ifdef CONFIG_SYN_COOKIES
950		syn_flood_warning(skb);
951#endif
952		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
953	} else if (!isn) {
954		struct inet_peer *peer = NULL;
955
956		/* VJ's idea. We save last timestamp seen
957		 * from the destination in peer table, when entering
958		 * state TIME-WAIT, and check against it before
959		 * accepting new connection request.
960		 *
961		 * If "isn" is not zero, this request hit alive
962		 * timewait bucket, so that all the necessary checks
963		 * are made in the function processing timewait state.
964		 */
965		if (tmp_opt.saw_tstamp &&
966		    tcp_death_row.sysctl_tw_recycle &&
967		    (dst = inet_csk_route_req(sk, req)) != NULL &&
968		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
969		    peer->v4daddr == saddr) {
970			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
971			    (s32)(peer->tcp_ts - req->ts_recent) >
972							TCP_PAWS_WINDOW) {
973				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
974				dst_release(dst);
975				goto drop_and_free;
976			}
977		}
978		/* Kill the following clause, if you dislike this way. */
979		else if (!sysctl_tcp_syncookies &&
980			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
981			  (sysctl_max_syn_backlog >> 2)) &&
982			 (!peer || !peer->tcp_ts_stamp) &&
983			 (!dst || !dst_metric(dst, RTAX_RTT))) {
984			/* Without syncookies last quarter of
985			 * backlog is filled with destinations,
986			 * proven to be alive.
987			 * It means that we continue to communicate
988			 * to destinations, already remembered
989			 * to the moment of synflood.
990			 */
991			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
992				       "request from %u.%u.%u.%u/%u\n",
993				       NIPQUAD(saddr),
994				       ntohs(skb->h.th->source));
995			dst_release(dst);
996			goto drop_and_free;
997		}
998
999		isn = tcp_v4_init_sequence(sk, skb);
1000	}
1001	tcp_rsk(req)->snt_isn = isn;
1002
1003	if (tcp_v4_send_synack(sk, req, dst))
1004		goto drop_and_free;
1005
1006	if (want_cookie) {
1007	   	reqsk_free(req);
1008	} else {
1009		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1010	}
1011	return 0;
1012
1013drop_and_free:
1014	reqsk_free(req);
1015drop:
1016	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1017	return 0;
1018}
1019
1020
1021/*
1022 * The three way handshake has completed - we got a valid synack -
1023 * now create the new socket.
1024 */
1025struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1026				  struct request_sock *req,
1027				  struct dst_entry *dst)
1028{
1029	struct inet_request_sock *ireq;
1030	struct inet_sock *newinet;
1031	struct tcp_sock *newtp;
1032	struct sock *newsk;
1033
1034	if (sk_acceptq_is_full(sk))
1035		goto exit_overflow;
1036
1037	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1038		goto exit;
1039
1040	newsk = tcp_create_openreq_child(sk, req, skb);
1041	if (!newsk)
1042		goto exit;
1043
1044	sk_setup_caps(newsk, dst);
1045
1046	newtp		      = tcp_sk(newsk);
1047	newinet		      = inet_sk(newsk);
1048	ireq		      = inet_rsk(req);
1049	newinet->daddr	      = ireq->rmt_addr;
1050	newinet->rcv_saddr    = ireq->loc_addr;
1051	newinet->saddr	      = ireq->loc_addr;
1052	newinet->opt	      = ireq->opt;
1053	ireq->opt	      = NULL;
1054	newinet->mc_index     = inet_iif(skb);
1055	newinet->mc_ttl	      = skb->nh.iph->ttl;
1056	newtp->ext_header_len = 0;
1057	if (newinet->opt)
1058		newtp->ext_header_len = newinet->opt->optlen;
1059	newinet->id = newtp->write_seq ^ jiffies;
1060
1061	tcp_sync_mss(newsk, dst_mtu(dst));
1062	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1063	tcp_initialize_rcv_mss(newsk);
1064
1065	__inet_hash(&tcp_hashinfo, newsk, 0);
1066	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
1067
1068	return newsk;
1069
1070exit_overflow:
1071	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1072exit:
1073	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1074	dst_release(dst);
1075	return NULL;
1076}
1077
1078static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1079{
1080	struct tcphdr *th = skb->h.th;
1081	struct iphdr *iph = skb->nh.iph;
1082	struct sock *nsk;
1083	struct request_sock **prev;
1084	/* Find possible connection requests. */
1085	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1086						       iph->saddr, iph->daddr);
1087	if (req)
1088		return tcp_check_req(sk, skb, req, prev);
1089
1090	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1091					th->source, skb->nh.iph->daddr,
1092					ntohs(th->dest), inet_iif(skb));
1093
1094	if (nsk) {
1095		if (nsk->sk_state != TCP_TIME_WAIT) {
1096			bh_lock_sock(nsk);
1097			return nsk;
1098		}
1099		inet_twsk_put((struct inet_timewait_sock *)nsk);
1100		return NULL;
1101	}
1102
1103#ifdef CONFIG_SYN_COOKIES
1104	if (!th->rst && !th->syn && th->ack)
1105		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1106#endif
1107	return sk;
1108}
1109
1110static int tcp_v4_checksum_init(struct sk_buff *skb)
1111{
1112	if (skb->ip_summed == CHECKSUM_HW) {
1113		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1114				  skb->nh.iph->daddr, skb->csum)) {
1115			skb->ip_summed = CHECKSUM_UNNECESSARY;
1116			return 0;
1117		}
1118	}
1119
1120	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1121				       skb->len, IPPROTO_TCP, 0);
1122
1123	if (skb->len <= 76) {
1124		return __skb_checksum_complete(skb);
1125	}
1126	return 0;
1127}
1128
1129
1130/* The socket must have it's spinlock held when we get
1131 * here.
1132 *
1133 * We have a potential double-lock case here, so even when
1134 * doing backlog processing we use the BH locking scheme.
1135 * This is because we cannot sleep with the original spinlock
1136 * held.
1137 */
1138int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1139{
1140	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1141		TCP_CHECK_TIMER(sk);
1142		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1143			goto reset;
1144		TCP_CHECK_TIMER(sk);
1145		return 0;
1146	}
1147
1148	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1149		goto csum_err;
1150
1151	if (sk->sk_state == TCP_LISTEN) {
1152		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1153		if (!nsk)
1154			goto discard;
1155
1156		if (nsk != sk) {
1157			if (tcp_child_process(sk, nsk, skb))
1158				goto reset;
1159			return 0;
1160		}
1161	}
1162
1163	TCP_CHECK_TIMER(sk);
1164	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1165		goto reset;
1166	TCP_CHECK_TIMER(sk);
1167	return 0;
1168
1169reset:
1170	tcp_v4_send_reset(skb);
1171discard:
1172	kfree_skb(skb);
1173	/* Be careful here. If this function gets more complicated and
1174	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1175	 * might be destroyed here. This current version compiles correctly,
1176	 * but you have been warned.
1177	 */
1178	return 0;
1179
1180csum_err:
1181	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1182	goto discard;
1183}
1184
1185/*
1186 *	From tcp_input.c
1187 */
1188
1189int tcp_v4_rcv(struct sk_buff *skb)
1190{
1191	struct tcphdr *th;
1192	struct sock *sk;
1193	int ret;
1194
1195	if (skb->pkt_type != PACKET_HOST)
1196		goto discard_it;
1197
1198	/* Count it even if it's bad */
1199	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1200
1201	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1202		goto discard_it;
1203
1204	th = skb->h.th;
1205
1206	if (th->doff < sizeof(struct tcphdr) / 4)
1207		goto bad_packet;
1208	if (!pskb_may_pull(skb, th->doff * 4))
1209		goto discard_it;
1210
1211	/* An explanation is required here, I think.
1212	 * Packet length and doff are validated by header prediction,
1213	 * provided case of th->doff==0 is elimineted.
1214	 * So, we defer the checks. */
1215	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1216	     tcp_v4_checksum_init(skb)))
1217		goto bad_packet;
1218
1219	th = skb->h.th;
1220	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1221	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1222				    skb->len - th->doff * 4);
1223	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1224	TCP_SKB_CB(skb)->when	 = 0;
1225	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1226	TCP_SKB_CB(skb)->sacked	 = 0;
1227
1228	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1229			   skb->nh.iph->daddr, ntohs(th->dest),
1230			   inet_iif(skb));
1231
1232	if (!sk)
1233		goto no_tcp_socket;
1234
1235process:
1236	if (sk->sk_state == TCP_TIME_WAIT)
1237		goto do_time_wait;
1238
1239	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1240		goto discard_and_relse;
1241
1242	if (sk_filter(sk, skb, 0))
1243		goto discard_and_relse;
1244
1245	skb->dev = NULL;
1246
1247	bh_lock_sock(sk);
1248	ret = 0;
1249	if (!sock_owned_by_user(sk)) {
1250		if (!tcp_prequeue(sk, skb))
1251			ret = tcp_v4_do_rcv(sk, skb);
1252	} else
1253		sk_add_backlog(sk, skb);
1254	bh_unlock_sock(sk);
1255
1256	sock_put(sk);
1257
1258	return ret;
1259
1260no_tcp_socket:
1261	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1262		goto discard_it;
1263
1264	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1265bad_packet:
1266		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1267	} else {
1268		tcp_v4_send_reset(skb);
1269	}
1270
1271discard_it:
1272	/* Discard frame. */
1273	kfree_skb(skb);
1274  	return 0;
1275
1276discard_and_relse:
1277	sock_put(sk);
1278	goto discard_it;
1279
1280do_time_wait:
1281	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1282		inet_twsk_put((struct inet_timewait_sock *) sk);
1283		goto discard_it;
1284	}
1285
1286	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1287		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1288		inet_twsk_put((struct inet_timewait_sock *) sk);
1289		goto discard_it;
1290	}
1291	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1292					   skb, th)) {
1293	case TCP_TW_SYN: {
1294		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1295							skb->nh.iph->daddr,
1296							ntohs(th->dest),
1297							inet_iif(skb));
1298		if (sk2) {
1299			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1300					     &tcp_death_row);
1301			inet_twsk_put((struct inet_timewait_sock *)sk);
1302			sk = sk2;
1303			goto process;
1304		}
1305		/* Fall through to ACK */
1306	}
1307	case TCP_TW_ACK:
1308		tcp_v4_timewait_ack(sk, skb);
1309		break;
1310	case TCP_TW_RST:
1311		goto no_tcp_socket;
1312	case TCP_TW_SUCCESS:;
1313	}
1314	goto discard_it;
1315}
1316
1317static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318{
1319	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320	struct inet_sock *inet = inet_sk(sk);
1321
1322	sin->sin_family		= AF_INET;
1323	sin->sin_addr.s_addr	= inet->daddr;
1324	sin->sin_port		= inet->dport;
1325}
1326
1327/* VJ's idea. Save last timestamp seen from this destination
1328 * and hold it at least for normal timewait interval to use for duplicate
1329 * segment detection in subsequent connections, before they enter synchronized
1330 * state.
1331 */
1332
1333int tcp_v4_remember_stamp(struct sock *sk)
1334{
1335	struct inet_sock *inet = inet_sk(sk);
1336	struct tcp_sock *tp = tcp_sk(sk);
1337	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1338	struct inet_peer *peer = NULL;
1339	int release_it = 0;
1340
1341	if (!rt || rt->rt_dst != inet->daddr) {
1342		peer = inet_getpeer(inet->daddr, 1);
1343		release_it = 1;
1344	} else {
1345		if (!rt->peer)
1346			rt_bind_peer(rt, 1);
1347		peer = rt->peer;
1348	}
1349
1350	if (peer) {
1351		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1352		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1353		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1354			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1355			peer->tcp_ts = tp->rx_opt.ts_recent;
1356		}
1357		if (release_it)
1358			inet_putpeer(peer);
1359		return 1;
1360	}
1361
1362	return 0;
1363}
1364
1365int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1366{
1367	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1368
1369	if (peer) {
1370		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1371
1372		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1373		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1374		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1375			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1376			peer->tcp_ts	   = tcptw->tw_ts_recent;
1377		}
1378		inet_putpeer(peer);
1379		return 1;
1380	}
1381
1382	return 0;
1383}
1384
1385struct tcp_func ipv4_specific = {
1386	.queue_xmit	=	ip_queue_xmit,
1387	.send_check	=	tcp_v4_send_check,
1388	.rebuild_header	=	inet_sk_rebuild_header,
1389	.conn_request	=	tcp_v4_conn_request,
1390	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
1391	.remember_stamp	=	tcp_v4_remember_stamp,
1392	.net_header_len	=	sizeof(struct iphdr),
1393	.setsockopt	=	ip_setsockopt,
1394	.getsockopt	=	ip_getsockopt,
1395	.addr2sockaddr	=	v4_addr2sockaddr,
1396	.sockaddr_len	=	sizeof(struct sockaddr_in),
1397};
1398
1399/* NOTE: A lot of things set to zero explicitly by call to
1400 *       sk_alloc() so need not be done here.
1401 */
1402static int tcp_v4_init_sock(struct sock *sk)
1403{
1404	struct inet_connection_sock *icsk = inet_csk(sk);
1405	struct tcp_sock *tp = tcp_sk(sk);
1406
1407	skb_queue_head_init(&tp->out_of_order_queue);
1408	tcp_init_xmit_timers(sk);
1409	tcp_prequeue_init(tp);
1410
1411	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1412	tp->mdev = TCP_TIMEOUT_INIT;
1413
1414	/* So many TCP implementations out there (incorrectly) count the
1415	 * initial SYN frame in their delayed-ACK and congestion control
1416	 * algorithms that we must have the following bandaid to talk
1417	 * efficiently to them.  -DaveM
1418	 */
1419	tp->snd_cwnd = 2;
1420
1421	/* See draft-stevens-tcpca-spec-01 for discussion of the
1422	 * initialization of these values.
1423	 */
1424	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1425	tp->snd_cwnd_clamp = ~0;
1426	tp->mss_cache = 536;
1427
1428	tp->reordering = sysctl_tcp_reordering;
1429	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1430
1431	sk->sk_state = TCP_CLOSE;
1432
1433	sk->sk_write_space = sk_stream_write_space;
1434	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435
1436	tp->af_specific = &ipv4_specific;
1437
1438	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1440
1441	atomic_inc(&tcp_sockets_allocated);
1442
1443	return 0;
1444}
1445
1446int tcp_v4_destroy_sock(struct sock *sk)
1447{
1448	struct tcp_sock *tp = tcp_sk(sk);
1449
1450	tcp_clear_xmit_timers(sk);
1451
1452	tcp_cleanup_congestion_control(sk);
1453
1454	/* Cleanup up the write buffer. */
1455  	sk_stream_writequeue_purge(sk);
1456
1457	/* Cleans up our, hopefully empty, out_of_order_queue. */
1458  	__skb_queue_purge(&tp->out_of_order_queue);
1459
1460	/* Clean prequeue, it must be empty really */
1461	__skb_queue_purge(&tp->ucopy.prequeue);
1462
1463	/* Clean up a referenced TCP bind bucket. */
1464	if (inet_csk(sk)->icsk_bind_hash)
1465		inet_put_port(&tcp_hashinfo, sk);
1466
1467	/*
1468	 * If sendmsg cached page exists, toss it.
1469	 */
1470	if (sk->sk_sndmsg_page) {
1471		__free_page(sk->sk_sndmsg_page);
1472		sk->sk_sndmsg_page = NULL;
1473	}
1474
1475	atomic_dec(&tcp_sockets_allocated);
1476
1477	return 0;
1478}
1479
1480EXPORT_SYMBOL(tcp_v4_destroy_sock);
1481
1482#ifdef CONFIG_PROC_FS
1483/* Proc filesystem TCP sock list dumping. */
1484
1485static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1486{
1487	return hlist_empty(head) ? NULL :
1488		list_entry(head->first, struct inet_timewait_sock, tw_node);
1489}
1490
1491static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1492{
1493	return tw->tw_node.next ?
1494		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1495}
1496
1497static void *listening_get_next(struct seq_file *seq, void *cur)
1498{
1499	struct inet_connection_sock *icsk;
1500	struct hlist_node *node;
1501	struct sock *sk = cur;
1502	struct tcp_iter_state* st = seq->private;
1503
1504	if (!sk) {
1505		st->bucket = 0;
1506		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1507		goto get_sk;
1508	}
1509
1510	++st->num;
1511
1512	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1513		struct request_sock *req = cur;
1514
1515	       	icsk = inet_csk(st->syn_wait_sk);
1516		req = req->dl_next;
1517		while (1) {
1518			while (req) {
1519				if (req->rsk_ops->family == st->family) {
1520					cur = req;
1521					goto out;
1522				}
1523				req = req->dl_next;
1524			}
1525			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1526				break;
1527get_req:
1528			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1529		}
1530		sk	  = sk_next(st->syn_wait_sk);
1531		st->state = TCP_SEQ_STATE_LISTENING;
1532		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1533	} else {
1534	       	icsk = inet_csk(sk);
1535		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1536		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1537			goto start_req;
1538		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1539		sk = sk_next(sk);
1540	}
1541get_sk:
1542	sk_for_each_from(sk, node) {
1543		if (sk->sk_family == st->family) {
1544			cur = sk;
1545			goto out;
1546		}
1547	       	icsk = inet_csk(sk);
1548		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1549		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1550start_req:
1551			st->uid		= sock_i_uid(sk);
1552			st->syn_wait_sk = sk;
1553			st->state	= TCP_SEQ_STATE_OPENREQ;
1554			st->sbucket	= 0;
1555			goto get_req;
1556		}
1557		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1558	}
1559	if (++st->bucket < INET_LHTABLE_SIZE) {
1560		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1561		goto get_sk;
1562	}
1563	cur = NULL;
1564out:
1565	return cur;
1566}
1567
1568static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1569{
1570	void *rc = listening_get_next(seq, NULL);
1571
1572	while (rc && *pos) {
1573		rc = listening_get_next(seq, rc);
1574		--*pos;
1575	}
1576	return rc;
1577}
1578
1579static void *established_get_first(struct seq_file *seq)
1580{
1581	struct tcp_iter_state* st = seq->private;
1582	void *rc = NULL;
1583
1584	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1585		struct sock *sk;
1586		struct hlist_node *node;
1587		struct inet_timewait_sock *tw;
1588
1589		/* We can reschedule _before_ having picked the target: */
1590		cond_resched_softirq();
1591
1592		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1593		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1594			if (sk->sk_family != st->family) {
1595				continue;
1596			}
1597			rc = sk;
1598			goto out;
1599		}
1600		st->state = TCP_SEQ_STATE_TIME_WAIT;
1601		inet_twsk_for_each(tw, node,
1602				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1603			if (tw->tw_family != st->family) {
1604				continue;
1605			}
1606			rc = tw;
1607			goto out;
1608		}
1609		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1610		st->state = TCP_SEQ_STATE_ESTABLISHED;
1611	}
1612out:
1613	return rc;
1614}
1615
1616static void *established_get_next(struct seq_file *seq, void *cur)
1617{
1618	struct sock *sk = cur;
1619	struct inet_timewait_sock *tw;
1620	struct hlist_node *node;
1621	struct tcp_iter_state* st = seq->private;
1622
1623	++st->num;
1624
1625	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1626		tw = cur;
1627		tw = tw_next(tw);
1628get_tw:
1629		while (tw && tw->tw_family != st->family) {
1630			tw = tw_next(tw);
1631		}
1632		if (tw) {
1633			cur = tw;
1634			goto out;
1635		}
1636		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1637		st->state = TCP_SEQ_STATE_ESTABLISHED;
1638
1639		/* We can reschedule between buckets: */
1640		cond_resched_softirq();
1641
1642		if (++st->bucket < tcp_hashinfo.ehash_size) {
1643			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1644			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1645		} else {
1646			cur = NULL;
1647			goto out;
1648		}
1649	} else
1650		sk = sk_next(sk);
1651
1652	sk_for_each_from(sk, node) {
1653		if (sk->sk_family == st->family)
1654			goto found;
1655	}
1656
1657	st->state = TCP_SEQ_STATE_TIME_WAIT;
1658	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1659	goto get_tw;
1660found:
1661	cur = sk;
1662out:
1663	return cur;
1664}
1665
1666static void *established_get_idx(struct seq_file *seq, loff_t pos)
1667{
1668	void *rc = established_get_first(seq);
1669
1670	while (rc && pos) {
1671		rc = established_get_next(seq, rc);
1672		--pos;
1673	}
1674	return rc;
1675}
1676
1677static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1678{
1679	void *rc;
1680	struct tcp_iter_state* st = seq->private;
1681
1682	inet_listen_lock(&tcp_hashinfo);
1683	st->state = TCP_SEQ_STATE_LISTENING;
1684	rc	  = listening_get_idx(seq, &pos);
1685
1686	if (!rc) {
1687		inet_listen_unlock(&tcp_hashinfo);
1688		local_bh_disable();
1689		st->state = TCP_SEQ_STATE_ESTABLISHED;
1690		rc	  = established_get_idx(seq, pos);
1691	}
1692
1693	return rc;
1694}
1695
1696static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1697{
1698	struct tcp_iter_state* st = seq->private;
1699	st->state = TCP_SEQ_STATE_LISTENING;
1700	st->num = 0;
1701	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1702}
1703
1704static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1705{
1706	void *rc = NULL;
1707	struct tcp_iter_state* st;
1708
1709	if (v == SEQ_START_TOKEN) {
1710		rc = tcp_get_idx(seq, 0);
1711		goto out;
1712	}
1713	st = seq->private;
1714
1715	switch (st->state) {
1716	case TCP_SEQ_STATE_OPENREQ:
1717	case TCP_SEQ_STATE_LISTENING:
1718		rc = listening_get_next(seq, v);
1719		if (!rc) {
1720			inet_listen_unlock(&tcp_hashinfo);
1721			local_bh_disable();
1722			st->state = TCP_SEQ_STATE_ESTABLISHED;
1723			rc	  = established_get_first(seq);
1724		}
1725		break;
1726	case TCP_SEQ_STATE_ESTABLISHED:
1727	case TCP_SEQ_STATE_TIME_WAIT:
1728		rc = established_get_next(seq, v);
1729		break;
1730	}
1731out:
1732	++*pos;
1733	return rc;
1734}
1735
1736static void tcp_seq_stop(struct seq_file *seq, void *v)
1737{
1738	struct tcp_iter_state* st = seq->private;
1739
1740	switch (st->state) {
1741	case TCP_SEQ_STATE_OPENREQ:
1742		if (v) {
1743			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1744			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1745		}
1746	case TCP_SEQ_STATE_LISTENING:
1747		if (v != SEQ_START_TOKEN)
1748			inet_listen_unlock(&tcp_hashinfo);
1749		break;
1750	case TCP_SEQ_STATE_TIME_WAIT:
1751	case TCP_SEQ_STATE_ESTABLISHED:
1752		if (v)
1753			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1754		local_bh_enable();
1755		break;
1756	}
1757}
1758
1759static int tcp_seq_open(struct inode *inode, struct file *file)
1760{
1761	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1762	struct seq_file *seq;
1763	struct tcp_iter_state *s;
1764	int rc;
1765
1766	if (unlikely(afinfo == NULL))
1767		return -EINVAL;
1768
1769	s = kmalloc(sizeof(*s), GFP_KERNEL);
1770	if (!s)
1771		return -ENOMEM;
1772	memset(s, 0, sizeof(*s));
1773	s->family		= afinfo->family;
1774	s->seq_ops.start	= tcp_seq_start;
1775	s->seq_ops.next		= tcp_seq_next;
1776	s->seq_ops.show		= afinfo->seq_show;
1777	s->seq_ops.stop		= tcp_seq_stop;
1778
1779	rc = seq_open(file, &s->seq_ops);
1780	if (rc)
1781		goto out_kfree;
1782	seq	     = file->private_data;
1783	seq->private = s;
1784out:
1785	return rc;
1786out_kfree:
1787	kfree(s);
1788	goto out;
1789}
1790
1791int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1792{
1793	int rc = 0;
1794	struct proc_dir_entry *p;
1795
1796	if (!afinfo)
1797		return -EINVAL;
1798	afinfo->seq_fops->owner		= afinfo->owner;
1799	afinfo->seq_fops->open		= tcp_seq_open;
1800	afinfo->seq_fops->read		= seq_read;
1801	afinfo->seq_fops->llseek	= seq_lseek;
1802	afinfo->seq_fops->release	= seq_release_private;
1803
1804	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1805	if (p)
1806		p->data = afinfo;
1807	else
1808		rc = -ENOMEM;
1809	return rc;
1810}
1811
1812void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1813{
1814	if (!afinfo)
1815		return;
1816	proc_net_remove(afinfo->name);
1817	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1818}
1819
1820static void get_openreq4(struct sock *sk, struct request_sock *req,
1821			 char *tmpbuf, int i, int uid)
1822{
1823	const struct inet_request_sock *ireq = inet_rsk(req);
1824	int ttd = req->expires - jiffies;
1825
1826	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1827		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1828		i,
1829		ireq->loc_addr,
1830		ntohs(inet_sk(sk)->sport),
1831		ireq->rmt_addr,
1832		ntohs(ireq->rmt_port),
1833		TCP_SYN_RECV,
1834		0, 0, /* could print option size, but that is af dependent. */
1835		1,    /* timers active (only the expire timer) */
1836		jiffies_to_clock_t(ttd),
1837		req->retrans,
1838		uid,
1839		0,  /* non standard timer */
1840		0, /* open_requests have no inode */
1841		atomic_read(&sk->sk_refcnt),
1842		req);
1843}
1844
1845static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1846{
1847	int timer_active;
1848	unsigned long timer_expires;
1849	struct tcp_sock *tp = tcp_sk(sp);
1850	const struct inet_connection_sock *icsk = inet_csk(sp);
1851	struct inet_sock *inet = inet_sk(sp);
1852	unsigned int dest = inet->daddr;
1853	unsigned int src = inet->rcv_saddr;
1854	__u16 destp = ntohs(inet->dport);
1855	__u16 srcp = ntohs(inet->sport);
1856
1857	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1858		timer_active	= 1;
1859		timer_expires	= icsk->icsk_timeout;
1860	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1861		timer_active	= 4;
1862		timer_expires	= icsk->icsk_timeout;
1863	} else if (timer_pending(&sp->sk_timer)) {
1864		timer_active	= 2;
1865		timer_expires	= sp->sk_timer.expires;
1866	} else {
1867		timer_active	= 0;
1868		timer_expires = jiffies;
1869	}
1870
1871	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1872			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1873		i, src, srcp, dest, destp, sp->sk_state,
1874		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1875		timer_active,
1876		jiffies_to_clock_t(timer_expires - jiffies),
1877		icsk->icsk_retransmits,
1878		sock_i_uid(sp),
1879		icsk->icsk_probes_out,
1880		sock_i_ino(sp),
1881		atomic_read(&sp->sk_refcnt), sp,
1882		icsk->icsk_rto,
1883		icsk->icsk_ack.ato,
1884		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1885		tp->snd_cwnd,
1886		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1887}
1888
1889static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1890{
1891	unsigned int dest, src;
1892	__u16 destp, srcp;
1893	int ttd = tw->tw_ttd - jiffies;
1894
1895	if (ttd < 0)
1896		ttd = 0;
1897
1898	dest  = tw->tw_daddr;
1899	src   = tw->tw_rcv_saddr;
1900	destp = ntohs(tw->tw_dport);
1901	srcp  = ntohs(tw->tw_sport);
1902
1903	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1904		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1905		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1906		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1907		atomic_read(&tw->tw_refcnt), tw);
1908}
1909
1910#define TMPSZ 150
1911
1912static int tcp4_seq_show(struct seq_file *seq, void *v)
1913{
1914	struct tcp_iter_state* st;
1915	char tmpbuf[TMPSZ + 1];
1916
1917	if (v == SEQ_START_TOKEN) {
1918		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1919			   "  sl  local_address rem_address   st tx_queue "
1920			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1921			   "inode");
1922		goto out;
1923	}
1924	st = seq->private;
1925
1926	switch (st->state) {
1927	case TCP_SEQ_STATE_LISTENING:
1928	case TCP_SEQ_STATE_ESTABLISHED:
1929		get_tcp4_sock(v, tmpbuf, st->num);
1930		break;
1931	case TCP_SEQ_STATE_OPENREQ:
1932		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1933		break;
1934	case TCP_SEQ_STATE_TIME_WAIT:
1935		get_timewait4_sock(v, tmpbuf, st->num);
1936		break;
1937	}
1938	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1939out:
1940	return 0;
1941}
1942
1943static struct file_operations tcp4_seq_fops;
1944static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1945	.owner		= THIS_MODULE,
1946	.name		= "tcp",
1947	.family		= AF_INET,
1948	.seq_show	= tcp4_seq_show,
1949	.seq_fops	= &tcp4_seq_fops,
1950};
1951
1952int __init tcp4_proc_init(void)
1953{
1954	return tcp_proc_register(&tcp4_seq_afinfo);
1955}
1956
1957void tcp4_proc_exit(void)
1958{
1959	tcp_proc_unregister(&tcp4_seq_afinfo);
1960}
1961#endif /* CONFIG_PROC_FS */
1962
1963struct proto tcp_prot = {
1964	.name			= "TCP",
1965	.owner			= THIS_MODULE,
1966	.close			= tcp_close,
1967	.connect		= tcp_v4_connect,
1968	.disconnect		= tcp_disconnect,
1969	.accept			= inet_csk_accept,
1970	.ioctl			= tcp_ioctl,
1971	.init			= tcp_v4_init_sock,
1972	.destroy		= tcp_v4_destroy_sock,
1973	.shutdown		= tcp_shutdown,
1974	.setsockopt		= tcp_setsockopt,
1975	.getsockopt		= tcp_getsockopt,
1976	.sendmsg		= tcp_sendmsg,
1977	.recvmsg		= tcp_recvmsg,
1978	.backlog_rcv		= tcp_v4_do_rcv,
1979	.hash			= tcp_v4_hash,
1980	.unhash			= tcp_unhash,
1981	.get_port		= tcp_v4_get_port,
1982	.enter_memory_pressure	= tcp_enter_memory_pressure,
1983	.sockets_allocated	= &tcp_sockets_allocated,
1984	.orphan_count		= &tcp_orphan_count,
1985	.memory_allocated	= &tcp_memory_allocated,
1986	.memory_pressure	= &tcp_memory_pressure,
1987	.sysctl_mem		= sysctl_tcp_mem,
1988	.sysctl_wmem		= sysctl_tcp_wmem,
1989	.sysctl_rmem		= sysctl_tcp_rmem,
1990	.max_header		= MAX_TCP_HEADER,
1991	.obj_size		= sizeof(struct tcp_sock),
1992	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
1993	.rsk_prot		= &tcp_request_sock_ops,
1994};
1995
1996
1997
1998void __init tcp_v4_init(struct net_proto_family *ops)
1999{
2000	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2001	if (err < 0)
2002		panic("Failed to create the TCP control socket.\n");
2003	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2004	inet_sk(tcp_socket->sk)->uc_ttl = -1;
2005
2006	/* Unhash it so that IP input processing does not even
2007	 * see it, we do not wish this socket to see incoming
2008	 * packets.
2009	 */
2010	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2011}
2012
2013EXPORT_SYMBOL(ipv4_specific);
2014EXPORT_SYMBOL(inet_bind_bucket_create);
2015EXPORT_SYMBOL(tcp_hashinfo);
2016EXPORT_SYMBOL(tcp_prot);
2017EXPORT_SYMBOL(tcp_unhash);
2018EXPORT_SYMBOL(tcp_v4_conn_request);
2019EXPORT_SYMBOL(tcp_v4_connect);
2020EXPORT_SYMBOL(tcp_v4_do_rcv);
2021EXPORT_SYMBOL(tcp_v4_remember_stamp);
2022EXPORT_SYMBOL(tcp_v4_send_check);
2023EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2024
2025#ifdef CONFIG_PROC_FS
2026EXPORT_SYMBOL(tcp_proc_register);
2027EXPORT_SYMBOL(tcp_proc_unregister);
2028#endif
2029EXPORT_SYMBOL(sysctl_local_port_range);
2030EXPORT_SYMBOL(sysctl_tcp_low_latency);
2031EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2032
2033