tcp_ipv4.c revision 06ca719faddaf5ea46c6356b12847663c3ed8806
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81int sysctl_tcp_tw_reuse __read_mostly;
82int sysctl_tcp_low_latency __read_mostly;
83
84/* Check TCP sequence numbers in ICMP packets. */
85#define ICMP_MIN_LENGTH 8
86
87/* Socket used for sending RSTs */
88static struct socket *tcp_socket;
89
90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock	= __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
94	.lhash_users	= ATOMIC_INIT(0),
95	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96};
97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{
100	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101				 inet_csk_bind_conflict);
102}
103
104static void tcp_v4_hash(struct sock *sk)
105{
106	inet_hash(&tcp_hashinfo, sk);
107}
108
109void tcp_unhash(struct sock *sk)
110{
111	inet_unhash(&tcp_hashinfo, sk);
112}
113
114static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
115{
116	return secure_tcp_sequence_number(skb->nh.iph->daddr,
117					  skb->nh.iph->saddr,
118					  skb->h.th->dest,
119					  skb->h.th->source);
120}
121
122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
123{
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126
127	/* With PAWS, it is safe from the viewpoint
128	   of data integrity. Even without PAWS it is safe provided sequence
129	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
130
131	   Actually, the idea is close to VJ's one, only timestamp cache is
132	   held not per host, but per port pair and TW bucket is used as state
133	   holder.
134
135	   If TW bucket has been already destroyed we fall back to VJ's scheme
136	   and use initial timestamp retrieved from peer table.
137	 */
138	if (tcptw->tw_ts_recent_stamp &&
139	    (twp == NULL || (sysctl_tcp_tw_reuse &&
140			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
141		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
142		if (tp->write_seq == 0)
143			tp->write_seq = 1;
144		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
145		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
146		sock_hold(sktw);
147		return 1;
148	}
149
150	return 0;
151}
152
153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
154
155/* This will initiate an outgoing connection. */
156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
157{
158	struct inet_sock *inet = inet_sk(sk);
159	struct tcp_sock *tp = tcp_sk(sk);
160	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
161	struct rtable *rt;
162	__be32 daddr, nexthop;
163	int tmp;
164	int err;
165
166	if (addr_len < sizeof(struct sockaddr_in))
167		return -EINVAL;
168
169	if (usin->sin_family != AF_INET)
170		return -EAFNOSUPPORT;
171
172	nexthop = daddr = usin->sin_addr.s_addr;
173	if (inet->opt && inet->opt->srr) {
174		if (!daddr)
175			return -EINVAL;
176		nexthop = inet->opt->faddr;
177	}
178
179	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
180			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181			       IPPROTO_TCP,
182			       inet->sport, usin->sin_port, sk);
183	if (tmp < 0)
184		return tmp;
185
186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187		ip_rt_put(rt);
188		return -ENETUNREACH;
189	}
190
191	if (!inet->opt || !inet->opt->srr)
192		daddr = rt->rt_dst;
193
194	if (!inet->saddr)
195		inet->saddr = rt->rt_src;
196	inet->rcv_saddr = inet->saddr;
197
198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199		/* Reset inherited state */
200		tp->rx_opt.ts_recent	   = 0;
201		tp->rx_opt.ts_recent_stamp = 0;
202		tp->write_seq		   = 0;
203	}
204
205	if (tcp_death_row.sysctl_tw_recycle &&
206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207		struct inet_peer *peer = rt_get_peer(rt);
208
209		/* VJ's idea. We save last timestamp seen from
210		 * the destination in peer table, when entering state TIME-WAIT
211		 * and initialize rx_opt.ts_recent from it, when trying new connection.
212		 */
213
214		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
215			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216			tp->rx_opt.ts_recent = peer->tcp_ts;
217		}
218	}
219
220	inet->dport = usin->sin_port;
221	inet->daddr = daddr;
222
223	inet_csk(sk)->icsk_ext_hdr_len = 0;
224	if (inet->opt)
225		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226
227	tp->rx_opt.mss_clamp = 536;
228
229	/* Socket identity is still unknown (sport may be zero).
230	 * However we set state to SYN-SENT and not releasing socket
231	 * lock select source port, enter ourselves into the hash tables and
232	 * complete initialization after this.
233	 */
234	tcp_set_state(sk, TCP_SYN_SENT);
235	err = inet_hash_connect(&tcp_death_row, sk);
236	if (err)
237		goto failure;
238
239	err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
240	if (err)
241		goto failure;
242
243	/* OK, now commit destination to socket.  */
244	sk->sk_gso_type = SKB_GSO_TCPV4;
245	sk_setup_caps(sk, &rt->u.dst);
246
247	if (!tp->write_seq)
248		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
249							   inet->daddr,
250							   inet->sport,
251							   usin->sin_port);
252
253	inet->id = tp->write_seq ^ jiffies;
254
255	err = tcp_connect(sk);
256	rt = NULL;
257	if (err)
258		goto failure;
259
260	return 0;
261
262failure:
263	/* This unhashes the socket and releases the local port, if necessary. */
264	tcp_set_state(sk, TCP_CLOSE);
265	ip_rt_put(rt);
266	sk->sk_route_caps = 0;
267	inet->dport = 0;
268	return err;
269}
270
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275{
276	struct dst_entry *dst;
277	struct inet_sock *inet = inet_sk(sk);
278
279	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280	 * send out by Linux are always <576bytes so they should go through
281	 * unfragmented).
282	 */
283	if (sk->sk_state == TCP_LISTEN)
284		return;
285
286	/* We don't check in the destentry if pmtu discovery is forbidden
287	 * on this route. We just assume that no packet_to_big packets
288	 * are send back when pmtu discovery is not active.
289     	 * There is a small race when the user changes this flag in the
290	 * route, but I think that's acceptable.
291	 */
292	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293		return;
294
295	dst->ops->update_pmtu(dst, mtu);
296
297	/* Something is about to be wrong... Remember soft error
298	 * for the case, if this connection will not able to recover.
299	 */
300	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301		sk->sk_err_soft = EMSGSIZE;
302
303	mtu = dst_mtu(dst);
304
305	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307		tcp_sync_mss(sk, mtu);
308
309		/* Resend the TCP packet because it's
310		 * clear that the old packet has been
311		 * dropped. This is the new "fast" path mtu
312		 * discovery.
313		 */
314		tcp_simple_retransmit(sk);
315	} /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition.  If err < 0 then the socket should
321 * be closed and the error returned to the user.  If err > 0
322 * it's just the icmp type << 8 | icmp code.  After adjustment
323 * header points to the first 8 bytes of the tcp header.  We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336	struct iphdr *iph = (struct iphdr *)skb->data;
337	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338	struct tcp_sock *tp;
339	struct inet_sock *inet;
340	int type = skb->h.icmph->type;
341	int code = skb->h.icmph->code;
342	struct sock *sk;
343	__u32 seq;
344	int err;
345
346	if (skb->len < (iph->ihl << 2) + 8) {
347		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
348		return;
349	}
350
351	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
352			 th->source, inet_iif(skb));
353	if (!sk) {
354		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355		return;
356	}
357	if (sk->sk_state == TCP_TIME_WAIT) {
358		inet_twsk_put(inet_twsk(sk));
359		return;
360	}
361
362	bh_lock_sock(sk);
363	/* If too many ICMPs get dropped on busy
364	 * servers this needs to be solved differently.
365	 */
366	if (sock_owned_by_user(sk))
367		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
368
369	if (sk->sk_state == TCP_CLOSE)
370		goto out;
371
372	tp = tcp_sk(sk);
373	seq = ntohl(th->seq);
374	if (sk->sk_state != TCP_LISTEN &&
375	    !between(seq, tp->snd_una, tp->snd_nxt)) {
376		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
377		goto out;
378	}
379
380	switch (type) {
381	case ICMP_SOURCE_QUENCH:
382		/* Just silently ignore these. */
383		goto out;
384	case ICMP_PARAMETERPROB:
385		err = EPROTO;
386		break;
387	case ICMP_DEST_UNREACH:
388		if (code > NR_ICMP_UNREACH)
389			goto out;
390
391		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
392			if (!sock_owned_by_user(sk))
393				do_pmtu_discovery(sk, iph, info);
394			goto out;
395		}
396
397		err = icmp_err_convert[code].errno;
398		break;
399	case ICMP_TIME_EXCEEDED:
400		err = EHOSTUNREACH;
401		break;
402	default:
403		goto out;
404	}
405
406	switch (sk->sk_state) {
407		struct request_sock *req, **prev;
408	case TCP_LISTEN:
409		if (sock_owned_by_user(sk))
410			goto out;
411
412		req = inet_csk_search_req(sk, &prev, th->dest,
413					  iph->daddr, iph->saddr);
414		if (!req)
415			goto out;
416
417		/* ICMPs are not backlogged, hence we cannot get
418		   an established socket here.
419		 */
420		BUG_TRAP(!req->sk);
421
422		if (seq != tcp_rsk(req)->snt_isn) {
423			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
424			goto out;
425		}
426
427		/*
428		 * Still in SYN_RECV, just remove it silently.
429		 * There is no good way to pass the error to the newly
430		 * created socket, and POSIX does not want network
431		 * errors returned from accept().
432		 */
433		inet_csk_reqsk_queue_drop(sk, req, prev);
434		goto out;
435
436	case TCP_SYN_SENT:
437	case TCP_SYN_RECV:  /* Cannot happen.
438			       It can f.e. if SYNs crossed.
439			     */
440		if (!sock_owned_by_user(sk)) {
441			sk->sk_err = err;
442
443			sk->sk_error_report(sk);
444
445			tcp_done(sk);
446		} else {
447			sk->sk_err_soft = err;
448		}
449		goto out;
450	}
451
452	/* If we've already connected we will keep trying
453	 * until we time out, or the user gives up.
454	 *
455	 * rfc1122 4.2.3.9 allows to consider as hard errors
456	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457	 * but it is obsoleted by pmtu discovery).
458	 *
459	 * Note, that in modern internet, where routing is unreliable
460	 * and in each dark corner broken firewalls sit, sending random
461	 * errors ordered by their masters even this two messages finally lose
462	 * their original sense (even Linux sends invalid PORT_UNREACHs)
463	 *
464	 * Now we are in compliance with RFCs.
465	 *							--ANK (980905)
466	 */
467
468	inet = inet_sk(sk);
469	if (!sock_owned_by_user(sk) && inet->recverr) {
470		sk->sk_err = err;
471		sk->sk_error_report(sk);
472	} else	{ /* Only an error on timeout */
473		sk->sk_err_soft = err;
474	}
475
476out:
477	bh_unlock_sock(sk);
478	sock_put(sk);
479}
480
481/* This routine computes an IPv4 TCP checksum. */
482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483{
484	struct inet_sock *inet = inet_sk(sk);
485	struct tcphdr *th = skb->h.th;
486
487	if (skb->ip_summed == CHECKSUM_PARTIAL) {
488		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
489		skb->csum = offsetof(struct tcphdr, check);
490	} else {
491		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
492					 csum_partial((char *)th,
493						      th->doff << 2,
494						      skb->csum));
495	}
496}
497
498int tcp_v4_gso_send_check(struct sk_buff *skb)
499{
500	struct iphdr *iph;
501	struct tcphdr *th;
502
503	if (!pskb_may_pull(skb, sizeof(*th)))
504		return -EINVAL;
505
506	iph = skb->nh.iph;
507	th = skb->h.th;
508
509	th->check = 0;
510	th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
511	skb->csum = offsetof(struct tcphdr, check);
512	skb->ip_summed = CHECKSUM_PARTIAL;
513	return 0;
514}
515
516/*
517 *	This routine will send an RST to the other tcp.
518 *
519 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
520 *		      for reset.
521 *	Answer: if a packet caused RST, it is not for a socket
522 *		existing in our system, if it is matched to a socket,
523 *		it is just duplicate segment or bug in other side's TCP.
524 *		So that we build reply only basing on parameters
525 *		arrived with segment.
526 *	Exception: precedence violation. We do not implement it in any case.
527 */
528
529static void tcp_v4_send_reset(struct sk_buff *skb)
530{
531	struct tcphdr *th = skb->h.th;
532	struct tcphdr rth;
533	struct ip_reply_arg arg;
534
535	/* Never send a reset in response to a reset. */
536	if (th->rst)
537		return;
538
539	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
540		return;
541
542	/* Swap the send and the receive. */
543	memset(&rth, 0, sizeof(struct tcphdr));
544	rth.dest   = th->source;
545	rth.source = th->dest;
546	rth.doff   = sizeof(struct tcphdr) / 4;
547	rth.rst    = 1;
548
549	if (th->ack) {
550		rth.seq = th->ack_seq;
551	} else {
552		rth.ack = 1;
553		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
554				    skb->len - (th->doff << 2));
555	}
556
557	memset(&arg, 0, sizeof arg);
558	arg.iov[0].iov_base = (unsigned char *)&rth;
559	arg.iov[0].iov_len  = sizeof rth;
560	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
561				      skb->nh.iph->saddr, /*XXX*/
562				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
563	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
564
565	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
566
567	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
568	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
569}
570
571/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
572   outside socket context is ugly, certainly. What can I do?
573 */
574
575static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
576			    u32 win, u32 ts)
577{
578	struct tcphdr *th = skb->h.th;
579	struct {
580		struct tcphdr th;
581		u32 tsopt[TCPOLEN_TSTAMP_ALIGNED >> 2];
582	} rep;
583	struct ip_reply_arg arg;
584
585	memset(&rep.th, 0, sizeof(struct tcphdr));
586	memset(&arg, 0, sizeof arg);
587
588	arg.iov[0].iov_base = (unsigned char *)&rep;
589	arg.iov[0].iov_len  = sizeof(rep.th);
590	if (ts) {
591		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
592				     (TCPOPT_TIMESTAMP << 8) |
593				     TCPOLEN_TIMESTAMP);
594		rep.tsopt[1] = htonl(tcp_time_stamp);
595		rep.tsopt[2] = htonl(ts);
596		arg.iov[0].iov_len = sizeof(rep);
597	}
598
599	/* Swap the send and the receive. */
600	rep.th.dest    = th->source;
601	rep.th.source  = th->dest;
602	rep.th.doff    = arg.iov[0].iov_len / 4;
603	rep.th.seq     = htonl(seq);
604	rep.th.ack_seq = htonl(ack);
605	rep.th.ack     = 1;
606	rep.th.window  = htons(win);
607
608	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
609				      skb->nh.iph->saddr, /*XXX*/
610				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
611	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
612
613	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
614
615	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
616}
617
618static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
619{
620	struct inet_timewait_sock *tw = inet_twsk(sk);
621	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
622
623	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
624			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
625
626	inet_twsk_put(tw);
627}
628
629static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
630{
631	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
632			req->ts_recent);
633}
634
635/*
636 *	Send a SYN-ACK after having received an ACK.
637 *	This still operates on a request_sock only, not on a big
638 *	socket.
639 */
640static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
641			      struct dst_entry *dst)
642{
643	const struct inet_request_sock *ireq = inet_rsk(req);
644	int err = -1;
645	struct sk_buff * skb;
646
647	/* First, grab a route. */
648	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
649		goto out;
650
651	skb = tcp_make_synack(sk, dst, req);
652
653	if (skb) {
654		struct tcphdr *th = skb->h.th;
655
656		th->check = tcp_v4_check(th, skb->len,
657					 ireq->loc_addr,
658					 ireq->rmt_addr,
659					 csum_partial((char *)th, skb->len,
660						      skb->csum));
661
662		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
663					    ireq->rmt_addr,
664					    ireq->opt);
665		if (err == NET_XMIT_CN)
666			err = 0;
667	}
668
669out:
670	dst_release(dst);
671	return err;
672}
673
674/*
675 *	IPv4 request_sock destructor.
676 */
677static void tcp_v4_reqsk_destructor(struct request_sock *req)
678{
679	kfree(inet_rsk(req)->opt);
680}
681
682#ifdef CONFIG_SYN_COOKIES
683static void syn_flood_warning(struct sk_buff *skb)
684{
685	static unsigned long warntime;
686
687	if (time_after(jiffies, (warntime + HZ * 60))) {
688		warntime = jiffies;
689		printk(KERN_INFO
690		       "possible SYN flooding on port %d. Sending cookies.\n",
691		       ntohs(skb->h.th->dest));
692	}
693}
694#endif
695
696/*
697 * Save and compile IPv4 options into the request_sock if needed.
698 */
699static struct ip_options *tcp_v4_save_options(struct sock *sk,
700					      struct sk_buff *skb)
701{
702	struct ip_options *opt = &(IPCB(skb)->opt);
703	struct ip_options *dopt = NULL;
704
705	if (opt && opt->optlen) {
706		int opt_size = optlength(opt);
707		dopt = kmalloc(opt_size, GFP_ATOMIC);
708		if (dopt) {
709			if (ip_options_echo(dopt, skb)) {
710				kfree(dopt);
711				dopt = NULL;
712			}
713		}
714	}
715	return dopt;
716}
717
718struct request_sock_ops tcp_request_sock_ops = {
719	.family		=	PF_INET,
720	.obj_size	=	sizeof(struct tcp_request_sock),
721	.rtx_syn_ack	=	tcp_v4_send_synack,
722	.send_ack	=	tcp_v4_reqsk_send_ack,
723	.destructor	=	tcp_v4_reqsk_destructor,
724	.send_reset	=	tcp_v4_send_reset,
725};
726
727static struct timewait_sock_ops tcp_timewait_sock_ops = {
728	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
729	.twsk_unique	= tcp_twsk_unique,
730};
731
732int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
733{
734	struct inet_request_sock *ireq;
735	struct tcp_options_received tmp_opt;
736	struct request_sock *req;
737	__be32 saddr = skb->nh.iph->saddr;
738	__be32 daddr = skb->nh.iph->daddr;
739	__u32 isn = TCP_SKB_CB(skb)->when;
740	struct dst_entry *dst = NULL;
741#ifdef CONFIG_SYN_COOKIES
742	int want_cookie = 0;
743#else
744#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
745#endif
746
747	/* Never answer to SYNs send to broadcast or multicast */
748	if (((struct rtable *)skb->dst)->rt_flags &
749	    (RTCF_BROADCAST | RTCF_MULTICAST))
750		goto drop;
751
752	/* TW buckets are converted to open requests without
753	 * limitations, they conserve resources and peer is
754	 * evidently real one.
755	 */
756	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
757#ifdef CONFIG_SYN_COOKIES
758		if (sysctl_tcp_syncookies) {
759			want_cookie = 1;
760		} else
761#endif
762		goto drop;
763	}
764
765	/* Accept backlog is full. If we have already queued enough
766	 * of warm entries in syn queue, drop request. It is better than
767	 * clogging syn queue with openreqs with exponentially increasing
768	 * timeout.
769	 */
770	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
771		goto drop;
772
773	req = reqsk_alloc(&tcp_request_sock_ops);
774	if (!req)
775		goto drop;
776
777	tcp_clear_options(&tmp_opt);
778	tmp_opt.mss_clamp = 536;
779	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
780
781	tcp_parse_options(skb, &tmp_opt, 0);
782
783	if (want_cookie) {
784		tcp_clear_options(&tmp_opt);
785		tmp_opt.saw_tstamp = 0;
786	}
787
788	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
789		/* Some OSes (unknown ones, but I see them on web server, which
790		 * contains information interesting only for windows'
791		 * users) do not send their stamp in SYN. It is easy case.
792		 * We simply do not advertise TS support.
793		 */
794		tmp_opt.saw_tstamp = 0;
795		tmp_opt.tstamp_ok  = 0;
796	}
797	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
798
799	tcp_openreq_init(req, &tmp_opt, skb);
800
801	if (security_inet_conn_request(sk, skb, req))
802		goto drop_and_free;
803
804	ireq = inet_rsk(req);
805	ireq->loc_addr = daddr;
806	ireq->rmt_addr = saddr;
807	ireq->opt = tcp_v4_save_options(sk, skb);
808	if (!want_cookie)
809		TCP_ECN_create_request(req, skb->h.th);
810
811	if (want_cookie) {
812#ifdef CONFIG_SYN_COOKIES
813		syn_flood_warning(skb);
814#endif
815		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
816	} else if (!isn) {
817		struct inet_peer *peer = NULL;
818
819		/* VJ's idea. We save last timestamp seen
820		 * from the destination in peer table, when entering
821		 * state TIME-WAIT, and check against it before
822		 * accepting new connection request.
823		 *
824		 * If "isn" is not zero, this request hit alive
825		 * timewait bucket, so that all the necessary checks
826		 * are made in the function processing timewait state.
827		 */
828		if (tmp_opt.saw_tstamp &&
829		    tcp_death_row.sysctl_tw_recycle &&
830		    (dst = inet_csk_route_req(sk, req)) != NULL &&
831		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
832		    peer->v4daddr == saddr) {
833			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
834			    (s32)(peer->tcp_ts - req->ts_recent) >
835							TCP_PAWS_WINDOW) {
836				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
837				dst_release(dst);
838				goto drop_and_free;
839			}
840		}
841		/* Kill the following clause, if you dislike this way. */
842		else if (!sysctl_tcp_syncookies &&
843			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
844			  (sysctl_max_syn_backlog >> 2)) &&
845			 (!peer || !peer->tcp_ts_stamp) &&
846			 (!dst || !dst_metric(dst, RTAX_RTT))) {
847			/* Without syncookies last quarter of
848			 * backlog is filled with destinations,
849			 * proven to be alive.
850			 * It means that we continue to communicate
851			 * to destinations, already remembered
852			 * to the moment of synflood.
853			 */
854			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
855				       "request from %u.%u.%u.%u/%u\n",
856				       NIPQUAD(saddr),
857				       ntohs(skb->h.th->source));
858			dst_release(dst);
859			goto drop_and_free;
860		}
861
862		isn = tcp_v4_init_sequence(sk, skb);
863	}
864	tcp_rsk(req)->snt_isn = isn;
865
866	if (tcp_v4_send_synack(sk, req, dst))
867		goto drop_and_free;
868
869	if (want_cookie) {
870	   	reqsk_free(req);
871	} else {
872		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
873	}
874	return 0;
875
876drop_and_free:
877	reqsk_free(req);
878drop:
879	return 0;
880}
881
882
883/*
884 * The three way handshake has completed - we got a valid synack -
885 * now create the new socket.
886 */
887struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
888				  struct request_sock *req,
889				  struct dst_entry *dst)
890{
891	struct inet_request_sock *ireq;
892	struct inet_sock *newinet;
893	struct tcp_sock *newtp;
894	struct sock *newsk;
895
896	if (sk_acceptq_is_full(sk))
897		goto exit_overflow;
898
899	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
900		goto exit;
901
902	newsk = tcp_create_openreq_child(sk, req, skb);
903	if (!newsk)
904		goto exit;
905
906	newsk->sk_gso_type = SKB_GSO_TCPV4;
907	sk_setup_caps(newsk, dst);
908
909	newtp		      = tcp_sk(newsk);
910	newinet		      = inet_sk(newsk);
911	ireq		      = inet_rsk(req);
912	newinet->daddr	      = ireq->rmt_addr;
913	newinet->rcv_saddr    = ireq->loc_addr;
914	newinet->saddr	      = ireq->loc_addr;
915	newinet->opt	      = ireq->opt;
916	ireq->opt	      = NULL;
917	newinet->mc_index     = inet_iif(skb);
918	newinet->mc_ttl	      = skb->nh.iph->ttl;
919	inet_csk(newsk)->icsk_ext_hdr_len = 0;
920	if (newinet->opt)
921		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
922	newinet->id = newtp->write_seq ^ jiffies;
923
924	tcp_mtup_init(newsk);
925	tcp_sync_mss(newsk, dst_mtu(dst));
926	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
927	tcp_initialize_rcv_mss(newsk);
928
929	__inet_hash(&tcp_hashinfo, newsk, 0);
930	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
931
932	return newsk;
933
934exit_overflow:
935	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
936exit:
937	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
938	dst_release(dst);
939	return NULL;
940}
941
942static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
943{
944	struct tcphdr *th = skb->h.th;
945	struct iphdr *iph = skb->nh.iph;
946	struct sock *nsk;
947	struct request_sock **prev;
948	/* Find possible connection requests. */
949	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
950						       iph->saddr, iph->daddr);
951	if (req)
952		return tcp_check_req(sk, skb, req, prev);
953
954	nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
955				      th->source, skb->nh.iph->daddr,
956				      th->dest, inet_iif(skb));
957
958	if (nsk) {
959		if (nsk->sk_state != TCP_TIME_WAIT) {
960			bh_lock_sock(nsk);
961			return nsk;
962		}
963		inet_twsk_put(inet_twsk(nsk));
964		return NULL;
965	}
966
967#ifdef CONFIG_SYN_COOKIES
968	if (!th->rst && !th->syn && th->ack)
969		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
970#endif
971	return sk;
972}
973
974static int tcp_v4_checksum_init(struct sk_buff *skb)
975{
976	if (skb->ip_summed == CHECKSUM_COMPLETE) {
977		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
978				  skb->nh.iph->daddr, skb->csum)) {
979			skb->ip_summed = CHECKSUM_UNNECESSARY;
980			return 0;
981		}
982	}
983
984	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
985				       skb->len, IPPROTO_TCP, 0);
986
987	if (skb->len <= 76) {
988		return __skb_checksum_complete(skb);
989	}
990	return 0;
991}
992
993
994/* The socket must have it's spinlock held when we get
995 * here.
996 *
997 * We have a potential double-lock case here, so even when
998 * doing backlog processing we use the BH locking scheme.
999 * This is because we cannot sleep with the original spinlock
1000 * held.
1001 */
1002int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1003{
1004	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1005		TCP_CHECK_TIMER(sk);
1006		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1007			goto reset;
1008		TCP_CHECK_TIMER(sk);
1009		return 0;
1010	}
1011
1012	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1013		goto csum_err;
1014
1015	if (sk->sk_state == TCP_LISTEN) {
1016		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1017		if (!nsk)
1018			goto discard;
1019
1020		if (nsk != sk) {
1021			if (tcp_child_process(sk, nsk, skb))
1022				goto reset;
1023			return 0;
1024		}
1025	}
1026
1027	TCP_CHECK_TIMER(sk);
1028	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1029		goto reset;
1030	TCP_CHECK_TIMER(sk);
1031	return 0;
1032
1033reset:
1034	tcp_v4_send_reset(skb);
1035discard:
1036	kfree_skb(skb);
1037	/* Be careful here. If this function gets more complicated and
1038	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1039	 * might be destroyed here. This current version compiles correctly,
1040	 * but you have been warned.
1041	 */
1042	return 0;
1043
1044csum_err:
1045	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1046	goto discard;
1047}
1048
1049/*
1050 *	From tcp_input.c
1051 */
1052
1053int tcp_v4_rcv(struct sk_buff *skb)
1054{
1055	struct tcphdr *th;
1056	struct sock *sk;
1057	int ret;
1058
1059	if (skb->pkt_type != PACKET_HOST)
1060		goto discard_it;
1061
1062	/* Count it even if it's bad */
1063	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1064
1065	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1066		goto discard_it;
1067
1068	th = skb->h.th;
1069
1070	if (th->doff < sizeof(struct tcphdr) / 4)
1071		goto bad_packet;
1072	if (!pskb_may_pull(skb, th->doff * 4))
1073		goto discard_it;
1074
1075	/* An explanation is required here, I think.
1076	 * Packet length and doff are validated by header prediction,
1077	 * provided case of th->doff==0 is eliminated.
1078	 * So, we defer the checks. */
1079	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1080	     tcp_v4_checksum_init(skb)))
1081		goto bad_packet;
1082
1083	th = skb->h.th;
1084	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1085	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1086				    skb->len - th->doff * 4);
1087	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1088	TCP_SKB_CB(skb)->when	 = 0;
1089	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1090	TCP_SKB_CB(skb)->sacked	 = 0;
1091
1092	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1093			   skb->nh.iph->daddr, th->dest,
1094			   inet_iif(skb));
1095
1096	if (!sk)
1097		goto no_tcp_socket;
1098
1099process:
1100	if (sk->sk_state == TCP_TIME_WAIT)
1101		goto do_time_wait;
1102
1103	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1104		goto discard_and_relse;
1105	nf_reset(skb);
1106
1107	if (sk_filter(sk, skb))
1108		goto discard_and_relse;
1109
1110	skb->dev = NULL;
1111
1112	bh_lock_sock_nested(sk);
1113	ret = 0;
1114	if (!sock_owned_by_user(sk)) {
1115#ifdef CONFIG_NET_DMA
1116		struct tcp_sock *tp = tcp_sk(sk);
1117		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1118			tp->ucopy.dma_chan = get_softnet_dma();
1119		if (tp->ucopy.dma_chan)
1120			ret = tcp_v4_do_rcv(sk, skb);
1121		else
1122#endif
1123		{
1124			if (!tcp_prequeue(sk, skb))
1125			ret = tcp_v4_do_rcv(sk, skb);
1126		}
1127	} else
1128		sk_add_backlog(sk, skb);
1129	bh_unlock_sock(sk);
1130
1131	sock_put(sk);
1132
1133	return ret;
1134
1135no_tcp_socket:
1136	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1137		goto discard_it;
1138
1139	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1140bad_packet:
1141		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1142	} else {
1143		tcp_v4_send_reset(skb);
1144	}
1145
1146discard_it:
1147	/* Discard frame. */
1148	kfree_skb(skb);
1149  	return 0;
1150
1151discard_and_relse:
1152	sock_put(sk);
1153	goto discard_it;
1154
1155do_time_wait:
1156	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1157		inet_twsk_put(inet_twsk(sk));
1158		goto discard_it;
1159	}
1160
1161	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1162		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1163		inet_twsk_put(inet_twsk(sk));
1164		goto discard_it;
1165	}
1166	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1167	case TCP_TW_SYN: {
1168		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1169							skb->nh.iph->daddr,
1170							th->dest,
1171							inet_iif(skb));
1172		if (sk2) {
1173			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1174			inet_twsk_put(inet_twsk(sk));
1175			sk = sk2;
1176			goto process;
1177		}
1178		/* Fall through to ACK */
1179	}
1180	case TCP_TW_ACK:
1181		tcp_v4_timewait_ack(sk, skb);
1182		break;
1183	case TCP_TW_RST:
1184		goto no_tcp_socket;
1185	case TCP_TW_SUCCESS:;
1186	}
1187	goto discard_it;
1188}
1189
1190/* VJ's idea. Save last timestamp seen from this destination
1191 * and hold it at least for normal timewait interval to use for duplicate
1192 * segment detection in subsequent connections, before they enter synchronized
1193 * state.
1194 */
1195
1196int tcp_v4_remember_stamp(struct sock *sk)
1197{
1198	struct inet_sock *inet = inet_sk(sk);
1199	struct tcp_sock *tp = tcp_sk(sk);
1200	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1201	struct inet_peer *peer = NULL;
1202	int release_it = 0;
1203
1204	if (!rt || rt->rt_dst != inet->daddr) {
1205		peer = inet_getpeer(inet->daddr, 1);
1206		release_it = 1;
1207	} else {
1208		if (!rt->peer)
1209			rt_bind_peer(rt, 1);
1210		peer = rt->peer;
1211	}
1212
1213	if (peer) {
1214		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1215		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1216		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1217			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1218			peer->tcp_ts = tp->rx_opt.ts_recent;
1219		}
1220		if (release_it)
1221			inet_putpeer(peer);
1222		return 1;
1223	}
1224
1225	return 0;
1226}
1227
1228int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1229{
1230	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1231
1232	if (peer) {
1233		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1234
1235		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1236		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1237		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1238			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1239			peer->tcp_ts	   = tcptw->tw_ts_recent;
1240		}
1241		inet_putpeer(peer);
1242		return 1;
1243	}
1244
1245	return 0;
1246}
1247
1248struct inet_connection_sock_af_ops ipv4_specific = {
1249	.queue_xmit	   = ip_queue_xmit,
1250	.send_check	   = tcp_v4_send_check,
1251	.rebuild_header	   = inet_sk_rebuild_header,
1252	.conn_request	   = tcp_v4_conn_request,
1253	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1254	.remember_stamp	   = tcp_v4_remember_stamp,
1255	.net_header_len	   = sizeof(struct iphdr),
1256	.setsockopt	   = ip_setsockopt,
1257	.getsockopt	   = ip_getsockopt,
1258	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1259	.sockaddr_len	   = sizeof(struct sockaddr_in),
1260#ifdef CONFIG_COMPAT
1261	.compat_setsockopt = compat_ip_setsockopt,
1262	.compat_getsockopt = compat_ip_getsockopt,
1263#endif
1264};
1265
1266/* NOTE: A lot of things set to zero explicitly by call to
1267 *       sk_alloc() so need not be done here.
1268 */
1269static int tcp_v4_init_sock(struct sock *sk)
1270{
1271	struct inet_connection_sock *icsk = inet_csk(sk);
1272	struct tcp_sock *tp = tcp_sk(sk);
1273
1274	skb_queue_head_init(&tp->out_of_order_queue);
1275	tcp_init_xmit_timers(sk);
1276	tcp_prequeue_init(tp);
1277
1278	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1279	tp->mdev = TCP_TIMEOUT_INIT;
1280
1281	/* So many TCP implementations out there (incorrectly) count the
1282	 * initial SYN frame in their delayed-ACK and congestion control
1283	 * algorithms that we must have the following bandaid to talk
1284	 * efficiently to them.  -DaveM
1285	 */
1286	tp->snd_cwnd = 2;
1287
1288	/* See draft-stevens-tcpca-spec-01 for discussion of the
1289	 * initialization of these values.
1290	 */
1291	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1292	tp->snd_cwnd_clamp = ~0;
1293	tp->mss_cache = 536;
1294
1295	tp->reordering = sysctl_tcp_reordering;
1296	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1297
1298	sk->sk_state = TCP_CLOSE;
1299
1300	sk->sk_write_space = sk_stream_write_space;
1301	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1302
1303	icsk->icsk_af_ops = &ipv4_specific;
1304	icsk->icsk_sync_mss = tcp_sync_mss;
1305
1306	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1307	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1308
1309	atomic_inc(&tcp_sockets_allocated);
1310
1311	return 0;
1312}
1313
1314int tcp_v4_destroy_sock(struct sock *sk)
1315{
1316	struct tcp_sock *tp = tcp_sk(sk);
1317
1318	tcp_clear_xmit_timers(sk);
1319
1320	tcp_cleanup_congestion_control(sk);
1321
1322	/* Cleanup up the write buffer. */
1323  	sk_stream_writequeue_purge(sk);
1324
1325	/* Cleans up our, hopefully empty, out_of_order_queue. */
1326  	__skb_queue_purge(&tp->out_of_order_queue);
1327
1328#ifdef CONFIG_NET_DMA
1329	/* Cleans up our sk_async_wait_queue */
1330  	__skb_queue_purge(&sk->sk_async_wait_queue);
1331#endif
1332
1333	/* Clean prequeue, it must be empty really */
1334	__skb_queue_purge(&tp->ucopy.prequeue);
1335
1336	/* Clean up a referenced TCP bind bucket. */
1337	if (inet_csk(sk)->icsk_bind_hash)
1338		inet_put_port(&tcp_hashinfo, sk);
1339
1340	/*
1341	 * If sendmsg cached page exists, toss it.
1342	 */
1343	if (sk->sk_sndmsg_page) {
1344		__free_page(sk->sk_sndmsg_page);
1345		sk->sk_sndmsg_page = NULL;
1346	}
1347
1348	atomic_dec(&tcp_sockets_allocated);
1349
1350	return 0;
1351}
1352
1353EXPORT_SYMBOL(tcp_v4_destroy_sock);
1354
1355#ifdef CONFIG_PROC_FS
1356/* Proc filesystem TCP sock list dumping. */
1357
1358static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1359{
1360	return hlist_empty(head) ? NULL :
1361		list_entry(head->first, struct inet_timewait_sock, tw_node);
1362}
1363
1364static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1365{
1366	return tw->tw_node.next ?
1367		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1368}
1369
1370static void *listening_get_next(struct seq_file *seq, void *cur)
1371{
1372	struct inet_connection_sock *icsk;
1373	struct hlist_node *node;
1374	struct sock *sk = cur;
1375	struct tcp_iter_state* st = seq->private;
1376
1377	if (!sk) {
1378		st->bucket = 0;
1379		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1380		goto get_sk;
1381	}
1382
1383	++st->num;
1384
1385	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1386		struct request_sock *req = cur;
1387
1388	       	icsk = inet_csk(st->syn_wait_sk);
1389		req = req->dl_next;
1390		while (1) {
1391			while (req) {
1392				if (req->rsk_ops->family == st->family) {
1393					cur = req;
1394					goto out;
1395				}
1396				req = req->dl_next;
1397			}
1398			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1399				break;
1400get_req:
1401			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1402		}
1403		sk	  = sk_next(st->syn_wait_sk);
1404		st->state = TCP_SEQ_STATE_LISTENING;
1405		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1406	} else {
1407	       	icsk = inet_csk(sk);
1408		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1409		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1410			goto start_req;
1411		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1412		sk = sk_next(sk);
1413	}
1414get_sk:
1415	sk_for_each_from(sk, node) {
1416		if (sk->sk_family == st->family) {
1417			cur = sk;
1418			goto out;
1419		}
1420	       	icsk = inet_csk(sk);
1421		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1422		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1423start_req:
1424			st->uid		= sock_i_uid(sk);
1425			st->syn_wait_sk = sk;
1426			st->state	= TCP_SEQ_STATE_OPENREQ;
1427			st->sbucket	= 0;
1428			goto get_req;
1429		}
1430		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1431	}
1432	if (++st->bucket < INET_LHTABLE_SIZE) {
1433		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1434		goto get_sk;
1435	}
1436	cur = NULL;
1437out:
1438	return cur;
1439}
1440
1441static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1442{
1443	void *rc = listening_get_next(seq, NULL);
1444
1445	while (rc && *pos) {
1446		rc = listening_get_next(seq, rc);
1447		--*pos;
1448	}
1449	return rc;
1450}
1451
1452static void *established_get_first(struct seq_file *seq)
1453{
1454	struct tcp_iter_state* st = seq->private;
1455	void *rc = NULL;
1456
1457	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1458		struct sock *sk;
1459		struct hlist_node *node;
1460		struct inet_timewait_sock *tw;
1461
1462		/* We can reschedule _before_ having picked the target: */
1463		cond_resched_softirq();
1464
1465		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1466		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1467			if (sk->sk_family != st->family) {
1468				continue;
1469			}
1470			rc = sk;
1471			goto out;
1472		}
1473		st->state = TCP_SEQ_STATE_TIME_WAIT;
1474		inet_twsk_for_each(tw, node,
1475				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1476			if (tw->tw_family != st->family) {
1477				continue;
1478			}
1479			rc = tw;
1480			goto out;
1481		}
1482		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1483		st->state = TCP_SEQ_STATE_ESTABLISHED;
1484	}
1485out:
1486	return rc;
1487}
1488
1489static void *established_get_next(struct seq_file *seq, void *cur)
1490{
1491	struct sock *sk = cur;
1492	struct inet_timewait_sock *tw;
1493	struct hlist_node *node;
1494	struct tcp_iter_state* st = seq->private;
1495
1496	++st->num;
1497
1498	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1499		tw = cur;
1500		tw = tw_next(tw);
1501get_tw:
1502		while (tw && tw->tw_family != st->family) {
1503			tw = tw_next(tw);
1504		}
1505		if (tw) {
1506			cur = tw;
1507			goto out;
1508		}
1509		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1510		st->state = TCP_SEQ_STATE_ESTABLISHED;
1511
1512		/* We can reschedule between buckets: */
1513		cond_resched_softirq();
1514
1515		if (++st->bucket < tcp_hashinfo.ehash_size) {
1516			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1517			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1518		} else {
1519			cur = NULL;
1520			goto out;
1521		}
1522	} else
1523		sk = sk_next(sk);
1524
1525	sk_for_each_from(sk, node) {
1526		if (sk->sk_family == st->family)
1527			goto found;
1528	}
1529
1530	st->state = TCP_SEQ_STATE_TIME_WAIT;
1531	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1532	goto get_tw;
1533found:
1534	cur = sk;
1535out:
1536	return cur;
1537}
1538
1539static void *established_get_idx(struct seq_file *seq, loff_t pos)
1540{
1541	void *rc = established_get_first(seq);
1542
1543	while (rc && pos) {
1544		rc = established_get_next(seq, rc);
1545		--pos;
1546	}
1547	return rc;
1548}
1549
1550static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1551{
1552	void *rc;
1553	struct tcp_iter_state* st = seq->private;
1554
1555	inet_listen_lock(&tcp_hashinfo);
1556	st->state = TCP_SEQ_STATE_LISTENING;
1557	rc	  = listening_get_idx(seq, &pos);
1558
1559	if (!rc) {
1560		inet_listen_unlock(&tcp_hashinfo);
1561		local_bh_disable();
1562		st->state = TCP_SEQ_STATE_ESTABLISHED;
1563		rc	  = established_get_idx(seq, pos);
1564	}
1565
1566	return rc;
1567}
1568
1569static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1570{
1571	struct tcp_iter_state* st = seq->private;
1572	st->state = TCP_SEQ_STATE_LISTENING;
1573	st->num = 0;
1574	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1575}
1576
1577static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1578{
1579	void *rc = NULL;
1580	struct tcp_iter_state* st;
1581
1582	if (v == SEQ_START_TOKEN) {
1583		rc = tcp_get_idx(seq, 0);
1584		goto out;
1585	}
1586	st = seq->private;
1587
1588	switch (st->state) {
1589	case TCP_SEQ_STATE_OPENREQ:
1590	case TCP_SEQ_STATE_LISTENING:
1591		rc = listening_get_next(seq, v);
1592		if (!rc) {
1593			inet_listen_unlock(&tcp_hashinfo);
1594			local_bh_disable();
1595			st->state = TCP_SEQ_STATE_ESTABLISHED;
1596			rc	  = established_get_first(seq);
1597		}
1598		break;
1599	case TCP_SEQ_STATE_ESTABLISHED:
1600	case TCP_SEQ_STATE_TIME_WAIT:
1601		rc = established_get_next(seq, v);
1602		break;
1603	}
1604out:
1605	++*pos;
1606	return rc;
1607}
1608
1609static void tcp_seq_stop(struct seq_file *seq, void *v)
1610{
1611	struct tcp_iter_state* st = seq->private;
1612
1613	switch (st->state) {
1614	case TCP_SEQ_STATE_OPENREQ:
1615		if (v) {
1616			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1617			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1618		}
1619	case TCP_SEQ_STATE_LISTENING:
1620		if (v != SEQ_START_TOKEN)
1621			inet_listen_unlock(&tcp_hashinfo);
1622		break;
1623	case TCP_SEQ_STATE_TIME_WAIT:
1624	case TCP_SEQ_STATE_ESTABLISHED:
1625		if (v)
1626			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1627		local_bh_enable();
1628		break;
1629	}
1630}
1631
1632static int tcp_seq_open(struct inode *inode, struct file *file)
1633{
1634	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1635	struct seq_file *seq;
1636	struct tcp_iter_state *s;
1637	int rc;
1638
1639	if (unlikely(afinfo == NULL))
1640		return -EINVAL;
1641
1642	s = kzalloc(sizeof(*s), GFP_KERNEL);
1643	if (!s)
1644		return -ENOMEM;
1645	s->family		= afinfo->family;
1646	s->seq_ops.start	= tcp_seq_start;
1647	s->seq_ops.next		= tcp_seq_next;
1648	s->seq_ops.show		= afinfo->seq_show;
1649	s->seq_ops.stop		= tcp_seq_stop;
1650
1651	rc = seq_open(file, &s->seq_ops);
1652	if (rc)
1653		goto out_kfree;
1654	seq	     = file->private_data;
1655	seq->private = s;
1656out:
1657	return rc;
1658out_kfree:
1659	kfree(s);
1660	goto out;
1661}
1662
1663int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1664{
1665	int rc = 0;
1666	struct proc_dir_entry *p;
1667
1668	if (!afinfo)
1669		return -EINVAL;
1670	afinfo->seq_fops->owner		= afinfo->owner;
1671	afinfo->seq_fops->open		= tcp_seq_open;
1672	afinfo->seq_fops->read		= seq_read;
1673	afinfo->seq_fops->llseek	= seq_lseek;
1674	afinfo->seq_fops->release	= seq_release_private;
1675
1676	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1677	if (p)
1678		p->data = afinfo;
1679	else
1680		rc = -ENOMEM;
1681	return rc;
1682}
1683
1684void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1685{
1686	if (!afinfo)
1687		return;
1688	proc_net_remove(afinfo->name);
1689	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1690}
1691
1692static void get_openreq4(struct sock *sk, struct request_sock *req,
1693			 char *tmpbuf, int i, int uid)
1694{
1695	const struct inet_request_sock *ireq = inet_rsk(req);
1696	int ttd = req->expires - jiffies;
1697
1698	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1699		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1700		i,
1701		ireq->loc_addr,
1702		ntohs(inet_sk(sk)->sport),
1703		ireq->rmt_addr,
1704		ntohs(ireq->rmt_port),
1705		TCP_SYN_RECV,
1706		0, 0, /* could print option size, but that is af dependent. */
1707		1,    /* timers active (only the expire timer) */
1708		jiffies_to_clock_t(ttd),
1709		req->retrans,
1710		uid,
1711		0,  /* non standard timer */
1712		0, /* open_requests have no inode */
1713		atomic_read(&sk->sk_refcnt),
1714		req);
1715}
1716
1717static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1718{
1719	int timer_active;
1720	unsigned long timer_expires;
1721	struct tcp_sock *tp = tcp_sk(sp);
1722	const struct inet_connection_sock *icsk = inet_csk(sp);
1723	struct inet_sock *inet = inet_sk(sp);
1724	unsigned int dest = inet->daddr;
1725	unsigned int src = inet->rcv_saddr;
1726	__u16 destp = ntohs(inet->dport);
1727	__u16 srcp = ntohs(inet->sport);
1728
1729	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1730		timer_active	= 1;
1731		timer_expires	= icsk->icsk_timeout;
1732	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1733		timer_active	= 4;
1734		timer_expires	= icsk->icsk_timeout;
1735	} else if (timer_pending(&sp->sk_timer)) {
1736		timer_active	= 2;
1737		timer_expires	= sp->sk_timer.expires;
1738	} else {
1739		timer_active	= 0;
1740		timer_expires = jiffies;
1741	}
1742
1743	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1744			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1745		i, src, srcp, dest, destp, sp->sk_state,
1746		tp->write_seq - tp->snd_una,
1747		(sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
1748		timer_active,
1749		jiffies_to_clock_t(timer_expires - jiffies),
1750		icsk->icsk_retransmits,
1751		sock_i_uid(sp),
1752		icsk->icsk_probes_out,
1753		sock_i_ino(sp),
1754		atomic_read(&sp->sk_refcnt), sp,
1755		icsk->icsk_rto,
1756		icsk->icsk_ack.ato,
1757		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1758		tp->snd_cwnd,
1759		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1760}
1761
1762static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1763{
1764	__be32 dest, src;
1765	__u16 destp, srcp;
1766	int ttd = tw->tw_ttd - jiffies;
1767
1768	if (ttd < 0)
1769		ttd = 0;
1770
1771	dest  = tw->tw_daddr;
1772	src   = tw->tw_rcv_saddr;
1773	destp = ntohs(tw->tw_dport);
1774	srcp  = ntohs(tw->tw_sport);
1775
1776	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1777		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1778		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1779		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1780		atomic_read(&tw->tw_refcnt), tw);
1781}
1782
1783#define TMPSZ 150
1784
1785static int tcp4_seq_show(struct seq_file *seq, void *v)
1786{
1787	struct tcp_iter_state* st;
1788	char tmpbuf[TMPSZ + 1];
1789
1790	if (v == SEQ_START_TOKEN) {
1791		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1792			   "  sl  local_address rem_address   st tx_queue "
1793			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1794			   "inode");
1795		goto out;
1796	}
1797	st = seq->private;
1798
1799	switch (st->state) {
1800	case TCP_SEQ_STATE_LISTENING:
1801	case TCP_SEQ_STATE_ESTABLISHED:
1802		get_tcp4_sock(v, tmpbuf, st->num);
1803		break;
1804	case TCP_SEQ_STATE_OPENREQ:
1805		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1806		break;
1807	case TCP_SEQ_STATE_TIME_WAIT:
1808		get_timewait4_sock(v, tmpbuf, st->num);
1809		break;
1810	}
1811	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1812out:
1813	return 0;
1814}
1815
1816static struct file_operations tcp4_seq_fops;
1817static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1818	.owner		= THIS_MODULE,
1819	.name		= "tcp",
1820	.family		= AF_INET,
1821	.seq_show	= tcp4_seq_show,
1822	.seq_fops	= &tcp4_seq_fops,
1823};
1824
1825int __init tcp4_proc_init(void)
1826{
1827	return tcp_proc_register(&tcp4_seq_afinfo);
1828}
1829
1830void tcp4_proc_exit(void)
1831{
1832	tcp_proc_unregister(&tcp4_seq_afinfo);
1833}
1834#endif /* CONFIG_PROC_FS */
1835
1836struct proto tcp_prot = {
1837	.name			= "TCP",
1838	.owner			= THIS_MODULE,
1839	.close			= tcp_close,
1840	.connect		= tcp_v4_connect,
1841	.disconnect		= tcp_disconnect,
1842	.accept			= inet_csk_accept,
1843	.ioctl			= tcp_ioctl,
1844	.init			= tcp_v4_init_sock,
1845	.destroy		= tcp_v4_destroy_sock,
1846	.shutdown		= tcp_shutdown,
1847	.setsockopt		= tcp_setsockopt,
1848	.getsockopt		= tcp_getsockopt,
1849	.sendmsg		= tcp_sendmsg,
1850	.recvmsg		= tcp_recvmsg,
1851	.backlog_rcv		= tcp_v4_do_rcv,
1852	.hash			= tcp_v4_hash,
1853	.unhash			= tcp_unhash,
1854	.get_port		= tcp_v4_get_port,
1855	.enter_memory_pressure	= tcp_enter_memory_pressure,
1856	.sockets_allocated	= &tcp_sockets_allocated,
1857	.orphan_count		= &tcp_orphan_count,
1858	.memory_allocated	= &tcp_memory_allocated,
1859	.memory_pressure	= &tcp_memory_pressure,
1860	.sysctl_mem		= sysctl_tcp_mem,
1861	.sysctl_wmem		= sysctl_tcp_wmem,
1862	.sysctl_rmem		= sysctl_tcp_rmem,
1863	.max_header		= MAX_TCP_HEADER,
1864	.obj_size		= sizeof(struct tcp_sock),
1865	.twsk_prot		= &tcp_timewait_sock_ops,
1866	.rsk_prot		= &tcp_request_sock_ops,
1867#ifdef CONFIG_COMPAT
1868	.compat_setsockopt	= compat_tcp_setsockopt,
1869	.compat_getsockopt	= compat_tcp_getsockopt,
1870#endif
1871};
1872
1873void __init tcp_v4_init(struct net_proto_family *ops)
1874{
1875	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
1876		panic("Failed to create the TCP control socket.\n");
1877}
1878
1879EXPORT_SYMBOL(ipv4_specific);
1880EXPORT_SYMBOL(tcp_hashinfo);
1881EXPORT_SYMBOL(tcp_prot);
1882EXPORT_SYMBOL(tcp_unhash);
1883EXPORT_SYMBOL(tcp_v4_conn_request);
1884EXPORT_SYMBOL(tcp_v4_connect);
1885EXPORT_SYMBOL(tcp_v4_do_rcv);
1886EXPORT_SYMBOL(tcp_v4_remember_stamp);
1887EXPORT_SYMBOL(tcp_v4_send_check);
1888EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1889
1890#ifdef CONFIG_PROC_FS
1891EXPORT_SYMBOL(tcp_proc_register);
1892EXPORT_SYMBOL(tcp_proc_unregister);
1893#endif
1894EXPORT_SYMBOL(sysctl_local_port_range);
1895EXPORT_SYMBOL(sysctl_tcp_low_latency);
1896
1897