tcp_ipv4.c revision 3687b1dc6fe83a500ba4d3235704594f6a111a2d
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81int sysctl_tcp_tw_reuse;
82int sysctl_tcp_low_latency;
83
84/* Check TCP sequence numbers in ICMP packets. */
85#define ICMP_MIN_LENGTH 8
86
87/* Socket used for sending RSTs */
88static struct socket *tcp_socket;
89
90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock	= __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
94	.lhash_users	= ATOMIC_INIT(0),
95	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96};
97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{
100	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101				 inet_csk_bind_conflict);
102}
103
104static void tcp_v4_hash(struct sock *sk)
105{
106	inet_hash(&tcp_hashinfo, sk);
107}
108
109void tcp_unhash(struct sock *sk)
110{
111	inet_unhash(&tcp_hashinfo, sk);
112}
113
114static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
115{
116	return secure_tcp_sequence_number(skb->nh.iph->daddr,
117					  skb->nh.iph->saddr,
118					  skb->h.th->dest,
119					  skb->h.th->source);
120}
121
122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
123{
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126
127	/* With PAWS, it is safe from the viewpoint
128	   of data integrity. Even without PAWS it is safe provided sequence
129	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
130
131	   Actually, the idea is close to VJ's one, only timestamp cache is
132	   held not per host, but per port pair and TW bucket is used as state
133	   holder.
134
135	   If TW bucket has been already destroyed we fall back to VJ's scheme
136	   and use initial timestamp retrieved from peer table.
137	 */
138	if (tcptw->tw_ts_recent_stamp &&
139	    (twp == NULL || (sysctl_tcp_tw_reuse &&
140			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
141		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
142		if (tp->write_seq == 0)
143			tp->write_seq = 1;
144		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
145		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
146		sock_hold(sktw);
147		return 1;
148	}
149
150	return 0;
151}
152
153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
154
155/* This will initiate an outgoing connection. */
156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
157{
158	struct inet_sock *inet = inet_sk(sk);
159	struct tcp_sock *tp = tcp_sk(sk);
160	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
161	struct rtable *rt;
162	u32 daddr, nexthop;
163	int tmp;
164	int err;
165
166	if (addr_len < sizeof(struct sockaddr_in))
167		return -EINVAL;
168
169	if (usin->sin_family != AF_INET)
170		return -EAFNOSUPPORT;
171
172	nexthop = daddr = usin->sin_addr.s_addr;
173	if (inet->opt && inet->opt->srr) {
174		if (!daddr)
175			return -EINVAL;
176		nexthop = inet->opt->faddr;
177	}
178
179	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
180			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181			       IPPROTO_TCP,
182			       inet->sport, usin->sin_port, sk);
183	if (tmp < 0)
184		return tmp;
185
186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187		ip_rt_put(rt);
188		return -ENETUNREACH;
189	}
190
191	if (!inet->opt || !inet->opt->srr)
192		daddr = rt->rt_dst;
193
194	if (!inet->saddr)
195		inet->saddr = rt->rt_src;
196	inet->rcv_saddr = inet->saddr;
197
198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199		/* Reset inherited state */
200		tp->rx_opt.ts_recent	   = 0;
201		tp->rx_opt.ts_recent_stamp = 0;
202		tp->write_seq		   = 0;
203	}
204
205	if (tcp_death_row.sysctl_tw_recycle &&
206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207		struct inet_peer *peer = rt_get_peer(rt);
208
209		/* VJ's idea. We save last timestamp seen from
210		 * the destination in peer table, when entering state TIME-WAIT
211		 * and initialize rx_opt.ts_recent from it, when trying new connection.
212		 */
213
214		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
215			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216			tp->rx_opt.ts_recent = peer->tcp_ts;
217		}
218	}
219
220	inet->dport = usin->sin_port;
221	inet->daddr = daddr;
222
223	inet_csk(sk)->icsk_ext_hdr_len = 0;
224	if (inet->opt)
225		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226
227	tp->rx_opt.mss_clamp = 536;
228
229	/* Socket identity is still unknown (sport may be zero).
230	 * However we set state to SYN-SENT and not releasing socket
231	 * lock select source port, enter ourselves into the hash tables and
232	 * complete initialization after this.
233	 */
234	tcp_set_state(sk, TCP_SYN_SENT);
235	err = inet_hash_connect(&tcp_death_row, sk);
236	if (err)
237		goto failure;
238
239	err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
240	if (err)
241		goto failure;
242
243	/* OK, now commit destination to socket.  */
244	sk->sk_gso_type = SKB_GSO_TCPV4;
245	sk_setup_caps(sk, &rt->u.dst);
246
247	if (!tp->write_seq)
248		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
249							   inet->daddr,
250							   inet->sport,
251							   usin->sin_port);
252
253	inet->id = tp->write_seq ^ jiffies;
254
255	err = tcp_connect(sk);
256	rt = NULL;
257	if (err)
258		goto failure;
259
260	return 0;
261
262failure:
263	/* This unhashes the socket and releases the local port, if necessary. */
264	tcp_set_state(sk, TCP_CLOSE);
265	ip_rt_put(rt);
266	sk->sk_route_caps = 0;
267	inet->dport = 0;
268	return err;
269}
270
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275{
276	struct dst_entry *dst;
277	struct inet_sock *inet = inet_sk(sk);
278
279	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280	 * send out by Linux are always <576bytes so they should go through
281	 * unfragmented).
282	 */
283	if (sk->sk_state == TCP_LISTEN)
284		return;
285
286	/* We don't check in the destentry if pmtu discovery is forbidden
287	 * on this route. We just assume that no packet_to_big packets
288	 * are send back when pmtu discovery is not active.
289     	 * There is a small race when the user changes this flag in the
290	 * route, but I think that's acceptable.
291	 */
292	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293		return;
294
295	dst->ops->update_pmtu(dst, mtu);
296
297	/* Something is about to be wrong... Remember soft error
298	 * for the case, if this connection will not able to recover.
299	 */
300	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301		sk->sk_err_soft = EMSGSIZE;
302
303	mtu = dst_mtu(dst);
304
305	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307		tcp_sync_mss(sk, mtu);
308
309		/* Resend the TCP packet because it's
310		 * clear that the old packet has been
311		 * dropped. This is the new "fast" path mtu
312		 * discovery.
313		 */
314		tcp_simple_retransmit(sk);
315	} /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition.  If err < 0 then the socket should
321 * be closed and the error returned to the user.  If err > 0
322 * it's just the icmp type << 8 | icmp code.  After adjustment
323 * header points to the first 8 bytes of the tcp header.  We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336	struct iphdr *iph = (struct iphdr *)skb->data;
337	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338	struct tcp_sock *tp;
339	struct inet_sock *inet;
340	int type = skb->h.icmph->type;
341	int code = skb->h.icmph->code;
342	struct sock *sk;
343	__u32 seq;
344	int err;
345
346	if (skb->len < (iph->ihl << 2) + 8) {
347		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
348		return;
349	}
350
351	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
352			 th->source, inet_iif(skb));
353	if (!sk) {
354		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355		return;
356	}
357	if (sk->sk_state == TCP_TIME_WAIT) {
358		inet_twsk_put((struct inet_timewait_sock *)sk);
359		return;
360	}
361
362	bh_lock_sock(sk);
363	/* If too many ICMPs get dropped on busy
364	 * servers this needs to be solved differently.
365	 */
366	if (sock_owned_by_user(sk))
367		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
368
369	if (sk->sk_state == TCP_CLOSE)
370		goto out;
371
372	tp = tcp_sk(sk);
373	seq = ntohl(th->seq);
374	if (sk->sk_state != TCP_LISTEN &&
375	    !between(seq, tp->snd_una, tp->snd_nxt)) {
376		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
377		goto out;
378	}
379
380	switch (type) {
381	case ICMP_SOURCE_QUENCH:
382		/* Just silently ignore these. */
383		goto out;
384	case ICMP_PARAMETERPROB:
385		err = EPROTO;
386		break;
387	case ICMP_DEST_UNREACH:
388		if (code > NR_ICMP_UNREACH)
389			goto out;
390
391		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
392			if (!sock_owned_by_user(sk))
393				do_pmtu_discovery(sk, iph, info);
394			goto out;
395		}
396
397		err = icmp_err_convert[code].errno;
398		break;
399	case ICMP_TIME_EXCEEDED:
400		err = EHOSTUNREACH;
401		break;
402	default:
403		goto out;
404	}
405
406	switch (sk->sk_state) {
407		struct request_sock *req, **prev;
408	case TCP_LISTEN:
409		if (sock_owned_by_user(sk))
410			goto out;
411
412		req = inet_csk_search_req(sk, &prev, th->dest,
413					  iph->daddr, iph->saddr);
414		if (!req)
415			goto out;
416
417		/* ICMPs are not backlogged, hence we cannot get
418		   an established socket here.
419		 */
420		BUG_TRAP(!req->sk);
421
422		if (seq != tcp_rsk(req)->snt_isn) {
423			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
424			goto out;
425		}
426
427		/*
428		 * Still in SYN_RECV, just remove it silently.
429		 * There is no good way to pass the error to the newly
430		 * created socket, and POSIX does not want network
431		 * errors returned from accept().
432		 */
433		inet_csk_reqsk_queue_drop(sk, req, prev);
434		goto out;
435
436	case TCP_SYN_SENT:
437	case TCP_SYN_RECV:  /* Cannot happen.
438			       It can f.e. if SYNs crossed.
439			     */
440		if (!sock_owned_by_user(sk)) {
441			sk->sk_err = err;
442
443			sk->sk_error_report(sk);
444
445			tcp_done(sk);
446		} else {
447			sk->sk_err_soft = err;
448		}
449		goto out;
450	}
451
452	/* If we've already connected we will keep trying
453	 * until we time out, or the user gives up.
454	 *
455	 * rfc1122 4.2.3.9 allows to consider as hard errors
456	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457	 * but it is obsoleted by pmtu discovery).
458	 *
459	 * Note, that in modern internet, where routing is unreliable
460	 * and in each dark corner broken firewalls sit, sending random
461	 * errors ordered by their masters even this two messages finally lose
462	 * their original sense (even Linux sends invalid PORT_UNREACHs)
463	 *
464	 * Now we are in compliance with RFCs.
465	 *							--ANK (980905)
466	 */
467
468	inet = inet_sk(sk);
469	if (!sock_owned_by_user(sk) && inet->recverr) {
470		sk->sk_err = err;
471		sk->sk_error_report(sk);
472	} else	{ /* Only an error on timeout */
473		sk->sk_err_soft = err;
474	}
475
476out:
477	bh_unlock_sock(sk);
478	sock_put(sk);
479}
480
481/* This routine computes an IPv4 TCP checksum. */
482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483{
484	struct inet_sock *inet = inet_sk(sk);
485	struct tcphdr *th = skb->h.th;
486
487	if (skb->ip_summed == CHECKSUM_HW) {
488		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
489		skb->csum = offsetof(struct tcphdr, check);
490	} else {
491		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
492					 csum_partial((char *)th,
493						      th->doff << 2,
494						      skb->csum));
495	}
496}
497
498int tcp_v4_gso_send_check(struct sk_buff *skb)
499{
500	struct iphdr *iph;
501	struct tcphdr *th;
502
503	if (!pskb_may_pull(skb, sizeof(*th)))
504		return -EINVAL;
505
506	iph = skb->nh.iph;
507	th = skb->h.th;
508
509	th->check = 0;
510	th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
511	skb->csum = offsetof(struct tcphdr, check);
512	skb->ip_summed = CHECKSUM_HW;
513	return 0;
514}
515
516/*
517 *	This routine will send an RST to the other tcp.
518 *
519 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
520 *		      for reset.
521 *	Answer: if a packet caused RST, it is not for a socket
522 *		existing in our system, if it is matched to a socket,
523 *		it is just duplicate segment or bug in other side's TCP.
524 *		So that we build reply only basing on parameters
525 *		arrived with segment.
526 *	Exception: precedence violation. We do not implement it in any case.
527 */
528
529static void tcp_v4_send_reset(struct sk_buff *skb)
530{
531	struct tcphdr *th = skb->h.th;
532	struct tcphdr rth;
533	struct ip_reply_arg arg;
534
535	/* Never send a reset in response to a reset. */
536	if (th->rst)
537		return;
538
539	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
540		return;
541
542	/* Swap the send and the receive. */
543	memset(&rth, 0, sizeof(struct tcphdr));
544	rth.dest   = th->source;
545	rth.source = th->dest;
546	rth.doff   = sizeof(struct tcphdr) / 4;
547	rth.rst    = 1;
548
549	if (th->ack) {
550		rth.seq = th->ack_seq;
551	} else {
552		rth.ack = 1;
553		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
554				    skb->len - (th->doff << 2));
555	}
556
557	memset(&arg, 0, sizeof arg);
558	arg.iov[0].iov_base = (unsigned char *)&rth;
559	arg.iov[0].iov_len  = sizeof rth;
560	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
561				      skb->nh.iph->saddr, /*XXX*/
562				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
563	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
564
565	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
566
567	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
568	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
569}
570
571/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
572   outside socket context is ugly, certainly. What can I do?
573 */
574
575static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
576			    u32 win, u32 ts)
577{
578	struct tcphdr *th = skb->h.th;
579	struct {
580		struct tcphdr th;
581		u32 tsopt[3];
582	} rep;
583	struct ip_reply_arg arg;
584
585	memset(&rep.th, 0, sizeof(struct tcphdr));
586	memset(&arg, 0, sizeof arg);
587
588	arg.iov[0].iov_base = (unsigned char *)&rep;
589	arg.iov[0].iov_len  = sizeof(rep.th);
590	if (ts) {
591		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
592				     (TCPOPT_TIMESTAMP << 8) |
593				     TCPOLEN_TIMESTAMP);
594		rep.tsopt[1] = htonl(tcp_time_stamp);
595		rep.tsopt[2] = htonl(ts);
596		arg.iov[0].iov_len = sizeof(rep);
597	}
598
599	/* Swap the send and the receive. */
600	rep.th.dest    = th->source;
601	rep.th.source  = th->dest;
602	rep.th.doff    = arg.iov[0].iov_len / 4;
603	rep.th.seq     = htonl(seq);
604	rep.th.ack_seq = htonl(ack);
605	rep.th.ack     = 1;
606	rep.th.window  = htons(win);
607
608	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
609				      skb->nh.iph->saddr, /*XXX*/
610				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
611	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
612
613	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
614
615	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
616}
617
618static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
619{
620	struct inet_timewait_sock *tw = inet_twsk(sk);
621	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
622
623	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
624			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
625
626	inet_twsk_put(tw);
627}
628
629static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
630{
631	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
632			req->ts_recent);
633}
634
635/*
636 *	Send a SYN-ACK after having received an ACK.
637 *	This still operates on a request_sock only, not on a big
638 *	socket.
639 */
640static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
641			      struct dst_entry *dst)
642{
643	const struct inet_request_sock *ireq = inet_rsk(req);
644	int err = -1;
645	struct sk_buff * skb;
646
647	/* First, grab a route. */
648	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
649		goto out;
650
651	skb = tcp_make_synack(sk, dst, req);
652
653	if (skb) {
654		struct tcphdr *th = skb->h.th;
655
656		th->check = tcp_v4_check(th, skb->len,
657					 ireq->loc_addr,
658					 ireq->rmt_addr,
659					 csum_partial((char *)th, skb->len,
660						      skb->csum));
661
662		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
663					    ireq->rmt_addr,
664					    ireq->opt);
665		if (err == NET_XMIT_CN)
666			err = 0;
667	}
668
669out:
670	dst_release(dst);
671	return err;
672}
673
674/*
675 *	IPv4 request_sock destructor.
676 */
677static void tcp_v4_reqsk_destructor(struct request_sock *req)
678{
679	kfree(inet_rsk(req)->opt);
680}
681
682#ifdef CONFIG_SYN_COOKIES
683static void syn_flood_warning(struct sk_buff *skb)
684{
685	static unsigned long warntime;
686
687	if (time_after(jiffies, (warntime + HZ * 60))) {
688		warntime = jiffies;
689		printk(KERN_INFO
690		       "possible SYN flooding on port %d. Sending cookies.\n",
691		       ntohs(skb->h.th->dest));
692	}
693}
694#endif
695
696/*
697 * Save and compile IPv4 options into the request_sock if needed.
698 */
699static struct ip_options *tcp_v4_save_options(struct sock *sk,
700					      struct sk_buff *skb)
701{
702	struct ip_options *opt = &(IPCB(skb)->opt);
703	struct ip_options *dopt = NULL;
704
705	if (opt && opt->optlen) {
706		int opt_size = optlength(opt);
707		dopt = kmalloc(opt_size, GFP_ATOMIC);
708		if (dopt) {
709			if (ip_options_echo(dopt, skb)) {
710				kfree(dopt);
711				dopt = NULL;
712			}
713		}
714	}
715	return dopt;
716}
717
718struct request_sock_ops tcp_request_sock_ops = {
719	.family		=	PF_INET,
720	.obj_size	=	sizeof(struct tcp_request_sock),
721	.rtx_syn_ack	=	tcp_v4_send_synack,
722	.send_ack	=	tcp_v4_reqsk_send_ack,
723	.destructor	=	tcp_v4_reqsk_destructor,
724	.send_reset	=	tcp_v4_send_reset,
725};
726
727static struct timewait_sock_ops tcp_timewait_sock_ops = {
728	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
729	.twsk_unique	= tcp_twsk_unique,
730};
731
732int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
733{
734	struct inet_request_sock *ireq;
735	struct tcp_options_received tmp_opt;
736	struct request_sock *req;
737	__u32 saddr = skb->nh.iph->saddr;
738	__u32 daddr = skb->nh.iph->daddr;
739	__u32 isn = TCP_SKB_CB(skb)->when;
740	struct dst_entry *dst = NULL;
741#ifdef CONFIG_SYN_COOKIES
742	int want_cookie = 0;
743#else
744#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
745#endif
746
747	/* Never answer to SYNs send to broadcast or multicast */
748	if (((struct rtable *)skb->dst)->rt_flags &
749	    (RTCF_BROADCAST | RTCF_MULTICAST))
750		goto drop;
751
752	/* TW buckets are converted to open requests without
753	 * limitations, they conserve resources and peer is
754	 * evidently real one.
755	 */
756	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
757#ifdef CONFIG_SYN_COOKIES
758		if (sysctl_tcp_syncookies) {
759			want_cookie = 1;
760		} else
761#endif
762		goto drop;
763	}
764
765	/* Accept backlog is full. If we have already queued enough
766	 * of warm entries in syn queue, drop request. It is better than
767	 * clogging syn queue with openreqs with exponentially increasing
768	 * timeout.
769	 */
770	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
771		goto drop;
772
773	req = reqsk_alloc(&tcp_request_sock_ops);
774	if (!req)
775		goto drop;
776
777	tcp_clear_options(&tmp_opt);
778	tmp_opt.mss_clamp = 536;
779	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
780
781	tcp_parse_options(skb, &tmp_opt, 0);
782
783	if (want_cookie) {
784		tcp_clear_options(&tmp_opt);
785		tmp_opt.saw_tstamp = 0;
786	}
787
788	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
789		/* Some OSes (unknown ones, but I see them on web server, which
790		 * contains information interesting only for windows'
791		 * users) do not send their stamp in SYN. It is easy case.
792		 * We simply do not advertise TS support.
793		 */
794		tmp_opt.saw_tstamp = 0;
795		tmp_opt.tstamp_ok  = 0;
796	}
797	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
798
799	tcp_openreq_init(req, &tmp_opt, skb);
800
801	ireq = inet_rsk(req);
802	ireq->loc_addr = daddr;
803	ireq->rmt_addr = saddr;
804	ireq->opt = tcp_v4_save_options(sk, skb);
805	if (!want_cookie)
806		TCP_ECN_create_request(req, skb->h.th);
807
808	if (want_cookie) {
809#ifdef CONFIG_SYN_COOKIES
810		syn_flood_warning(skb);
811#endif
812		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
813	} else if (!isn) {
814		struct inet_peer *peer = NULL;
815
816		/* VJ's idea. We save last timestamp seen
817		 * from the destination in peer table, when entering
818		 * state TIME-WAIT, and check against it before
819		 * accepting new connection request.
820		 *
821		 * If "isn" is not zero, this request hit alive
822		 * timewait bucket, so that all the necessary checks
823		 * are made in the function processing timewait state.
824		 */
825		if (tmp_opt.saw_tstamp &&
826		    tcp_death_row.sysctl_tw_recycle &&
827		    (dst = inet_csk_route_req(sk, req)) != NULL &&
828		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
829		    peer->v4daddr == saddr) {
830			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
831			    (s32)(peer->tcp_ts - req->ts_recent) >
832							TCP_PAWS_WINDOW) {
833				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
834				dst_release(dst);
835				goto drop_and_free;
836			}
837		}
838		/* Kill the following clause, if you dislike this way. */
839		else if (!sysctl_tcp_syncookies &&
840			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
841			  (sysctl_max_syn_backlog >> 2)) &&
842			 (!peer || !peer->tcp_ts_stamp) &&
843			 (!dst || !dst_metric(dst, RTAX_RTT))) {
844			/* Without syncookies last quarter of
845			 * backlog is filled with destinations,
846			 * proven to be alive.
847			 * It means that we continue to communicate
848			 * to destinations, already remembered
849			 * to the moment of synflood.
850			 */
851			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
852				       "request from %u.%u.%u.%u/%u\n",
853				       NIPQUAD(saddr),
854				       ntohs(skb->h.th->source));
855			dst_release(dst);
856			goto drop_and_free;
857		}
858
859		isn = tcp_v4_init_sequence(sk, skb);
860	}
861	tcp_rsk(req)->snt_isn = isn;
862
863	if (tcp_v4_send_synack(sk, req, dst))
864		goto drop_and_free;
865
866	if (want_cookie) {
867	   	reqsk_free(req);
868	} else {
869		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
870	}
871	return 0;
872
873drop_and_free:
874	reqsk_free(req);
875drop:
876	return 0;
877}
878
879
880/*
881 * The three way handshake has completed - we got a valid synack -
882 * now create the new socket.
883 */
884struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
885				  struct request_sock *req,
886				  struct dst_entry *dst)
887{
888	struct inet_request_sock *ireq;
889	struct inet_sock *newinet;
890	struct tcp_sock *newtp;
891	struct sock *newsk;
892
893	if (sk_acceptq_is_full(sk))
894		goto exit_overflow;
895
896	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
897		goto exit;
898
899	newsk = tcp_create_openreq_child(sk, req, skb);
900	if (!newsk)
901		goto exit;
902
903	newsk->sk_gso_type = SKB_GSO_TCPV4;
904	sk_setup_caps(newsk, dst);
905
906	newtp		      = tcp_sk(newsk);
907	newinet		      = inet_sk(newsk);
908	ireq		      = inet_rsk(req);
909	newinet->daddr	      = ireq->rmt_addr;
910	newinet->rcv_saddr    = ireq->loc_addr;
911	newinet->saddr	      = ireq->loc_addr;
912	newinet->opt	      = ireq->opt;
913	ireq->opt	      = NULL;
914	newinet->mc_index     = inet_iif(skb);
915	newinet->mc_ttl	      = skb->nh.iph->ttl;
916	inet_csk(newsk)->icsk_ext_hdr_len = 0;
917	if (newinet->opt)
918		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
919	newinet->id = newtp->write_seq ^ jiffies;
920
921	tcp_mtup_init(newsk);
922	tcp_sync_mss(newsk, dst_mtu(dst));
923	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
924	tcp_initialize_rcv_mss(newsk);
925
926	__inet_hash(&tcp_hashinfo, newsk, 0);
927	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
928
929	return newsk;
930
931exit_overflow:
932	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
933exit:
934	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
935	dst_release(dst);
936	return NULL;
937}
938
939static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
940{
941	struct tcphdr *th = skb->h.th;
942	struct iphdr *iph = skb->nh.iph;
943	struct sock *nsk;
944	struct request_sock **prev;
945	/* Find possible connection requests. */
946	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
947						       iph->saddr, iph->daddr);
948	if (req)
949		return tcp_check_req(sk, skb, req, prev);
950
951	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
952					th->source, skb->nh.iph->daddr,
953					ntohs(th->dest), inet_iif(skb));
954
955	if (nsk) {
956		if (nsk->sk_state != TCP_TIME_WAIT) {
957			bh_lock_sock(nsk);
958			return nsk;
959		}
960		inet_twsk_put((struct inet_timewait_sock *)nsk);
961		return NULL;
962	}
963
964#ifdef CONFIG_SYN_COOKIES
965	if (!th->rst && !th->syn && th->ack)
966		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
967#endif
968	return sk;
969}
970
971static int tcp_v4_checksum_init(struct sk_buff *skb)
972{
973	if (skb->ip_summed == CHECKSUM_HW) {
974		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
975				  skb->nh.iph->daddr, skb->csum)) {
976			skb->ip_summed = CHECKSUM_UNNECESSARY;
977			return 0;
978		}
979	}
980
981	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
982				       skb->len, IPPROTO_TCP, 0);
983
984	if (skb->len <= 76) {
985		return __skb_checksum_complete(skb);
986	}
987	return 0;
988}
989
990
991/* The socket must have it's spinlock held when we get
992 * here.
993 *
994 * We have a potential double-lock case here, so even when
995 * doing backlog processing we use the BH locking scheme.
996 * This is because we cannot sleep with the original spinlock
997 * held.
998 */
999int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1000{
1001	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1002		TCP_CHECK_TIMER(sk);
1003		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1004			goto reset;
1005		TCP_CHECK_TIMER(sk);
1006		return 0;
1007	}
1008
1009	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1010		goto csum_err;
1011
1012	if (sk->sk_state == TCP_LISTEN) {
1013		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1014		if (!nsk)
1015			goto discard;
1016
1017		if (nsk != sk) {
1018			if (tcp_child_process(sk, nsk, skb))
1019				goto reset;
1020			return 0;
1021		}
1022	}
1023
1024	TCP_CHECK_TIMER(sk);
1025	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1026		goto reset;
1027	TCP_CHECK_TIMER(sk);
1028	return 0;
1029
1030reset:
1031	tcp_v4_send_reset(skb);
1032discard:
1033	kfree_skb(skb);
1034	/* Be careful here. If this function gets more complicated and
1035	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1036	 * might be destroyed here. This current version compiles correctly,
1037	 * but you have been warned.
1038	 */
1039	return 0;
1040
1041csum_err:
1042	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1043	goto discard;
1044}
1045
1046/*
1047 *	From tcp_input.c
1048 */
1049
1050int tcp_v4_rcv(struct sk_buff *skb)
1051{
1052	struct tcphdr *th;
1053	struct sock *sk;
1054	int ret;
1055
1056	if (skb->pkt_type != PACKET_HOST)
1057		goto discard_it;
1058
1059	/* Count it even if it's bad */
1060	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1061
1062	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1063		goto discard_it;
1064
1065	th = skb->h.th;
1066
1067	if (th->doff < sizeof(struct tcphdr) / 4)
1068		goto bad_packet;
1069	if (!pskb_may_pull(skb, th->doff * 4))
1070		goto discard_it;
1071
1072	/* An explanation is required here, I think.
1073	 * Packet length and doff are validated by header prediction,
1074	 * provided case of th->doff==0 is eliminated.
1075	 * So, we defer the checks. */
1076	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1077	     tcp_v4_checksum_init(skb)))
1078		goto bad_packet;
1079
1080	th = skb->h.th;
1081	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1082	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1083				    skb->len - th->doff * 4);
1084	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1085	TCP_SKB_CB(skb)->when	 = 0;
1086	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1087	TCP_SKB_CB(skb)->sacked	 = 0;
1088
1089	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1090			   skb->nh.iph->daddr, ntohs(th->dest),
1091			   inet_iif(skb));
1092
1093	if (!sk)
1094		goto no_tcp_socket;
1095
1096process:
1097	if (sk->sk_state == TCP_TIME_WAIT)
1098		goto do_time_wait;
1099
1100	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1101		goto discard_and_relse;
1102	nf_reset(skb);
1103
1104	if (sk_filter(sk, skb, 0))
1105		goto discard_and_relse;
1106
1107	skb->dev = NULL;
1108
1109	bh_lock_sock_nested(sk);
1110	ret = 0;
1111	if (!sock_owned_by_user(sk)) {
1112#ifdef CONFIG_NET_DMA
1113		struct tcp_sock *tp = tcp_sk(sk);
1114		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1115			tp->ucopy.dma_chan = get_softnet_dma();
1116		if (tp->ucopy.dma_chan)
1117			ret = tcp_v4_do_rcv(sk, skb);
1118		else
1119#endif
1120		{
1121			if (!tcp_prequeue(sk, skb))
1122			ret = tcp_v4_do_rcv(sk, skb);
1123		}
1124	} else
1125		sk_add_backlog(sk, skb);
1126	bh_unlock_sock(sk);
1127
1128	sock_put(sk);
1129
1130	return ret;
1131
1132no_tcp_socket:
1133	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1134		goto discard_it;
1135
1136	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1137bad_packet:
1138		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1139	} else {
1140		tcp_v4_send_reset(skb);
1141	}
1142
1143discard_it:
1144	/* Discard frame. */
1145	kfree_skb(skb);
1146  	return 0;
1147
1148discard_and_relse:
1149	sock_put(sk);
1150	goto discard_it;
1151
1152do_time_wait:
1153	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1154		inet_twsk_put((struct inet_timewait_sock *) sk);
1155		goto discard_it;
1156	}
1157
1158	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1159		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1160		inet_twsk_put((struct inet_timewait_sock *) sk);
1161		goto discard_it;
1162	}
1163	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1164					   skb, th)) {
1165	case TCP_TW_SYN: {
1166		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1167							skb->nh.iph->daddr,
1168							ntohs(th->dest),
1169							inet_iif(skb));
1170		if (sk2) {
1171			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1172					     &tcp_death_row);
1173			inet_twsk_put((struct inet_timewait_sock *)sk);
1174			sk = sk2;
1175			goto process;
1176		}
1177		/* Fall through to ACK */
1178	}
1179	case TCP_TW_ACK:
1180		tcp_v4_timewait_ack(sk, skb);
1181		break;
1182	case TCP_TW_RST:
1183		goto no_tcp_socket;
1184	case TCP_TW_SUCCESS:;
1185	}
1186	goto discard_it;
1187}
1188
1189/* VJ's idea. Save last timestamp seen from this destination
1190 * and hold it at least for normal timewait interval to use for duplicate
1191 * segment detection in subsequent connections, before they enter synchronized
1192 * state.
1193 */
1194
1195int tcp_v4_remember_stamp(struct sock *sk)
1196{
1197	struct inet_sock *inet = inet_sk(sk);
1198	struct tcp_sock *tp = tcp_sk(sk);
1199	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1200	struct inet_peer *peer = NULL;
1201	int release_it = 0;
1202
1203	if (!rt || rt->rt_dst != inet->daddr) {
1204		peer = inet_getpeer(inet->daddr, 1);
1205		release_it = 1;
1206	} else {
1207		if (!rt->peer)
1208			rt_bind_peer(rt, 1);
1209		peer = rt->peer;
1210	}
1211
1212	if (peer) {
1213		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1214		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1215		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1216			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1217			peer->tcp_ts = tp->rx_opt.ts_recent;
1218		}
1219		if (release_it)
1220			inet_putpeer(peer);
1221		return 1;
1222	}
1223
1224	return 0;
1225}
1226
1227int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1228{
1229	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1230
1231	if (peer) {
1232		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1233
1234		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1235		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1236		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1237			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1238			peer->tcp_ts	   = tcptw->tw_ts_recent;
1239		}
1240		inet_putpeer(peer);
1241		return 1;
1242	}
1243
1244	return 0;
1245}
1246
1247struct inet_connection_sock_af_ops ipv4_specific = {
1248	.queue_xmit	   = ip_queue_xmit,
1249	.send_check	   = tcp_v4_send_check,
1250	.rebuild_header	   = inet_sk_rebuild_header,
1251	.conn_request	   = tcp_v4_conn_request,
1252	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1253	.remember_stamp	   = tcp_v4_remember_stamp,
1254	.net_header_len	   = sizeof(struct iphdr),
1255	.setsockopt	   = ip_setsockopt,
1256	.getsockopt	   = ip_getsockopt,
1257	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1258	.sockaddr_len	   = sizeof(struct sockaddr_in),
1259#ifdef CONFIG_COMPAT
1260	.compat_setsockopt = compat_ip_setsockopt,
1261	.compat_getsockopt = compat_ip_getsockopt,
1262#endif
1263};
1264
1265/* NOTE: A lot of things set to zero explicitly by call to
1266 *       sk_alloc() so need not be done here.
1267 */
1268static int tcp_v4_init_sock(struct sock *sk)
1269{
1270	struct inet_connection_sock *icsk = inet_csk(sk);
1271	struct tcp_sock *tp = tcp_sk(sk);
1272
1273	skb_queue_head_init(&tp->out_of_order_queue);
1274	tcp_init_xmit_timers(sk);
1275	tcp_prequeue_init(tp);
1276
1277	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1278	tp->mdev = TCP_TIMEOUT_INIT;
1279
1280	/* So many TCP implementations out there (incorrectly) count the
1281	 * initial SYN frame in their delayed-ACK and congestion control
1282	 * algorithms that we must have the following bandaid to talk
1283	 * efficiently to them.  -DaveM
1284	 */
1285	tp->snd_cwnd = 2;
1286
1287	/* See draft-stevens-tcpca-spec-01 for discussion of the
1288	 * initialization of these values.
1289	 */
1290	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1291	tp->snd_cwnd_clamp = ~0;
1292	tp->mss_cache = 536;
1293
1294	tp->reordering = sysctl_tcp_reordering;
1295	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1296
1297	sk->sk_state = TCP_CLOSE;
1298
1299	sk->sk_write_space = sk_stream_write_space;
1300	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1301
1302	icsk->icsk_af_ops = &ipv4_specific;
1303	icsk->icsk_sync_mss = tcp_sync_mss;
1304
1305	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1306	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1307
1308	atomic_inc(&tcp_sockets_allocated);
1309
1310	return 0;
1311}
1312
1313int tcp_v4_destroy_sock(struct sock *sk)
1314{
1315	struct tcp_sock *tp = tcp_sk(sk);
1316
1317	tcp_clear_xmit_timers(sk);
1318
1319	tcp_cleanup_congestion_control(sk);
1320
1321	/* Cleanup up the write buffer. */
1322  	sk_stream_writequeue_purge(sk);
1323
1324	/* Cleans up our, hopefully empty, out_of_order_queue. */
1325  	__skb_queue_purge(&tp->out_of_order_queue);
1326
1327#ifdef CONFIG_NET_DMA
1328	/* Cleans up our sk_async_wait_queue */
1329  	__skb_queue_purge(&sk->sk_async_wait_queue);
1330#endif
1331
1332	/* Clean prequeue, it must be empty really */
1333	__skb_queue_purge(&tp->ucopy.prequeue);
1334
1335	/* Clean up a referenced TCP bind bucket. */
1336	if (inet_csk(sk)->icsk_bind_hash)
1337		inet_put_port(&tcp_hashinfo, sk);
1338
1339	/*
1340	 * If sendmsg cached page exists, toss it.
1341	 */
1342	if (sk->sk_sndmsg_page) {
1343		__free_page(sk->sk_sndmsg_page);
1344		sk->sk_sndmsg_page = NULL;
1345	}
1346
1347	atomic_dec(&tcp_sockets_allocated);
1348
1349	return 0;
1350}
1351
1352EXPORT_SYMBOL(tcp_v4_destroy_sock);
1353
1354#ifdef CONFIG_PROC_FS
1355/* Proc filesystem TCP sock list dumping. */
1356
1357static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1358{
1359	return hlist_empty(head) ? NULL :
1360		list_entry(head->first, struct inet_timewait_sock, tw_node);
1361}
1362
1363static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1364{
1365	return tw->tw_node.next ?
1366		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1367}
1368
1369static void *listening_get_next(struct seq_file *seq, void *cur)
1370{
1371	struct inet_connection_sock *icsk;
1372	struct hlist_node *node;
1373	struct sock *sk = cur;
1374	struct tcp_iter_state* st = seq->private;
1375
1376	if (!sk) {
1377		st->bucket = 0;
1378		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1379		goto get_sk;
1380	}
1381
1382	++st->num;
1383
1384	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1385		struct request_sock *req = cur;
1386
1387	       	icsk = inet_csk(st->syn_wait_sk);
1388		req = req->dl_next;
1389		while (1) {
1390			while (req) {
1391				if (req->rsk_ops->family == st->family) {
1392					cur = req;
1393					goto out;
1394				}
1395				req = req->dl_next;
1396			}
1397			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1398				break;
1399get_req:
1400			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1401		}
1402		sk	  = sk_next(st->syn_wait_sk);
1403		st->state = TCP_SEQ_STATE_LISTENING;
1404		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1405	} else {
1406	       	icsk = inet_csk(sk);
1407		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1408		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1409			goto start_req;
1410		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1411		sk = sk_next(sk);
1412	}
1413get_sk:
1414	sk_for_each_from(sk, node) {
1415		if (sk->sk_family == st->family) {
1416			cur = sk;
1417			goto out;
1418		}
1419	       	icsk = inet_csk(sk);
1420		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1421		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1422start_req:
1423			st->uid		= sock_i_uid(sk);
1424			st->syn_wait_sk = sk;
1425			st->state	= TCP_SEQ_STATE_OPENREQ;
1426			st->sbucket	= 0;
1427			goto get_req;
1428		}
1429		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1430	}
1431	if (++st->bucket < INET_LHTABLE_SIZE) {
1432		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1433		goto get_sk;
1434	}
1435	cur = NULL;
1436out:
1437	return cur;
1438}
1439
1440static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1441{
1442	void *rc = listening_get_next(seq, NULL);
1443
1444	while (rc && *pos) {
1445		rc = listening_get_next(seq, rc);
1446		--*pos;
1447	}
1448	return rc;
1449}
1450
1451static void *established_get_first(struct seq_file *seq)
1452{
1453	struct tcp_iter_state* st = seq->private;
1454	void *rc = NULL;
1455
1456	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1457		struct sock *sk;
1458		struct hlist_node *node;
1459		struct inet_timewait_sock *tw;
1460
1461		/* We can reschedule _before_ having picked the target: */
1462		cond_resched_softirq();
1463
1464		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1465		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1466			if (sk->sk_family != st->family) {
1467				continue;
1468			}
1469			rc = sk;
1470			goto out;
1471		}
1472		st->state = TCP_SEQ_STATE_TIME_WAIT;
1473		inet_twsk_for_each(tw, node,
1474				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1475			if (tw->tw_family != st->family) {
1476				continue;
1477			}
1478			rc = tw;
1479			goto out;
1480		}
1481		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1482		st->state = TCP_SEQ_STATE_ESTABLISHED;
1483	}
1484out:
1485	return rc;
1486}
1487
1488static void *established_get_next(struct seq_file *seq, void *cur)
1489{
1490	struct sock *sk = cur;
1491	struct inet_timewait_sock *tw;
1492	struct hlist_node *node;
1493	struct tcp_iter_state* st = seq->private;
1494
1495	++st->num;
1496
1497	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1498		tw = cur;
1499		tw = tw_next(tw);
1500get_tw:
1501		while (tw && tw->tw_family != st->family) {
1502			tw = tw_next(tw);
1503		}
1504		if (tw) {
1505			cur = tw;
1506			goto out;
1507		}
1508		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1509		st->state = TCP_SEQ_STATE_ESTABLISHED;
1510
1511		/* We can reschedule between buckets: */
1512		cond_resched_softirq();
1513
1514		if (++st->bucket < tcp_hashinfo.ehash_size) {
1515			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1516			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1517		} else {
1518			cur = NULL;
1519			goto out;
1520		}
1521	} else
1522		sk = sk_next(sk);
1523
1524	sk_for_each_from(sk, node) {
1525		if (sk->sk_family == st->family)
1526			goto found;
1527	}
1528
1529	st->state = TCP_SEQ_STATE_TIME_WAIT;
1530	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1531	goto get_tw;
1532found:
1533	cur = sk;
1534out:
1535	return cur;
1536}
1537
1538static void *established_get_idx(struct seq_file *seq, loff_t pos)
1539{
1540	void *rc = established_get_first(seq);
1541
1542	while (rc && pos) {
1543		rc = established_get_next(seq, rc);
1544		--pos;
1545	}
1546	return rc;
1547}
1548
1549static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1550{
1551	void *rc;
1552	struct tcp_iter_state* st = seq->private;
1553
1554	inet_listen_lock(&tcp_hashinfo);
1555	st->state = TCP_SEQ_STATE_LISTENING;
1556	rc	  = listening_get_idx(seq, &pos);
1557
1558	if (!rc) {
1559		inet_listen_unlock(&tcp_hashinfo);
1560		local_bh_disable();
1561		st->state = TCP_SEQ_STATE_ESTABLISHED;
1562		rc	  = established_get_idx(seq, pos);
1563	}
1564
1565	return rc;
1566}
1567
1568static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1569{
1570	struct tcp_iter_state* st = seq->private;
1571	st->state = TCP_SEQ_STATE_LISTENING;
1572	st->num = 0;
1573	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1574}
1575
1576static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1577{
1578	void *rc = NULL;
1579	struct tcp_iter_state* st;
1580
1581	if (v == SEQ_START_TOKEN) {
1582		rc = tcp_get_idx(seq, 0);
1583		goto out;
1584	}
1585	st = seq->private;
1586
1587	switch (st->state) {
1588	case TCP_SEQ_STATE_OPENREQ:
1589	case TCP_SEQ_STATE_LISTENING:
1590		rc = listening_get_next(seq, v);
1591		if (!rc) {
1592			inet_listen_unlock(&tcp_hashinfo);
1593			local_bh_disable();
1594			st->state = TCP_SEQ_STATE_ESTABLISHED;
1595			rc	  = established_get_first(seq);
1596		}
1597		break;
1598	case TCP_SEQ_STATE_ESTABLISHED:
1599	case TCP_SEQ_STATE_TIME_WAIT:
1600		rc = established_get_next(seq, v);
1601		break;
1602	}
1603out:
1604	++*pos;
1605	return rc;
1606}
1607
1608static void tcp_seq_stop(struct seq_file *seq, void *v)
1609{
1610	struct tcp_iter_state* st = seq->private;
1611
1612	switch (st->state) {
1613	case TCP_SEQ_STATE_OPENREQ:
1614		if (v) {
1615			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1616			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1617		}
1618	case TCP_SEQ_STATE_LISTENING:
1619		if (v != SEQ_START_TOKEN)
1620			inet_listen_unlock(&tcp_hashinfo);
1621		break;
1622	case TCP_SEQ_STATE_TIME_WAIT:
1623	case TCP_SEQ_STATE_ESTABLISHED:
1624		if (v)
1625			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1626		local_bh_enable();
1627		break;
1628	}
1629}
1630
1631static int tcp_seq_open(struct inode *inode, struct file *file)
1632{
1633	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1634	struct seq_file *seq;
1635	struct tcp_iter_state *s;
1636	int rc;
1637
1638	if (unlikely(afinfo == NULL))
1639		return -EINVAL;
1640
1641	s = kzalloc(sizeof(*s), GFP_KERNEL);
1642	if (!s)
1643		return -ENOMEM;
1644	s->family		= afinfo->family;
1645	s->seq_ops.start	= tcp_seq_start;
1646	s->seq_ops.next		= tcp_seq_next;
1647	s->seq_ops.show		= afinfo->seq_show;
1648	s->seq_ops.stop		= tcp_seq_stop;
1649
1650	rc = seq_open(file, &s->seq_ops);
1651	if (rc)
1652		goto out_kfree;
1653	seq	     = file->private_data;
1654	seq->private = s;
1655out:
1656	return rc;
1657out_kfree:
1658	kfree(s);
1659	goto out;
1660}
1661
1662int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1663{
1664	int rc = 0;
1665	struct proc_dir_entry *p;
1666
1667	if (!afinfo)
1668		return -EINVAL;
1669	afinfo->seq_fops->owner		= afinfo->owner;
1670	afinfo->seq_fops->open		= tcp_seq_open;
1671	afinfo->seq_fops->read		= seq_read;
1672	afinfo->seq_fops->llseek	= seq_lseek;
1673	afinfo->seq_fops->release	= seq_release_private;
1674
1675	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1676	if (p)
1677		p->data = afinfo;
1678	else
1679		rc = -ENOMEM;
1680	return rc;
1681}
1682
1683void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1684{
1685	if (!afinfo)
1686		return;
1687	proc_net_remove(afinfo->name);
1688	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1689}
1690
1691static void get_openreq4(struct sock *sk, struct request_sock *req,
1692			 char *tmpbuf, int i, int uid)
1693{
1694	const struct inet_request_sock *ireq = inet_rsk(req);
1695	int ttd = req->expires - jiffies;
1696
1697	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1698		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1699		i,
1700		ireq->loc_addr,
1701		ntohs(inet_sk(sk)->sport),
1702		ireq->rmt_addr,
1703		ntohs(ireq->rmt_port),
1704		TCP_SYN_RECV,
1705		0, 0, /* could print option size, but that is af dependent. */
1706		1,    /* timers active (only the expire timer) */
1707		jiffies_to_clock_t(ttd),
1708		req->retrans,
1709		uid,
1710		0,  /* non standard timer */
1711		0, /* open_requests have no inode */
1712		atomic_read(&sk->sk_refcnt),
1713		req);
1714}
1715
1716static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1717{
1718	int timer_active;
1719	unsigned long timer_expires;
1720	struct tcp_sock *tp = tcp_sk(sp);
1721	const struct inet_connection_sock *icsk = inet_csk(sp);
1722	struct inet_sock *inet = inet_sk(sp);
1723	unsigned int dest = inet->daddr;
1724	unsigned int src = inet->rcv_saddr;
1725	__u16 destp = ntohs(inet->dport);
1726	__u16 srcp = ntohs(inet->sport);
1727
1728	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1729		timer_active	= 1;
1730		timer_expires	= icsk->icsk_timeout;
1731	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1732		timer_active	= 4;
1733		timer_expires	= icsk->icsk_timeout;
1734	} else if (timer_pending(&sp->sk_timer)) {
1735		timer_active	= 2;
1736		timer_expires	= sp->sk_timer.expires;
1737	} else {
1738		timer_active	= 0;
1739		timer_expires = jiffies;
1740	}
1741
1742	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1743			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1744		i, src, srcp, dest, destp, sp->sk_state,
1745		tp->write_seq - tp->snd_una,
1746		(sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
1747		timer_active,
1748		jiffies_to_clock_t(timer_expires - jiffies),
1749		icsk->icsk_retransmits,
1750		sock_i_uid(sp),
1751		icsk->icsk_probes_out,
1752		sock_i_ino(sp),
1753		atomic_read(&sp->sk_refcnt), sp,
1754		icsk->icsk_rto,
1755		icsk->icsk_ack.ato,
1756		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1757		tp->snd_cwnd,
1758		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1759}
1760
1761static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1762{
1763	unsigned int dest, src;
1764	__u16 destp, srcp;
1765	int ttd = tw->tw_ttd - jiffies;
1766
1767	if (ttd < 0)
1768		ttd = 0;
1769
1770	dest  = tw->tw_daddr;
1771	src   = tw->tw_rcv_saddr;
1772	destp = ntohs(tw->tw_dport);
1773	srcp  = ntohs(tw->tw_sport);
1774
1775	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1776		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1777		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1778		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1779		atomic_read(&tw->tw_refcnt), tw);
1780}
1781
1782#define TMPSZ 150
1783
1784static int tcp4_seq_show(struct seq_file *seq, void *v)
1785{
1786	struct tcp_iter_state* st;
1787	char tmpbuf[TMPSZ + 1];
1788
1789	if (v == SEQ_START_TOKEN) {
1790		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1791			   "  sl  local_address rem_address   st tx_queue "
1792			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1793			   "inode");
1794		goto out;
1795	}
1796	st = seq->private;
1797
1798	switch (st->state) {
1799	case TCP_SEQ_STATE_LISTENING:
1800	case TCP_SEQ_STATE_ESTABLISHED:
1801		get_tcp4_sock(v, tmpbuf, st->num);
1802		break;
1803	case TCP_SEQ_STATE_OPENREQ:
1804		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1805		break;
1806	case TCP_SEQ_STATE_TIME_WAIT:
1807		get_timewait4_sock(v, tmpbuf, st->num);
1808		break;
1809	}
1810	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1811out:
1812	return 0;
1813}
1814
1815static struct file_operations tcp4_seq_fops;
1816static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1817	.owner		= THIS_MODULE,
1818	.name		= "tcp",
1819	.family		= AF_INET,
1820	.seq_show	= tcp4_seq_show,
1821	.seq_fops	= &tcp4_seq_fops,
1822};
1823
1824int __init tcp4_proc_init(void)
1825{
1826	return tcp_proc_register(&tcp4_seq_afinfo);
1827}
1828
1829void tcp4_proc_exit(void)
1830{
1831	tcp_proc_unregister(&tcp4_seq_afinfo);
1832}
1833#endif /* CONFIG_PROC_FS */
1834
1835struct proto tcp_prot = {
1836	.name			= "TCP",
1837	.owner			= THIS_MODULE,
1838	.close			= tcp_close,
1839	.connect		= tcp_v4_connect,
1840	.disconnect		= tcp_disconnect,
1841	.accept			= inet_csk_accept,
1842	.ioctl			= tcp_ioctl,
1843	.init			= tcp_v4_init_sock,
1844	.destroy		= tcp_v4_destroy_sock,
1845	.shutdown		= tcp_shutdown,
1846	.setsockopt		= tcp_setsockopt,
1847	.getsockopt		= tcp_getsockopt,
1848	.sendmsg		= tcp_sendmsg,
1849	.recvmsg		= tcp_recvmsg,
1850	.backlog_rcv		= tcp_v4_do_rcv,
1851	.hash			= tcp_v4_hash,
1852	.unhash			= tcp_unhash,
1853	.get_port		= tcp_v4_get_port,
1854	.enter_memory_pressure	= tcp_enter_memory_pressure,
1855	.sockets_allocated	= &tcp_sockets_allocated,
1856	.orphan_count		= &tcp_orphan_count,
1857	.memory_allocated	= &tcp_memory_allocated,
1858	.memory_pressure	= &tcp_memory_pressure,
1859	.sysctl_mem		= sysctl_tcp_mem,
1860	.sysctl_wmem		= sysctl_tcp_wmem,
1861	.sysctl_rmem		= sysctl_tcp_rmem,
1862	.max_header		= MAX_TCP_HEADER,
1863	.obj_size		= sizeof(struct tcp_sock),
1864	.twsk_prot		= &tcp_timewait_sock_ops,
1865	.rsk_prot		= &tcp_request_sock_ops,
1866#ifdef CONFIG_COMPAT
1867	.compat_setsockopt	= compat_tcp_setsockopt,
1868	.compat_getsockopt	= compat_tcp_getsockopt,
1869#endif
1870};
1871
1872void __init tcp_v4_init(struct net_proto_family *ops)
1873{
1874	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
1875		panic("Failed to create the TCP control socket.\n");
1876}
1877
1878EXPORT_SYMBOL(ipv4_specific);
1879EXPORT_SYMBOL(tcp_hashinfo);
1880EXPORT_SYMBOL(tcp_prot);
1881EXPORT_SYMBOL(tcp_unhash);
1882EXPORT_SYMBOL(tcp_v4_conn_request);
1883EXPORT_SYMBOL(tcp_v4_connect);
1884EXPORT_SYMBOL(tcp_v4_do_rcv);
1885EXPORT_SYMBOL(tcp_v4_remember_stamp);
1886EXPORT_SYMBOL(tcp_v4_send_check);
1887EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1888
1889#ifdef CONFIG_PROC_FS
1890EXPORT_SYMBOL(tcp_proc_register);
1891EXPORT_SYMBOL(tcp_proc_unregister);
1892#endif
1893EXPORT_SYMBOL(sysctl_local_port_range);
1894EXPORT_SYMBOL(sysctl_tcp_low_latency);
1895
1896