tcp_ipv4.c revision b9df3cb8cf9a96e63dfdcd3056a9cbc71f2459e7
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81int sysctl_tcp_tw_reuse __read_mostly;
82int sysctl_tcp_low_latency __read_mostly;
83
84/* Check TCP sequence numbers in ICMP packets. */
85#define ICMP_MIN_LENGTH 8
86
87/* Socket used for sending RSTs */
88static struct socket *tcp_socket;
89
90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock	= __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
94	.lhash_users	= ATOMIC_INIT(0),
95	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96};
97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{
100	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101				 inet_csk_bind_conflict);
102}
103
104static void tcp_v4_hash(struct sock *sk)
105{
106	inet_hash(&tcp_hashinfo, sk);
107}
108
109void tcp_unhash(struct sock *sk)
110{
111	inet_unhash(&tcp_hashinfo, sk);
112}
113
114static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
115{
116	return secure_tcp_sequence_number(skb->nh.iph->daddr,
117					  skb->nh.iph->saddr,
118					  skb->h.th->dest,
119					  skb->h.th->source);
120}
121
122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
123{
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126
127	/* With PAWS, it is safe from the viewpoint
128	   of data integrity. Even without PAWS it is safe provided sequence
129	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
130
131	   Actually, the idea is close to VJ's one, only timestamp cache is
132	   held not per host, but per port pair and TW bucket is used as state
133	   holder.
134
135	   If TW bucket has been already destroyed we fall back to VJ's scheme
136	   and use initial timestamp retrieved from peer table.
137	 */
138	if (tcptw->tw_ts_recent_stamp &&
139	    (twp == NULL || (sysctl_tcp_tw_reuse &&
140			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
141		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
142		if (tp->write_seq == 0)
143			tp->write_seq = 1;
144		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
145		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
146		sock_hold(sktw);
147		return 1;
148	}
149
150	return 0;
151}
152
153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
154
155/* This will initiate an outgoing connection. */
156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
157{
158	struct inet_sock *inet = inet_sk(sk);
159	struct tcp_sock *tp = tcp_sk(sk);
160	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
161	struct rtable *rt;
162	__be32 daddr, nexthop;
163	int tmp;
164	int err;
165
166	if (addr_len < sizeof(struct sockaddr_in))
167		return -EINVAL;
168
169	if (usin->sin_family != AF_INET)
170		return -EAFNOSUPPORT;
171
172	nexthop = daddr = usin->sin_addr.s_addr;
173	if (inet->opt && inet->opt->srr) {
174		if (!daddr)
175			return -EINVAL;
176		nexthop = inet->opt->faddr;
177	}
178
179	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
180			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181			       IPPROTO_TCP,
182			       inet->sport, usin->sin_port, sk);
183	if (tmp < 0)
184		return tmp;
185
186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187		ip_rt_put(rt);
188		return -ENETUNREACH;
189	}
190
191	if (!inet->opt || !inet->opt->srr)
192		daddr = rt->rt_dst;
193
194	if (!inet->saddr)
195		inet->saddr = rt->rt_src;
196	inet->rcv_saddr = inet->saddr;
197
198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199		/* Reset inherited state */
200		tp->rx_opt.ts_recent	   = 0;
201		tp->rx_opt.ts_recent_stamp = 0;
202		tp->write_seq		   = 0;
203	}
204
205	if (tcp_death_row.sysctl_tw_recycle &&
206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207		struct inet_peer *peer = rt_get_peer(rt);
208
209		/* VJ's idea. We save last timestamp seen from
210		 * the destination in peer table, when entering state TIME-WAIT
211		 * and initialize rx_opt.ts_recent from it, when trying new connection.
212		 */
213
214		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
215			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216			tp->rx_opt.ts_recent = peer->tcp_ts;
217		}
218	}
219
220	inet->dport = usin->sin_port;
221	inet->daddr = daddr;
222
223	inet_csk(sk)->icsk_ext_hdr_len = 0;
224	if (inet->opt)
225		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226
227	tp->rx_opt.mss_clamp = 536;
228
229	/* Socket identity is still unknown (sport may be zero).
230	 * However we set state to SYN-SENT and not releasing socket
231	 * lock select source port, enter ourselves into the hash tables and
232	 * complete initialization after this.
233	 */
234	tcp_set_state(sk, TCP_SYN_SENT);
235	err = inet_hash_connect(&tcp_death_row, sk);
236	if (err)
237		goto failure;
238
239	err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
240	if (err)
241		goto failure;
242
243	/* OK, now commit destination to socket.  */
244	sk->sk_gso_type = SKB_GSO_TCPV4;
245	sk_setup_caps(sk, &rt->u.dst);
246
247	if (!tp->write_seq)
248		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
249							   inet->daddr,
250							   inet->sport,
251							   usin->sin_port);
252
253	inet->id = tp->write_seq ^ jiffies;
254
255	err = tcp_connect(sk);
256	rt = NULL;
257	if (err)
258		goto failure;
259
260	return 0;
261
262failure:
263	/* This unhashes the socket and releases the local port, if necessary. */
264	tcp_set_state(sk, TCP_CLOSE);
265	ip_rt_put(rt);
266	sk->sk_route_caps = 0;
267	inet->dport = 0;
268	return err;
269}
270
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275{
276	struct dst_entry *dst;
277	struct inet_sock *inet = inet_sk(sk);
278
279	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280	 * send out by Linux are always <576bytes so they should go through
281	 * unfragmented).
282	 */
283	if (sk->sk_state == TCP_LISTEN)
284		return;
285
286	/* We don't check in the destentry if pmtu discovery is forbidden
287	 * on this route. We just assume that no packet_to_big packets
288	 * are send back when pmtu discovery is not active.
289     	 * There is a small race when the user changes this flag in the
290	 * route, but I think that's acceptable.
291	 */
292	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293		return;
294
295	dst->ops->update_pmtu(dst, mtu);
296
297	/* Something is about to be wrong... Remember soft error
298	 * for the case, if this connection will not able to recover.
299	 */
300	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301		sk->sk_err_soft = EMSGSIZE;
302
303	mtu = dst_mtu(dst);
304
305	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307		tcp_sync_mss(sk, mtu);
308
309		/* Resend the TCP packet because it's
310		 * clear that the old packet has been
311		 * dropped. This is the new "fast" path mtu
312		 * discovery.
313		 */
314		tcp_simple_retransmit(sk);
315	} /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition.  If err < 0 then the socket should
321 * be closed and the error returned to the user.  If err > 0
322 * it's just the icmp type << 8 | icmp code.  After adjustment
323 * header points to the first 8 bytes of the tcp header.  We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336	struct iphdr *iph = (struct iphdr *)skb->data;
337	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338	struct tcp_sock *tp;
339	struct inet_sock *inet;
340	int type = skb->h.icmph->type;
341	int code = skb->h.icmph->code;
342	struct sock *sk;
343	__u32 seq;
344	int err;
345
346	if (skb->len < (iph->ihl << 2) + 8) {
347		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
348		return;
349	}
350
351	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
352			 th->source, inet_iif(skb));
353	if (!sk) {
354		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355		return;
356	}
357	if (sk->sk_state == TCP_TIME_WAIT) {
358		inet_twsk_put(inet_twsk(sk));
359		return;
360	}
361
362	bh_lock_sock(sk);
363	/* If too many ICMPs get dropped on busy
364	 * servers this needs to be solved differently.
365	 */
366	if (sock_owned_by_user(sk))
367		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
368
369	if (sk->sk_state == TCP_CLOSE)
370		goto out;
371
372	tp = tcp_sk(sk);
373	seq = ntohl(th->seq);
374	if (sk->sk_state != TCP_LISTEN &&
375	    !between(seq, tp->snd_una, tp->snd_nxt)) {
376		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
377		goto out;
378	}
379
380	switch (type) {
381	case ICMP_SOURCE_QUENCH:
382		/* Just silently ignore these. */
383		goto out;
384	case ICMP_PARAMETERPROB:
385		err = EPROTO;
386		break;
387	case ICMP_DEST_UNREACH:
388		if (code > NR_ICMP_UNREACH)
389			goto out;
390
391		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
392			if (!sock_owned_by_user(sk))
393				do_pmtu_discovery(sk, iph, info);
394			goto out;
395		}
396
397		err = icmp_err_convert[code].errno;
398		break;
399	case ICMP_TIME_EXCEEDED:
400		err = EHOSTUNREACH;
401		break;
402	default:
403		goto out;
404	}
405
406	switch (sk->sk_state) {
407		struct request_sock *req, **prev;
408	case TCP_LISTEN:
409		if (sock_owned_by_user(sk))
410			goto out;
411
412		req = inet_csk_search_req(sk, &prev, th->dest,
413					  iph->daddr, iph->saddr);
414		if (!req)
415			goto out;
416
417		/* ICMPs are not backlogged, hence we cannot get
418		   an established socket here.
419		 */
420		BUG_TRAP(!req->sk);
421
422		if (seq != tcp_rsk(req)->snt_isn) {
423			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
424			goto out;
425		}
426
427		/*
428		 * Still in SYN_RECV, just remove it silently.
429		 * There is no good way to pass the error to the newly
430		 * created socket, and POSIX does not want network
431		 * errors returned from accept().
432		 */
433		inet_csk_reqsk_queue_drop(sk, req, prev);
434		goto out;
435
436	case TCP_SYN_SENT:
437	case TCP_SYN_RECV:  /* Cannot happen.
438			       It can f.e. if SYNs crossed.
439			     */
440		if (!sock_owned_by_user(sk)) {
441			sk->sk_err = err;
442
443			sk->sk_error_report(sk);
444
445			tcp_done(sk);
446		} else {
447			sk->sk_err_soft = err;
448		}
449		goto out;
450	}
451
452	/* If we've already connected we will keep trying
453	 * until we time out, or the user gives up.
454	 *
455	 * rfc1122 4.2.3.9 allows to consider as hard errors
456	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457	 * but it is obsoleted by pmtu discovery).
458	 *
459	 * Note, that in modern internet, where routing is unreliable
460	 * and in each dark corner broken firewalls sit, sending random
461	 * errors ordered by their masters even this two messages finally lose
462	 * their original sense (even Linux sends invalid PORT_UNREACHs)
463	 *
464	 * Now we are in compliance with RFCs.
465	 *							--ANK (980905)
466	 */
467
468	inet = inet_sk(sk);
469	if (!sock_owned_by_user(sk) && inet->recverr) {
470		sk->sk_err = err;
471		sk->sk_error_report(sk);
472	} else	{ /* Only an error on timeout */
473		sk->sk_err_soft = err;
474	}
475
476out:
477	bh_unlock_sock(sk);
478	sock_put(sk);
479}
480
481/* This routine computes an IPv4 TCP checksum. */
482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483{
484	struct inet_sock *inet = inet_sk(sk);
485	struct tcphdr *th = skb->h.th;
486
487	if (skb->ip_summed == CHECKSUM_PARTIAL) {
488		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
489		skb->csum = offsetof(struct tcphdr, check);
490	} else {
491		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
492					 csum_partial((char *)th,
493						      th->doff << 2,
494						      skb->csum));
495	}
496}
497
498int tcp_v4_gso_send_check(struct sk_buff *skb)
499{
500	struct iphdr *iph;
501	struct tcphdr *th;
502
503	if (!pskb_may_pull(skb, sizeof(*th)))
504		return -EINVAL;
505
506	iph = skb->nh.iph;
507	th = skb->h.th;
508
509	th->check = 0;
510	th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
511	skb->csum = offsetof(struct tcphdr, check);
512	skb->ip_summed = CHECKSUM_PARTIAL;
513	return 0;
514}
515
516/*
517 *	This routine will send an RST to the other tcp.
518 *
519 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
520 *		      for reset.
521 *	Answer: if a packet caused RST, it is not for a socket
522 *		existing in our system, if it is matched to a socket,
523 *		it is just duplicate segment or bug in other side's TCP.
524 *		So that we build reply only basing on parameters
525 *		arrived with segment.
526 *	Exception: precedence violation. We do not implement it in any case.
527 */
528
529static void tcp_v4_send_reset(struct sk_buff *skb)
530{
531	struct tcphdr *th = skb->h.th;
532	struct tcphdr rth;
533	struct ip_reply_arg arg;
534
535	/* Never send a reset in response to a reset. */
536	if (th->rst)
537		return;
538
539	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
540		return;
541
542	/* Swap the send and the receive. */
543	memset(&rth, 0, sizeof(struct tcphdr));
544	rth.dest   = th->source;
545	rth.source = th->dest;
546	rth.doff   = sizeof(struct tcphdr) / 4;
547	rth.rst    = 1;
548
549	if (th->ack) {
550		rth.seq = th->ack_seq;
551	} else {
552		rth.ack = 1;
553		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
554				    skb->len - (th->doff << 2));
555	}
556
557	memset(&arg, 0, sizeof arg);
558	arg.iov[0].iov_base = (unsigned char *)&rth;
559	arg.iov[0].iov_len  = sizeof rth;
560	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
561				      skb->nh.iph->saddr, /*XXX*/
562				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
563	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
564
565	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
566
567	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
568	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
569}
570
571/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
572   outside socket context is ugly, certainly. What can I do?
573 */
574
575static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
576			    u32 win, u32 ts)
577{
578	struct tcphdr *th = skb->h.th;
579	struct {
580		struct tcphdr th;
581		u32 tsopt[TCPOLEN_TSTAMP_ALIGNED >> 2];
582	} rep;
583	struct ip_reply_arg arg;
584
585	memset(&rep.th, 0, sizeof(struct tcphdr));
586	memset(&arg, 0, sizeof arg);
587
588	arg.iov[0].iov_base = (unsigned char *)&rep;
589	arg.iov[0].iov_len  = sizeof(rep.th);
590	if (ts) {
591		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
592				     (TCPOPT_TIMESTAMP << 8) |
593				     TCPOLEN_TIMESTAMP);
594		rep.tsopt[1] = htonl(tcp_time_stamp);
595		rep.tsopt[2] = htonl(ts);
596		arg.iov[0].iov_len = sizeof(rep);
597	}
598
599	/* Swap the send and the receive. */
600	rep.th.dest    = th->source;
601	rep.th.source  = th->dest;
602	rep.th.doff    = arg.iov[0].iov_len / 4;
603	rep.th.seq     = htonl(seq);
604	rep.th.ack_seq = htonl(ack);
605	rep.th.ack     = 1;
606	rep.th.window  = htons(win);
607
608	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
609				      skb->nh.iph->saddr, /*XXX*/
610				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
611	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
612
613	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
614
615	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
616}
617
618static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
619{
620	struct inet_timewait_sock *tw = inet_twsk(sk);
621	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
622
623	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
624			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
625
626	inet_twsk_put(tw);
627}
628
629static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
630{
631	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
632			req->ts_recent);
633}
634
635/*
636 *	Send a SYN-ACK after having received an ACK.
637 *	This still operates on a request_sock only, not on a big
638 *	socket.
639 */
640static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
641			      struct dst_entry *dst)
642{
643	const struct inet_request_sock *ireq = inet_rsk(req);
644	int err = -1;
645	struct sk_buff * skb;
646
647	/* First, grab a route. */
648	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
649		goto out;
650
651	skb = tcp_make_synack(sk, dst, req);
652
653	if (skb) {
654		struct tcphdr *th = skb->h.th;
655
656		th->check = tcp_v4_check(th, skb->len,
657					 ireq->loc_addr,
658					 ireq->rmt_addr,
659					 csum_partial((char *)th, skb->len,
660						      skb->csum));
661
662		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
663					    ireq->rmt_addr,
664					    ireq->opt);
665		err = net_xmit_eval(err);
666	}
667
668out:
669	dst_release(dst);
670	return err;
671}
672
673/*
674 *	IPv4 request_sock destructor.
675 */
676static void tcp_v4_reqsk_destructor(struct request_sock *req)
677{
678	kfree(inet_rsk(req)->opt);
679}
680
681#ifdef CONFIG_SYN_COOKIES
682static void syn_flood_warning(struct sk_buff *skb)
683{
684	static unsigned long warntime;
685
686	if (time_after(jiffies, (warntime + HZ * 60))) {
687		warntime = jiffies;
688		printk(KERN_INFO
689		       "possible SYN flooding on port %d. Sending cookies.\n",
690		       ntohs(skb->h.th->dest));
691	}
692}
693#endif
694
695/*
696 * Save and compile IPv4 options into the request_sock if needed.
697 */
698static struct ip_options *tcp_v4_save_options(struct sock *sk,
699					      struct sk_buff *skb)
700{
701	struct ip_options *opt = &(IPCB(skb)->opt);
702	struct ip_options *dopt = NULL;
703
704	if (opt && opt->optlen) {
705		int opt_size = optlength(opt);
706		dopt = kmalloc(opt_size, GFP_ATOMIC);
707		if (dopt) {
708			if (ip_options_echo(dopt, skb)) {
709				kfree(dopt);
710				dopt = NULL;
711			}
712		}
713	}
714	return dopt;
715}
716
717struct request_sock_ops tcp_request_sock_ops __read_mostly = {
718	.family		=	PF_INET,
719	.obj_size	=	sizeof(struct tcp_request_sock),
720	.rtx_syn_ack	=	tcp_v4_send_synack,
721	.send_ack	=	tcp_v4_reqsk_send_ack,
722	.destructor	=	tcp_v4_reqsk_destructor,
723	.send_reset	=	tcp_v4_send_reset,
724};
725
726static struct timewait_sock_ops tcp_timewait_sock_ops = {
727	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
728	.twsk_unique	= tcp_twsk_unique,
729};
730
731int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
732{
733	struct inet_request_sock *ireq;
734	struct tcp_options_received tmp_opt;
735	struct request_sock *req;
736	__be32 saddr = skb->nh.iph->saddr;
737	__be32 daddr = skb->nh.iph->daddr;
738	__u32 isn = TCP_SKB_CB(skb)->when;
739	struct dst_entry *dst = NULL;
740#ifdef CONFIG_SYN_COOKIES
741	int want_cookie = 0;
742#else
743#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
744#endif
745
746	/* Never answer to SYNs send to broadcast or multicast */
747	if (((struct rtable *)skb->dst)->rt_flags &
748	    (RTCF_BROADCAST | RTCF_MULTICAST))
749		goto drop;
750
751	/* TW buckets are converted to open requests without
752	 * limitations, they conserve resources and peer is
753	 * evidently real one.
754	 */
755	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
756#ifdef CONFIG_SYN_COOKIES
757		if (sysctl_tcp_syncookies) {
758			want_cookie = 1;
759		} else
760#endif
761		goto drop;
762	}
763
764	/* Accept backlog is full. If we have already queued enough
765	 * of warm entries in syn queue, drop request. It is better than
766	 * clogging syn queue with openreqs with exponentially increasing
767	 * timeout.
768	 */
769	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
770		goto drop;
771
772	req = reqsk_alloc(&tcp_request_sock_ops);
773	if (!req)
774		goto drop;
775
776	tcp_clear_options(&tmp_opt);
777	tmp_opt.mss_clamp = 536;
778	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
779
780	tcp_parse_options(skb, &tmp_opt, 0);
781
782	if (want_cookie) {
783		tcp_clear_options(&tmp_opt);
784		tmp_opt.saw_tstamp = 0;
785	}
786
787	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
788		/* Some OSes (unknown ones, but I see them on web server, which
789		 * contains information interesting only for windows'
790		 * users) do not send their stamp in SYN. It is easy case.
791		 * We simply do not advertise TS support.
792		 */
793		tmp_opt.saw_tstamp = 0;
794		tmp_opt.tstamp_ok  = 0;
795	}
796	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
797
798	tcp_openreq_init(req, &tmp_opt, skb);
799
800	if (security_inet_conn_request(sk, skb, req))
801		goto drop_and_free;
802
803	ireq = inet_rsk(req);
804	ireq->loc_addr = daddr;
805	ireq->rmt_addr = saddr;
806	ireq->opt = tcp_v4_save_options(sk, skb);
807	if (!want_cookie)
808		TCP_ECN_create_request(req, skb->h.th);
809
810	if (want_cookie) {
811#ifdef CONFIG_SYN_COOKIES
812		syn_flood_warning(skb);
813#endif
814		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
815	} else if (!isn) {
816		struct inet_peer *peer = NULL;
817
818		/* VJ's idea. We save last timestamp seen
819		 * from the destination in peer table, when entering
820		 * state TIME-WAIT, and check against it before
821		 * accepting new connection request.
822		 *
823		 * If "isn" is not zero, this request hit alive
824		 * timewait bucket, so that all the necessary checks
825		 * are made in the function processing timewait state.
826		 */
827		if (tmp_opt.saw_tstamp &&
828		    tcp_death_row.sysctl_tw_recycle &&
829		    (dst = inet_csk_route_req(sk, req)) != NULL &&
830		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
831		    peer->v4daddr == saddr) {
832			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
833			    (s32)(peer->tcp_ts - req->ts_recent) >
834							TCP_PAWS_WINDOW) {
835				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
836				dst_release(dst);
837				goto drop_and_free;
838			}
839		}
840		/* Kill the following clause, if you dislike this way. */
841		else if (!sysctl_tcp_syncookies &&
842			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
843			  (sysctl_max_syn_backlog >> 2)) &&
844			 (!peer || !peer->tcp_ts_stamp) &&
845			 (!dst || !dst_metric(dst, RTAX_RTT))) {
846			/* Without syncookies last quarter of
847			 * backlog is filled with destinations,
848			 * proven to be alive.
849			 * It means that we continue to communicate
850			 * to destinations, already remembered
851			 * to the moment of synflood.
852			 */
853			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
854				       "request from %u.%u.%u.%u/%u\n",
855				       NIPQUAD(saddr),
856				       ntohs(skb->h.th->source));
857			dst_release(dst);
858			goto drop_and_free;
859		}
860
861		isn = tcp_v4_init_sequence(skb);
862	}
863	tcp_rsk(req)->snt_isn = isn;
864
865	if (tcp_v4_send_synack(sk, req, dst))
866		goto drop_and_free;
867
868	if (want_cookie) {
869	   	reqsk_free(req);
870	} else {
871		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
872	}
873	return 0;
874
875drop_and_free:
876	reqsk_free(req);
877drop:
878	return 0;
879}
880
881
882/*
883 * The three way handshake has completed - we got a valid synack -
884 * now create the new socket.
885 */
886struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
887				  struct request_sock *req,
888				  struct dst_entry *dst)
889{
890	struct inet_request_sock *ireq;
891	struct inet_sock *newinet;
892	struct tcp_sock *newtp;
893	struct sock *newsk;
894
895	if (sk_acceptq_is_full(sk))
896		goto exit_overflow;
897
898	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
899		goto exit;
900
901	newsk = tcp_create_openreq_child(sk, req, skb);
902	if (!newsk)
903		goto exit;
904
905	newsk->sk_gso_type = SKB_GSO_TCPV4;
906	sk_setup_caps(newsk, dst);
907
908	newtp		      = tcp_sk(newsk);
909	newinet		      = inet_sk(newsk);
910	ireq		      = inet_rsk(req);
911	newinet->daddr	      = ireq->rmt_addr;
912	newinet->rcv_saddr    = ireq->loc_addr;
913	newinet->saddr	      = ireq->loc_addr;
914	newinet->opt	      = ireq->opt;
915	ireq->opt	      = NULL;
916	newinet->mc_index     = inet_iif(skb);
917	newinet->mc_ttl	      = skb->nh.iph->ttl;
918	inet_csk(newsk)->icsk_ext_hdr_len = 0;
919	if (newinet->opt)
920		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
921	newinet->id = newtp->write_seq ^ jiffies;
922
923	tcp_mtup_init(newsk);
924	tcp_sync_mss(newsk, dst_mtu(dst));
925	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
926	tcp_initialize_rcv_mss(newsk);
927
928	__inet_hash(&tcp_hashinfo, newsk, 0);
929	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
930
931	return newsk;
932
933exit_overflow:
934	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
935exit:
936	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
937	dst_release(dst);
938	return NULL;
939}
940
941static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
942{
943	struct tcphdr *th = skb->h.th;
944	struct iphdr *iph = skb->nh.iph;
945	struct sock *nsk;
946	struct request_sock **prev;
947	/* Find possible connection requests. */
948	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
949						       iph->saddr, iph->daddr);
950	if (req)
951		return tcp_check_req(sk, skb, req, prev);
952
953	nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
954				      th->source, skb->nh.iph->daddr,
955				      th->dest, inet_iif(skb));
956
957	if (nsk) {
958		if (nsk->sk_state != TCP_TIME_WAIT) {
959			bh_lock_sock(nsk);
960			return nsk;
961		}
962		inet_twsk_put(inet_twsk(nsk));
963		return NULL;
964	}
965
966#ifdef CONFIG_SYN_COOKIES
967	if (!th->rst && !th->syn && th->ack)
968		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
969#endif
970	return sk;
971}
972
973static int tcp_v4_checksum_init(struct sk_buff *skb)
974{
975	if (skb->ip_summed == CHECKSUM_COMPLETE) {
976		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
977				  skb->nh.iph->daddr, skb->csum)) {
978			skb->ip_summed = CHECKSUM_UNNECESSARY;
979			return 0;
980		}
981	}
982
983	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
984				       skb->len, IPPROTO_TCP, 0);
985
986	if (skb->len <= 76) {
987		return __skb_checksum_complete(skb);
988	}
989	return 0;
990}
991
992
993/* The socket must have it's spinlock held when we get
994 * here.
995 *
996 * We have a potential double-lock case here, so even when
997 * doing backlog processing we use the BH locking scheme.
998 * This is because we cannot sleep with the original spinlock
999 * held.
1000 */
1001int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1002{
1003	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1004		TCP_CHECK_TIMER(sk);
1005		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1006			goto reset;
1007		TCP_CHECK_TIMER(sk);
1008		return 0;
1009	}
1010
1011	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1012		goto csum_err;
1013
1014	if (sk->sk_state == TCP_LISTEN) {
1015		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1016		if (!nsk)
1017			goto discard;
1018
1019		if (nsk != sk) {
1020			if (tcp_child_process(sk, nsk, skb))
1021				goto reset;
1022			return 0;
1023		}
1024	}
1025
1026	TCP_CHECK_TIMER(sk);
1027	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1028		goto reset;
1029	TCP_CHECK_TIMER(sk);
1030	return 0;
1031
1032reset:
1033	tcp_v4_send_reset(skb);
1034discard:
1035	kfree_skb(skb);
1036	/* Be careful here. If this function gets more complicated and
1037	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1038	 * might be destroyed here. This current version compiles correctly,
1039	 * but you have been warned.
1040	 */
1041	return 0;
1042
1043csum_err:
1044	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1045	goto discard;
1046}
1047
1048/*
1049 *	From tcp_input.c
1050 */
1051
1052int tcp_v4_rcv(struct sk_buff *skb)
1053{
1054	struct tcphdr *th;
1055	struct sock *sk;
1056	int ret;
1057
1058	if (skb->pkt_type != PACKET_HOST)
1059		goto discard_it;
1060
1061	/* Count it even if it's bad */
1062	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1063
1064	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1065		goto discard_it;
1066
1067	th = skb->h.th;
1068
1069	if (th->doff < sizeof(struct tcphdr) / 4)
1070		goto bad_packet;
1071	if (!pskb_may_pull(skb, th->doff * 4))
1072		goto discard_it;
1073
1074	/* An explanation is required here, I think.
1075	 * Packet length and doff are validated by header prediction,
1076	 * provided case of th->doff==0 is eliminated.
1077	 * So, we defer the checks. */
1078	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1079	     tcp_v4_checksum_init(skb)))
1080		goto bad_packet;
1081
1082	th = skb->h.th;
1083	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1084	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1085				    skb->len - th->doff * 4);
1086	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1087	TCP_SKB_CB(skb)->when	 = 0;
1088	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1089	TCP_SKB_CB(skb)->sacked	 = 0;
1090
1091	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1092			   skb->nh.iph->daddr, th->dest,
1093			   inet_iif(skb));
1094
1095	if (!sk)
1096		goto no_tcp_socket;
1097
1098process:
1099	if (sk->sk_state == TCP_TIME_WAIT)
1100		goto do_time_wait;
1101
1102	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1103		goto discard_and_relse;
1104	nf_reset(skb);
1105
1106	if (sk_filter(sk, skb))
1107		goto discard_and_relse;
1108
1109	skb->dev = NULL;
1110
1111	bh_lock_sock_nested(sk);
1112	ret = 0;
1113	if (!sock_owned_by_user(sk)) {
1114#ifdef CONFIG_NET_DMA
1115		struct tcp_sock *tp = tcp_sk(sk);
1116		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1117			tp->ucopy.dma_chan = get_softnet_dma();
1118		if (tp->ucopy.dma_chan)
1119			ret = tcp_v4_do_rcv(sk, skb);
1120		else
1121#endif
1122		{
1123			if (!tcp_prequeue(sk, skb))
1124			ret = tcp_v4_do_rcv(sk, skb);
1125		}
1126	} else
1127		sk_add_backlog(sk, skb);
1128	bh_unlock_sock(sk);
1129
1130	sock_put(sk);
1131
1132	return ret;
1133
1134no_tcp_socket:
1135	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1136		goto discard_it;
1137
1138	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1139bad_packet:
1140		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1141	} else {
1142		tcp_v4_send_reset(skb);
1143	}
1144
1145discard_it:
1146	/* Discard frame. */
1147	kfree_skb(skb);
1148  	return 0;
1149
1150discard_and_relse:
1151	sock_put(sk);
1152	goto discard_it;
1153
1154do_time_wait:
1155	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1156		inet_twsk_put(inet_twsk(sk));
1157		goto discard_it;
1158	}
1159
1160	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1161		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1162		inet_twsk_put(inet_twsk(sk));
1163		goto discard_it;
1164	}
1165	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1166	case TCP_TW_SYN: {
1167		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1168							skb->nh.iph->daddr,
1169							th->dest,
1170							inet_iif(skb));
1171		if (sk2) {
1172			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1173			inet_twsk_put(inet_twsk(sk));
1174			sk = sk2;
1175			goto process;
1176		}
1177		/* Fall through to ACK */
1178	}
1179	case TCP_TW_ACK:
1180		tcp_v4_timewait_ack(sk, skb);
1181		break;
1182	case TCP_TW_RST:
1183		goto no_tcp_socket;
1184	case TCP_TW_SUCCESS:;
1185	}
1186	goto discard_it;
1187}
1188
1189/* VJ's idea. Save last timestamp seen from this destination
1190 * and hold it at least for normal timewait interval to use for duplicate
1191 * segment detection in subsequent connections, before they enter synchronized
1192 * state.
1193 */
1194
1195int tcp_v4_remember_stamp(struct sock *sk)
1196{
1197	struct inet_sock *inet = inet_sk(sk);
1198	struct tcp_sock *tp = tcp_sk(sk);
1199	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1200	struct inet_peer *peer = NULL;
1201	int release_it = 0;
1202
1203	if (!rt || rt->rt_dst != inet->daddr) {
1204		peer = inet_getpeer(inet->daddr, 1);
1205		release_it = 1;
1206	} else {
1207		if (!rt->peer)
1208			rt_bind_peer(rt, 1);
1209		peer = rt->peer;
1210	}
1211
1212	if (peer) {
1213		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1214		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1215		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1216			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1217			peer->tcp_ts = tp->rx_opt.ts_recent;
1218		}
1219		if (release_it)
1220			inet_putpeer(peer);
1221		return 1;
1222	}
1223
1224	return 0;
1225}
1226
1227int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1228{
1229	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1230
1231	if (peer) {
1232		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1233
1234		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1235		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1236		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1237			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1238			peer->tcp_ts	   = tcptw->tw_ts_recent;
1239		}
1240		inet_putpeer(peer);
1241		return 1;
1242	}
1243
1244	return 0;
1245}
1246
1247struct inet_connection_sock_af_ops ipv4_specific = {
1248	.queue_xmit	   = ip_queue_xmit,
1249	.send_check	   = tcp_v4_send_check,
1250	.rebuild_header	   = inet_sk_rebuild_header,
1251	.conn_request	   = tcp_v4_conn_request,
1252	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1253	.remember_stamp	   = tcp_v4_remember_stamp,
1254	.net_header_len	   = sizeof(struct iphdr),
1255	.setsockopt	   = ip_setsockopt,
1256	.getsockopt	   = ip_getsockopt,
1257	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1258	.sockaddr_len	   = sizeof(struct sockaddr_in),
1259#ifdef CONFIG_COMPAT
1260	.compat_setsockopt = compat_ip_setsockopt,
1261	.compat_getsockopt = compat_ip_getsockopt,
1262#endif
1263};
1264
1265/* NOTE: A lot of things set to zero explicitly by call to
1266 *       sk_alloc() so need not be done here.
1267 */
1268static int tcp_v4_init_sock(struct sock *sk)
1269{
1270	struct inet_connection_sock *icsk = inet_csk(sk);
1271	struct tcp_sock *tp = tcp_sk(sk);
1272
1273	skb_queue_head_init(&tp->out_of_order_queue);
1274	tcp_init_xmit_timers(sk);
1275	tcp_prequeue_init(tp);
1276
1277	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1278	tp->mdev = TCP_TIMEOUT_INIT;
1279
1280	/* So many TCP implementations out there (incorrectly) count the
1281	 * initial SYN frame in their delayed-ACK and congestion control
1282	 * algorithms that we must have the following bandaid to talk
1283	 * efficiently to them.  -DaveM
1284	 */
1285	tp->snd_cwnd = 2;
1286
1287	/* See draft-stevens-tcpca-spec-01 for discussion of the
1288	 * initialization of these values.
1289	 */
1290	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1291	tp->snd_cwnd_clamp = ~0;
1292	tp->mss_cache = 536;
1293
1294	tp->reordering = sysctl_tcp_reordering;
1295	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1296
1297	sk->sk_state = TCP_CLOSE;
1298
1299	sk->sk_write_space = sk_stream_write_space;
1300	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1301
1302	icsk->icsk_af_ops = &ipv4_specific;
1303	icsk->icsk_sync_mss = tcp_sync_mss;
1304
1305	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1306	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1307
1308	atomic_inc(&tcp_sockets_allocated);
1309
1310	return 0;
1311}
1312
1313int tcp_v4_destroy_sock(struct sock *sk)
1314{
1315	struct tcp_sock *tp = tcp_sk(sk);
1316
1317	tcp_clear_xmit_timers(sk);
1318
1319	tcp_cleanup_congestion_control(sk);
1320
1321	/* Cleanup up the write buffer. */
1322  	sk_stream_writequeue_purge(sk);
1323
1324	/* Cleans up our, hopefully empty, out_of_order_queue. */
1325  	__skb_queue_purge(&tp->out_of_order_queue);
1326
1327#ifdef CONFIG_NET_DMA
1328	/* Cleans up our sk_async_wait_queue */
1329  	__skb_queue_purge(&sk->sk_async_wait_queue);
1330#endif
1331
1332	/* Clean prequeue, it must be empty really */
1333	__skb_queue_purge(&tp->ucopy.prequeue);
1334
1335	/* Clean up a referenced TCP bind bucket. */
1336	if (inet_csk(sk)->icsk_bind_hash)
1337		inet_put_port(&tcp_hashinfo, sk);
1338
1339	/*
1340	 * If sendmsg cached page exists, toss it.
1341	 */
1342	if (sk->sk_sndmsg_page) {
1343		__free_page(sk->sk_sndmsg_page);
1344		sk->sk_sndmsg_page = NULL;
1345	}
1346
1347	atomic_dec(&tcp_sockets_allocated);
1348
1349	return 0;
1350}
1351
1352EXPORT_SYMBOL(tcp_v4_destroy_sock);
1353
1354#ifdef CONFIG_PROC_FS
1355/* Proc filesystem TCP sock list dumping. */
1356
1357static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1358{
1359	return hlist_empty(head) ? NULL :
1360		list_entry(head->first, struct inet_timewait_sock, tw_node);
1361}
1362
1363static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1364{
1365	return tw->tw_node.next ?
1366		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1367}
1368
1369static void *listening_get_next(struct seq_file *seq, void *cur)
1370{
1371	struct inet_connection_sock *icsk;
1372	struct hlist_node *node;
1373	struct sock *sk = cur;
1374	struct tcp_iter_state* st = seq->private;
1375
1376	if (!sk) {
1377		st->bucket = 0;
1378		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1379		goto get_sk;
1380	}
1381
1382	++st->num;
1383
1384	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1385		struct request_sock *req = cur;
1386
1387		icsk = inet_csk(st->syn_wait_sk);
1388		req = req->dl_next;
1389		while (1) {
1390			while (req) {
1391				if (req->rsk_ops->family == st->family) {
1392					cur = req;
1393					goto out;
1394				}
1395				req = req->dl_next;
1396			}
1397			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1398				break;
1399get_req:
1400			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1401		}
1402		sk	  = sk_next(st->syn_wait_sk);
1403		st->state = TCP_SEQ_STATE_LISTENING;
1404		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1405	} else {
1406	       	icsk = inet_csk(sk);
1407		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1408		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1409			goto start_req;
1410		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1411		sk = sk_next(sk);
1412	}
1413get_sk:
1414	sk_for_each_from(sk, node) {
1415		if (sk->sk_family == st->family) {
1416			cur = sk;
1417			goto out;
1418		}
1419	       	icsk = inet_csk(sk);
1420		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1421		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1422start_req:
1423			st->uid		= sock_i_uid(sk);
1424			st->syn_wait_sk = sk;
1425			st->state	= TCP_SEQ_STATE_OPENREQ;
1426			st->sbucket	= 0;
1427			goto get_req;
1428		}
1429		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1430	}
1431	if (++st->bucket < INET_LHTABLE_SIZE) {
1432		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1433		goto get_sk;
1434	}
1435	cur = NULL;
1436out:
1437	return cur;
1438}
1439
1440static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1441{
1442	void *rc = listening_get_next(seq, NULL);
1443
1444	while (rc && *pos) {
1445		rc = listening_get_next(seq, rc);
1446		--*pos;
1447	}
1448	return rc;
1449}
1450
1451static void *established_get_first(struct seq_file *seq)
1452{
1453	struct tcp_iter_state* st = seq->private;
1454	void *rc = NULL;
1455
1456	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1457		struct sock *sk;
1458		struct hlist_node *node;
1459		struct inet_timewait_sock *tw;
1460
1461		/* We can reschedule _before_ having picked the target: */
1462		cond_resched_softirq();
1463
1464		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1465		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1466			if (sk->sk_family != st->family) {
1467				continue;
1468			}
1469			rc = sk;
1470			goto out;
1471		}
1472		st->state = TCP_SEQ_STATE_TIME_WAIT;
1473		inet_twsk_for_each(tw, node,
1474				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1475			if (tw->tw_family != st->family) {
1476				continue;
1477			}
1478			rc = tw;
1479			goto out;
1480		}
1481		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1482		st->state = TCP_SEQ_STATE_ESTABLISHED;
1483	}
1484out:
1485	return rc;
1486}
1487
1488static void *established_get_next(struct seq_file *seq, void *cur)
1489{
1490	struct sock *sk = cur;
1491	struct inet_timewait_sock *tw;
1492	struct hlist_node *node;
1493	struct tcp_iter_state* st = seq->private;
1494
1495	++st->num;
1496
1497	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1498		tw = cur;
1499		tw = tw_next(tw);
1500get_tw:
1501		while (tw && tw->tw_family != st->family) {
1502			tw = tw_next(tw);
1503		}
1504		if (tw) {
1505			cur = tw;
1506			goto out;
1507		}
1508		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1509		st->state = TCP_SEQ_STATE_ESTABLISHED;
1510
1511		/* We can reschedule between buckets: */
1512		cond_resched_softirq();
1513
1514		if (++st->bucket < tcp_hashinfo.ehash_size) {
1515			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1516			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1517		} else {
1518			cur = NULL;
1519			goto out;
1520		}
1521	} else
1522		sk = sk_next(sk);
1523
1524	sk_for_each_from(sk, node) {
1525		if (sk->sk_family == st->family)
1526			goto found;
1527	}
1528
1529	st->state = TCP_SEQ_STATE_TIME_WAIT;
1530	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1531	goto get_tw;
1532found:
1533	cur = sk;
1534out:
1535	return cur;
1536}
1537
1538static void *established_get_idx(struct seq_file *seq, loff_t pos)
1539{
1540	void *rc = established_get_first(seq);
1541
1542	while (rc && pos) {
1543		rc = established_get_next(seq, rc);
1544		--pos;
1545	}
1546	return rc;
1547}
1548
1549static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1550{
1551	void *rc;
1552	struct tcp_iter_state* st = seq->private;
1553
1554	inet_listen_lock(&tcp_hashinfo);
1555	st->state = TCP_SEQ_STATE_LISTENING;
1556	rc	  = listening_get_idx(seq, &pos);
1557
1558	if (!rc) {
1559		inet_listen_unlock(&tcp_hashinfo);
1560		local_bh_disable();
1561		st->state = TCP_SEQ_STATE_ESTABLISHED;
1562		rc	  = established_get_idx(seq, pos);
1563	}
1564
1565	return rc;
1566}
1567
1568static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1569{
1570	struct tcp_iter_state* st = seq->private;
1571	st->state = TCP_SEQ_STATE_LISTENING;
1572	st->num = 0;
1573	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1574}
1575
1576static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1577{
1578	void *rc = NULL;
1579	struct tcp_iter_state* st;
1580
1581	if (v == SEQ_START_TOKEN) {
1582		rc = tcp_get_idx(seq, 0);
1583		goto out;
1584	}
1585	st = seq->private;
1586
1587	switch (st->state) {
1588	case TCP_SEQ_STATE_OPENREQ:
1589	case TCP_SEQ_STATE_LISTENING:
1590		rc = listening_get_next(seq, v);
1591		if (!rc) {
1592			inet_listen_unlock(&tcp_hashinfo);
1593			local_bh_disable();
1594			st->state = TCP_SEQ_STATE_ESTABLISHED;
1595			rc	  = established_get_first(seq);
1596		}
1597		break;
1598	case TCP_SEQ_STATE_ESTABLISHED:
1599	case TCP_SEQ_STATE_TIME_WAIT:
1600		rc = established_get_next(seq, v);
1601		break;
1602	}
1603out:
1604	++*pos;
1605	return rc;
1606}
1607
1608static void tcp_seq_stop(struct seq_file *seq, void *v)
1609{
1610	struct tcp_iter_state* st = seq->private;
1611
1612	switch (st->state) {
1613	case TCP_SEQ_STATE_OPENREQ:
1614		if (v) {
1615			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1616			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1617		}
1618	case TCP_SEQ_STATE_LISTENING:
1619		if (v != SEQ_START_TOKEN)
1620			inet_listen_unlock(&tcp_hashinfo);
1621		break;
1622	case TCP_SEQ_STATE_TIME_WAIT:
1623	case TCP_SEQ_STATE_ESTABLISHED:
1624		if (v)
1625			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1626		local_bh_enable();
1627		break;
1628	}
1629}
1630
1631static int tcp_seq_open(struct inode *inode, struct file *file)
1632{
1633	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1634	struct seq_file *seq;
1635	struct tcp_iter_state *s;
1636	int rc;
1637
1638	if (unlikely(afinfo == NULL))
1639		return -EINVAL;
1640
1641	s = kzalloc(sizeof(*s), GFP_KERNEL);
1642	if (!s)
1643		return -ENOMEM;
1644	s->family		= afinfo->family;
1645	s->seq_ops.start	= tcp_seq_start;
1646	s->seq_ops.next		= tcp_seq_next;
1647	s->seq_ops.show		= afinfo->seq_show;
1648	s->seq_ops.stop		= tcp_seq_stop;
1649
1650	rc = seq_open(file, &s->seq_ops);
1651	if (rc)
1652		goto out_kfree;
1653	seq	     = file->private_data;
1654	seq->private = s;
1655out:
1656	return rc;
1657out_kfree:
1658	kfree(s);
1659	goto out;
1660}
1661
1662int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1663{
1664	int rc = 0;
1665	struct proc_dir_entry *p;
1666
1667	if (!afinfo)
1668		return -EINVAL;
1669	afinfo->seq_fops->owner		= afinfo->owner;
1670	afinfo->seq_fops->open		= tcp_seq_open;
1671	afinfo->seq_fops->read		= seq_read;
1672	afinfo->seq_fops->llseek	= seq_lseek;
1673	afinfo->seq_fops->release	= seq_release_private;
1674
1675	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1676	if (p)
1677		p->data = afinfo;
1678	else
1679		rc = -ENOMEM;
1680	return rc;
1681}
1682
1683void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1684{
1685	if (!afinfo)
1686		return;
1687	proc_net_remove(afinfo->name);
1688	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1689}
1690
1691static void get_openreq4(struct sock *sk, struct request_sock *req,
1692			 char *tmpbuf, int i, int uid)
1693{
1694	const struct inet_request_sock *ireq = inet_rsk(req);
1695	int ttd = req->expires - jiffies;
1696
1697	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1698		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1699		i,
1700		ireq->loc_addr,
1701		ntohs(inet_sk(sk)->sport),
1702		ireq->rmt_addr,
1703		ntohs(ireq->rmt_port),
1704		TCP_SYN_RECV,
1705		0, 0, /* could print option size, but that is af dependent. */
1706		1,    /* timers active (only the expire timer) */
1707		jiffies_to_clock_t(ttd),
1708		req->retrans,
1709		uid,
1710		0,  /* non standard timer */
1711		0, /* open_requests have no inode */
1712		atomic_read(&sk->sk_refcnt),
1713		req);
1714}
1715
1716static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1717{
1718	int timer_active;
1719	unsigned long timer_expires;
1720	struct tcp_sock *tp = tcp_sk(sp);
1721	const struct inet_connection_sock *icsk = inet_csk(sp);
1722	struct inet_sock *inet = inet_sk(sp);
1723	unsigned int dest = inet->daddr;
1724	unsigned int src = inet->rcv_saddr;
1725	__u16 destp = ntohs(inet->dport);
1726	__u16 srcp = ntohs(inet->sport);
1727
1728	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1729		timer_active	= 1;
1730		timer_expires	= icsk->icsk_timeout;
1731	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1732		timer_active	= 4;
1733		timer_expires	= icsk->icsk_timeout;
1734	} else if (timer_pending(&sp->sk_timer)) {
1735		timer_active	= 2;
1736		timer_expires	= sp->sk_timer.expires;
1737	} else {
1738		timer_active	= 0;
1739		timer_expires = jiffies;
1740	}
1741
1742	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1743			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1744		i, src, srcp, dest, destp, sp->sk_state,
1745		tp->write_seq - tp->snd_una,
1746		(sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
1747		timer_active,
1748		jiffies_to_clock_t(timer_expires - jiffies),
1749		icsk->icsk_retransmits,
1750		sock_i_uid(sp),
1751		icsk->icsk_probes_out,
1752		sock_i_ino(sp),
1753		atomic_read(&sp->sk_refcnt), sp,
1754		icsk->icsk_rto,
1755		icsk->icsk_ack.ato,
1756		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1757		tp->snd_cwnd,
1758		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1759}
1760
1761static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1762{
1763	__be32 dest, src;
1764	__u16 destp, srcp;
1765	int ttd = tw->tw_ttd - jiffies;
1766
1767	if (ttd < 0)
1768		ttd = 0;
1769
1770	dest  = tw->tw_daddr;
1771	src   = tw->tw_rcv_saddr;
1772	destp = ntohs(tw->tw_dport);
1773	srcp  = ntohs(tw->tw_sport);
1774
1775	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1776		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1777		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1778		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1779		atomic_read(&tw->tw_refcnt), tw);
1780}
1781
1782#define TMPSZ 150
1783
1784static int tcp4_seq_show(struct seq_file *seq, void *v)
1785{
1786	struct tcp_iter_state* st;
1787	char tmpbuf[TMPSZ + 1];
1788
1789	if (v == SEQ_START_TOKEN) {
1790		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1791			   "  sl  local_address rem_address   st tx_queue "
1792			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1793			   "inode");
1794		goto out;
1795	}
1796	st = seq->private;
1797
1798	switch (st->state) {
1799	case TCP_SEQ_STATE_LISTENING:
1800	case TCP_SEQ_STATE_ESTABLISHED:
1801		get_tcp4_sock(v, tmpbuf, st->num);
1802		break;
1803	case TCP_SEQ_STATE_OPENREQ:
1804		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1805		break;
1806	case TCP_SEQ_STATE_TIME_WAIT:
1807		get_timewait4_sock(v, tmpbuf, st->num);
1808		break;
1809	}
1810	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1811out:
1812	return 0;
1813}
1814
1815static struct file_operations tcp4_seq_fops;
1816static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1817	.owner		= THIS_MODULE,
1818	.name		= "tcp",
1819	.family		= AF_INET,
1820	.seq_show	= tcp4_seq_show,
1821	.seq_fops	= &tcp4_seq_fops,
1822};
1823
1824int __init tcp4_proc_init(void)
1825{
1826	return tcp_proc_register(&tcp4_seq_afinfo);
1827}
1828
1829void tcp4_proc_exit(void)
1830{
1831	tcp_proc_unregister(&tcp4_seq_afinfo);
1832}
1833#endif /* CONFIG_PROC_FS */
1834
1835struct proto tcp_prot = {
1836	.name			= "TCP",
1837	.owner			= THIS_MODULE,
1838	.close			= tcp_close,
1839	.connect		= tcp_v4_connect,
1840	.disconnect		= tcp_disconnect,
1841	.accept			= inet_csk_accept,
1842	.ioctl			= tcp_ioctl,
1843	.init			= tcp_v4_init_sock,
1844	.destroy		= tcp_v4_destroy_sock,
1845	.shutdown		= tcp_shutdown,
1846	.setsockopt		= tcp_setsockopt,
1847	.getsockopt		= tcp_getsockopt,
1848	.sendmsg		= tcp_sendmsg,
1849	.recvmsg		= tcp_recvmsg,
1850	.backlog_rcv		= tcp_v4_do_rcv,
1851	.hash			= tcp_v4_hash,
1852	.unhash			= tcp_unhash,
1853	.get_port		= tcp_v4_get_port,
1854	.enter_memory_pressure	= tcp_enter_memory_pressure,
1855	.sockets_allocated	= &tcp_sockets_allocated,
1856	.orphan_count		= &tcp_orphan_count,
1857	.memory_allocated	= &tcp_memory_allocated,
1858	.memory_pressure	= &tcp_memory_pressure,
1859	.sysctl_mem		= sysctl_tcp_mem,
1860	.sysctl_wmem		= sysctl_tcp_wmem,
1861	.sysctl_rmem		= sysctl_tcp_rmem,
1862	.max_header		= MAX_TCP_HEADER,
1863	.obj_size		= sizeof(struct tcp_sock),
1864	.twsk_prot		= &tcp_timewait_sock_ops,
1865	.rsk_prot		= &tcp_request_sock_ops,
1866#ifdef CONFIG_COMPAT
1867	.compat_setsockopt	= compat_tcp_setsockopt,
1868	.compat_getsockopt	= compat_tcp_getsockopt,
1869#endif
1870};
1871
1872void __init tcp_v4_init(struct net_proto_family *ops)
1873{
1874	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
1875		panic("Failed to create the TCP control socket.\n");
1876}
1877
1878EXPORT_SYMBOL(ipv4_specific);
1879EXPORT_SYMBOL(tcp_hashinfo);
1880EXPORT_SYMBOL(tcp_prot);
1881EXPORT_SYMBOL(tcp_unhash);
1882EXPORT_SYMBOL(tcp_v4_conn_request);
1883EXPORT_SYMBOL(tcp_v4_connect);
1884EXPORT_SYMBOL(tcp_v4_do_rcv);
1885EXPORT_SYMBOL(tcp_v4_remember_stamp);
1886EXPORT_SYMBOL(tcp_v4_send_check);
1887EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1888
1889#ifdef CONFIG_PROC_FS
1890EXPORT_SYMBOL(tcp_proc_register);
1891EXPORT_SYMBOL(tcp_proc_unregister);
1892#endif
1893EXPORT_SYMBOL(sysctl_local_port_range);
1894EXPORT_SYMBOL(sysctl_tcp_low_latency);
1895
1896