tcp_ipv4.c revision 5d424d5a674f782d0659a3b66d951f412901faee
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81int sysctl_tcp_tw_reuse;
82int sysctl_tcp_low_latency;
83
84/* Check TCP sequence numbers in ICMP packets. */
85#define ICMP_MIN_LENGTH 8
86
87/* Socket used for sending RSTs */
88static struct socket *tcp_socket;
89
90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock	= RW_LOCK_UNLOCKED,
94	.lhash_users	= ATOMIC_INIT(0),
95	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96};
97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{
100	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101				 inet_csk_bind_conflict);
102}
103
104static void tcp_v4_hash(struct sock *sk)
105{
106	inet_hash(&tcp_hashinfo, sk);
107}
108
109void tcp_unhash(struct sock *sk)
110{
111	inet_unhash(&tcp_hashinfo, sk);
112}
113
114static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
115{
116	return secure_tcp_sequence_number(skb->nh.iph->daddr,
117					  skb->nh.iph->saddr,
118					  skb->h.th->dest,
119					  skb->h.th->source);
120}
121
122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
123{
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126
127	/* With PAWS, it is safe from the viewpoint
128	   of data integrity. Even without PAWS it is safe provided sequence
129	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
130
131	   Actually, the idea is close to VJ's one, only timestamp cache is
132	   held not per host, but per port pair and TW bucket is used as state
133	   holder.
134
135	   If TW bucket has been already destroyed we fall back to VJ's scheme
136	   and use initial timestamp retrieved from peer table.
137	 */
138	if (tcptw->tw_ts_recent_stamp &&
139	    (twp == NULL || (sysctl_tcp_tw_reuse &&
140			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
141		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
142		if (tp->write_seq == 0)
143			tp->write_seq = 1;
144		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
145		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
146		sock_hold(sktw);
147		return 1;
148	}
149
150	return 0;
151}
152
153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
154
155/* This will initiate an outgoing connection. */
156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
157{
158	struct inet_sock *inet = inet_sk(sk);
159	struct tcp_sock *tp = tcp_sk(sk);
160	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
161	struct rtable *rt;
162	u32 daddr, nexthop;
163	int tmp;
164	int err;
165
166	if (addr_len < sizeof(struct sockaddr_in))
167		return -EINVAL;
168
169	if (usin->sin_family != AF_INET)
170		return -EAFNOSUPPORT;
171
172	nexthop = daddr = usin->sin_addr.s_addr;
173	if (inet->opt && inet->opt->srr) {
174		if (!daddr)
175			return -EINVAL;
176		nexthop = inet->opt->faddr;
177	}
178
179	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
180			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181			       IPPROTO_TCP,
182			       inet->sport, usin->sin_port, sk);
183	if (tmp < 0)
184		return tmp;
185
186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187		ip_rt_put(rt);
188		return -ENETUNREACH;
189	}
190
191	if (!inet->opt || !inet->opt->srr)
192		daddr = rt->rt_dst;
193
194	if (!inet->saddr)
195		inet->saddr = rt->rt_src;
196	inet->rcv_saddr = inet->saddr;
197
198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199		/* Reset inherited state */
200		tp->rx_opt.ts_recent	   = 0;
201		tp->rx_opt.ts_recent_stamp = 0;
202		tp->write_seq		   = 0;
203	}
204
205	if (tcp_death_row.sysctl_tw_recycle &&
206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207		struct inet_peer *peer = rt_get_peer(rt);
208
209		/* VJ's idea. We save last timestamp seen from
210		 * the destination in peer table, when entering state TIME-WAIT
211		 * and initialize rx_opt.ts_recent from it, when trying new connection.
212		 */
213
214		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
215			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216			tp->rx_opt.ts_recent = peer->tcp_ts;
217		}
218	}
219
220	inet->dport = usin->sin_port;
221	inet->daddr = daddr;
222
223	inet_csk(sk)->icsk_ext_hdr_len = 0;
224	if (inet->opt)
225		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226
227	tp->rx_opt.mss_clamp = 536;
228
229	/* Socket identity is still unknown (sport may be zero).
230	 * However we set state to SYN-SENT and not releasing socket
231	 * lock select source port, enter ourselves into the hash tables and
232	 * complete initialization after this.
233	 */
234	tcp_set_state(sk, TCP_SYN_SENT);
235	err = inet_hash_connect(&tcp_death_row, sk);
236	if (err)
237		goto failure;
238
239	err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
240	if (err)
241		goto failure;
242
243	/* OK, now commit destination to socket.  */
244	sk_setup_caps(sk, &rt->u.dst);
245
246	if (!tp->write_seq)
247		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
248							   inet->daddr,
249							   inet->sport,
250							   usin->sin_port);
251
252	inet->id = tp->write_seq ^ jiffies;
253
254	err = tcp_connect(sk);
255	rt = NULL;
256	if (err)
257		goto failure;
258
259	return 0;
260
261failure:
262	/* This unhashes the socket and releases the local port, if necessary. */
263	tcp_set_state(sk, TCP_CLOSE);
264	ip_rt_put(rt);
265	sk->sk_route_caps = 0;
266	inet->dport = 0;
267	return err;
268}
269
270/*
271 * This routine does path mtu discovery as defined in RFC1191.
272 */
273static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
274{
275	struct dst_entry *dst;
276	struct inet_sock *inet = inet_sk(sk);
277
278	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
279	 * send out by Linux are always <576bytes so they should go through
280	 * unfragmented).
281	 */
282	if (sk->sk_state == TCP_LISTEN)
283		return;
284
285	/* We don't check in the destentry if pmtu discovery is forbidden
286	 * on this route. We just assume that no packet_to_big packets
287	 * are send back when pmtu discovery is not active.
288     	 * There is a small race when the user changes this flag in the
289	 * route, but I think that's acceptable.
290	 */
291	if ((dst = __sk_dst_check(sk, 0)) == NULL)
292		return;
293
294	dst->ops->update_pmtu(dst, mtu);
295
296	/* Something is about to be wrong... Remember soft error
297	 * for the case, if this connection will not able to recover.
298	 */
299	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
300		sk->sk_err_soft = EMSGSIZE;
301
302	mtu = dst_mtu(dst);
303
304	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
305	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306		tcp_sync_mss(sk, mtu);
307
308		/* Resend the TCP packet because it's
309		 * clear that the old packet has been
310		 * dropped. This is the new "fast" path mtu
311		 * discovery.
312		 */
313		tcp_simple_retransmit(sk);
314	} /* else let the usual retransmit timer handle it */
315}
316
317/*
318 * This routine is called by the ICMP module when it gets some
319 * sort of error condition.  If err < 0 then the socket should
320 * be closed and the error returned to the user.  If err > 0
321 * it's just the icmp type << 8 | icmp code.  After adjustment
322 * header points to the first 8 bytes of the tcp header.  We need
323 * to find the appropriate port.
324 *
325 * The locking strategy used here is very "optimistic". When
326 * someone else accesses the socket the ICMP is just dropped
327 * and for some paths there is no check at all.
328 * A more general error queue to queue errors for later handling
329 * is probably better.
330 *
331 */
332
333void tcp_v4_err(struct sk_buff *skb, u32 info)
334{
335	struct iphdr *iph = (struct iphdr *)skb->data;
336	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
337	struct tcp_sock *tp;
338	struct inet_sock *inet;
339	int type = skb->h.icmph->type;
340	int code = skb->h.icmph->code;
341	struct sock *sk;
342	__u32 seq;
343	int err;
344
345	if (skb->len < (iph->ihl << 2) + 8) {
346		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
347		return;
348	}
349
350	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
351			 th->source, inet_iif(skb));
352	if (!sk) {
353		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
354		return;
355	}
356	if (sk->sk_state == TCP_TIME_WAIT) {
357		inet_twsk_put((struct inet_timewait_sock *)sk);
358		return;
359	}
360
361	bh_lock_sock(sk);
362	/* If too many ICMPs get dropped on busy
363	 * servers this needs to be solved differently.
364	 */
365	if (sock_owned_by_user(sk))
366		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
367
368	if (sk->sk_state == TCP_CLOSE)
369		goto out;
370
371	tp = tcp_sk(sk);
372	seq = ntohl(th->seq);
373	if (sk->sk_state != TCP_LISTEN &&
374	    !between(seq, tp->snd_una, tp->snd_nxt)) {
375		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
376		goto out;
377	}
378
379	switch (type) {
380	case ICMP_SOURCE_QUENCH:
381		/* Just silently ignore these. */
382		goto out;
383	case ICMP_PARAMETERPROB:
384		err = EPROTO;
385		break;
386	case ICMP_DEST_UNREACH:
387		if (code > NR_ICMP_UNREACH)
388			goto out;
389
390		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
391			if (!sock_owned_by_user(sk))
392				do_pmtu_discovery(sk, iph, info);
393			goto out;
394		}
395
396		err = icmp_err_convert[code].errno;
397		break;
398	case ICMP_TIME_EXCEEDED:
399		err = EHOSTUNREACH;
400		break;
401	default:
402		goto out;
403	}
404
405	switch (sk->sk_state) {
406		struct request_sock *req, **prev;
407	case TCP_LISTEN:
408		if (sock_owned_by_user(sk))
409			goto out;
410
411		req = inet_csk_search_req(sk, &prev, th->dest,
412					  iph->daddr, iph->saddr);
413		if (!req)
414			goto out;
415
416		/* ICMPs are not backlogged, hence we cannot get
417		   an established socket here.
418		 */
419		BUG_TRAP(!req->sk);
420
421		if (seq != tcp_rsk(req)->snt_isn) {
422			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
423			goto out;
424		}
425
426		/*
427		 * Still in SYN_RECV, just remove it silently.
428		 * There is no good way to pass the error to the newly
429		 * created socket, and POSIX does not want network
430		 * errors returned from accept().
431		 */
432		inet_csk_reqsk_queue_drop(sk, req, prev);
433		goto out;
434
435	case TCP_SYN_SENT:
436	case TCP_SYN_RECV:  /* Cannot happen.
437			       It can f.e. if SYNs crossed.
438			     */
439		if (!sock_owned_by_user(sk)) {
440			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
441			sk->sk_err = err;
442
443			sk->sk_error_report(sk);
444
445			tcp_done(sk);
446		} else {
447			sk->sk_err_soft = err;
448		}
449		goto out;
450	}
451
452	/* If we've already connected we will keep trying
453	 * until we time out, or the user gives up.
454	 *
455	 * rfc1122 4.2.3.9 allows to consider as hard errors
456	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457	 * but it is obsoleted by pmtu discovery).
458	 *
459	 * Note, that in modern internet, where routing is unreliable
460	 * and in each dark corner broken firewalls sit, sending random
461	 * errors ordered by their masters even this two messages finally lose
462	 * their original sense (even Linux sends invalid PORT_UNREACHs)
463	 *
464	 * Now we are in compliance with RFCs.
465	 *							--ANK (980905)
466	 */
467
468	inet = inet_sk(sk);
469	if (!sock_owned_by_user(sk) && inet->recverr) {
470		sk->sk_err = err;
471		sk->sk_error_report(sk);
472	} else	{ /* Only an error on timeout */
473		sk->sk_err_soft = err;
474	}
475
476out:
477	bh_unlock_sock(sk);
478	sock_put(sk);
479}
480
481/* This routine computes an IPv4 TCP checksum. */
482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483{
484	struct inet_sock *inet = inet_sk(sk);
485	struct tcphdr *th = skb->h.th;
486
487	if (skb->ip_summed == CHECKSUM_HW) {
488		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
489		skb->csum = offsetof(struct tcphdr, check);
490	} else {
491		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
492					 csum_partial((char *)th,
493						      th->doff << 2,
494						      skb->csum));
495	}
496}
497
498/*
499 *	This routine will send an RST to the other tcp.
500 *
501 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
502 *		      for reset.
503 *	Answer: if a packet caused RST, it is not for a socket
504 *		existing in our system, if it is matched to a socket,
505 *		it is just duplicate segment or bug in other side's TCP.
506 *		So that we build reply only basing on parameters
507 *		arrived with segment.
508 *	Exception: precedence violation. We do not implement it in any case.
509 */
510
511static void tcp_v4_send_reset(struct sk_buff *skb)
512{
513	struct tcphdr *th = skb->h.th;
514	struct tcphdr rth;
515	struct ip_reply_arg arg;
516
517	/* Never send a reset in response to a reset. */
518	if (th->rst)
519		return;
520
521	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
522		return;
523
524	/* Swap the send and the receive. */
525	memset(&rth, 0, sizeof(struct tcphdr));
526	rth.dest   = th->source;
527	rth.source = th->dest;
528	rth.doff   = sizeof(struct tcphdr) / 4;
529	rth.rst    = 1;
530
531	if (th->ack) {
532		rth.seq = th->ack_seq;
533	} else {
534		rth.ack = 1;
535		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
536				    skb->len - (th->doff << 2));
537	}
538
539	memset(&arg, 0, sizeof arg);
540	arg.iov[0].iov_base = (unsigned char *)&rth;
541	arg.iov[0].iov_len  = sizeof rth;
542	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
543				      skb->nh.iph->saddr, /*XXX*/
544				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
545	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
546
547	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
548
549	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
550	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
551}
552
553/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
554   outside socket context is ugly, certainly. What can I do?
555 */
556
557static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
558			    u32 win, u32 ts)
559{
560	struct tcphdr *th = skb->h.th;
561	struct {
562		struct tcphdr th;
563		u32 tsopt[3];
564	} rep;
565	struct ip_reply_arg arg;
566
567	memset(&rep.th, 0, sizeof(struct tcphdr));
568	memset(&arg, 0, sizeof arg);
569
570	arg.iov[0].iov_base = (unsigned char *)&rep;
571	arg.iov[0].iov_len  = sizeof(rep.th);
572	if (ts) {
573		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
574				     (TCPOPT_TIMESTAMP << 8) |
575				     TCPOLEN_TIMESTAMP);
576		rep.tsopt[1] = htonl(tcp_time_stamp);
577		rep.tsopt[2] = htonl(ts);
578		arg.iov[0].iov_len = sizeof(rep);
579	}
580
581	/* Swap the send and the receive. */
582	rep.th.dest    = th->source;
583	rep.th.source  = th->dest;
584	rep.th.doff    = arg.iov[0].iov_len / 4;
585	rep.th.seq     = htonl(seq);
586	rep.th.ack_seq = htonl(ack);
587	rep.th.ack     = 1;
588	rep.th.window  = htons(win);
589
590	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
591				      skb->nh.iph->saddr, /*XXX*/
592				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
593	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594
595	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
596
597	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
598}
599
600static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
601{
602	struct inet_timewait_sock *tw = inet_twsk(sk);
603	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
604
605	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
606			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
607
608	inet_twsk_put(tw);
609}
610
611static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
612{
613	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
614			req->ts_recent);
615}
616
617/*
618 *	Send a SYN-ACK after having received an ACK.
619 *	This still operates on a request_sock only, not on a big
620 *	socket.
621 */
622static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
623			      struct dst_entry *dst)
624{
625	const struct inet_request_sock *ireq = inet_rsk(req);
626	int err = -1;
627	struct sk_buff * skb;
628
629	/* First, grab a route. */
630	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
631		goto out;
632
633	skb = tcp_make_synack(sk, dst, req);
634
635	if (skb) {
636		struct tcphdr *th = skb->h.th;
637
638		th->check = tcp_v4_check(th, skb->len,
639					 ireq->loc_addr,
640					 ireq->rmt_addr,
641					 csum_partial((char *)th, skb->len,
642						      skb->csum));
643
644		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
645					    ireq->rmt_addr,
646					    ireq->opt);
647		if (err == NET_XMIT_CN)
648			err = 0;
649	}
650
651out:
652	dst_release(dst);
653	return err;
654}
655
656/*
657 *	IPv4 request_sock destructor.
658 */
659static void tcp_v4_reqsk_destructor(struct request_sock *req)
660{
661	kfree(inet_rsk(req)->opt);
662}
663
664#ifdef CONFIG_SYN_COOKIES
665static void syn_flood_warning(struct sk_buff *skb)
666{
667	static unsigned long warntime;
668
669	if (time_after(jiffies, (warntime + HZ * 60))) {
670		warntime = jiffies;
671		printk(KERN_INFO
672		       "possible SYN flooding on port %d. Sending cookies.\n",
673		       ntohs(skb->h.th->dest));
674	}
675}
676#endif
677
678/*
679 * Save and compile IPv4 options into the request_sock if needed.
680 */
681static struct ip_options *tcp_v4_save_options(struct sock *sk,
682					      struct sk_buff *skb)
683{
684	struct ip_options *opt = &(IPCB(skb)->opt);
685	struct ip_options *dopt = NULL;
686
687	if (opt && opt->optlen) {
688		int opt_size = optlength(opt);
689		dopt = kmalloc(opt_size, GFP_ATOMIC);
690		if (dopt) {
691			if (ip_options_echo(dopt, skb)) {
692				kfree(dopt);
693				dopt = NULL;
694			}
695		}
696	}
697	return dopt;
698}
699
700struct request_sock_ops tcp_request_sock_ops = {
701	.family		=	PF_INET,
702	.obj_size	=	sizeof(struct tcp_request_sock),
703	.rtx_syn_ack	=	tcp_v4_send_synack,
704	.send_ack	=	tcp_v4_reqsk_send_ack,
705	.destructor	=	tcp_v4_reqsk_destructor,
706	.send_reset	=	tcp_v4_send_reset,
707};
708
709static struct timewait_sock_ops tcp_timewait_sock_ops = {
710	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
711	.twsk_unique	= tcp_twsk_unique,
712};
713
714int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
715{
716	struct inet_request_sock *ireq;
717	struct tcp_options_received tmp_opt;
718	struct request_sock *req;
719	__u32 saddr = skb->nh.iph->saddr;
720	__u32 daddr = skb->nh.iph->daddr;
721	__u32 isn = TCP_SKB_CB(skb)->when;
722	struct dst_entry *dst = NULL;
723#ifdef CONFIG_SYN_COOKIES
724	int want_cookie = 0;
725#else
726#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
727#endif
728
729	/* Never answer to SYNs send to broadcast or multicast */
730	if (((struct rtable *)skb->dst)->rt_flags &
731	    (RTCF_BROADCAST | RTCF_MULTICAST))
732		goto drop;
733
734	/* TW buckets are converted to open requests without
735	 * limitations, they conserve resources and peer is
736	 * evidently real one.
737	 */
738	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
739#ifdef CONFIG_SYN_COOKIES
740		if (sysctl_tcp_syncookies) {
741			want_cookie = 1;
742		} else
743#endif
744		goto drop;
745	}
746
747	/* Accept backlog is full. If we have already queued enough
748	 * of warm entries in syn queue, drop request. It is better than
749	 * clogging syn queue with openreqs with exponentially increasing
750	 * timeout.
751	 */
752	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
753		goto drop;
754
755	req = reqsk_alloc(&tcp_request_sock_ops);
756	if (!req)
757		goto drop;
758
759	tcp_clear_options(&tmp_opt);
760	tmp_opt.mss_clamp = 536;
761	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
762
763	tcp_parse_options(skb, &tmp_opt, 0);
764
765	if (want_cookie) {
766		tcp_clear_options(&tmp_opt);
767		tmp_opt.saw_tstamp = 0;
768	}
769
770	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
771		/* Some OSes (unknown ones, but I see them on web server, which
772		 * contains information interesting only for windows'
773		 * users) do not send their stamp in SYN. It is easy case.
774		 * We simply do not advertise TS support.
775		 */
776		tmp_opt.saw_tstamp = 0;
777		tmp_opt.tstamp_ok  = 0;
778	}
779	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
780
781	tcp_openreq_init(req, &tmp_opt, skb);
782
783	ireq = inet_rsk(req);
784	ireq->loc_addr = daddr;
785	ireq->rmt_addr = saddr;
786	ireq->opt = tcp_v4_save_options(sk, skb);
787	if (!want_cookie)
788		TCP_ECN_create_request(req, skb->h.th);
789
790	if (want_cookie) {
791#ifdef CONFIG_SYN_COOKIES
792		syn_flood_warning(skb);
793#endif
794		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
795	} else if (!isn) {
796		struct inet_peer *peer = NULL;
797
798		/* VJ's idea. We save last timestamp seen
799		 * from the destination in peer table, when entering
800		 * state TIME-WAIT, and check against it before
801		 * accepting new connection request.
802		 *
803		 * If "isn" is not zero, this request hit alive
804		 * timewait bucket, so that all the necessary checks
805		 * are made in the function processing timewait state.
806		 */
807		if (tmp_opt.saw_tstamp &&
808		    tcp_death_row.sysctl_tw_recycle &&
809		    (dst = inet_csk_route_req(sk, req)) != NULL &&
810		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
811		    peer->v4daddr == saddr) {
812			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
813			    (s32)(peer->tcp_ts - req->ts_recent) >
814							TCP_PAWS_WINDOW) {
815				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
816				dst_release(dst);
817				goto drop_and_free;
818			}
819		}
820		/* Kill the following clause, if you dislike this way. */
821		else if (!sysctl_tcp_syncookies &&
822			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
823			  (sysctl_max_syn_backlog >> 2)) &&
824			 (!peer || !peer->tcp_ts_stamp) &&
825			 (!dst || !dst_metric(dst, RTAX_RTT))) {
826			/* Without syncookies last quarter of
827			 * backlog is filled with destinations,
828			 * proven to be alive.
829			 * It means that we continue to communicate
830			 * to destinations, already remembered
831			 * to the moment of synflood.
832			 */
833			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
834				       "request from %u.%u.%u.%u/%u\n",
835				       NIPQUAD(saddr),
836				       ntohs(skb->h.th->source));
837			dst_release(dst);
838			goto drop_and_free;
839		}
840
841		isn = tcp_v4_init_sequence(sk, skb);
842	}
843	tcp_rsk(req)->snt_isn = isn;
844
845	if (tcp_v4_send_synack(sk, req, dst))
846		goto drop_and_free;
847
848	if (want_cookie) {
849	   	reqsk_free(req);
850	} else {
851		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
852	}
853	return 0;
854
855drop_and_free:
856	reqsk_free(req);
857drop:
858	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
859	return 0;
860}
861
862
863/*
864 * The three way handshake has completed - we got a valid synack -
865 * now create the new socket.
866 */
867struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
868				  struct request_sock *req,
869				  struct dst_entry *dst)
870{
871	struct inet_request_sock *ireq;
872	struct inet_sock *newinet;
873	struct tcp_sock *newtp;
874	struct sock *newsk;
875
876	if (sk_acceptq_is_full(sk))
877		goto exit_overflow;
878
879	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
880		goto exit;
881
882	newsk = tcp_create_openreq_child(sk, req, skb);
883	if (!newsk)
884		goto exit;
885
886	sk_setup_caps(newsk, dst);
887
888	newtp		      = tcp_sk(newsk);
889	newinet		      = inet_sk(newsk);
890	ireq		      = inet_rsk(req);
891	newinet->daddr	      = ireq->rmt_addr;
892	newinet->rcv_saddr    = ireq->loc_addr;
893	newinet->saddr	      = ireq->loc_addr;
894	newinet->opt	      = ireq->opt;
895	ireq->opt	      = NULL;
896	newinet->mc_index     = inet_iif(skb);
897	newinet->mc_ttl	      = skb->nh.iph->ttl;
898	inet_csk(newsk)->icsk_ext_hdr_len = 0;
899	if (newinet->opt)
900		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
901	newinet->id = newtp->write_seq ^ jiffies;
902
903	tcp_mtup_init(newsk);
904	tcp_sync_mss(newsk, dst_mtu(dst));
905	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
906	tcp_initialize_rcv_mss(newsk);
907
908	__inet_hash(&tcp_hashinfo, newsk, 0);
909	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
910
911	return newsk;
912
913exit_overflow:
914	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
915exit:
916	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
917	dst_release(dst);
918	return NULL;
919}
920
921static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
922{
923	struct tcphdr *th = skb->h.th;
924	struct iphdr *iph = skb->nh.iph;
925	struct sock *nsk;
926	struct request_sock **prev;
927	/* Find possible connection requests. */
928	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
929						       iph->saddr, iph->daddr);
930	if (req)
931		return tcp_check_req(sk, skb, req, prev);
932
933	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
934					th->source, skb->nh.iph->daddr,
935					ntohs(th->dest), inet_iif(skb));
936
937	if (nsk) {
938		if (nsk->sk_state != TCP_TIME_WAIT) {
939			bh_lock_sock(nsk);
940			return nsk;
941		}
942		inet_twsk_put((struct inet_timewait_sock *)nsk);
943		return NULL;
944	}
945
946#ifdef CONFIG_SYN_COOKIES
947	if (!th->rst && !th->syn && th->ack)
948		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
949#endif
950	return sk;
951}
952
953static int tcp_v4_checksum_init(struct sk_buff *skb)
954{
955	if (skb->ip_summed == CHECKSUM_HW) {
956		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
957				  skb->nh.iph->daddr, skb->csum)) {
958			skb->ip_summed = CHECKSUM_UNNECESSARY;
959			return 0;
960		}
961	}
962
963	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
964				       skb->len, IPPROTO_TCP, 0);
965
966	if (skb->len <= 76) {
967		return __skb_checksum_complete(skb);
968	}
969	return 0;
970}
971
972
973/* The socket must have it's spinlock held when we get
974 * here.
975 *
976 * We have a potential double-lock case here, so even when
977 * doing backlog processing we use the BH locking scheme.
978 * This is because we cannot sleep with the original spinlock
979 * held.
980 */
981int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
982{
983	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
984		TCP_CHECK_TIMER(sk);
985		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
986			goto reset;
987		TCP_CHECK_TIMER(sk);
988		return 0;
989	}
990
991	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
992		goto csum_err;
993
994	if (sk->sk_state == TCP_LISTEN) {
995		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
996		if (!nsk)
997			goto discard;
998
999		if (nsk != sk) {
1000			if (tcp_child_process(sk, nsk, skb))
1001				goto reset;
1002			return 0;
1003		}
1004	}
1005
1006	TCP_CHECK_TIMER(sk);
1007	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1008		goto reset;
1009	TCP_CHECK_TIMER(sk);
1010	return 0;
1011
1012reset:
1013	tcp_v4_send_reset(skb);
1014discard:
1015	kfree_skb(skb);
1016	/* Be careful here. If this function gets more complicated and
1017	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1018	 * might be destroyed here. This current version compiles correctly,
1019	 * but you have been warned.
1020	 */
1021	return 0;
1022
1023csum_err:
1024	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1025	goto discard;
1026}
1027
1028/*
1029 *	From tcp_input.c
1030 */
1031
1032int tcp_v4_rcv(struct sk_buff *skb)
1033{
1034	struct tcphdr *th;
1035	struct sock *sk;
1036	int ret;
1037
1038	if (skb->pkt_type != PACKET_HOST)
1039		goto discard_it;
1040
1041	/* Count it even if it's bad */
1042	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1043
1044	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1045		goto discard_it;
1046
1047	th = skb->h.th;
1048
1049	if (th->doff < sizeof(struct tcphdr) / 4)
1050		goto bad_packet;
1051	if (!pskb_may_pull(skb, th->doff * 4))
1052		goto discard_it;
1053
1054	/* An explanation is required here, I think.
1055	 * Packet length and doff are validated by header prediction,
1056	 * provided case of th->doff==0 is eliminated.
1057	 * So, we defer the checks. */
1058	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1059	     tcp_v4_checksum_init(skb)))
1060		goto bad_packet;
1061
1062	th = skb->h.th;
1063	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1064	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1065				    skb->len - th->doff * 4);
1066	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1067	TCP_SKB_CB(skb)->when	 = 0;
1068	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1069	TCP_SKB_CB(skb)->sacked	 = 0;
1070
1071	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1072			   skb->nh.iph->daddr, ntohs(th->dest),
1073			   inet_iif(skb));
1074
1075	if (!sk)
1076		goto no_tcp_socket;
1077
1078process:
1079	if (sk->sk_state == TCP_TIME_WAIT)
1080		goto do_time_wait;
1081
1082	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1083		goto discard_and_relse;
1084	nf_reset(skb);
1085
1086	if (sk_filter(sk, skb, 0))
1087		goto discard_and_relse;
1088
1089	skb->dev = NULL;
1090
1091	bh_lock_sock(sk);
1092	ret = 0;
1093	if (!sock_owned_by_user(sk)) {
1094		if (!tcp_prequeue(sk, skb))
1095			ret = tcp_v4_do_rcv(sk, skb);
1096	} else
1097		sk_add_backlog(sk, skb);
1098	bh_unlock_sock(sk);
1099
1100	sock_put(sk);
1101
1102	return ret;
1103
1104no_tcp_socket:
1105	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1106		goto discard_it;
1107
1108	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1109bad_packet:
1110		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1111	} else {
1112		tcp_v4_send_reset(skb);
1113	}
1114
1115discard_it:
1116	/* Discard frame. */
1117	kfree_skb(skb);
1118  	return 0;
1119
1120discard_and_relse:
1121	sock_put(sk);
1122	goto discard_it;
1123
1124do_time_wait:
1125	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1126		inet_twsk_put((struct inet_timewait_sock *) sk);
1127		goto discard_it;
1128	}
1129
1130	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1131		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1132		inet_twsk_put((struct inet_timewait_sock *) sk);
1133		goto discard_it;
1134	}
1135	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1136					   skb, th)) {
1137	case TCP_TW_SYN: {
1138		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1139							skb->nh.iph->daddr,
1140							ntohs(th->dest),
1141							inet_iif(skb));
1142		if (sk2) {
1143			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1144					     &tcp_death_row);
1145			inet_twsk_put((struct inet_timewait_sock *)sk);
1146			sk = sk2;
1147			goto process;
1148		}
1149		/* Fall through to ACK */
1150	}
1151	case TCP_TW_ACK:
1152		tcp_v4_timewait_ack(sk, skb);
1153		break;
1154	case TCP_TW_RST:
1155		goto no_tcp_socket;
1156	case TCP_TW_SUCCESS:;
1157	}
1158	goto discard_it;
1159}
1160
1161/* VJ's idea. Save last timestamp seen from this destination
1162 * and hold it at least for normal timewait interval to use for duplicate
1163 * segment detection in subsequent connections, before they enter synchronized
1164 * state.
1165 */
1166
1167int tcp_v4_remember_stamp(struct sock *sk)
1168{
1169	struct inet_sock *inet = inet_sk(sk);
1170	struct tcp_sock *tp = tcp_sk(sk);
1171	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1172	struct inet_peer *peer = NULL;
1173	int release_it = 0;
1174
1175	if (!rt || rt->rt_dst != inet->daddr) {
1176		peer = inet_getpeer(inet->daddr, 1);
1177		release_it = 1;
1178	} else {
1179		if (!rt->peer)
1180			rt_bind_peer(rt, 1);
1181		peer = rt->peer;
1182	}
1183
1184	if (peer) {
1185		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1186		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1187		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1188			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1189			peer->tcp_ts = tp->rx_opt.ts_recent;
1190		}
1191		if (release_it)
1192			inet_putpeer(peer);
1193		return 1;
1194	}
1195
1196	return 0;
1197}
1198
1199int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1200{
1201	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1202
1203	if (peer) {
1204		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1205
1206		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1207		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1208		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1209			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1210			peer->tcp_ts	   = tcptw->tw_ts_recent;
1211		}
1212		inet_putpeer(peer);
1213		return 1;
1214	}
1215
1216	return 0;
1217}
1218
1219struct inet_connection_sock_af_ops ipv4_specific = {
1220	.queue_xmit	=	ip_queue_xmit,
1221	.send_check	=	tcp_v4_send_check,
1222	.rebuild_header	=	inet_sk_rebuild_header,
1223	.conn_request	=	tcp_v4_conn_request,
1224	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
1225	.remember_stamp	=	tcp_v4_remember_stamp,
1226	.net_header_len	=	sizeof(struct iphdr),
1227	.setsockopt	=	ip_setsockopt,
1228	.getsockopt	=	ip_getsockopt,
1229	.addr2sockaddr	=	inet_csk_addr2sockaddr,
1230	.sockaddr_len	=	sizeof(struct sockaddr_in),
1231};
1232
1233/* NOTE: A lot of things set to zero explicitly by call to
1234 *       sk_alloc() so need not be done here.
1235 */
1236static int tcp_v4_init_sock(struct sock *sk)
1237{
1238	struct inet_connection_sock *icsk = inet_csk(sk);
1239	struct tcp_sock *tp = tcp_sk(sk);
1240
1241	skb_queue_head_init(&tp->out_of_order_queue);
1242	tcp_init_xmit_timers(sk);
1243	tcp_prequeue_init(tp);
1244
1245	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1246	tp->mdev = TCP_TIMEOUT_INIT;
1247
1248	/* So many TCP implementations out there (incorrectly) count the
1249	 * initial SYN frame in their delayed-ACK and congestion control
1250	 * algorithms that we must have the following bandaid to talk
1251	 * efficiently to them.  -DaveM
1252	 */
1253	tp->snd_cwnd = 2;
1254
1255	/* See draft-stevens-tcpca-spec-01 for discussion of the
1256	 * initialization of these values.
1257	 */
1258	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1259	tp->snd_cwnd_clamp = ~0;
1260	tp->mss_cache = 536;
1261
1262	tp->reordering = sysctl_tcp_reordering;
1263	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1264
1265	sk->sk_state = TCP_CLOSE;
1266
1267	sk->sk_write_space = sk_stream_write_space;
1268	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1269
1270	icsk->icsk_af_ops = &ipv4_specific;
1271	icsk->icsk_sync_mss = tcp_sync_mss;
1272
1273	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1274	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1275
1276	atomic_inc(&tcp_sockets_allocated);
1277
1278	return 0;
1279}
1280
1281int tcp_v4_destroy_sock(struct sock *sk)
1282{
1283	struct tcp_sock *tp = tcp_sk(sk);
1284
1285	tcp_clear_xmit_timers(sk);
1286
1287	tcp_cleanup_congestion_control(sk);
1288
1289	/* Cleanup up the write buffer. */
1290  	sk_stream_writequeue_purge(sk);
1291
1292	/* Cleans up our, hopefully empty, out_of_order_queue. */
1293  	__skb_queue_purge(&tp->out_of_order_queue);
1294
1295	/* Clean prequeue, it must be empty really */
1296	__skb_queue_purge(&tp->ucopy.prequeue);
1297
1298	/* Clean up a referenced TCP bind bucket. */
1299	if (inet_csk(sk)->icsk_bind_hash)
1300		inet_put_port(&tcp_hashinfo, sk);
1301
1302	/*
1303	 * If sendmsg cached page exists, toss it.
1304	 */
1305	if (sk->sk_sndmsg_page) {
1306		__free_page(sk->sk_sndmsg_page);
1307		sk->sk_sndmsg_page = NULL;
1308	}
1309
1310	atomic_dec(&tcp_sockets_allocated);
1311
1312	return 0;
1313}
1314
1315EXPORT_SYMBOL(tcp_v4_destroy_sock);
1316
1317#ifdef CONFIG_PROC_FS
1318/* Proc filesystem TCP sock list dumping. */
1319
1320static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1321{
1322	return hlist_empty(head) ? NULL :
1323		list_entry(head->first, struct inet_timewait_sock, tw_node);
1324}
1325
1326static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1327{
1328	return tw->tw_node.next ?
1329		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1330}
1331
1332static void *listening_get_next(struct seq_file *seq, void *cur)
1333{
1334	struct inet_connection_sock *icsk;
1335	struct hlist_node *node;
1336	struct sock *sk = cur;
1337	struct tcp_iter_state* st = seq->private;
1338
1339	if (!sk) {
1340		st->bucket = 0;
1341		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1342		goto get_sk;
1343	}
1344
1345	++st->num;
1346
1347	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1348		struct request_sock *req = cur;
1349
1350	       	icsk = inet_csk(st->syn_wait_sk);
1351		req = req->dl_next;
1352		while (1) {
1353			while (req) {
1354				if (req->rsk_ops->family == st->family) {
1355					cur = req;
1356					goto out;
1357				}
1358				req = req->dl_next;
1359			}
1360			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1361				break;
1362get_req:
1363			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1364		}
1365		sk	  = sk_next(st->syn_wait_sk);
1366		st->state = TCP_SEQ_STATE_LISTENING;
1367		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1368	} else {
1369	       	icsk = inet_csk(sk);
1370		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1371		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1372			goto start_req;
1373		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1374		sk = sk_next(sk);
1375	}
1376get_sk:
1377	sk_for_each_from(sk, node) {
1378		if (sk->sk_family == st->family) {
1379			cur = sk;
1380			goto out;
1381		}
1382	       	icsk = inet_csk(sk);
1383		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1384		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1385start_req:
1386			st->uid		= sock_i_uid(sk);
1387			st->syn_wait_sk = sk;
1388			st->state	= TCP_SEQ_STATE_OPENREQ;
1389			st->sbucket	= 0;
1390			goto get_req;
1391		}
1392		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1393	}
1394	if (++st->bucket < INET_LHTABLE_SIZE) {
1395		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1396		goto get_sk;
1397	}
1398	cur = NULL;
1399out:
1400	return cur;
1401}
1402
1403static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1404{
1405	void *rc = listening_get_next(seq, NULL);
1406
1407	while (rc && *pos) {
1408		rc = listening_get_next(seq, rc);
1409		--*pos;
1410	}
1411	return rc;
1412}
1413
1414static void *established_get_first(struct seq_file *seq)
1415{
1416	struct tcp_iter_state* st = seq->private;
1417	void *rc = NULL;
1418
1419	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1420		struct sock *sk;
1421		struct hlist_node *node;
1422		struct inet_timewait_sock *tw;
1423
1424		/* We can reschedule _before_ having picked the target: */
1425		cond_resched_softirq();
1426
1427		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1428		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1429			if (sk->sk_family != st->family) {
1430				continue;
1431			}
1432			rc = sk;
1433			goto out;
1434		}
1435		st->state = TCP_SEQ_STATE_TIME_WAIT;
1436		inet_twsk_for_each(tw, node,
1437				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1438			if (tw->tw_family != st->family) {
1439				continue;
1440			}
1441			rc = tw;
1442			goto out;
1443		}
1444		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1445		st->state = TCP_SEQ_STATE_ESTABLISHED;
1446	}
1447out:
1448	return rc;
1449}
1450
1451static void *established_get_next(struct seq_file *seq, void *cur)
1452{
1453	struct sock *sk = cur;
1454	struct inet_timewait_sock *tw;
1455	struct hlist_node *node;
1456	struct tcp_iter_state* st = seq->private;
1457
1458	++st->num;
1459
1460	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1461		tw = cur;
1462		tw = tw_next(tw);
1463get_tw:
1464		while (tw && tw->tw_family != st->family) {
1465			tw = tw_next(tw);
1466		}
1467		if (tw) {
1468			cur = tw;
1469			goto out;
1470		}
1471		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1472		st->state = TCP_SEQ_STATE_ESTABLISHED;
1473
1474		/* We can reschedule between buckets: */
1475		cond_resched_softirq();
1476
1477		if (++st->bucket < tcp_hashinfo.ehash_size) {
1478			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1479			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1480		} else {
1481			cur = NULL;
1482			goto out;
1483		}
1484	} else
1485		sk = sk_next(sk);
1486
1487	sk_for_each_from(sk, node) {
1488		if (sk->sk_family == st->family)
1489			goto found;
1490	}
1491
1492	st->state = TCP_SEQ_STATE_TIME_WAIT;
1493	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1494	goto get_tw;
1495found:
1496	cur = sk;
1497out:
1498	return cur;
1499}
1500
1501static void *established_get_idx(struct seq_file *seq, loff_t pos)
1502{
1503	void *rc = established_get_first(seq);
1504
1505	while (rc && pos) {
1506		rc = established_get_next(seq, rc);
1507		--pos;
1508	}
1509	return rc;
1510}
1511
1512static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1513{
1514	void *rc;
1515	struct tcp_iter_state* st = seq->private;
1516
1517	inet_listen_lock(&tcp_hashinfo);
1518	st->state = TCP_SEQ_STATE_LISTENING;
1519	rc	  = listening_get_idx(seq, &pos);
1520
1521	if (!rc) {
1522		inet_listen_unlock(&tcp_hashinfo);
1523		local_bh_disable();
1524		st->state = TCP_SEQ_STATE_ESTABLISHED;
1525		rc	  = established_get_idx(seq, pos);
1526	}
1527
1528	return rc;
1529}
1530
1531static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1532{
1533	struct tcp_iter_state* st = seq->private;
1534	st->state = TCP_SEQ_STATE_LISTENING;
1535	st->num = 0;
1536	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1537}
1538
1539static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1540{
1541	void *rc = NULL;
1542	struct tcp_iter_state* st;
1543
1544	if (v == SEQ_START_TOKEN) {
1545		rc = tcp_get_idx(seq, 0);
1546		goto out;
1547	}
1548	st = seq->private;
1549
1550	switch (st->state) {
1551	case TCP_SEQ_STATE_OPENREQ:
1552	case TCP_SEQ_STATE_LISTENING:
1553		rc = listening_get_next(seq, v);
1554		if (!rc) {
1555			inet_listen_unlock(&tcp_hashinfo);
1556			local_bh_disable();
1557			st->state = TCP_SEQ_STATE_ESTABLISHED;
1558			rc	  = established_get_first(seq);
1559		}
1560		break;
1561	case TCP_SEQ_STATE_ESTABLISHED:
1562	case TCP_SEQ_STATE_TIME_WAIT:
1563		rc = established_get_next(seq, v);
1564		break;
1565	}
1566out:
1567	++*pos;
1568	return rc;
1569}
1570
1571static void tcp_seq_stop(struct seq_file *seq, void *v)
1572{
1573	struct tcp_iter_state* st = seq->private;
1574
1575	switch (st->state) {
1576	case TCP_SEQ_STATE_OPENREQ:
1577		if (v) {
1578			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1579			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1580		}
1581	case TCP_SEQ_STATE_LISTENING:
1582		if (v != SEQ_START_TOKEN)
1583			inet_listen_unlock(&tcp_hashinfo);
1584		break;
1585	case TCP_SEQ_STATE_TIME_WAIT:
1586	case TCP_SEQ_STATE_ESTABLISHED:
1587		if (v)
1588			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1589		local_bh_enable();
1590		break;
1591	}
1592}
1593
1594static int tcp_seq_open(struct inode *inode, struct file *file)
1595{
1596	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1597	struct seq_file *seq;
1598	struct tcp_iter_state *s;
1599	int rc;
1600
1601	if (unlikely(afinfo == NULL))
1602		return -EINVAL;
1603
1604	s = kmalloc(sizeof(*s), GFP_KERNEL);
1605	if (!s)
1606		return -ENOMEM;
1607	memset(s, 0, sizeof(*s));
1608	s->family		= afinfo->family;
1609	s->seq_ops.start	= tcp_seq_start;
1610	s->seq_ops.next		= tcp_seq_next;
1611	s->seq_ops.show		= afinfo->seq_show;
1612	s->seq_ops.stop		= tcp_seq_stop;
1613
1614	rc = seq_open(file, &s->seq_ops);
1615	if (rc)
1616		goto out_kfree;
1617	seq	     = file->private_data;
1618	seq->private = s;
1619out:
1620	return rc;
1621out_kfree:
1622	kfree(s);
1623	goto out;
1624}
1625
1626int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1627{
1628	int rc = 0;
1629	struct proc_dir_entry *p;
1630
1631	if (!afinfo)
1632		return -EINVAL;
1633	afinfo->seq_fops->owner		= afinfo->owner;
1634	afinfo->seq_fops->open		= tcp_seq_open;
1635	afinfo->seq_fops->read		= seq_read;
1636	afinfo->seq_fops->llseek	= seq_lseek;
1637	afinfo->seq_fops->release	= seq_release_private;
1638
1639	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1640	if (p)
1641		p->data = afinfo;
1642	else
1643		rc = -ENOMEM;
1644	return rc;
1645}
1646
1647void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1648{
1649	if (!afinfo)
1650		return;
1651	proc_net_remove(afinfo->name);
1652	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1653}
1654
1655static void get_openreq4(struct sock *sk, struct request_sock *req,
1656			 char *tmpbuf, int i, int uid)
1657{
1658	const struct inet_request_sock *ireq = inet_rsk(req);
1659	int ttd = req->expires - jiffies;
1660
1661	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1662		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1663		i,
1664		ireq->loc_addr,
1665		ntohs(inet_sk(sk)->sport),
1666		ireq->rmt_addr,
1667		ntohs(ireq->rmt_port),
1668		TCP_SYN_RECV,
1669		0, 0, /* could print option size, but that is af dependent. */
1670		1,    /* timers active (only the expire timer) */
1671		jiffies_to_clock_t(ttd),
1672		req->retrans,
1673		uid,
1674		0,  /* non standard timer */
1675		0, /* open_requests have no inode */
1676		atomic_read(&sk->sk_refcnt),
1677		req);
1678}
1679
1680static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1681{
1682	int timer_active;
1683	unsigned long timer_expires;
1684	struct tcp_sock *tp = tcp_sk(sp);
1685	const struct inet_connection_sock *icsk = inet_csk(sp);
1686	struct inet_sock *inet = inet_sk(sp);
1687	unsigned int dest = inet->daddr;
1688	unsigned int src = inet->rcv_saddr;
1689	__u16 destp = ntohs(inet->dport);
1690	__u16 srcp = ntohs(inet->sport);
1691
1692	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1693		timer_active	= 1;
1694		timer_expires	= icsk->icsk_timeout;
1695	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1696		timer_active	= 4;
1697		timer_expires	= icsk->icsk_timeout;
1698	} else if (timer_pending(&sp->sk_timer)) {
1699		timer_active	= 2;
1700		timer_expires	= sp->sk_timer.expires;
1701	} else {
1702		timer_active	= 0;
1703		timer_expires = jiffies;
1704	}
1705
1706	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1707			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1708		i, src, srcp, dest, destp, sp->sk_state,
1709		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1710		timer_active,
1711		jiffies_to_clock_t(timer_expires - jiffies),
1712		icsk->icsk_retransmits,
1713		sock_i_uid(sp),
1714		icsk->icsk_probes_out,
1715		sock_i_ino(sp),
1716		atomic_read(&sp->sk_refcnt), sp,
1717		icsk->icsk_rto,
1718		icsk->icsk_ack.ato,
1719		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1720		tp->snd_cwnd,
1721		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1722}
1723
1724static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1725{
1726	unsigned int dest, src;
1727	__u16 destp, srcp;
1728	int ttd = tw->tw_ttd - jiffies;
1729
1730	if (ttd < 0)
1731		ttd = 0;
1732
1733	dest  = tw->tw_daddr;
1734	src   = tw->tw_rcv_saddr;
1735	destp = ntohs(tw->tw_dport);
1736	srcp  = ntohs(tw->tw_sport);
1737
1738	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1739		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1740		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1741		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1742		atomic_read(&tw->tw_refcnt), tw);
1743}
1744
1745#define TMPSZ 150
1746
1747static int tcp4_seq_show(struct seq_file *seq, void *v)
1748{
1749	struct tcp_iter_state* st;
1750	char tmpbuf[TMPSZ + 1];
1751
1752	if (v == SEQ_START_TOKEN) {
1753		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1754			   "  sl  local_address rem_address   st tx_queue "
1755			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1756			   "inode");
1757		goto out;
1758	}
1759	st = seq->private;
1760
1761	switch (st->state) {
1762	case TCP_SEQ_STATE_LISTENING:
1763	case TCP_SEQ_STATE_ESTABLISHED:
1764		get_tcp4_sock(v, tmpbuf, st->num);
1765		break;
1766	case TCP_SEQ_STATE_OPENREQ:
1767		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1768		break;
1769	case TCP_SEQ_STATE_TIME_WAIT:
1770		get_timewait4_sock(v, tmpbuf, st->num);
1771		break;
1772	}
1773	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1774out:
1775	return 0;
1776}
1777
1778static struct file_operations tcp4_seq_fops;
1779static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1780	.owner		= THIS_MODULE,
1781	.name		= "tcp",
1782	.family		= AF_INET,
1783	.seq_show	= tcp4_seq_show,
1784	.seq_fops	= &tcp4_seq_fops,
1785};
1786
1787int __init tcp4_proc_init(void)
1788{
1789	return tcp_proc_register(&tcp4_seq_afinfo);
1790}
1791
1792void tcp4_proc_exit(void)
1793{
1794	tcp_proc_unregister(&tcp4_seq_afinfo);
1795}
1796#endif /* CONFIG_PROC_FS */
1797
1798struct proto tcp_prot = {
1799	.name			= "TCP",
1800	.owner			= THIS_MODULE,
1801	.close			= tcp_close,
1802	.connect		= tcp_v4_connect,
1803	.disconnect		= tcp_disconnect,
1804	.accept			= inet_csk_accept,
1805	.ioctl			= tcp_ioctl,
1806	.init			= tcp_v4_init_sock,
1807	.destroy		= tcp_v4_destroy_sock,
1808	.shutdown		= tcp_shutdown,
1809	.setsockopt		= tcp_setsockopt,
1810	.getsockopt		= tcp_getsockopt,
1811	.sendmsg		= tcp_sendmsg,
1812	.recvmsg		= tcp_recvmsg,
1813	.backlog_rcv		= tcp_v4_do_rcv,
1814	.hash			= tcp_v4_hash,
1815	.unhash			= tcp_unhash,
1816	.get_port		= tcp_v4_get_port,
1817	.enter_memory_pressure	= tcp_enter_memory_pressure,
1818	.sockets_allocated	= &tcp_sockets_allocated,
1819	.orphan_count		= &tcp_orphan_count,
1820	.memory_allocated	= &tcp_memory_allocated,
1821	.memory_pressure	= &tcp_memory_pressure,
1822	.sysctl_mem		= sysctl_tcp_mem,
1823	.sysctl_wmem		= sysctl_tcp_wmem,
1824	.sysctl_rmem		= sysctl_tcp_rmem,
1825	.max_header		= MAX_TCP_HEADER,
1826	.obj_size		= sizeof(struct tcp_sock),
1827	.twsk_prot		= &tcp_timewait_sock_ops,
1828	.rsk_prot		= &tcp_request_sock_ops,
1829};
1830
1831
1832
1833void __init tcp_v4_init(struct net_proto_family *ops)
1834{
1835	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
1836	if (err < 0)
1837		panic("Failed to create the TCP control socket.\n");
1838	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
1839	inet_sk(tcp_socket->sk)->uc_ttl = -1;
1840
1841	/* Unhash it so that IP input processing does not even
1842	 * see it, we do not wish this socket to see incoming
1843	 * packets.
1844	 */
1845	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
1846}
1847
1848EXPORT_SYMBOL(ipv4_specific);
1849EXPORT_SYMBOL(tcp_hashinfo);
1850EXPORT_SYMBOL(tcp_prot);
1851EXPORT_SYMBOL(tcp_unhash);
1852EXPORT_SYMBOL(tcp_v4_conn_request);
1853EXPORT_SYMBOL(tcp_v4_connect);
1854EXPORT_SYMBOL(tcp_v4_do_rcv);
1855EXPORT_SYMBOL(tcp_v4_remember_stamp);
1856EXPORT_SYMBOL(tcp_v4_send_check);
1857EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1858
1859#ifdef CONFIG_PROC_FS
1860EXPORT_SYMBOL(tcp_proc_register);
1861EXPORT_SYMBOL(tcp_proc_unregister);
1862#endif
1863EXPORT_SYMBOL(sysctl_local_port_range);
1864EXPORT_SYMBOL(sysctl_tcp_low_latency);
1865EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
1866
1867