tcp_ipv4.c revision 4103f8cd5c1f260d674a7b426ed221812de54d47
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
84int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
86
87/* Check TCP sequence numbers in ICMP packets. */
88#define ICMP_MIN_LENGTH 8
89
90/* Socket used for sending RSTs */
91static struct socket *tcp_socket __read_mostly;
92
93void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95#ifdef CONFIG_TCP_MD5SIG
96static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97						   __be32 addr);
98static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99				   __be32 saddr, __be32 daddr,
100				   struct tcphdr *th, int protocol,
101				   int tcplen);
102#endif
103
104struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106	.lhash_users = ATOMIC_INIT(0),
107	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108};
109
110static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111{
112	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113				 inet_csk_bind_conflict);
114}
115
116static void tcp_v4_hash(struct sock *sk)
117{
118	inet_hash(&tcp_hashinfo, sk);
119}
120
121void tcp_unhash(struct sock *sk)
122{
123	inet_unhash(&tcp_hashinfo, sk);
124}
125
126static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127{
128	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129					  ip_hdr(skb)->saddr,
130					  tcp_hdr(skb)->dest,
131					  tcp_hdr(skb)->source);
132}
133
134int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135{
136	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137	struct tcp_sock *tp = tcp_sk(sk);
138
139	/* With PAWS, it is safe from the viewpoint
140	   of data integrity. Even without PAWS it is safe provided sequence
141	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143	   Actually, the idea is close to VJ's one, only timestamp cache is
144	   held not per host, but per port pair and TW bucket is used as state
145	   holder.
146
147	   If TW bucket has been already destroyed we fall back to VJ's scheme
148	   and use initial timestamp retrieved from peer table.
149	 */
150	if (tcptw->tw_ts_recent_stamp &&
151	    (twp == NULL || (sysctl_tcp_tw_reuse &&
152			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154		if (tp->write_seq == 0)
155			tp->write_seq = 1;
156		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
157		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158		sock_hold(sktw);
159		return 1;
160	}
161
162	return 0;
163}
164
165EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167/* This will initiate an outgoing connection. */
168int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169{
170	struct inet_sock *inet = inet_sk(sk);
171	struct tcp_sock *tp = tcp_sk(sk);
172	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173	struct rtable *rt;
174	__be32 daddr, nexthop;
175	int tmp;
176	int err;
177
178	if (addr_len < sizeof(struct sockaddr_in))
179		return -EINVAL;
180
181	if (usin->sin_family != AF_INET)
182		return -EAFNOSUPPORT;
183
184	nexthop = daddr = usin->sin_addr.s_addr;
185	if (inet->opt && inet->opt->srr) {
186		if (!daddr)
187			return -EINVAL;
188		nexthop = inet->opt->faddr;
189	}
190
191	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193			       IPPROTO_TCP,
194			       inet->sport, usin->sin_port, sk, 1);
195	if (tmp < 0)
196		return tmp;
197
198	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199		ip_rt_put(rt);
200		return -ENETUNREACH;
201	}
202
203	if (!inet->opt || !inet->opt->srr)
204		daddr = rt->rt_dst;
205
206	if (!inet->saddr)
207		inet->saddr = rt->rt_src;
208	inet->rcv_saddr = inet->saddr;
209
210	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211		/* Reset inherited state */
212		tp->rx_opt.ts_recent	   = 0;
213		tp->rx_opt.ts_recent_stamp = 0;
214		tp->write_seq		   = 0;
215	}
216
217	if (tcp_death_row.sysctl_tw_recycle &&
218	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219		struct inet_peer *peer = rt_get_peer(rt);
220		/*
221		 * VJ's idea. We save last timestamp seen from
222		 * the destination in peer table, when entering state
223		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224		 * when trying new connection.
225		 */
226		if (peer != NULL &&
227		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
228			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229			tp->rx_opt.ts_recent = peer->tcp_ts;
230		}
231	}
232
233	inet->dport = usin->sin_port;
234	inet->daddr = daddr;
235
236	inet_csk(sk)->icsk_ext_hdr_len = 0;
237	if (inet->opt)
238		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240	tp->rx_opt.mss_clamp = 536;
241
242	/* Socket identity is still unknown (sport may be zero).
243	 * However we set state to SYN-SENT and not releasing socket
244	 * lock select source port, enter ourselves into the hash tables and
245	 * complete initialization after this.
246	 */
247	tcp_set_state(sk, TCP_SYN_SENT);
248	err = inet_hash_connect(&tcp_death_row, sk);
249	if (err)
250		goto failure;
251
252	err = ip_route_newports(&rt, IPPROTO_TCP,
253				inet->sport, inet->dport, sk);
254	if (err)
255		goto failure;
256
257	/* OK, now commit destination to socket.  */
258	sk->sk_gso_type = SKB_GSO_TCPV4;
259	sk_setup_caps(sk, &rt->u.dst);
260
261	if (!tp->write_seq)
262		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263							   inet->daddr,
264							   inet->sport,
265							   usin->sin_port);
266
267	inet->id = tp->write_seq ^ jiffies;
268
269	err = tcp_connect(sk);
270	rt = NULL;
271	if (err)
272		goto failure;
273
274	return 0;
275
276failure:
277	/*
278	 * This unhashes the socket and releases the local port,
279	 * if necessary.
280	 */
281	tcp_set_state(sk, TCP_CLOSE);
282	ip_rt_put(rt);
283	sk->sk_route_caps = 0;
284	inet->dport = 0;
285	return err;
286}
287
288/*
289 * This routine does path mtu discovery as defined in RFC1191.
290 */
291static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292{
293	struct dst_entry *dst;
294	struct inet_sock *inet = inet_sk(sk);
295
296	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297	 * send out by Linux are always <576bytes so they should go through
298	 * unfragmented).
299	 */
300	if (sk->sk_state == TCP_LISTEN)
301		return;
302
303	/* We don't check in the destentry if pmtu discovery is forbidden
304	 * on this route. We just assume that no packet_to_big packets
305	 * are send back when pmtu discovery is not active.
306	 * There is a small race when the user changes this flag in the
307	 * route, but I think that's acceptable.
308	 */
309	if ((dst = __sk_dst_check(sk, 0)) == NULL)
310		return;
311
312	dst->ops->update_pmtu(dst, mtu);
313
314	/* Something is about to be wrong... Remember soft error
315	 * for the case, if this connection will not able to recover.
316	 */
317	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318		sk->sk_err_soft = EMSGSIZE;
319
320	mtu = dst_mtu(dst);
321
322	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324		tcp_sync_mss(sk, mtu);
325
326		/* Resend the TCP packet because it's
327		 * clear that the old packet has been
328		 * dropped. This is the new "fast" path mtu
329		 * discovery.
330		 */
331		tcp_simple_retransmit(sk);
332	} /* else let the usual retransmit timer handle it */
333}
334
335/*
336 * This routine is called by the ICMP module when it gets some
337 * sort of error condition.  If err < 0 then the socket should
338 * be closed and the error returned to the user.  If err > 0
339 * it's just the icmp type << 8 | icmp code.  After adjustment
340 * header points to the first 8 bytes of the tcp header.  We need
341 * to find the appropriate port.
342 *
343 * The locking strategy used here is very "optimistic". When
344 * someone else accesses the socket the ICMP is just dropped
345 * and for some paths there is no check at all.
346 * A more general error queue to queue errors for later handling
347 * is probably better.
348 *
349 */
350
351void tcp_v4_err(struct sk_buff *skb, u32 info)
352{
353	struct iphdr *iph = (struct iphdr *)skb->data;
354	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355	struct tcp_sock *tp;
356	struct inet_sock *inet;
357	const int type = icmp_hdr(skb)->type;
358	const int code = icmp_hdr(skb)->code;
359	struct sock *sk;
360	__u32 seq;
361	int err;
362
363	if (skb->len < (iph->ihl << 2) + 8) {
364		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365		return;
366	}
367
368	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369			 th->source, inet_iif(skb));
370	if (!sk) {
371		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372		return;
373	}
374	if (sk->sk_state == TCP_TIME_WAIT) {
375		inet_twsk_put(inet_twsk(sk));
376		return;
377	}
378
379	bh_lock_sock(sk);
380	/* If too many ICMPs get dropped on busy
381	 * servers this needs to be solved differently.
382	 */
383	if (sock_owned_by_user(sk))
384		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386	if (sk->sk_state == TCP_CLOSE)
387		goto out;
388
389	tp = tcp_sk(sk);
390	seq = ntohl(th->seq);
391	if (sk->sk_state != TCP_LISTEN &&
392	    !between(seq, tp->snd_una, tp->snd_nxt)) {
393		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394		goto out;
395	}
396
397	switch (type) {
398	case ICMP_SOURCE_QUENCH:
399		/* Just silently ignore these. */
400		goto out;
401	case ICMP_PARAMETERPROB:
402		err = EPROTO;
403		break;
404	case ICMP_DEST_UNREACH:
405		if (code > NR_ICMP_UNREACH)
406			goto out;
407
408		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409			if (!sock_owned_by_user(sk))
410				do_pmtu_discovery(sk, iph, info);
411			goto out;
412		}
413
414		err = icmp_err_convert[code].errno;
415		break;
416	case ICMP_TIME_EXCEEDED:
417		err = EHOSTUNREACH;
418		break;
419	default:
420		goto out;
421	}
422
423	switch (sk->sk_state) {
424		struct request_sock *req, **prev;
425	case TCP_LISTEN:
426		if (sock_owned_by_user(sk))
427			goto out;
428
429		req = inet_csk_search_req(sk, &prev, th->dest,
430					  iph->daddr, iph->saddr);
431		if (!req)
432			goto out;
433
434		/* ICMPs are not backlogged, hence we cannot get
435		   an established socket here.
436		 */
437		BUG_TRAP(!req->sk);
438
439		if (seq != tcp_rsk(req)->snt_isn) {
440			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441			goto out;
442		}
443
444		/*
445		 * Still in SYN_RECV, just remove it silently.
446		 * There is no good way to pass the error to the newly
447		 * created socket, and POSIX does not want network
448		 * errors returned from accept().
449		 */
450		inet_csk_reqsk_queue_drop(sk, req, prev);
451		goto out;
452
453	case TCP_SYN_SENT:
454	case TCP_SYN_RECV:  /* Cannot happen.
455			       It can f.e. if SYNs crossed.
456			     */
457		if (!sock_owned_by_user(sk)) {
458			sk->sk_err = err;
459
460			sk->sk_error_report(sk);
461
462			tcp_done(sk);
463		} else {
464			sk->sk_err_soft = err;
465		}
466		goto out;
467	}
468
469	/* If we've already connected we will keep trying
470	 * until we time out, or the user gives up.
471	 *
472	 * rfc1122 4.2.3.9 allows to consider as hard errors
473	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474	 * but it is obsoleted by pmtu discovery).
475	 *
476	 * Note, that in modern internet, where routing is unreliable
477	 * and in each dark corner broken firewalls sit, sending random
478	 * errors ordered by their masters even this two messages finally lose
479	 * their original sense (even Linux sends invalid PORT_UNREACHs)
480	 *
481	 * Now we are in compliance with RFCs.
482	 *							--ANK (980905)
483	 */
484
485	inet = inet_sk(sk);
486	if (!sock_owned_by_user(sk) && inet->recverr) {
487		sk->sk_err = err;
488		sk->sk_error_report(sk);
489	} else	{ /* Only an error on timeout */
490		sk->sk_err_soft = err;
491	}
492
493out:
494	bh_unlock_sock(sk);
495	sock_put(sk);
496}
497
498/* This routine computes an IPv4 TCP checksum. */
499void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500{
501	struct inet_sock *inet = inet_sk(sk);
502	struct tcphdr *th = tcp_hdr(skb);
503
504	if (skb->ip_summed == CHECKSUM_PARTIAL) {
505		th->check = ~tcp_v4_check(len, inet->saddr,
506					  inet->daddr, 0);
507		skb->csum_offset = offsetof(struct tcphdr, check);
508	} else {
509		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
510					 csum_partial((char *)th,
511						      th->doff << 2,
512						      skb->csum));
513	}
514}
515
516int tcp_v4_gso_send_check(struct sk_buff *skb)
517{
518	const struct iphdr *iph;
519	struct tcphdr *th;
520
521	if (!pskb_may_pull(skb, sizeof(*th)))
522		return -EINVAL;
523
524	iph = ip_hdr(skb);
525	th = tcp_hdr(skb);
526
527	th->check = 0;
528	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
529	skb->csum_offset = offsetof(struct tcphdr, check);
530	skb->ip_summed = CHECKSUM_PARTIAL;
531	return 0;
532}
533
534/*
535 *	This routine will send an RST to the other tcp.
536 *
537 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538 *		      for reset.
539 *	Answer: if a packet caused RST, it is not for a socket
540 *		existing in our system, if it is matched to a socket,
541 *		it is just duplicate segment or bug in other side's TCP.
542 *		So that we build reply only basing on parameters
543 *		arrived with segment.
544 *	Exception: precedence violation. We do not implement it in any case.
545 */
546
547static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548{
549	struct tcphdr *th = tcp_hdr(skb);
550	struct {
551		struct tcphdr th;
552#ifdef CONFIG_TCP_MD5SIG
553		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
554#endif
555	} rep;
556	struct ip_reply_arg arg;
557#ifdef CONFIG_TCP_MD5SIG
558	struct tcp_md5sig_key *key;
559#endif
560
561	/* Never send a reset in response to a reset. */
562	if (th->rst)
563		return;
564
565	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566		return;
567
568	/* Swap the send and the receive. */
569	memset(&rep, 0, sizeof(rep));
570	rep.th.dest   = th->source;
571	rep.th.source = th->dest;
572	rep.th.doff   = sizeof(struct tcphdr) / 4;
573	rep.th.rst    = 1;
574
575	if (th->ack) {
576		rep.th.seq = th->ack_seq;
577	} else {
578		rep.th.ack = 1;
579		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580				       skb->len - (th->doff << 2));
581	}
582
583	memset(&arg, 0, sizeof(arg));
584	arg.iov[0].iov_base = (unsigned char *)&rep;
585	arg.iov[0].iov_len  = sizeof(rep.th);
586
587#ifdef CONFIG_TCP_MD5SIG
588	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
589	if (key) {
590		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591				   (TCPOPT_NOP << 16) |
592				   (TCPOPT_MD5SIG << 8) |
593				   TCPOLEN_MD5SIG);
594		/* Update length and the length the header thinks exists */
595		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596		rep.th.doff = arg.iov[0].iov_len / 4;
597
598		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599					key,
600					ip_hdr(skb)->daddr,
601					ip_hdr(skb)->saddr,
602					&rep.th, IPPROTO_TCP,
603					arg.iov[0].iov_len);
604	}
605#endif
606	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
607				      ip_hdr(skb)->saddr, /* XXX */
608				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
609	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
611	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
612
613	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615}
616
617/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618   outside socket context is ugly, certainly. What can I do?
619 */
620
621static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622			    struct sk_buff *skb, u32 seq, u32 ack,
623			    u32 win, u32 ts)
624{
625	struct tcphdr *th = tcp_hdr(skb);
626	struct {
627		struct tcphdr th;
628		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
629#ifdef CONFIG_TCP_MD5SIG
630			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
631#endif
632			];
633	} rep;
634	struct ip_reply_arg arg;
635#ifdef CONFIG_TCP_MD5SIG
636	struct tcp_md5sig_key *key;
637	struct tcp_md5sig_key tw_key;
638#endif
639
640	memset(&rep.th, 0, sizeof(struct tcphdr));
641	memset(&arg, 0, sizeof(arg));
642
643	arg.iov[0].iov_base = (unsigned char *)&rep;
644	arg.iov[0].iov_len  = sizeof(rep.th);
645	if (ts) {
646		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647				   (TCPOPT_TIMESTAMP << 8) |
648				   TCPOLEN_TIMESTAMP);
649		rep.opt[1] = htonl(tcp_time_stamp);
650		rep.opt[2] = htonl(ts);
651		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
652	}
653
654	/* Swap the send and the receive. */
655	rep.th.dest    = th->source;
656	rep.th.source  = th->dest;
657	rep.th.doff    = arg.iov[0].iov_len / 4;
658	rep.th.seq     = htonl(seq);
659	rep.th.ack_seq = htonl(ack);
660	rep.th.ack     = 1;
661	rep.th.window  = htons(win);
662
663#ifdef CONFIG_TCP_MD5SIG
664	/*
665	 * The SKB holds an imcoming packet, but may not have a valid ->sk
666	 * pointer. This is especially the case when we're dealing with a
667	 * TIME_WAIT ack, because the sk structure is long gone, and only
668	 * the tcp_timewait_sock remains. So the md5 key is stashed in that
669	 * structure, and we use it in preference.  I believe that (twsk ||
670	 * skb->sk) holds true, but we program defensively.
671	 */
672	if (!twsk && skb->sk) {
673		key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
674	} else if (twsk && twsk->tw_md5_keylen) {
675		tw_key.key = twsk->tw_md5_key;
676		tw_key.keylen = twsk->tw_md5_keylen;
677		key = &tw_key;
678	} else
679		key = NULL;
680
681	if (key) {
682		int offset = (ts) ? 3 : 0;
683
684		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685					  (TCPOPT_NOP << 16) |
686					  (TCPOPT_MD5SIG << 8) |
687					  TCPOLEN_MD5SIG);
688		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689		rep.th.doff = arg.iov[0].iov_len/4;
690
691		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692					key,
693					ip_hdr(skb)->daddr,
694					ip_hdr(skb)->saddr,
695					&rep.th, IPPROTO_TCP,
696					arg.iov[0].iov_len);
697	}
698#endif
699	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
700				      ip_hdr(skb)->saddr, /* XXX */
701				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
702	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707}
708
709static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710{
711	struct inet_timewait_sock *tw = inet_twsk(sk);
712	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
713
714	tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
715			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716			tcptw->tw_ts_recent);
717
718	inet_twsk_put(tw);
719}
720
721static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722				  struct request_sock *req)
723{
724	tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
726			req->ts_recent);
727}
728
729/*
730 *	Send a SYN-ACK after having received an ACK.
731 *	This still operates on a request_sock only, not on a big
732 *	socket.
733 */
734static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
735			      struct dst_entry *dst)
736{
737	const struct inet_request_sock *ireq = inet_rsk(req);
738	int err = -1;
739	struct sk_buff * skb;
740
741	/* First, grab a route. */
742	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
743		goto out;
744
745	skb = tcp_make_synack(sk, dst, req);
746
747	if (skb) {
748		struct tcphdr *th = tcp_hdr(skb);
749
750		th->check = tcp_v4_check(skb->len,
751					 ireq->loc_addr,
752					 ireq->rmt_addr,
753					 csum_partial((char *)th, skb->len,
754						      skb->csum));
755
756		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757					    ireq->rmt_addr,
758					    ireq->opt);
759		err = net_xmit_eval(err);
760	}
761
762out:
763	dst_release(dst);
764	return err;
765}
766
767/*
768 *	IPv4 request_sock destructor.
769 */
770static void tcp_v4_reqsk_destructor(struct request_sock *req)
771{
772	kfree(inet_rsk(req)->opt);
773}
774
775#ifdef CONFIG_SYN_COOKIES
776static void syn_flood_warning(struct sk_buff *skb)
777{
778	static unsigned long warntime;
779
780	if (time_after(jiffies, (warntime + HZ * 60))) {
781		warntime = jiffies;
782		printk(KERN_INFO
783		       "possible SYN flooding on port %d. Sending cookies.\n",
784		       ntohs(tcp_hdr(skb)->dest));
785	}
786}
787#endif
788
789/*
790 * Save and compile IPv4 options into the request_sock if needed.
791 */
792static struct ip_options *tcp_v4_save_options(struct sock *sk,
793					      struct sk_buff *skb)
794{
795	struct ip_options *opt = &(IPCB(skb)->opt);
796	struct ip_options *dopt = NULL;
797
798	if (opt && opt->optlen) {
799		int opt_size = optlength(opt);
800		dopt = kmalloc(opt_size, GFP_ATOMIC);
801		if (dopt) {
802			if (ip_options_echo(dopt, skb)) {
803				kfree(dopt);
804				dopt = NULL;
805			}
806		}
807	}
808	return dopt;
809}
810
811#ifdef CONFIG_TCP_MD5SIG
812/*
813 * RFC2385 MD5 checksumming requires a mapping of
814 * IP address->MD5 Key.
815 * We need to maintain these in the sk structure.
816 */
817
818/* Find the Key structure for an address.  */
819static struct tcp_md5sig_key *
820			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
821{
822	struct tcp_sock *tp = tcp_sk(sk);
823	int i;
824
825	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826		return NULL;
827	for (i = 0; i < tp->md5sig_info->entries4; i++) {
828		if (tp->md5sig_info->keys4[i].addr == addr)
829			return (struct tcp_md5sig_key *)
830						&tp->md5sig_info->keys4[i];
831	}
832	return NULL;
833}
834
835struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836					 struct sock *addr_sk)
837{
838	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839}
840
841EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844						      struct request_sock *req)
845{
846	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847}
848
849/* This can be called on a newly created socket, from other files */
850int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851		      u8 *newkey, u8 newkeylen)
852{
853	/* Add Key to the list */
854	struct tcp4_md5sig_key *key;
855	struct tcp_sock *tp = tcp_sk(sk);
856	struct tcp4_md5sig_key *keys;
857
858	key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
859	if (key) {
860		/* Pre-existing entry - just update that one. */
861		kfree(key->key);
862		key->key = newkey;
863		key->keylen = newkeylen;
864	} else {
865		struct tcp_md5sig_info *md5sig;
866
867		if (!tp->md5sig_info) {
868			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
869						  GFP_ATOMIC);
870			if (!tp->md5sig_info) {
871				kfree(newkey);
872				return -ENOMEM;
873			}
874		}
875		if (tcp_alloc_md5sig_pool() == NULL) {
876			kfree(newkey);
877			return -ENOMEM;
878		}
879		md5sig = tp->md5sig_info;
880
881		if (md5sig->alloced4 == md5sig->entries4) {
882			keys = kmalloc((sizeof(*keys) *
883					(md5sig->entries4 + 1)), GFP_ATOMIC);
884			if (!keys) {
885				kfree(newkey);
886				tcp_free_md5sig_pool();
887				return -ENOMEM;
888			}
889
890			if (md5sig->entries4)
891				memcpy(keys, md5sig->keys4,
892				       sizeof(*keys) * md5sig->entries4);
893
894			/* Free old key list, and reference new one */
895			if (md5sig->keys4)
896				kfree(md5sig->keys4);
897			md5sig->keys4 = keys;
898			md5sig->alloced4++;
899		}
900		md5sig->entries4++;
901		md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
902		md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
903		md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
904	}
905	return 0;
906}
907
908EXPORT_SYMBOL(tcp_v4_md5_do_add);
909
910static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
911			       u8 *newkey, u8 newkeylen)
912{
913	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
914				 newkey, newkeylen);
915}
916
917int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918{
919	struct tcp_sock *tp = tcp_sk(sk);
920	int i;
921
922	for (i = 0; i < tp->md5sig_info->entries4; i++) {
923		if (tp->md5sig_info->keys4[i].addr == addr) {
924			/* Free the key */
925			kfree(tp->md5sig_info->keys4[i].key);
926			tp->md5sig_info->entries4--;
927
928			if (tp->md5sig_info->entries4 == 0) {
929				kfree(tp->md5sig_info->keys4);
930				tp->md5sig_info->keys4 = NULL;
931				tp->md5sig_info->alloced4 = 0;
932			} else if (tp->md5sig_info->entries4 != i) {
933				/* Need to do some manipulation */
934				memcpy(&tp->md5sig_info->keys4[i],
935				       &tp->md5sig_info->keys4[i+1],
936				       (tp->md5sig_info->entries4 - i) *
937					sizeof(struct tcp4_md5sig_key));
938			}
939			tcp_free_md5sig_pool();
940			return 0;
941		}
942	}
943	return -ENOENT;
944}
945
946EXPORT_SYMBOL(tcp_v4_md5_do_del);
947
948static void tcp_v4_clear_md5_list(struct sock *sk)
949{
950	struct tcp_sock *tp = tcp_sk(sk);
951
952	/* Free each key, then the set of key keys,
953	 * the crypto element, and then decrement our
954	 * hold on the last resort crypto.
955	 */
956	if (tp->md5sig_info->entries4) {
957		int i;
958		for (i = 0; i < tp->md5sig_info->entries4; i++)
959			kfree(tp->md5sig_info->keys4[i].key);
960		tp->md5sig_info->entries4 = 0;
961		tcp_free_md5sig_pool();
962	}
963	if (tp->md5sig_info->keys4) {
964		kfree(tp->md5sig_info->keys4);
965		tp->md5sig_info->keys4 = NULL;
966		tp->md5sig_info->alloced4  = 0;
967	}
968}
969
970static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
971				 int optlen)
972{
973	struct tcp_md5sig cmd;
974	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
975	u8 *newkey;
976
977	if (optlen < sizeof(cmd))
978		return -EINVAL;
979
980	if (copy_from_user(&cmd, optval, sizeof(cmd)))
981		return -EFAULT;
982
983	if (sin->sin_family != AF_INET)
984		return -EINVAL;
985
986	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
987		if (!tcp_sk(sk)->md5sig_info)
988			return -ENOENT;
989		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
990	}
991
992	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
993		return -EINVAL;
994
995	if (!tcp_sk(sk)->md5sig_info) {
996		struct tcp_sock *tp = tcp_sk(sk);
997		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
998
999		if (!p)
1000			return -EINVAL;
1001
1002		tp->md5sig_info = p;
1003
1004	}
1005
1006	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1007	if (!newkey)
1008		return -ENOMEM;
1009	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1010				 newkey, cmd.tcpm_keylen);
1011}
1012
1013static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1014				   __be32 saddr, __be32 daddr,
1015				   struct tcphdr *th, int protocol,
1016				   int tcplen)
1017{
1018	struct scatterlist sg[4];
1019	__u16 data_len;
1020	int block = 0;
1021	__sum16 old_checksum;
1022	struct tcp_md5sig_pool *hp;
1023	struct tcp4_pseudohdr *bp;
1024	struct hash_desc *desc;
1025	int err;
1026	unsigned int nbytes = 0;
1027
1028	/*
1029	 * Okay, so RFC2385 is turned on for this connection,
1030	 * so we need to generate the MD5 hash for the packet now.
1031	 */
1032
1033	hp = tcp_get_md5sig_pool();
1034	if (!hp)
1035		goto clear_hash_noput;
1036
1037	bp = &hp->md5_blk.ip4;
1038	desc = &hp->md5_desc;
1039
1040	/*
1041	 * 1. the TCP pseudo-header (in the order: source IP address,
1042	 * destination IP address, zero-padded protocol number, and
1043	 * segment length)
1044	 */
1045	bp->saddr = saddr;
1046	bp->daddr = daddr;
1047	bp->pad = 0;
1048	bp->protocol = protocol;
1049	bp->len = htons(tcplen);
1050	sg_set_buf(&sg[block++], bp, sizeof(*bp));
1051	nbytes += sizeof(*bp);
1052
1053	/* 2. the TCP header, excluding options, and assuming a
1054	 * checksum of zero/
1055	 */
1056	old_checksum = th->check;
1057	th->check = 0;
1058	sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1059	nbytes += sizeof(struct tcphdr);
1060
1061	/* 3. the TCP segment data (if any) */
1062	data_len = tcplen - (th->doff << 2);
1063	if (data_len > 0) {
1064		unsigned char *data = (unsigned char *)th + (th->doff << 2);
1065		sg_set_buf(&sg[block++], data, data_len);
1066		nbytes += data_len;
1067	}
1068
1069	/* 4. an independently-specified key or password, known to both
1070	 * TCPs and presumably connection-specific
1071	 */
1072	sg_set_buf(&sg[block++], key->key, key->keylen);
1073	nbytes += key->keylen;
1074
1075	/* Now store the Hash into the packet */
1076	err = crypto_hash_init(desc);
1077	if (err)
1078		goto clear_hash;
1079	err = crypto_hash_update(desc, sg, nbytes);
1080	if (err)
1081		goto clear_hash;
1082	err = crypto_hash_final(desc, md5_hash);
1083	if (err)
1084		goto clear_hash;
1085
1086	/* Reset header, and free up the crypto */
1087	tcp_put_md5sig_pool();
1088	th->check = old_checksum;
1089
1090out:
1091	return 0;
1092clear_hash:
1093	tcp_put_md5sig_pool();
1094clear_hash_noput:
1095	memset(md5_hash, 0, 16);
1096	goto out;
1097}
1098
1099int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1100			 struct sock *sk,
1101			 struct dst_entry *dst,
1102			 struct request_sock *req,
1103			 struct tcphdr *th, int protocol,
1104			 int tcplen)
1105{
1106	__be32 saddr, daddr;
1107
1108	if (sk) {
1109		saddr = inet_sk(sk)->saddr;
1110		daddr = inet_sk(sk)->daddr;
1111	} else {
1112		struct rtable *rt = (struct rtable *)dst;
1113		BUG_ON(!rt);
1114		saddr = rt->rt_src;
1115		daddr = rt->rt_dst;
1116	}
1117	return tcp_v4_do_calc_md5_hash(md5_hash, key,
1118				       saddr, daddr,
1119				       th, protocol, tcplen);
1120}
1121
1122EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1123
1124static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1125{
1126	/*
1127	 * This gets called for each TCP segment that arrives
1128	 * so we want to be efficient.
1129	 * We have 3 drop cases:
1130	 * o No MD5 hash and one expected.
1131	 * o MD5 hash and we're not expecting one.
1132	 * o MD5 hash and its wrong.
1133	 */
1134	__u8 *hash_location = NULL;
1135	struct tcp_md5sig_key *hash_expected;
1136	const struct iphdr *iph = ip_hdr(skb);
1137	struct tcphdr *th = tcp_hdr(skb);
1138	int length = (th->doff << 2) - sizeof(struct tcphdr);
1139	int genhash;
1140	unsigned char *ptr;
1141	unsigned char newhash[16];
1142
1143	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1144
1145	/*
1146	 * If the TCP option length is less than the TCP_MD5SIG
1147	 * option length, then we can shortcut
1148	 */
1149	if (length < TCPOLEN_MD5SIG) {
1150		if (hash_expected)
1151			return 1;
1152		else
1153			return 0;
1154	}
1155
1156	/* Okay, we can't shortcut - we have to grub through the options */
1157	ptr = (unsigned char *)(th + 1);
1158	while (length > 0) {
1159		int opcode = *ptr++;
1160		int opsize;
1161
1162		switch (opcode) {
1163		case TCPOPT_EOL:
1164			goto done_opts;
1165		case TCPOPT_NOP:
1166			length--;
1167			continue;
1168		default:
1169			opsize = *ptr++;
1170			if (opsize < 2)
1171				goto done_opts;
1172			if (opsize > length)
1173				goto done_opts;
1174
1175			if (opcode == TCPOPT_MD5SIG) {
1176				hash_location = ptr;
1177				goto done_opts;
1178			}
1179		}
1180		ptr += opsize-2;
1181		length -= opsize;
1182	}
1183done_opts:
1184	/* We've parsed the options - do we have a hash? */
1185	if (!hash_expected && !hash_location)
1186		return 0;
1187
1188	if (hash_expected && !hash_location) {
1189		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1190			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1191			       NIPQUAD(iph->saddr), ntohs(th->source),
1192			       NIPQUAD(iph->daddr), ntohs(th->dest));
1193		return 1;
1194	}
1195
1196	if (!hash_expected && hash_location) {
1197		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1198			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199			       NIPQUAD(iph->saddr), ntohs(th->source),
1200			       NIPQUAD(iph->daddr), ntohs(th->dest));
1201		return 1;
1202	}
1203
1204	/* Okay, so this is hash_expected and hash_location -
1205	 * so we need to calculate the checksum.
1206	 */
1207	genhash = tcp_v4_do_calc_md5_hash(newhash,
1208					  hash_expected,
1209					  iph->saddr, iph->daddr,
1210					  th, sk->sk_protocol,
1211					  skb->len);
1212
1213	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214		if (net_ratelimit()) {
1215			printk(KERN_INFO "MD5 Hash failed for "
1216			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1217			       NIPQUAD(iph->saddr), ntohs(th->source),
1218			       NIPQUAD(iph->daddr), ntohs(th->dest),
1219			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1220		}
1221		return 1;
1222	}
1223	return 0;
1224}
1225
1226#endif
1227
1228struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229	.family		=	PF_INET,
1230	.obj_size	=	sizeof(struct tcp_request_sock),
1231	.rtx_syn_ack	=	tcp_v4_send_synack,
1232	.send_ack	=	tcp_v4_reqsk_send_ack,
1233	.destructor	=	tcp_v4_reqsk_destructor,
1234	.send_reset	=	tcp_v4_send_reset,
1235};
1236
1237#ifdef CONFIG_TCP_MD5SIG
1238static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1240};
1241#endif
1242
1243static struct timewait_sock_ops tcp_timewait_sock_ops = {
1244	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1245	.twsk_unique	= tcp_twsk_unique,
1246	.twsk_destructor= tcp_twsk_destructor,
1247};
1248
1249int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1250{
1251	struct inet_request_sock *ireq;
1252	struct tcp_options_received tmp_opt;
1253	struct request_sock *req;
1254	__be32 saddr = ip_hdr(skb)->saddr;
1255	__be32 daddr = ip_hdr(skb)->daddr;
1256	__u32 isn = TCP_SKB_CB(skb)->when;
1257	struct dst_entry *dst = NULL;
1258#ifdef CONFIG_SYN_COOKIES
1259	int want_cookie = 0;
1260#else
1261#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1262#endif
1263
1264	/* Never answer to SYNs send to broadcast or multicast */
1265	if (((struct rtable *)skb->dst)->rt_flags &
1266	    (RTCF_BROADCAST | RTCF_MULTICAST))
1267		goto drop;
1268
1269	/* TW buckets are converted to open requests without
1270	 * limitations, they conserve resources and peer is
1271	 * evidently real one.
1272	 */
1273	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1274#ifdef CONFIG_SYN_COOKIES
1275		if (sysctl_tcp_syncookies) {
1276			want_cookie = 1;
1277		} else
1278#endif
1279		goto drop;
1280	}
1281
1282	/* Accept backlog is full. If we have already queued enough
1283	 * of warm entries in syn queue, drop request. It is better than
1284	 * clogging syn queue with openreqs with exponentially increasing
1285	 * timeout.
1286	 */
1287	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1288		goto drop;
1289
1290	req = reqsk_alloc(&tcp_request_sock_ops);
1291	if (!req)
1292		goto drop;
1293
1294#ifdef CONFIG_TCP_MD5SIG
1295	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1296#endif
1297
1298	tcp_clear_options(&tmp_opt);
1299	tmp_opt.mss_clamp = 536;
1300	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1301
1302	tcp_parse_options(skb, &tmp_opt, 0);
1303
1304	if (want_cookie) {
1305		tcp_clear_options(&tmp_opt);
1306		tmp_opt.saw_tstamp = 0;
1307	}
1308
1309	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1310		/* Some OSes (unknown ones, but I see them on web server, which
1311		 * contains information interesting only for windows'
1312		 * users) do not send their stamp in SYN. It is easy case.
1313		 * We simply do not advertise TS support.
1314		 */
1315		tmp_opt.saw_tstamp = 0;
1316		tmp_opt.tstamp_ok  = 0;
1317	}
1318	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1319
1320	tcp_openreq_init(req, &tmp_opt, skb);
1321
1322	if (security_inet_conn_request(sk, skb, req))
1323		goto drop_and_free;
1324
1325	ireq = inet_rsk(req);
1326	ireq->loc_addr = daddr;
1327	ireq->rmt_addr = saddr;
1328	ireq->opt = tcp_v4_save_options(sk, skb);
1329	if (!want_cookie)
1330		TCP_ECN_create_request(req, tcp_hdr(skb));
1331
1332	if (want_cookie) {
1333#ifdef CONFIG_SYN_COOKIES
1334		syn_flood_warning(skb);
1335#endif
1336		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337	} else if (!isn) {
1338		struct inet_peer *peer = NULL;
1339
1340		/* VJ's idea. We save last timestamp seen
1341		 * from the destination in peer table, when entering
1342		 * state TIME-WAIT, and check against it before
1343		 * accepting new connection request.
1344		 *
1345		 * If "isn" is not zero, this request hit alive
1346		 * timewait bucket, so that all the necessary checks
1347		 * are made in the function processing timewait state.
1348		 */
1349		if (tmp_opt.saw_tstamp &&
1350		    tcp_death_row.sysctl_tw_recycle &&
1351		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1352		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353		    peer->v4daddr == saddr) {
1354			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1355			    (s32)(peer->tcp_ts - req->ts_recent) >
1356							TCP_PAWS_WINDOW) {
1357				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1358				dst_release(dst);
1359				goto drop_and_free;
1360			}
1361		}
1362		/* Kill the following clause, if you dislike this way. */
1363		else if (!sysctl_tcp_syncookies &&
1364			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1365			  (sysctl_max_syn_backlog >> 2)) &&
1366			 (!peer || !peer->tcp_ts_stamp) &&
1367			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1368			/* Without syncookies last quarter of
1369			 * backlog is filled with destinations,
1370			 * proven to be alive.
1371			 * It means that we continue to communicate
1372			 * to destinations, already remembered
1373			 * to the moment of synflood.
1374			 */
1375			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1376				       "request from %u.%u.%u.%u/%u\n",
1377				       NIPQUAD(saddr),
1378				       ntohs(tcp_hdr(skb)->source));
1379			dst_release(dst);
1380			goto drop_and_free;
1381		}
1382
1383		isn = tcp_v4_init_sequence(skb);
1384	}
1385	tcp_rsk(req)->snt_isn = isn;
1386
1387	if (tcp_v4_send_synack(sk, req, dst))
1388		goto drop_and_free;
1389
1390	if (want_cookie) {
1391		reqsk_free(req);
1392	} else {
1393		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394	}
1395	return 0;
1396
1397drop_and_free:
1398	reqsk_free(req);
1399drop:
1400	return 0;
1401}
1402
1403
1404/*
1405 * The three way handshake has completed - we got a valid synack -
1406 * now create the new socket.
1407 */
1408struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409				  struct request_sock *req,
1410				  struct dst_entry *dst)
1411{
1412	struct inet_request_sock *ireq;
1413	struct inet_sock *newinet;
1414	struct tcp_sock *newtp;
1415	struct sock *newsk;
1416#ifdef CONFIG_TCP_MD5SIG
1417	struct tcp_md5sig_key *key;
1418#endif
1419
1420	if (sk_acceptq_is_full(sk))
1421		goto exit_overflow;
1422
1423	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1424		goto exit;
1425
1426	newsk = tcp_create_openreq_child(sk, req, skb);
1427	if (!newsk)
1428		goto exit;
1429
1430	newsk->sk_gso_type = SKB_GSO_TCPV4;
1431	sk_setup_caps(newsk, dst);
1432
1433	newtp		      = tcp_sk(newsk);
1434	newinet		      = inet_sk(newsk);
1435	ireq		      = inet_rsk(req);
1436	newinet->daddr	      = ireq->rmt_addr;
1437	newinet->rcv_saddr    = ireq->loc_addr;
1438	newinet->saddr	      = ireq->loc_addr;
1439	newinet->opt	      = ireq->opt;
1440	ireq->opt	      = NULL;
1441	newinet->mc_index     = inet_iif(skb);
1442	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1443	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444	if (newinet->opt)
1445		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1446	newinet->id = newtp->write_seq ^ jiffies;
1447
1448	tcp_mtup_init(newsk);
1449	tcp_sync_mss(newsk, dst_mtu(dst));
1450	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451	tcp_initialize_rcv_mss(newsk);
1452
1453#ifdef CONFIG_TCP_MD5SIG
1454	/* Copy over the MD5 key from the original socket */
1455	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1456		/*
1457		 * We're using one, so create a matching key
1458		 * on the newsk structure. If we fail to get
1459		 * memory, then we end up not copying the key
1460		 * across. Shucks.
1461		 */
1462		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463		if (newkey != NULL)
1464			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1465					  newkey, key->keylen);
1466	}
1467#endif
1468
1469	__inet_hash(&tcp_hashinfo, newsk, 0);
1470	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
1471
1472	return newsk;
1473
1474exit_overflow:
1475	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1476exit:
1477	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1478	dst_release(dst);
1479	return NULL;
1480}
1481
1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483{
1484	struct tcphdr *th = tcp_hdr(skb);
1485	const struct iphdr *iph = ip_hdr(skb);
1486	struct sock *nsk;
1487	struct request_sock **prev;
1488	/* Find possible connection requests. */
1489	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1490						       iph->saddr, iph->daddr);
1491	if (req)
1492		return tcp_check_req(sk, skb, req, prev);
1493
1494	nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1495				      iph->daddr, th->dest, inet_iif(skb));
1496
1497	if (nsk) {
1498		if (nsk->sk_state != TCP_TIME_WAIT) {
1499			bh_lock_sock(nsk);
1500			return nsk;
1501		}
1502		inet_twsk_put(inet_twsk(nsk));
1503		return NULL;
1504	}
1505
1506#ifdef CONFIG_SYN_COOKIES
1507	if (!th->rst && !th->syn && th->ack)
1508		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1509#endif
1510	return sk;
1511}
1512
1513static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1514{
1515	const struct iphdr *iph = ip_hdr(skb);
1516
1517	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1518		if (!tcp_v4_check(skb->len, iph->saddr,
1519				  iph->daddr, skb->csum)) {
1520			skb->ip_summed = CHECKSUM_UNNECESSARY;
1521			return 0;
1522		}
1523	}
1524
1525	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1526				       skb->len, IPPROTO_TCP, 0);
1527
1528	if (skb->len <= 76) {
1529		return __skb_checksum_complete(skb);
1530	}
1531	return 0;
1532}
1533
1534
1535/* The socket must have it's spinlock held when we get
1536 * here.
1537 *
1538 * We have a potential double-lock case here, so even when
1539 * doing backlog processing we use the BH locking scheme.
1540 * This is because we cannot sleep with the original spinlock
1541 * held.
1542 */
1543int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1544{
1545	struct sock *rsk;
1546#ifdef CONFIG_TCP_MD5SIG
1547	/*
1548	 * We really want to reject the packet as early as possible
1549	 * if:
1550	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1551	 *  o There is an MD5 option and we're not expecting one
1552	 */
1553	if (tcp_v4_inbound_md5_hash(sk, skb))
1554		goto discard;
1555#endif
1556
1557	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558		TCP_CHECK_TIMER(sk);
1559		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1560			rsk = sk;
1561			goto reset;
1562		}
1563		TCP_CHECK_TIMER(sk);
1564		return 0;
1565	}
1566
1567	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1568		goto csum_err;
1569
1570	if (sk->sk_state == TCP_LISTEN) {
1571		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1572		if (!nsk)
1573			goto discard;
1574
1575		if (nsk != sk) {
1576			if (tcp_child_process(sk, nsk, skb)) {
1577				rsk = nsk;
1578				goto reset;
1579			}
1580			return 0;
1581		}
1582	}
1583
1584	TCP_CHECK_TIMER(sk);
1585	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1586		rsk = sk;
1587		goto reset;
1588	}
1589	TCP_CHECK_TIMER(sk);
1590	return 0;
1591
1592reset:
1593	tcp_v4_send_reset(rsk, skb);
1594discard:
1595	kfree_skb(skb);
1596	/* Be careful here. If this function gets more complicated and
1597	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1598	 * might be destroyed here. This current version compiles correctly,
1599	 * but you have been warned.
1600	 */
1601	return 0;
1602
1603csum_err:
1604	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1605	goto discard;
1606}
1607
1608/*
1609 *	From tcp_input.c
1610 */
1611
1612int tcp_v4_rcv(struct sk_buff *skb)
1613{
1614	const struct iphdr *iph;
1615	struct tcphdr *th;
1616	struct sock *sk;
1617	int ret;
1618
1619	if (skb->pkt_type != PACKET_HOST)
1620		goto discard_it;
1621
1622	/* Count it even if it's bad */
1623	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1624
1625	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1626		goto discard_it;
1627
1628	th = tcp_hdr(skb);
1629
1630	if (th->doff < sizeof(struct tcphdr) / 4)
1631		goto bad_packet;
1632	if (!pskb_may_pull(skb, th->doff * 4))
1633		goto discard_it;
1634
1635	/* An explanation is required here, I think.
1636	 * Packet length and doff are validated by header prediction,
1637	 * provided case of th->doff==0 is eliminated.
1638	 * So, we defer the checks. */
1639	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1640	     tcp_v4_checksum_init(skb)))
1641		goto bad_packet;
1642
1643	th = tcp_hdr(skb);
1644	iph = ip_hdr(skb);
1645	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1646	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1647				    skb->len - th->doff * 4);
1648	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1649	TCP_SKB_CB(skb)->when	 = 0;
1650	TCP_SKB_CB(skb)->flags	 = iph->tos;
1651	TCP_SKB_CB(skb)->sacked	 = 0;
1652
1653	sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1654			   iph->daddr, th->dest, inet_iif(skb));
1655	if (!sk)
1656		goto no_tcp_socket;
1657
1658process:
1659	if (sk->sk_state == TCP_TIME_WAIT)
1660		goto do_time_wait;
1661
1662	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1663		goto discard_and_relse;
1664	nf_reset(skb);
1665
1666	if (sk_filter(sk, skb))
1667		goto discard_and_relse;
1668
1669	skb->dev = NULL;
1670
1671	bh_lock_sock_nested(sk);
1672	ret = 0;
1673	if (!sock_owned_by_user(sk)) {
1674#ifdef CONFIG_NET_DMA
1675		struct tcp_sock *tp = tcp_sk(sk);
1676		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1677			tp->ucopy.dma_chan = get_softnet_dma();
1678		if (tp->ucopy.dma_chan)
1679			ret = tcp_v4_do_rcv(sk, skb);
1680		else
1681#endif
1682		{
1683			if (!tcp_prequeue(sk, skb))
1684			ret = tcp_v4_do_rcv(sk, skb);
1685		}
1686	} else
1687		sk_add_backlog(sk, skb);
1688	bh_unlock_sock(sk);
1689
1690	sock_put(sk);
1691
1692	return ret;
1693
1694no_tcp_socket:
1695	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1696		goto discard_it;
1697
1698	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1699bad_packet:
1700		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1701	} else {
1702		tcp_v4_send_reset(NULL, skb);
1703	}
1704
1705discard_it:
1706	/* Discard frame. */
1707	kfree_skb(skb);
1708	return 0;
1709
1710discard_and_relse:
1711	sock_put(sk);
1712	goto discard_it;
1713
1714do_time_wait:
1715	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1716		inet_twsk_put(inet_twsk(sk));
1717		goto discard_it;
1718	}
1719
1720	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1721		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1722		inet_twsk_put(inet_twsk(sk));
1723		goto discard_it;
1724	}
1725	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1726	case TCP_TW_SYN: {
1727		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1728							iph->daddr, th->dest,
1729							inet_iif(skb));
1730		if (sk2) {
1731			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1732			inet_twsk_put(inet_twsk(sk));
1733			sk = sk2;
1734			goto process;
1735		}
1736		/* Fall through to ACK */
1737	}
1738	case TCP_TW_ACK:
1739		tcp_v4_timewait_ack(sk, skb);
1740		break;
1741	case TCP_TW_RST:
1742		goto no_tcp_socket;
1743	case TCP_TW_SUCCESS:;
1744	}
1745	goto discard_it;
1746}
1747
1748/* VJ's idea. Save last timestamp seen from this destination
1749 * and hold it at least for normal timewait interval to use for duplicate
1750 * segment detection in subsequent connections, before they enter synchronized
1751 * state.
1752 */
1753
1754int tcp_v4_remember_stamp(struct sock *sk)
1755{
1756	struct inet_sock *inet = inet_sk(sk);
1757	struct tcp_sock *tp = tcp_sk(sk);
1758	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1759	struct inet_peer *peer = NULL;
1760	int release_it = 0;
1761
1762	if (!rt || rt->rt_dst != inet->daddr) {
1763		peer = inet_getpeer(inet->daddr, 1);
1764		release_it = 1;
1765	} else {
1766		if (!rt->peer)
1767			rt_bind_peer(rt, 1);
1768		peer = rt->peer;
1769	}
1770
1771	if (peer) {
1772		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1773		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1774		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1775			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1776			peer->tcp_ts = tp->rx_opt.ts_recent;
1777		}
1778		if (release_it)
1779			inet_putpeer(peer);
1780		return 1;
1781	}
1782
1783	return 0;
1784}
1785
1786int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1787{
1788	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1789
1790	if (peer) {
1791		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1792
1793		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1794		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1795		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1796			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1797			peer->tcp_ts	   = tcptw->tw_ts_recent;
1798		}
1799		inet_putpeer(peer);
1800		return 1;
1801	}
1802
1803	return 0;
1804}
1805
1806struct inet_connection_sock_af_ops ipv4_specific = {
1807	.queue_xmit	   = ip_queue_xmit,
1808	.send_check	   = tcp_v4_send_check,
1809	.rebuild_header	   = inet_sk_rebuild_header,
1810	.conn_request	   = tcp_v4_conn_request,
1811	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1812	.remember_stamp	   = tcp_v4_remember_stamp,
1813	.net_header_len	   = sizeof(struct iphdr),
1814	.setsockopt	   = ip_setsockopt,
1815	.getsockopt	   = ip_getsockopt,
1816	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1817	.sockaddr_len	   = sizeof(struct sockaddr_in),
1818#ifdef CONFIG_COMPAT
1819	.compat_setsockopt = compat_ip_setsockopt,
1820	.compat_getsockopt = compat_ip_getsockopt,
1821#endif
1822};
1823
1824#ifdef CONFIG_TCP_MD5SIG
1825static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1826	.md5_lookup		= tcp_v4_md5_lookup,
1827	.calc_md5_hash		= tcp_v4_calc_md5_hash,
1828	.md5_add		= tcp_v4_md5_add_func,
1829	.md5_parse		= tcp_v4_parse_md5_keys,
1830};
1831#endif
1832
1833/* NOTE: A lot of things set to zero explicitly by call to
1834 *       sk_alloc() so need not be done here.
1835 */
1836static int tcp_v4_init_sock(struct sock *sk)
1837{
1838	struct inet_connection_sock *icsk = inet_csk(sk);
1839	struct tcp_sock *tp = tcp_sk(sk);
1840
1841	skb_queue_head_init(&tp->out_of_order_queue);
1842	tcp_init_xmit_timers(sk);
1843	tcp_prequeue_init(tp);
1844
1845	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1846	tp->mdev = TCP_TIMEOUT_INIT;
1847
1848	/* So many TCP implementations out there (incorrectly) count the
1849	 * initial SYN frame in their delayed-ACK and congestion control
1850	 * algorithms that we must have the following bandaid to talk
1851	 * efficiently to them.  -DaveM
1852	 */
1853	tp->snd_cwnd = 2;
1854
1855	/* See draft-stevens-tcpca-spec-01 for discussion of the
1856	 * initialization of these values.
1857	 */
1858	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1859	tp->snd_cwnd_clamp = ~0;
1860	tp->mss_cache = 536;
1861
1862	tp->reordering = sysctl_tcp_reordering;
1863	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1864
1865	sk->sk_state = TCP_CLOSE;
1866
1867	sk->sk_write_space = sk_stream_write_space;
1868	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1869
1870	icsk->icsk_af_ops = &ipv4_specific;
1871	icsk->icsk_sync_mss = tcp_sync_mss;
1872#ifdef CONFIG_TCP_MD5SIG
1873	tp->af_specific = &tcp_sock_ipv4_specific;
1874#endif
1875
1876	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1877	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1878
1879	atomic_inc(&tcp_sockets_allocated);
1880
1881	return 0;
1882}
1883
1884int tcp_v4_destroy_sock(struct sock *sk)
1885{
1886	struct tcp_sock *tp = tcp_sk(sk);
1887
1888	tcp_clear_xmit_timers(sk);
1889
1890	tcp_cleanup_congestion_control(sk);
1891
1892	/* Cleanup up the write buffer. */
1893	tcp_write_queue_purge(sk);
1894
1895	/* Cleans up our, hopefully empty, out_of_order_queue. */
1896	__skb_queue_purge(&tp->out_of_order_queue);
1897
1898#ifdef CONFIG_TCP_MD5SIG
1899	/* Clean up the MD5 key list, if any */
1900	if (tp->md5sig_info) {
1901		tcp_v4_clear_md5_list(sk);
1902		kfree(tp->md5sig_info);
1903		tp->md5sig_info = NULL;
1904	}
1905#endif
1906
1907#ifdef CONFIG_NET_DMA
1908	/* Cleans up our sk_async_wait_queue */
1909	__skb_queue_purge(&sk->sk_async_wait_queue);
1910#endif
1911
1912	/* Clean prequeue, it must be empty really */
1913	__skb_queue_purge(&tp->ucopy.prequeue);
1914
1915	/* Clean up a referenced TCP bind bucket. */
1916	if (inet_csk(sk)->icsk_bind_hash)
1917		inet_put_port(&tcp_hashinfo, sk);
1918
1919	/*
1920	 * If sendmsg cached page exists, toss it.
1921	 */
1922	if (sk->sk_sndmsg_page) {
1923		__free_page(sk->sk_sndmsg_page);
1924		sk->sk_sndmsg_page = NULL;
1925	}
1926
1927	atomic_dec(&tcp_sockets_allocated);
1928
1929	return 0;
1930}
1931
1932EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933
1934#ifdef CONFIG_PROC_FS
1935/* Proc filesystem TCP sock list dumping. */
1936
1937static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1938{
1939	return hlist_empty(head) ? NULL :
1940		list_entry(head->first, struct inet_timewait_sock, tw_node);
1941}
1942
1943static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1944{
1945	return tw->tw_node.next ?
1946		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1947}
1948
1949static void *listening_get_next(struct seq_file *seq, void *cur)
1950{
1951	struct inet_connection_sock *icsk;
1952	struct hlist_node *node;
1953	struct sock *sk = cur;
1954	struct tcp_iter_state* st = seq->private;
1955
1956	if (!sk) {
1957		st->bucket = 0;
1958		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1959		goto get_sk;
1960	}
1961
1962	++st->num;
1963
1964	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1965		struct request_sock *req = cur;
1966
1967		icsk = inet_csk(st->syn_wait_sk);
1968		req = req->dl_next;
1969		while (1) {
1970			while (req) {
1971				if (req->rsk_ops->family == st->family) {
1972					cur = req;
1973					goto out;
1974				}
1975				req = req->dl_next;
1976			}
1977			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1978				break;
1979get_req:
1980			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1981		}
1982		sk	  = sk_next(st->syn_wait_sk);
1983		st->state = TCP_SEQ_STATE_LISTENING;
1984		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1985	} else {
1986		icsk = inet_csk(sk);
1987		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1988		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1989			goto start_req;
1990		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1991		sk = sk_next(sk);
1992	}
1993get_sk:
1994	sk_for_each_from(sk, node) {
1995		if (sk->sk_family == st->family) {
1996			cur = sk;
1997			goto out;
1998		}
1999		icsk = inet_csk(sk);
2000		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2001		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2002start_req:
2003			st->uid		= sock_i_uid(sk);
2004			st->syn_wait_sk = sk;
2005			st->state	= TCP_SEQ_STATE_OPENREQ;
2006			st->sbucket	= 0;
2007			goto get_req;
2008		}
2009		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2010	}
2011	if (++st->bucket < INET_LHTABLE_SIZE) {
2012		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2013		goto get_sk;
2014	}
2015	cur = NULL;
2016out:
2017	return cur;
2018}
2019
2020static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2021{
2022	void *rc = listening_get_next(seq, NULL);
2023
2024	while (rc && *pos) {
2025		rc = listening_get_next(seq, rc);
2026		--*pos;
2027	}
2028	return rc;
2029}
2030
2031static void *established_get_first(struct seq_file *seq)
2032{
2033	struct tcp_iter_state* st = seq->private;
2034	void *rc = NULL;
2035
2036	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2037		struct sock *sk;
2038		struct hlist_node *node;
2039		struct inet_timewait_sock *tw;
2040
2041		/* We can reschedule _before_ having picked the target: */
2042		cond_resched_softirq();
2043
2044		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2045		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2046			if (sk->sk_family != st->family) {
2047				continue;
2048			}
2049			rc = sk;
2050			goto out;
2051		}
2052		st->state = TCP_SEQ_STATE_TIME_WAIT;
2053		inet_twsk_for_each(tw, node,
2054				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2055			if (tw->tw_family != st->family) {
2056				continue;
2057			}
2058			rc = tw;
2059			goto out;
2060		}
2061		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2062		st->state = TCP_SEQ_STATE_ESTABLISHED;
2063	}
2064out:
2065	return rc;
2066}
2067
2068static void *established_get_next(struct seq_file *seq, void *cur)
2069{
2070	struct sock *sk = cur;
2071	struct inet_timewait_sock *tw;
2072	struct hlist_node *node;
2073	struct tcp_iter_state* st = seq->private;
2074
2075	++st->num;
2076
2077	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2078		tw = cur;
2079		tw = tw_next(tw);
2080get_tw:
2081		while (tw && tw->tw_family != st->family) {
2082			tw = tw_next(tw);
2083		}
2084		if (tw) {
2085			cur = tw;
2086			goto out;
2087		}
2088		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2089		st->state = TCP_SEQ_STATE_ESTABLISHED;
2090
2091		/* We can reschedule between buckets: */
2092		cond_resched_softirq();
2093
2094		if (++st->bucket < tcp_hashinfo.ehash_size) {
2095			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2096			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2097		} else {
2098			cur = NULL;
2099			goto out;
2100		}
2101	} else
2102		sk = sk_next(sk);
2103
2104	sk_for_each_from(sk, node) {
2105		if (sk->sk_family == st->family)
2106			goto found;
2107	}
2108
2109	st->state = TCP_SEQ_STATE_TIME_WAIT;
2110	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2111	goto get_tw;
2112found:
2113	cur = sk;
2114out:
2115	return cur;
2116}
2117
2118static void *established_get_idx(struct seq_file *seq, loff_t pos)
2119{
2120	void *rc = established_get_first(seq);
2121
2122	while (rc && pos) {
2123		rc = established_get_next(seq, rc);
2124		--pos;
2125	}
2126	return rc;
2127}
2128
2129static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2130{
2131	void *rc;
2132	struct tcp_iter_state* st = seq->private;
2133
2134	inet_listen_lock(&tcp_hashinfo);
2135	st->state = TCP_SEQ_STATE_LISTENING;
2136	rc	  = listening_get_idx(seq, &pos);
2137
2138	if (!rc) {
2139		inet_listen_unlock(&tcp_hashinfo);
2140		local_bh_disable();
2141		st->state = TCP_SEQ_STATE_ESTABLISHED;
2142		rc	  = established_get_idx(seq, pos);
2143	}
2144
2145	return rc;
2146}
2147
2148static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2149{
2150	struct tcp_iter_state* st = seq->private;
2151	st->state = TCP_SEQ_STATE_LISTENING;
2152	st->num = 0;
2153	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154}
2155
2156static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157{
2158	void *rc = NULL;
2159	struct tcp_iter_state* st;
2160
2161	if (v == SEQ_START_TOKEN) {
2162		rc = tcp_get_idx(seq, 0);
2163		goto out;
2164	}
2165	st = seq->private;
2166
2167	switch (st->state) {
2168	case TCP_SEQ_STATE_OPENREQ:
2169	case TCP_SEQ_STATE_LISTENING:
2170		rc = listening_get_next(seq, v);
2171		if (!rc) {
2172			inet_listen_unlock(&tcp_hashinfo);
2173			local_bh_disable();
2174			st->state = TCP_SEQ_STATE_ESTABLISHED;
2175			rc	  = established_get_first(seq);
2176		}
2177		break;
2178	case TCP_SEQ_STATE_ESTABLISHED:
2179	case TCP_SEQ_STATE_TIME_WAIT:
2180		rc = established_get_next(seq, v);
2181		break;
2182	}
2183out:
2184	++*pos;
2185	return rc;
2186}
2187
2188static void tcp_seq_stop(struct seq_file *seq, void *v)
2189{
2190	struct tcp_iter_state* st = seq->private;
2191
2192	switch (st->state) {
2193	case TCP_SEQ_STATE_OPENREQ:
2194		if (v) {
2195			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2196			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2197		}
2198	case TCP_SEQ_STATE_LISTENING:
2199		if (v != SEQ_START_TOKEN)
2200			inet_listen_unlock(&tcp_hashinfo);
2201		break;
2202	case TCP_SEQ_STATE_TIME_WAIT:
2203	case TCP_SEQ_STATE_ESTABLISHED:
2204		if (v)
2205			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2206		local_bh_enable();
2207		break;
2208	}
2209}
2210
2211static int tcp_seq_open(struct inode *inode, struct file *file)
2212{
2213	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2214	struct seq_file *seq;
2215	struct tcp_iter_state *s;
2216	int rc;
2217
2218	if (unlikely(afinfo == NULL))
2219		return -EINVAL;
2220
2221	s = kzalloc(sizeof(*s), GFP_KERNEL);
2222	if (!s)
2223		return -ENOMEM;
2224	s->family		= afinfo->family;
2225	s->seq_ops.start	= tcp_seq_start;
2226	s->seq_ops.next		= tcp_seq_next;
2227	s->seq_ops.show		= afinfo->seq_show;
2228	s->seq_ops.stop		= tcp_seq_stop;
2229
2230	rc = seq_open(file, &s->seq_ops);
2231	if (rc)
2232		goto out_kfree;
2233	seq	     = file->private_data;
2234	seq->private = s;
2235out:
2236	return rc;
2237out_kfree:
2238	kfree(s);
2239	goto out;
2240}
2241
2242int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2243{
2244	int rc = 0;
2245	struct proc_dir_entry *p;
2246
2247	if (!afinfo)
2248		return -EINVAL;
2249	afinfo->seq_fops->owner		= afinfo->owner;
2250	afinfo->seq_fops->open		= tcp_seq_open;
2251	afinfo->seq_fops->read		= seq_read;
2252	afinfo->seq_fops->llseek	= seq_lseek;
2253	afinfo->seq_fops->release	= seq_release_private;
2254
2255	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2256	if (p)
2257		p->data = afinfo;
2258	else
2259		rc = -ENOMEM;
2260	return rc;
2261}
2262
2263void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2264{
2265	if (!afinfo)
2266		return;
2267	proc_net_remove(afinfo->name);
2268	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2269}
2270
2271static void get_openreq4(struct sock *sk, struct request_sock *req,
2272			 char *tmpbuf, int i, int uid)
2273{
2274	const struct inet_request_sock *ireq = inet_rsk(req);
2275	int ttd = req->expires - jiffies;
2276
2277	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2278		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2279		i,
2280		ireq->loc_addr,
2281		ntohs(inet_sk(sk)->sport),
2282		ireq->rmt_addr,
2283		ntohs(ireq->rmt_port),
2284		TCP_SYN_RECV,
2285		0, 0, /* could print option size, but that is af dependent. */
2286		1,    /* timers active (only the expire timer) */
2287		jiffies_to_clock_t(ttd),
2288		req->retrans,
2289		uid,
2290		0,  /* non standard timer */
2291		0, /* open_requests have no inode */
2292		atomic_read(&sk->sk_refcnt),
2293		req);
2294}
2295
2296static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2297{
2298	int timer_active;
2299	unsigned long timer_expires;
2300	struct tcp_sock *tp = tcp_sk(sk);
2301	const struct inet_connection_sock *icsk = inet_csk(sk);
2302	struct inet_sock *inet = inet_sk(sk);
2303	__be32 dest = inet->daddr;
2304	__be32 src = inet->rcv_saddr;
2305	__u16 destp = ntohs(inet->dport);
2306	__u16 srcp = ntohs(inet->sport);
2307
2308	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2309		timer_active	= 1;
2310		timer_expires	= icsk->icsk_timeout;
2311	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2312		timer_active	= 4;
2313		timer_expires	= icsk->icsk_timeout;
2314	} else if (timer_pending(&sk->sk_timer)) {
2315		timer_active	= 2;
2316		timer_expires	= sk->sk_timer.expires;
2317	} else {
2318		timer_active	= 0;
2319		timer_expires = jiffies;
2320	}
2321
2322	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2323			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
2324		i, src, srcp, dest, destp, sk->sk_state,
2325		tp->write_seq - tp->snd_una,
2326		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2327					     (tp->rcv_nxt - tp->copied_seq),
2328		timer_active,
2329		jiffies_to_clock_t(timer_expires - jiffies),
2330		icsk->icsk_retransmits,
2331		sock_i_uid(sk),
2332		icsk->icsk_probes_out,
2333		sock_i_ino(sk),
2334		atomic_read(&sk->sk_refcnt), sk,
2335		icsk->icsk_rto,
2336		icsk->icsk_ack.ato,
2337		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2338		tp->snd_cwnd,
2339		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2340}
2341
2342static void get_timewait4_sock(struct inet_timewait_sock *tw,
2343			       char *tmpbuf, int i)
2344{
2345	__be32 dest, src;
2346	__u16 destp, srcp;
2347	int ttd = tw->tw_ttd - jiffies;
2348
2349	if (ttd < 0)
2350		ttd = 0;
2351
2352	dest  = tw->tw_daddr;
2353	src   = tw->tw_rcv_saddr;
2354	destp = ntohs(tw->tw_dport);
2355	srcp  = ntohs(tw->tw_sport);
2356
2357	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2358		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2359		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2360		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2361		atomic_read(&tw->tw_refcnt), tw);
2362}
2363
2364#define TMPSZ 150
2365
2366static int tcp4_seq_show(struct seq_file *seq, void *v)
2367{
2368	struct tcp_iter_state* st;
2369	char tmpbuf[TMPSZ + 1];
2370
2371	if (v == SEQ_START_TOKEN) {
2372		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2373			   "  sl  local_address rem_address   st tx_queue "
2374			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2375			   "inode");
2376		goto out;
2377	}
2378	st = seq->private;
2379
2380	switch (st->state) {
2381	case TCP_SEQ_STATE_LISTENING:
2382	case TCP_SEQ_STATE_ESTABLISHED:
2383		get_tcp4_sock(v, tmpbuf, st->num);
2384		break;
2385	case TCP_SEQ_STATE_OPENREQ:
2386		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2387		break;
2388	case TCP_SEQ_STATE_TIME_WAIT:
2389		get_timewait4_sock(v, tmpbuf, st->num);
2390		break;
2391	}
2392	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2393out:
2394	return 0;
2395}
2396
2397static struct file_operations tcp4_seq_fops;
2398static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2399	.owner		= THIS_MODULE,
2400	.name		= "tcp",
2401	.family		= AF_INET,
2402	.seq_show	= tcp4_seq_show,
2403	.seq_fops	= &tcp4_seq_fops,
2404};
2405
2406int __init tcp4_proc_init(void)
2407{
2408	return tcp_proc_register(&tcp4_seq_afinfo);
2409}
2410
2411void tcp4_proc_exit(void)
2412{
2413	tcp_proc_unregister(&tcp4_seq_afinfo);
2414}
2415#endif /* CONFIG_PROC_FS */
2416
2417struct proto tcp_prot = {
2418	.name			= "TCP",
2419	.owner			= THIS_MODULE,
2420	.close			= tcp_close,
2421	.connect		= tcp_v4_connect,
2422	.disconnect		= tcp_disconnect,
2423	.accept			= inet_csk_accept,
2424	.ioctl			= tcp_ioctl,
2425	.init			= tcp_v4_init_sock,
2426	.destroy		= tcp_v4_destroy_sock,
2427	.shutdown		= tcp_shutdown,
2428	.setsockopt		= tcp_setsockopt,
2429	.getsockopt		= tcp_getsockopt,
2430	.sendmsg		= tcp_sendmsg,
2431	.recvmsg		= tcp_recvmsg,
2432	.backlog_rcv		= tcp_v4_do_rcv,
2433	.hash			= tcp_v4_hash,
2434	.unhash			= tcp_unhash,
2435	.get_port		= tcp_v4_get_port,
2436	.enter_memory_pressure	= tcp_enter_memory_pressure,
2437	.sockets_allocated	= &tcp_sockets_allocated,
2438	.orphan_count		= &tcp_orphan_count,
2439	.memory_allocated	= &tcp_memory_allocated,
2440	.memory_pressure	= &tcp_memory_pressure,
2441	.sysctl_mem		= sysctl_tcp_mem,
2442	.sysctl_wmem		= sysctl_tcp_wmem,
2443	.sysctl_rmem		= sysctl_tcp_rmem,
2444	.max_header		= MAX_TCP_HEADER,
2445	.obj_size		= sizeof(struct tcp_sock),
2446	.twsk_prot		= &tcp_timewait_sock_ops,
2447	.rsk_prot		= &tcp_request_sock_ops,
2448#ifdef CONFIG_COMPAT
2449	.compat_setsockopt	= compat_tcp_setsockopt,
2450	.compat_getsockopt	= compat_tcp_getsockopt,
2451#endif
2452};
2453
2454void __init tcp_v4_init(struct net_proto_family *ops)
2455{
2456	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2457				     IPPROTO_TCP) < 0)
2458		panic("Failed to create the TCP control socket.\n");
2459}
2460
2461EXPORT_SYMBOL(ipv4_specific);
2462EXPORT_SYMBOL(tcp_hashinfo);
2463EXPORT_SYMBOL(tcp_prot);
2464EXPORT_SYMBOL(tcp_unhash);
2465EXPORT_SYMBOL(tcp_v4_conn_request);
2466EXPORT_SYMBOL(tcp_v4_connect);
2467EXPORT_SYMBOL(tcp_v4_do_rcv);
2468EXPORT_SYMBOL(tcp_v4_remember_stamp);
2469EXPORT_SYMBOL(tcp_v4_send_check);
2470EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2471
2472#ifdef CONFIG_PROC_FS
2473EXPORT_SYMBOL(tcp_proc_register);
2474EXPORT_SYMBOL(tcp_proc_unregister);
2475#endif
2476EXPORT_SYMBOL(sysctl_local_port_range);
2477EXPORT_SYMBOL(sysctl_tcp_low_latency);
2478
2479