tcp_ipv4.c revision f40c8174d3c21bf178283f3ef3aa8c7bf238fdec
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 *		IPv4 specific functions
11 *
12 *
13 *		code split from:
14 *		linux/ipv4/tcp.c
15 *		linux/ipv4/tcp_input.c
16 *		linux/ipv4/tcp_output.c
17 *
18 *		See tcp.c for author information
19 *
20 *	This program is free software; you can redistribute it and/or
21 *      modify it under the terms of the GNU General Public License
22 *      as published by the Free Software Foundation; either version
23 *      2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 *		David S. Miller	:	New socket lookup architecture.
29 *					This code is dedicated to John Dyson.
30 *		David S. Miller :	Change semantics of established hash,
31 *					half is devoted to TIME_WAIT sockets
32 *					and the rest go in the other half.
33 *		Andi Kleen :		Add support for syncookies and fixed
34 *					some bugs: ip options weren't passed to
35 *					the TCP layer, missed a check for an
36 *					ACK bit.
37 *		Andi Kleen :		Implemented fast path mtu discovery.
38 *	     				Fixed many serious bugs in the
39 *					request_sock handling and moved
40 *					most of it into the af independent code.
41 *					Added tail drop and some other bugfixes.
42 *					Added new listen semantics.
43 *		Mike McLagan	:	Routing by source
44 *	Juan Jose Ciarlante:		ip_dynaddr bits
45 *		Andi Kleen:		various fixes.
46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47 *					coma.
48 *	Andi Kleen		:	Fix new listen.
49 *	Andi Kleen		:	Fix accept error reporting.
50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52 *					a single port at the same time.
53 */
54
55
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87
88/* Check TCP sequence numbers in ICMP packets. */
89#define ICMP_MIN_LENGTH 8
90
91/* Socket used for sending RSTs */
92static struct socket *tcp_socket __read_mostly;
93
94void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
95
96#ifdef CONFIG_TCP_MD5SIG
97static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
98						   __be32 addr);
99static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
100				   __be32 saddr, __be32 daddr,
101				   struct tcphdr *th, int protocol,
102				   unsigned int tcplen);
103#endif
104
105struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107	.lhash_users = ATOMIC_INIT(0),
108	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109};
110
111static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
112{
113	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
114					  ip_hdr(skb)->saddr,
115					  tcp_hdr(skb)->dest,
116					  tcp_hdr(skb)->source);
117}
118
119int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120{
121	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122	struct tcp_sock *tp = tcp_sk(sk);
123
124	/* With PAWS, it is safe from the viewpoint
125	   of data integrity. Even without PAWS it is safe provided sequence
126	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127
128	   Actually, the idea is close to VJ's one, only timestamp cache is
129	   held not per host, but per port pair and TW bucket is used as state
130	   holder.
131
132	   If TW bucket has been already destroyed we fall back to VJ's scheme
133	   and use initial timestamp retrieved from peer table.
134	 */
135	if (tcptw->tw_ts_recent_stamp &&
136	    (twp == NULL || (sysctl_tcp_tw_reuse &&
137			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
138		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
139		if (tp->write_seq == 0)
140			tp->write_seq = 1;
141		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
142		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
143		sock_hold(sktw);
144		return 1;
145	}
146
147	return 0;
148}
149
150EXPORT_SYMBOL_GPL(tcp_twsk_unique);
151
152/* This will initiate an outgoing connection. */
153int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
154{
155	struct inet_sock *inet = inet_sk(sk);
156	struct tcp_sock *tp = tcp_sk(sk);
157	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
158	struct rtable *rt;
159	__be32 daddr, nexthop;
160	int tmp;
161	int err;
162
163	if (addr_len < sizeof(struct sockaddr_in))
164		return -EINVAL;
165
166	if (usin->sin_family != AF_INET)
167		return -EAFNOSUPPORT;
168
169	nexthop = daddr = usin->sin_addr.s_addr;
170	if (inet->opt && inet->opt->srr) {
171		if (!daddr)
172			return -EINVAL;
173		nexthop = inet->opt->faddr;
174	}
175
176	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
177			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
178			       IPPROTO_TCP,
179			       inet->sport, usin->sin_port, sk, 1);
180	if (tmp < 0) {
181		if (tmp == -ENETUNREACH)
182			IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
183		return tmp;
184	}
185
186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187		ip_rt_put(rt);
188		return -ENETUNREACH;
189	}
190
191	if (!inet->opt || !inet->opt->srr)
192		daddr = rt->rt_dst;
193
194	if (!inet->saddr)
195		inet->saddr = rt->rt_src;
196	inet->rcv_saddr = inet->saddr;
197
198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199		/* Reset inherited state */
200		tp->rx_opt.ts_recent	   = 0;
201		tp->rx_opt.ts_recent_stamp = 0;
202		tp->write_seq		   = 0;
203	}
204
205	if (tcp_death_row.sysctl_tw_recycle &&
206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207		struct inet_peer *peer = rt_get_peer(rt);
208		/*
209		 * VJ's idea. We save last timestamp seen from
210		 * the destination in peer table, when entering state
211		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
212		 * when trying new connection.
213		 */
214		if (peer != NULL &&
215		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
216			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217			tp->rx_opt.ts_recent = peer->tcp_ts;
218		}
219	}
220
221	inet->dport = usin->sin_port;
222	inet->daddr = daddr;
223
224	inet_csk(sk)->icsk_ext_hdr_len = 0;
225	if (inet->opt)
226		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228	tp->rx_opt.mss_clamp = 536;
229
230	/* Socket identity is still unknown (sport may be zero).
231	 * However we set state to SYN-SENT and not releasing socket
232	 * lock select source port, enter ourselves into the hash tables and
233	 * complete initialization after this.
234	 */
235	tcp_set_state(sk, TCP_SYN_SENT);
236	err = inet_hash_connect(&tcp_death_row, sk);
237	if (err)
238		goto failure;
239
240	err = ip_route_newports(&rt, IPPROTO_TCP,
241				inet->sport, inet->dport, sk);
242	if (err)
243		goto failure;
244
245	/* OK, now commit destination to socket.  */
246	sk->sk_gso_type = SKB_GSO_TCPV4;
247	sk_setup_caps(sk, &rt->u.dst);
248
249	if (!tp->write_seq)
250		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
251							   inet->daddr,
252							   inet->sport,
253							   usin->sin_port);
254
255	inet->id = tp->write_seq ^ jiffies;
256
257	err = tcp_connect(sk);
258	rt = NULL;
259	if (err)
260		goto failure;
261
262	return 0;
263
264failure:
265	/*
266	 * This unhashes the socket and releases the local port,
267	 * if necessary.
268	 */
269	tcp_set_state(sk, TCP_CLOSE);
270	ip_rt_put(rt);
271	sk->sk_route_caps = 0;
272	inet->dport = 0;
273	return err;
274}
275
276/*
277 * This routine does path mtu discovery as defined in RFC1191.
278 */
279static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
280{
281	struct dst_entry *dst;
282	struct inet_sock *inet = inet_sk(sk);
283
284	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
285	 * send out by Linux are always <576bytes so they should go through
286	 * unfragmented).
287	 */
288	if (sk->sk_state == TCP_LISTEN)
289		return;
290
291	/* We don't check in the destentry if pmtu discovery is forbidden
292	 * on this route. We just assume that no packet_to_big packets
293	 * are send back when pmtu discovery is not active.
294	 * There is a small race when the user changes this flag in the
295	 * route, but I think that's acceptable.
296	 */
297	if ((dst = __sk_dst_check(sk, 0)) == NULL)
298		return;
299
300	dst->ops->update_pmtu(dst, mtu);
301
302	/* Something is about to be wrong... Remember soft error
303	 * for the case, if this connection will not able to recover.
304	 */
305	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
306		sk->sk_err_soft = EMSGSIZE;
307
308	mtu = dst_mtu(dst);
309
310	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
311	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
312		tcp_sync_mss(sk, mtu);
313
314		/* Resend the TCP packet because it's
315		 * clear that the old packet has been
316		 * dropped. This is the new "fast" path mtu
317		 * discovery.
318		 */
319		tcp_simple_retransmit(sk);
320	} /* else let the usual retransmit timer handle it */
321}
322
323/*
324 * This routine is called by the ICMP module when it gets some
325 * sort of error condition.  If err < 0 then the socket should
326 * be closed and the error returned to the user.  If err > 0
327 * it's just the icmp type << 8 | icmp code.  After adjustment
328 * header points to the first 8 bytes of the tcp header.  We need
329 * to find the appropriate port.
330 *
331 * The locking strategy used here is very "optimistic". When
332 * someone else accesses the socket the ICMP is just dropped
333 * and for some paths there is no check at all.
334 * A more general error queue to queue errors for later handling
335 * is probably better.
336 *
337 */
338
339void tcp_v4_err(struct sk_buff *skb, u32 info)
340{
341	struct iphdr *iph = (struct iphdr *)skb->data;
342	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
343	struct tcp_sock *tp;
344	struct inet_sock *inet;
345	const int type = icmp_hdr(skb)->type;
346	const int code = icmp_hdr(skb)->code;
347	struct sock *sk;
348	__u32 seq;
349	int err;
350
351	if (skb->len < (iph->ihl << 2) + 8) {
352		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
353		return;
354	}
355
356	sk = inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->daddr, th->dest,
357			iph->saddr, th->source, inet_iif(skb));
358	if (!sk) {
359		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
360		return;
361	}
362	if (sk->sk_state == TCP_TIME_WAIT) {
363		inet_twsk_put(inet_twsk(sk));
364		return;
365	}
366
367	bh_lock_sock(sk);
368	/* If too many ICMPs get dropped on busy
369	 * servers this needs to be solved differently.
370	 */
371	if (sock_owned_by_user(sk))
372		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
373
374	if (sk->sk_state == TCP_CLOSE)
375		goto out;
376
377	tp = tcp_sk(sk);
378	seq = ntohl(th->seq);
379	if (sk->sk_state != TCP_LISTEN &&
380	    !between(seq, tp->snd_una, tp->snd_nxt)) {
381		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
382		goto out;
383	}
384
385	switch (type) {
386	case ICMP_SOURCE_QUENCH:
387		/* Just silently ignore these. */
388		goto out;
389	case ICMP_PARAMETERPROB:
390		err = EPROTO;
391		break;
392	case ICMP_DEST_UNREACH:
393		if (code > NR_ICMP_UNREACH)
394			goto out;
395
396		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
397			if (!sock_owned_by_user(sk))
398				do_pmtu_discovery(sk, iph, info);
399			goto out;
400		}
401
402		err = icmp_err_convert[code].errno;
403		break;
404	case ICMP_TIME_EXCEEDED:
405		err = EHOSTUNREACH;
406		break;
407	default:
408		goto out;
409	}
410
411	switch (sk->sk_state) {
412		struct request_sock *req, **prev;
413	case TCP_LISTEN:
414		if (sock_owned_by_user(sk))
415			goto out;
416
417		req = inet_csk_search_req(sk, &prev, th->dest,
418					  iph->daddr, iph->saddr);
419		if (!req)
420			goto out;
421
422		/* ICMPs are not backlogged, hence we cannot get
423		   an established socket here.
424		 */
425		BUG_TRAP(!req->sk);
426
427		if (seq != tcp_rsk(req)->snt_isn) {
428			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429			goto out;
430		}
431
432		/*
433		 * Still in SYN_RECV, just remove it silently.
434		 * There is no good way to pass the error to the newly
435		 * created socket, and POSIX does not want network
436		 * errors returned from accept().
437		 */
438		inet_csk_reqsk_queue_drop(sk, req, prev);
439		goto out;
440
441	case TCP_SYN_SENT:
442	case TCP_SYN_RECV:  /* Cannot happen.
443			       It can f.e. if SYNs crossed.
444			     */
445		if (!sock_owned_by_user(sk)) {
446			sk->sk_err = err;
447
448			sk->sk_error_report(sk);
449
450			tcp_done(sk);
451		} else {
452			sk->sk_err_soft = err;
453		}
454		goto out;
455	}
456
457	/* If we've already connected we will keep trying
458	 * until we time out, or the user gives up.
459	 *
460	 * rfc1122 4.2.3.9 allows to consider as hard errors
461	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
462	 * but it is obsoleted by pmtu discovery).
463	 *
464	 * Note, that in modern internet, where routing is unreliable
465	 * and in each dark corner broken firewalls sit, sending random
466	 * errors ordered by their masters even this two messages finally lose
467	 * their original sense (even Linux sends invalid PORT_UNREACHs)
468	 *
469	 * Now we are in compliance with RFCs.
470	 *							--ANK (980905)
471	 */
472
473	inet = inet_sk(sk);
474	if (!sock_owned_by_user(sk) && inet->recverr) {
475		sk->sk_err = err;
476		sk->sk_error_report(sk);
477	} else	{ /* Only an error on timeout */
478		sk->sk_err_soft = err;
479	}
480
481out:
482	bh_unlock_sock(sk);
483	sock_put(sk);
484}
485
486/* This routine computes an IPv4 TCP checksum. */
487void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
488{
489	struct inet_sock *inet = inet_sk(sk);
490	struct tcphdr *th = tcp_hdr(skb);
491
492	if (skb->ip_summed == CHECKSUM_PARTIAL) {
493		th->check = ~tcp_v4_check(len, inet->saddr,
494					  inet->daddr, 0);
495		skb->csum_start = skb_transport_header(skb) - skb->head;
496		skb->csum_offset = offsetof(struct tcphdr, check);
497	} else {
498		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
499					 csum_partial((char *)th,
500						      th->doff << 2,
501						      skb->csum));
502	}
503}
504
505int tcp_v4_gso_send_check(struct sk_buff *skb)
506{
507	const struct iphdr *iph;
508	struct tcphdr *th;
509
510	if (!pskb_may_pull(skb, sizeof(*th)))
511		return -EINVAL;
512
513	iph = ip_hdr(skb);
514	th = tcp_hdr(skb);
515
516	th->check = 0;
517	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
518	skb->csum_start = skb_transport_header(skb) - skb->head;
519	skb->csum_offset = offsetof(struct tcphdr, check);
520	skb->ip_summed = CHECKSUM_PARTIAL;
521	return 0;
522}
523
524/*
525 *	This routine will send an RST to the other tcp.
526 *
527 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
528 *		      for reset.
529 *	Answer: if a packet caused RST, it is not for a socket
530 *		existing in our system, if it is matched to a socket,
531 *		it is just duplicate segment or bug in other side's TCP.
532 *		So that we build reply only basing on parameters
533 *		arrived with segment.
534 *	Exception: precedence violation. We do not implement it in any case.
535 */
536
537static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
538{
539	struct tcphdr *th = tcp_hdr(skb);
540	struct {
541		struct tcphdr th;
542#ifdef CONFIG_TCP_MD5SIG
543		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
544#endif
545	} rep;
546	struct ip_reply_arg arg;
547#ifdef CONFIG_TCP_MD5SIG
548	struct tcp_md5sig_key *key;
549#endif
550
551	/* Never send a reset in response to a reset. */
552	if (th->rst)
553		return;
554
555	if (skb->rtable->rt_type != RTN_LOCAL)
556		return;
557
558	/* Swap the send and the receive. */
559	memset(&rep, 0, sizeof(rep));
560	rep.th.dest   = th->source;
561	rep.th.source = th->dest;
562	rep.th.doff   = sizeof(struct tcphdr) / 4;
563	rep.th.rst    = 1;
564
565	if (th->ack) {
566		rep.th.seq = th->ack_seq;
567	} else {
568		rep.th.ack = 1;
569		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
570				       skb->len - (th->doff << 2));
571	}
572
573	memset(&arg, 0, sizeof(arg));
574	arg.iov[0].iov_base = (unsigned char *)&rep;
575	arg.iov[0].iov_len  = sizeof(rep.th);
576
577#ifdef CONFIG_TCP_MD5SIG
578	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
579	if (key) {
580		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
581				   (TCPOPT_NOP << 16) |
582				   (TCPOPT_MD5SIG << 8) |
583				   TCPOLEN_MD5SIG);
584		/* Update length and the length the header thinks exists */
585		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
586		rep.th.doff = arg.iov[0].iov_len / 4;
587
588		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
589					key,
590					ip_hdr(skb)->daddr,
591					ip_hdr(skb)->saddr,
592					&rep.th, IPPROTO_TCP,
593					arg.iov[0].iov_len);
594	}
595#endif
596	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
597				      ip_hdr(skb)->saddr, /* XXX */
598				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
599	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
600
601	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
602
603	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
604	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
605}
606
607/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
608   outside socket context is ugly, certainly. What can I do?
609 */
610
611static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
612			    struct sk_buff *skb, u32 seq, u32 ack,
613			    u32 win, u32 ts)
614{
615	struct tcphdr *th = tcp_hdr(skb);
616	struct {
617		struct tcphdr th;
618		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
619#ifdef CONFIG_TCP_MD5SIG
620			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
621#endif
622			];
623	} rep;
624	struct ip_reply_arg arg;
625#ifdef CONFIG_TCP_MD5SIG
626	struct tcp_md5sig_key *key;
627	struct tcp_md5sig_key tw_key;
628#endif
629
630	memset(&rep.th, 0, sizeof(struct tcphdr));
631	memset(&arg, 0, sizeof(arg));
632
633	arg.iov[0].iov_base = (unsigned char *)&rep;
634	arg.iov[0].iov_len  = sizeof(rep.th);
635	if (ts) {
636		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
637				   (TCPOPT_TIMESTAMP << 8) |
638				   TCPOLEN_TIMESTAMP);
639		rep.opt[1] = htonl(tcp_time_stamp);
640		rep.opt[2] = htonl(ts);
641		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
642	}
643
644	/* Swap the send and the receive. */
645	rep.th.dest    = th->source;
646	rep.th.source  = th->dest;
647	rep.th.doff    = arg.iov[0].iov_len / 4;
648	rep.th.seq     = htonl(seq);
649	rep.th.ack_seq = htonl(ack);
650	rep.th.ack     = 1;
651	rep.th.window  = htons(win);
652
653#ifdef CONFIG_TCP_MD5SIG
654	/*
655	 * The SKB holds an imcoming packet, but may not have a valid ->sk
656	 * pointer. This is especially the case when we're dealing with a
657	 * TIME_WAIT ack, because the sk structure is long gone, and only
658	 * the tcp_timewait_sock remains. So the md5 key is stashed in that
659	 * structure, and we use it in preference.  I believe that (twsk ||
660	 * skb->sk) holds true, but we program defensively.
661	 */
662	if (!twsk && skb->sk) {
663		key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
664	} else if (twsk && twsk->tw_md5_keylen) {
665		tw_key.key = twsk->tw_md5_key;
666		tw_key.keylen = twsk->tw_md5_keylen;
667		key = &tw_key;
668	} else
669		key = NULL;
670
671	if (key) {
672		int offset = (ts) ? 3 : 0;
673
674		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
675					  (TCPOPT_NOP << 16) |
676					  (TCPOPT_MD5SIG << 8) |
677					  TCPOLEN_MD5SIG);
678		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
679		rep.th.doff = arg.iov[0].iov_len/4;
680
681		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
682					key,
683					ip_hdr(skb)->daddr,
684					ip_hdr(skb)->saddr,
685					&rep.th, IPPROTO_TCP,
686					arg.iov[0].iov_len);
687	}
688#endif
689	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
690				      ip_hdr(skb)->saddr, /* XXX */
691				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
692	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
693	if (twsk)
694		arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
695
696	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
697
698	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
699}
700
701static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
702{
703	struct inet_timewait_sock *tw = inet_twsk(sk);
704	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
705
706	tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
707			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
708			tcptw->tw_ts_recent);
709
710	inet_twsk_put(tw);
711}
712
713static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
714				  struct request_sock *req)
715{
716	tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
717			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
718			req->ts_recent);
719}
720
721/*
722 *	Send a SYN-ACK after having received a SYN.
723 *	This still operates on a request_sock only, not on a big
724 *	socket.
725 */
726static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
727				struct dst_entry *dst)
728{
729	const struct inet_request_sock *ireq = inet_rsk(req);
730	int err = -1;
731	struct sk_buff * skb;
732
733	/* First, grab a route. */
734	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
735		return -1;
736
737	skb = tcp_make_synack(sk, dst, req);
738
739	if (skb) {
740		struct tcphdr *th = tcp_hdr(skb);
741
742		th->check = tcp_v4_check(skb->len,
743					 ireq->loc_addr,
744					 ireq->rmt_addr,
745					 csum_partial((char *)th, skb->len,
746						      skb->csum));
747
748		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
749					    ireq->rmt_addr,
750					    ireq->opt);
751		err = net_xmit_eval(err);
752	}
753
754	dst_release(dst);
755	return err;
756}
757
758static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
759{
760	return __tcp_v4_send_synack(sk, req, NULL);
761}
762
763/*
764 *	IPv4 request_sock destructor.
765 */
766static void tcp_v4_reqsk_destructor(struct request_sock *req)
767{
768	kfree(inet_rsk(req)->opt);
769}
770
771#ifdef CONFIG_SYN_COOKIES
772static void syn_flood_warning(struct sk_buff *skb)
773{
774	static unsigned long warntime;
775
776	if (time_after(jiffies, (warntime + HZ * 60))) {
777		warntime = jiffies;
778		printk(KERN_INFO
779		       "possible SYN flooding on port %d. Sending cookies.\n",
780		       ntohs(tcp_hdr(skb)->dest));
781	}
782}
783#endif
784
785/*
786 * Save and compile IPv4 options into the request_sock if needed.
787 */
788static struct ip_options *tcp_v4_save_options(struct sock *sk,
789					      struct sk_buff *skb)
790{
791	struct ip_options *opt = &(IPCB(skb)->opt);
792	struct ip_options *dopt = NULL;
793
794	if (opt && opt->optlen) {
795		int opt_size = optlength(opt);
796		dopt = kmalloc(opt_size, GFP_ATOMIC);
797		if (dopt) {
798			if (ip_options_echo(dopt, skb)) {
799				kfree(dopt);
800				dopt = NULL;
801			}
802		}
803	}
804	return dopt;
805}
806
807#ifdef CONFIG_TCP_MD5SIG
808/*
809 * RFC2385 MD5 checksumming requires a mapping of
810 * IP address->MD5 Key.
811 * We need to maintain these in the sk structure.
812 */
813
814/* Find the Key structure for an address.  */
815static struct tcp_md5sig_key *
816			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
817{
818	struct tcp_sock *tp = tcp_sk(sk);
819	int i;
820
821	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
822		return NULL;
823	for (i = 0; i < tp->md5sig_info->entries4; i++) {
824		if (tp->md5sig_info->keys4[i].addr == addr)
825			return &tp->md5sig_info->keys4[i].base;
826	}
827	return NULL;
828}
829
830struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
831					 struct sock *addr_sk)
832{
833	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
834}
835
836EXPORT_SYMBOL(tcp_v4_md5_lookup);
837
838static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
839						      struct request_sock *req)
840{
841	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
842}
843
844/* This can be called on a newly created socket, from other files */
845int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
846		      u8 *newkey, u8 newkeylen)
847{
848	/* Add Key to the list */
849	struct tcp_md5sig_key *key;
850	struct tcp_sock *tp = tcp_sk(sk);
851	struct tcp4_md5sig_key *keys;
852
853	key = tcp_v4_md5_do_lookup(sk, addr);
854	if (key) {
855		/* Pre-existing entry - just update that one. */
856		kfree(key->key);
857		key->key = newkey;
858		key->keylen = newkeylen;
859	} else {
860		struct tcp_md5sig_info *md5sig;
861
862		if (!tp->md5sig_info) {
863			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
864						  GFP_ATOMIC);
865			if (!tp->md5sig_info) {
866				kfree(newkey);
867				return -ENOMEM;
868			}
869			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
870		}
871		if (tcp_alloc_md5sig_pool() == NULL) {
872			kfree(newkey);
873			return -ENOMEM;
874		}
875		md5sig = tp->md5sig_info;
876
877		if (md5sig->alloced4 == md5sig->entries4) {
878			keys = kmalloc((sizeof(*keys) *
879					(md5sig->entries4 + 1)), GFP_ATOMIC);
880			if (!keys) {
881				kfree(newkey);
882				tcp_free_md5sig_pool();
883				return -ENOMEM;
884			}
885
886			if (md5sig->entries4)
887				memcpy(keys, md5sig->keys4,
888				       sizeof(*keys) * md5sig->entries4);
889
890			/* Free old key list, and reference new one */
891			kfree(md5sig->keys4);
892			md5sig->keys4 = keys;
893			md5sig->alloced4++;
894		}
895		md5sig->entries4++;
896		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
897		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
898		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
899	}
900	return 0;
901}
902
903EXPORT_SYMBOL(tcp_v4_md5_do_add);
904
905static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
906			       u8 *newkey, u8 newkeylen)
907{
908	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
909				 newkey, newkeylen);
910}
911
912int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
913{
914	struct tcp_sock *tp = tcp_sk(sk);
915	int i;
916
917	for (i = 0; i < tp->md5sig_info->entries4; i++) {
918		if (tp->md5sig_info->keys4[i].addr == addr) {
919			/* Free the key */
920			kfree(tp->md5sig_info->keys4[i].base.key);
921			tp->md5sig_info->entries4--;
922
923			if (tp->md5sig_info->entries4 == 0) {
924				kfree(tp->md5sig_info->keys4);
925				tp->md5sig_info->keys4 = NULL;
926				tp->md5sig_info->alloced4 = 0;
927			} else if (tp->md5sig_info->entries4 != i) {
928				/* Need to do some manipulation */
929				memmove(&tp->md5sig_info->keys4[i],
930					&tp->md5sig_info->keys4[i+1],
931					(tp->md5sig_info->entries4 - i) *
932					 sizeof(struct tcp4_md5sig_key));
933			}
934			tcp_free_md5sig_pool();
935			return 0;
936		}
937	}
938	return -ENOENT;
939}
940
941EXPORT_SYMBOL(tcp_v4_md5_do_del);
942
943static void tcp_v4_clear_md5_list(struct sock *sk)
944{
945	struct tcp_sock *tp = tcp_sk(sk);
946
947	/* Free each key, then the set of key keys,
948	 * the crypto element, and then decrement our
949	 * hold on the last resort crypto.
950	 */
951	if (tp->md5sig_info->entries4) {
952		int i;
953		for (i = 0; i < tp->md5sig_info->entries4; i++)
954			kfree(tp->md5sig_info->keys4[i].base.key);
955		tp->md5sig_info->entries4 = 0;
956		tcp_free_md5sig_pool();
957	}
958	if (tp->md5sig_info->keys4) {
959		kfree(tp->md5sig_info->keys4);
960		tp->md5sig_info->keys4 = NULL;
961		tp->md5sig_info->alloced4  = 0;
962	}
963}
964
965static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
966				 int optlen)
967{
968	struct tcp_md5sig cmd;
969	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
970	u8 *newkey;
971
972	if (optlen < sizeof(cmd))
973		return -EINVAL;
974
975	if (copy_from_user(&cmd, optval, sizeof(cmd)))
976		return -EFAULT;
977
978	if (sin->sin_family != AF_INET)
979		return -EINVAL;
980
981	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
982		if (!tcp_sk(sk)->md5sig_info)
983			return -ENOENT;
984		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
985	}
986
987	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
988		return -EINVAL;
989
990	if (!tcp_sk(sk)->md5sig_info) {
991		struct tcp_sock *tp = tcp_sk(sk);
992		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
993
994		if (!p)
995			return -EINVAL;
996
997		tp->md5sig_info = p;
998		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
999	}
1000
1001	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1002	if (!newkey)
1003		return -ENOMEM;
1004	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1005				 newkey, cmd.tcpm_keylen);
1006}
1007
1008static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1009				   __be32 saddr, __be32 daddr,
1010				   struct tcphdr *th, int protocol,
1011				   unsigned int tcplen)
1012{
1013	struct scatterlist sg[4];
1014	__u16 data_len;
1015	int block = 0;
1016	__sum16 old_checksum;
1017	struct tcp_md5sig_pool *hp;
1018	struct tcp4_pseudohdr *bp;
1019	struct hash_desc *desc;
1020	int err;
1021	unsigned int nbytes = 0;
1022
1023	/*
1024	 * Okay, so RFC2385 is turned on for this connection,
1025	 * so we need to generate the MD5 hash for the packet now.
1026	 */
1027
1028	hp = tcp_get_md5sig_pool();
1029	if (!hp)
1030		goto clear_hash_noput;
1031
1032	bp = &hp->md5_blk.ip4;
1033	desc = &hp->md5_desc;
1034
1035	/*
1036	 * 1. the TCP pseudo-header (in the order: source IP address,
1037	 * destination IP address, zero-padded protocol number, and
1038	 * segment length)
1039	 */
1040	bp->saddr = saddr;
1041	bp->daddr = daddr;
1042	bp->pad = 0;
1043	bp->protocol = protocol;
1044	bp->len = htons(tcplen);
1045
1046	sg_init_table(sg, 4);
1047
1048	sg_set_buf(&sg[block++], bp, sizeof(*bp));
1049	nbytes += sizeof(*bp);
1050
1051	/* 2. the TCP header, excluding options, and assuming a
1052	 * checksum of zero/
1053	 */
1054	old_checksum = th->check;
1055	th->check = 0;
1056	sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1057	nbytes += sizeof(struct tcphdr);
1058
1059	/* 3. the TCP segment data (if any) */
1060	data_len = tcplen - (th->doff << 2);
1061	if (data_len > 0) {
1062		unsigned char *data = (unsigned char *)th + (th->doff << 2);
1063		sg_set_buf(&sg[block++], data, data_len);
1064		nbytes += data_len;
1065	}
1066
1067	/* 4. an independently-specified key or password, known to both
1068	 * TCPs and presumably connection-specific
1069	 */
1070	sg_set_buf(&sg[block++], key->key, key->keylen);
1071	nbytes += key->keylen;
1072
1073	sg_mark_end(&sg[block - 1]);
1074
1075	/* Now store the Hash into the packet */
1076	err = crypto_hash_init(desc);
1077	if (err)
1078		goto clear_hash;
1079	err = crypto_hash_update(desc, sg, nbytes);
1080	if (err)
1081		goto clear_hash;
1082	err = crypto_hash_final(desc, md5_hash);
1083	if (err)
1084		goto clear_hash;
1085
1086	/* Reset header, and free up the crypto */
1087	tcp_put_md5sig_pool();
1088	th->check = old_checksum;
1089
1090out:
1091	return 0;
1092clear_hash:
1093	tcp_put_md5sig_pool();
1094clear_hash_noput:
1095	memset(md5_hash, 0, 16);
1096	goto out;
1097}
1098
1099int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1100			 struct sock *sk,
1101			 struct dst_entry *dst,
1102			 struct request_sock *req,
1103			 struct tcphdr *th, int protocol,
1104			 unsigned int tcplen)
1105{
1106	__be32 saddr, daddr;
1107
1108	if (sk) {
1109		saddr = inet_sk(sk)->saddr;
1110		daddr = inet_sk(sk)->daddr;
1111	} else {
1112		struct rtable *rt = (struct rtable *)dst;
1113		BUG_ON(!rt);
1114		saddr = rt->rt_src;
1115		daddr = rt->rt_dst;
1116	}
1117	return tcp_v4_do_calc_md5_hash(md5_hash, key,
1118				       saddr, daddr,
1119				       th, protocol, tcplen);
1120}
1121
1122EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1123
1124static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1125{
1126	/*
1127	 * This gets called for each TCP segment that arrives
1128	 * so we want to be efficient.
1129	 * We have 3 drop cases:
1130	 * o No MD5 hash and one expected.
1131	 * o MD5 hash and we're not expecting one.
1132	 * o MD5 hash and its wrong.
1133	 */
1134	__u8 *hash_location = NULL;
1135	struct tcp_md5sig_key *hash_expected;
1136	const struct iphdr *iph = ip_hdr(skb);
1137	struct tcphdr *th = tcp_hdr(skb);
1138	int length = (th->doff << 2) - sizeof(struct tcphdr);
1139	int genhash;
1140	unsigned char *ptr;
1141	unsigned char newhash[16];
1142
1143	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1144
1145	/*
1146	 * If the TCP option length is less than the TCP_MD5SIG
1147	 * option length, then we can shortcut
1148	 */
1149	if (length < TCPOLEN_MD5SIG) {
1150		if (hash_expected)
1151			return 1;
1152		else
1153			return 0;
1154	}
1155
1156	/* Okay, we can't shortcut - we have to grub through the options */
1157	ptr = (unsigned char *)(th + 1);
1158	while (length > 0) {
1159		int opcode = *ptr++;
1160		int opsize;
1161
1162		switch (opcode) {
1163		case TCPOPT_EOL:
1164			goto done_opts;
1165		case TCPOPT_NOP:
1166			length--;
1167			continue;
1168		default:
1169			opsize = *ptr++;
1170			if (opsize < 2)
1171				goto done_opts;
1172			if (opsize > length)
1173				goto done_opts;
1174
1175			if (opcode == TCPOPT_MD5SIG) {
1176				hash_location = ptr;
1177				goto done_opts;
1178			}
1179		}
1180		ptr += opsize-2;
1181		length -= opsize;
1182	}
1183done_opts:
1184	/* We've parsed the options - do we have a hash? */
1185	if (!hash_expected && !hash_location)
1186		return 0;
1187
1188	if (hash_expected && !hash_location) {
1189		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1190			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1191			       NIPQUAD(iph->saddr), ntohs(th->source),
1192			       NIPQUAD(iph->daddr), ntohs(th->dest));
1193		return 1;
1194	}
1195
1196	if (!hash_expected && hash_location) {
1197		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1198			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199			       NIPQUAD(iph->saddr), ntohs(th->source),
1200			       NIPQUAD(iph->daddr), ntohs(th->dest));
1201		return 1;
1202	}
1203
1204	/* Okay, so this is hash_expected and hash_location -
1205	 * so we need to calculate the checksum.
1206	 */
1207	genhash = tcp_v4_do_calc_md5_hash(newhash,
1208					  hash_expected,
1209					  iph->saddr, iph->daddr,
1210					  th, sk->sk_protocol,
1211					  skb->len);
1212
1213	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214		if (net_ratelimit()) {
1215			printk(KERN_INFO "MD5 Hash failed for "
1216			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1217			       NIPQUAD(iph->saddr), ntohs(th->source),
1218			       NIPQUAD(iph->daddr), ntohs(th->dest),
1219			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1220		}
1221		return 1;
1222	}
1223	return 0;
1224}
1225
1226#endif
1227
1228struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229	.family		=	PF_INET,
1230	.obj_size	=	sizeof(struct tcp_request_sock),
1231	.rtx_syn_ack	=	tcp_v4_send_synack,
1232	.send_ack	=	tcp_v4_reqsk_send_ack,
1233	.destructor	=	tcp_v4_reqsk_destructor,
1234	.send_reset	=	tcp_v4_send_reset,
1235};
1236
1237#ifdef CONFIG_TCP_MD5SIG
1238static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1240};
1241#endif
1242
1243static struct timewait_sock_ops tcp_timewait_sock_ops = {
1244	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1245	.twsk_unique	= tcp_twsk_unique,
1246	.twsk_destructor= tcp_twsk_destructor,
1247};
1248
1249int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1250{
1251	struct inet_request_sock *ireq;
1252	struct tcp_options_received tmp_opt;
1253	struct request_sock *req;
1254	__be32 saddr = ip_hdr(skb)->saddr;
1255	__be32 daddr = ip_hdr(skb)->daddr;
1256	__u32 isn = TCP_SKB_CB(skb)->when;
1257	struct dst_entry *dst = NULL;
1258#ifdef CONFIG_SYN_COOKIES
1259	int want_cookie = 0;
1260#else
1261#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1262#endif
1263
1264	/* Never answer to SYNs send to broadcast or multicast */
1265	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1266		goto drop;
1267
1268	/* TW buckets are converted to open requests without
1269	 * limitations, they conserve resources and peer is
1270	 * evidently real one.
1271	 */
1272	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1273#ifdef CONFIG_SYN_COOKIES
1274		if (sysctl_tcp_syncookies) {
1275			want_cookie = 1;
1276		} else
1277#endif
1278		goto drop;
1279	}
1280
1281	/* Accept backlog is full. If we have already queued enough
1282	 * of warm entries in syn queue, drop request. It is better than
1283	 * clogging syn queue with openreqs with exponentially increasing
1284	 * timeout.
1285	 */
1286	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1287		goto drop;
1288
1289	req = reqsk_alloc(&tcp_request_sock_ops);
1290	if (!req)
1291		goto drop;
1292
1293#ifdef CONFIG_TCP_MD5SIG
1294	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1295#endif
1296
1297	tcp_clear_options(&tmp_opt);
1298	tmp_opt.mss_clamp = 536;
1299	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1300
1301	tcp_parse_options(skb, &tmp_opt, 0);
1302
1303	if (want_cookie) {
1304		tcp_clear_options(&tmp_opt);
1305		tmp_opt.saw_tstamp = 0;
1306	}
1307
1308	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1309		/* Some OSes (unknown ones, but I see them on web server, which
1310		 * contains information interesting only for windows'
1311		 * users) do not send their stamp in SYN. It is easy case.
1312		 * We simply do not advertise TS support.
1313		 */
1314		tmp_opt.saw_tstamp = 0;
1315		tmp_opt.tstamp_ok  = 0;
1316	}
1317	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1318
1319	tcp_openreq_init(req, &tmp_opt, skb);
1320
1321	if (security_inet_conn_request(sk, skb, req))
1322		goto drop_and_free;
1323
1324	ireq = inet_rsk(req);
1325	ireq->loc_addr = daddr;
1326	ireq->rmt_addr = saddr;
1327	ireq->opt = tcp_v4_save_options(sk, skb);
1328	if (!want_cookie)
1329		TCP_ECN_create_request(req, tcp_hdr(skb));
1330
1331	if (want_cookie) {
1332#ifdef CONFIG_SYN_COOKIES
1333		syn_flood_warning(skb);
1334#endif
1335		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1336	} else if (!isn) {
1337		struct inet_peer *peer = NULL;
1338
1339		/* VJ's idea. We save last timestamp seen
1340		 * from the destination in peer table, when entering
1341		 * state TIME-WAIT, and check against it before
1342		 * accepting new connection request.
1343		 *
1344		 * If "isn" is not zero, this request hit alive
1345		 * timewait bucket, so that all the necessary checks
1346		 * are made in the function processing timewait state.
1347		 */
1348		if (tmp_opt.saw_tstamp &&
1349		    tcp_death_row.sysctl_tw_recycle &&
1350		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1351		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352		    peer->v4daddr == saddr) {
1353			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1354			    (s32)(peer->tcp_ts - req->ts_recent) >
1355							TCP_PAWS_WINDOW) {
1356				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1357				goto drop_and_release;
1358			}
1359		}
1360		/* Kill the following clause, if you dislike this way. */
1361		else if (!sysctl_tcp_syncookies &&
1362			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1363			  (sysctl_max_syn_backlog >> 2)) &&
1364			 (!peer || !peer->tcp_ts_stamp) &&
1365			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1366			/* Without syncookies last quarter of
1367			 * backlog is filled with destinations,
1368			 * proven to be alive.
1369			 * It means that we continue to communicate
1370			 * to destinations, already remembered
1371			 * to the moment of synflood.
1372			 */
1373			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1374				       "request from %u.%u.%u.%u/%u\n",
1375				       NIPQUAD(saddr),
1376				       ntohs(tcp_hdr(skb)->source));
1377			goto drop_and_release;
1378		}
1379
1380		isn = tcp_v4_init_sequence(skb);
1381	}
1382	tcp_rsk(req)->snt_isn = isn;
1383
1384	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1385		goto drop_and_free;
1386
1387	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1388	return 0;
1389
1390drop_and_release:
1391	dst_release(dst);
1392drop_and_free:
1393	reqsk_free(req);
1394drop:
1395	return 0;
1396}
1397
1398
1399/*
1400 * The three way handshake has completed - we got a valid synack -
1401 * now create the new socket.
1402 */
1403struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1404				  struct request_sock *req,
1405				  struct dst_entry *dst)
1406{
1407	struct inet_request_sock *ireq;
1408	struct inet_sock *newinet;
1409	struct tcp_sock *newtp;
1410	struct sock *newsk;
1411#ifdef CONFIG_TCP_MD5SIG
1412	struct tcp_md5sig_key *key;
1413#endif
1414
1415	if (sk_acceptq_is_full(sk))
1416		goto exit_overflow;
1417
1418	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1419		goto exit;
1420
1421	newsk = tcp_create_openreq_child(sk, req, skb);
1422	if (!newsk)
1423		goto exit;
1424
1425	newsk->sk_gso_type = SKB_GSO_TCPV4;
1426	sk_setup_caps(newsk, dst);
1427
1428	newtp		      = tcp_sk(newsk);
1429	newinet		      = inet_sk(newsk);
1430	ireq		      = inet_rsk(req);
1431	newinet->daddr	      = ireq->rmt_addr;
1432	newinet->rcv_saddr    = ireq->loc_addr;
1433	newinet->saddr	      = ireq->loc_addr;
1434	newinet->opt	      = ireq->opt;
1435	ireq->opt	      = NULL;
1436	newinet->mc_index     = inet_iif(skb);
1437	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1438	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1439	if (newinet->opt)
1440		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1441	newinet->id = newtp->write_seq ^ jiffies;
1442
1443	tcp_mtup_init(newsk);
1444	tcp_sync_mss(newsk, dst_mtu(dst));
1445	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1446	tcp_initialize_rcv_mss(newsk);
1447
1448#ifdef CONFIG_TCP_MD5SIG
1449	/* Copy over the MD5 key from the original socket */
1450	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1451		/*
1452		 * We're using one, so create a matching key
1453		 * on the newsk structure. If we fail to get
1454		 * memory, then we end up not copying the key
1455		 * across. Shucks.
1456		 */
1457		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1458		if (newkey != NULL)
1459			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1460					  newkey, key->keylen);
1461	}
1462#endif
1463
1464	__inet_hash_nolisten(newsk);
1465	__inet_inherit_port(sk, newsk);
1466
1467	return newsk;
1468
1469exit_overflow:
1470	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1471exit:
1472	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1473	dst_release(dst);
1474	return NULL;
1475}
1476
1477static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1478{
1479	struct tcphdr *th = tcp_hdr(skb);
1480	const struct iphdr *iph = ip_hdr(skb);
1481	struct sock *nsk;
1482	struct request_sock **prev;
1483	/* Find possible connection requests. */
1484	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1485						       iph->saddr, iph->daddr);
1486	if (req)
1487		return tcp_check_req(sk, skb, req, prev);
1488
1489	nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr,
1490			th->source, iph->daddr, th->dest, inet_iif(skb));
1491
1492	if (nsk) {
1493		if (nsk->sk_state != TCP_TIME_WAIT) {
1494			bh_lock_sock(nsk);
1495			return nsk;
1496		}
1497		inet_twsk_put(inet_twsk(nsk));
1498		return NULL;
1499	}
1500
1501#ifdef CONFIG_SYN_COOKIES
1502	if (!th->rst && !th->syn && th->ack)
1503		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1504#endif
1505	return sk;
1506}
1507
1508static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1509{
1510	const struct iphdr *iph = ip_hdr(skb);
1511
1512	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1513		if (!tcp_v4_check(skb->len, iph->saddr,
1514				  iph->daddr, skb->csum)) {
1515			skb->ip_summed = CHECKSUM_UNNECESSARY;
1516			return 0;
1517		}
1518	}
1519
1520	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1521				       skb->len, IPPROTO_TCP, 0);
1522
1523	if (skb->len <= 76) {
1524		return __skb_checksum_complete(skb);
1525	}
1526	return 0;
1527}
1528
1529
1530/* The socket must have it's spinlock held when we get
1531 * here.
1532 *
1533 * We have a potential double-lock case here, so even when
1534 * doing backlog processing we use the BH locking scheme.
1535 * This is because we cannot sleep with the original spinlock
1536 * held.
1537 */
1538int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1539{
1540	struct sock *rsk;
1541#ifdef CONFIG_TCP_MD5SIG
1542	/*
1543	 * We really want to reject the packet as early as possible
1544	 * if:
1545	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1546	 *  o There is an MD5 option and we're not expecting one
1547	 */
1548	if (tcp_v4_inbound_md5_hash(sk, skb))
1549		goto discard;
1550#endif
1551
1552	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553		TCP_CHECK_TIMER(sk);
1554		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1555			rsk = sk;
1556			goto reset;
1557		}
1558		TCP_CHECK_TIMER(sk);
1559		return 0;
1560	}
1561
1562	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1563		goto csum_err;
1564
1565	if (sk->sk_state == TCP_LISTEN) {
1566		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1567		if (!nsk)
1568			goto discard;
1569
1570		if (nsk != sk) {
1571			if (tcp_child_process(sk, nsk, skb)) {
1572				rsk = nsk;
1573				goto reset;
1574			}
1575			return 0;
1576		}
1577	}
1578
1579	TCP_CHECK_TIMER(sk);
1580	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1581		rsk = sk;
1582		goto reset;
1583	}
1584	TCP_CHECK_TIMER(sk);
1585	return 0;
1586
1587reset:
1588	tcp_v4_send_reset(rsk, skb);
1589discard:
1590	kfree_skb(skb);
1591	/* Be careful here. If this function gets more complicated and
1592	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1593	 * might be destroyed here. This current version compiles correctly,
1594	 * but you have been warned.
1595	 */
1596	return 0;
1597
1598csum_err:
1599	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1600	goto discard;
1601}
1602
1603/*
1604 *	From tcp_input.c
1605 */
1606
1607int tcp_v4_rcv(struct sk_buff *skb)
1608{
1609	const struct iphdr *iph;
1610	struct tcphdr *th;
1611	struct sock *sk;
1612	int ret;
1613
1614	if (skb->pkt_type != PACKET_HOST)
1615		goto discard_it;
1616
1617	/* Count it even if it's bad */
1618	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1619
1620	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1621		goto discard_it;
1622
1623	th = tcp_hdr(skb);
1624
1625	if (th->doff < sizeof(struct tcphdr) / 4)
1626		goto bad_packet;
1627	if (!pskb_may_pull(skb, th->doff * 4))
1628		goto discard_it;
1629
1630	/* An explanation is required here, I think.
1631	 * Packet length and doff are validated by header prediction,
1632	 * provided case of th->doff==0 is eliminated.
1633	 * So, we defer the checks. */
1634	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1635		goto bad_packet;
1636
1637	th = tcp_hdr(skb);
1638	iph = ip_hdr(skb);
1639	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1640	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1641				    skb->len - th->doff * 4);
1642	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1643	TCP_SKB_CB(skb)->when	 = 0;
1644	TCP_SKB_CB(skb)->flags	 = iph->tos;
1645	TCP_SKB_CB(skb)->sacked	 = 0;
1646
1647	sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
1648			th->source, iph->daddr, th->dest, inet_iif(skb));
1649	if (!sk)
1650		goto no_tcp_socket;
1651
1652process:
1653	if (sk->sk_state == TCP_TIME_WAIT)
1654		goto do_time_wait;
1655
1656	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1657		goto discard_and_relse;
1658	nf_reset(skb);
1659
1660	if (sk_filter(sk, skb))
1661		goto discard_and_relse;
1662
1663	skb->dev = NULL;
1664
1665	bh_lock_sock_nested(sk);
1666	ret = 0;
1667	if (!sock_owned_by_user(sk)) {
1668#ifdef CONFIG_NET_DMA
1669		struct tcp_sock *tp = tcp_sk(sk);
1670		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1671			tp->ucopy.dma_chan = get_softnet_dma();
1672		if (tp->ucopy.dma_chan)
1673			ret = tcp_v4_do_rcv(sk, skb);
1674		else
1675#endif
1676		{
1677			if (!tcp_prequeue(sk, skb))
1678			ret = tcp_v4_do_rcv(sk, skb);
1679		}
1680	} else
1681		sk_add_backlog(sk, skb);
1682	bh_unlock_sock(sk);
1683
1684	sock_put(sk);
1685
1686	return ret;
1687
1688no_tcp_socket:
1689	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1690		goto discard_it;
1691
1692	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1693bad_packet:
1694		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1695	} else {
1696		tcp_v4_send_reset(NULL, skb);
1697	}
1698
1699discard_it:
1700	/* Discard frame. */
1701	kfree_skb(skb);
1702	return 0;
1703
1704discard_and_relse:
1705	sock_put(sk);
1706	goto discard_it;
1707
1708do_time_wait:
1709	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1710		inet_twsk_put(inet_twsk(sk));
1711		goto discard_it;
1712	}
1713
1714	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1715		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1716		inet_twsk_put(inet_twsk(sk));
1717		goto discard_it;
1718	}
1719	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1720	case TCP_TW_SYN: {
1721		struct sock *sk2 = inet_lookup_listener(skb->dev->nd_net,
1722							&tcp_hashinfo,
1723							iph->daddr, th->dest,
1724							inet_iif(skb));
1725		if (sk2) {
1726			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1727			inet_twsk_put(inet_twsk(sk));
1728			sk = sk2;
1729			goto process;
1730		}
1731		/* Fall through to ACK */
1732	}
1733	case TCP_TW_ACK:
1734		tcp_v4_timewait_ack(sk, skb);
1735		break;
1736	case TCP_TW_RST:
1737		goto no_tcp_socket;
1738	case TCP_TW_SUCCESS:;
1739	}
1740	goto discard_it;
1741}
1742
1743/* VJ's idea. Save last timestamp seen from this destination
1744 * and hold it at least for normal timewait interval to use for duplicate
1745 * segment detection in subsequent connections, before they enter synchronized
1746 * state.
1747 */
1748
1749int tcp_v4_remember_stamp(struct sock *sk)
1750{
1751	struct inet_sock *inet = inet_sk(sk);
1752	struct tcp_sock *tp = tcp_sk(sk);
1753	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1754	struct inet_peer *peer = NULL;
1755	int release_it = 0;
1756
1757	if (!rt || rt->rt_dst != inet->daddr) {
1758		peer = inet_getpeer(inet->daddr, 1);
1759		release_it = 1;
1760	} else {
1761		if (!rt->peer)
1762			rt_bind_peer(rt, 1);
1763		peer = rt->peer;
1764	}
1765
1766	if (peer) {
1767		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1768		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1769		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1770			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1771			peer->tcp_ts = tp->rx_opt.ts_recent;
1772		}
1773		if (release_it)
1774			inet_putpeer(peer);
1775		return 1;
1776	}
1777
1778	return 0;
1779}
1780
1781int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1782{
1783	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1784
1785	if (peer) {
1786		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1787
1788		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1789		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1790		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1791			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1792			peer->tcp_ts	   = tcptw->tw_ts_recent;
1793		}
1794		inet_putpeer(peer);
1795		return 1;
1796	}
1797
1798	return 0;
1799}
1800
1801struct inet_connection_sock_af_ops ipv4_specific = {
1802	.queue_xmit	   = ip_queue_xmit,
1803	.send_check	   = tcp_v4_send_check,
1804	.rebuild_header	   = inet_sk_rebuild_header,
1805	.conn_request	   = tcp_v4_conn_request,
1806	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1807	.remember_stamp	   = tcp_v4_remember_stamp,
1808	.net_header_len	   = sizeof(struct iphdr),
1809	.setsockopt	   = ip_setsockopt,
1810	.getsockopt	   = ip_getsockopt,
1811	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1812	.sockaddr_len	   = sizeof(struct sockaddr_in),
1813	.bind_conflict	   = inet_csk_bind_conflict,
1814#ifdef CONFIG_COMPAT
1815	.compat_setsockopt = compat_ip_setsockopt,
1816	.compat_getsockopt = compat_ip_getsockopt,
1817#endif
1818};
1819
1820#ifdef CONFIG_TCP_MD5SIG
1821static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1822	.md5_lookup		= tcp_v4_md5_lookup,
1823	.calc_md5_hash		= tcp_v4_calc_md5_hash,
1824	.md5_add		= tcp_v4_md5_add_func,
1825	.md5_parse		= tcp_v4_parse_md5_keys,
1826};
1827#endif
1828
1829/* NOTE: A lot of things set to zero explicitly by call to
1830 *       sk_alloc() so need not be done here.
1831 */
1832static int tcp_v4_init_sock(struct sock *sk)
1833{
1834	struct inet_connection_sock *icsk = inet_csk(sk);
1835	struct tcp_sock *tp = tcp_sk(sk);
1836
1837	skb_queue_head_init(&tp->out_of_order_queue);
1838	tcp_init_xmit_timers(sk);
1839	tcp_prequeue_init(tp);
1840
1841	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1842	tp->mdev = TCP_TIMEOUT_INIT;
1843
1844	/* So many TCP implementations out there (incorrectly) count the
1845	 * initial SYN frame in their delayed-ACK and congestion control
1846	 * algorithms that we must have the following bandaid to talk
1847	 * efficiently to them.  -DaveM
1848	 */
1849	tp->snd_cwnd = 2;
1850
1851	/* See draft-stevens-tcpca-spec-01 for discussion of the
1852	 * initialization of these values.
1853	 */
1854	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1855	tp->snd_cwnd_clamp = ~0;
1856	tp->mss_cache = 536;
1857
1858	tp->reordering = sysctl_tcp_reordering;
1859	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1860
1861	sk->sk_state = TCP_CLOSE;
1862
1863	sk->sk_write_space = sk_stream_write_space;
1864	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1865
1866	icsk->icsk_af_ops = &ipv4_specific;
1867	icsk->icsk_sync_mss = tcp_sync_mss;
1868#ifdef CONFIG_TCP_MD5SIG
1869	tp->af_specific = &tcp_sock_ipv4_specific;
1870#endif
1871
1872	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1873	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1874
1875	atomic_inc(&tcp_sockets_allocated);
1876
1877	return 0;
1878}
1879
1880int tcp_v4_destroy_sock(struct sock *sk)
1881{
1882	struct tcp_sock *tp = tcp_sk(sk);
1883
1884	tcp_clear_xmit_timers(sk);
1885
1886	tcp_cleanup_congestion_control(sk);
1887
1888	/* Cleanup up the write buffer. */
1889	tcp_write_queue_purge(sk);
1890
1891	/* Cleans up our, hopefully empty, out_of_order_queue. */
1892	__skb_queue_purge(&tp->out_of_order_queue);
1893
1894#ifdef CONFIG_TCP_MD5SIG
1895	/* Clean up the MD5 key list, if any */
1896	if (tp->md5sig_info) {
1897		tcp_v4_clear_md5_list(sk);
1898		kfree(tp->md5sig_info);
1899		tp->md5sig_info = NULL;
1900	}
1901#endif
1902
1903#ifdef CONFIG_NET_DMA
1904	/* Cleans up our sk_async_wait_queue */
1905	__skb_queue_purge(&sk->sk_async_wait_queue);
1906#endif
1907
1908	/* Clean prequeue, it must be empty really */
1909	__skb_queue_purge(&tp->ucopy.prequeue);
1910
1911	/* Clean up a referenced TCP bind bucket. */
1912	if (inet_csk(sk)->icsk_bind_hash)
1913		inet_put_port(sk);
1914
1915	/*
1916	 * If sendmsg cached page exists, toss it.
1917	 */
1918	if (sk->sk_sndmsg_page) {
1919		__free_page(sk->sk_sndmsg_page);
1920		sk->sk_sndmsg_page = NULL;
1921	}
1922
1923	atomic_dec(&tcp_sockets_allocated);
1924
1925	return 0;
1926}
1927
1928EXPORT_SYMBOL(tcp_v4_destroy_sock);
1929
1930#ifdef CONFIG_PROC_FS
1931/* Proc filesystem TCP sock list dumping. */
1932
1933static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1934{
1935	return hlist_empty(head) ? NULL :
1936		list_entry(head->first, struct inet_timewait_sock, tw_node);
1937}
1938
1939static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1940{
1941	return tw->tw_node.next ?
1942		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1943}
1944
1945static void *listening_get_next(struct seq_file *seq, void *cur)
1946{
1947	struct inet_connection_sock *icsk;
1948	struct hlist_node *node;
1949	struct sock *sk = cur;
1950	struct tcp_iter_state* st = seq->private;
1951	struct net *net = st->net;
1952
1953	if (!sk) {
1954		st->bucket = 0;
1955		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1956		goto get_sk;
1957	}
1958
1959	++st->num;
1960
1961	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1962		struct request_sock *req = cur;
1963
1964		icsk = inet_csk(st->syn_wait_sk);
1965		req = req->dl_next;
1966		while (1) {
1967			while (req) {
1968				if (req->rsk_ops->family == st->family &&
1969				    req->sk->sk_net == net) {
1970					cur = req;
1971					goto out;
1972				}
1973				req = req->dl_next;
1974			}
1975			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1976				break;
1977get_req:
1978			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1979		}
1980		sk	  = sk_next(st->syn_wait_sk);
1981		st->state = TCP_SEQ_STATE_LISTENING;
1982		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1983	} else {
1984		icsk = inet_csk(sk);
1985		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1986		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1987			goto start_req;
1988		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1989		sk = sk_next(sk);
1990	}
1991get_sk:
1992	sk_for_each_from(sk, node) {
1993		if (sk->sk_family == st->family && sk->sk_net == net) {
1994			cur = sk;
1995			goto out;
1996		}
1997		icsk = inet_csk(sk);
1998		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1999		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2000start_req:
2001			st->uid		= sock_i_uid(sk);
2002			st->syn_wait_sk = sk;
2003			st->state	= TCP_SEQ_STATE_OPENREQ;
2004			st->sbucket	= 0;
2005			goto get_req;
2006		}
2007		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008	}
2009	if (++st->bucket < INET_LHTABLE_SIZE) {
2010		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2011		goto get_sk;
2012	}
2013	cur = NULL;
2014out:
2015	return cur;
2016}
2017
2018static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2019{
2020	void *rc = listening_get_next(seq, NULL);
2021
2022	while (rc && *pos) {
2023		rc = listening_get_next(seq, rc);
2024		--*pos;
2025	}
2026	return rc;
2027}
2028
2029static void *established_get_first(struct seq_file *seq)
2030{
2031	struct tcp_iter_state* st = seq->private;
2032	struct net *net = st->net;
2033	void *rc = NULL;
2034
2035	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2036		struct sock *sk;
2037		struct hlist_node *node;
2038		struct inet_timewait_sock *tw;
2039		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2040
2041		read_lock_bh(lock);
2042		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2043			if (sk->sk_family != st->family ||
2044			    sk->sk_net != net) {
2045				continue;
2046			}
2047			rc = sk;
2048			goto out;
2049		}
2050		st->state = TCP_SEQ_STATE_TIME_WAIT;
2051		inet_twsk_for_each(tw, node,
2052				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2053			if (tw->tw_family != st->family &&
2054			    tw->tw_net != net) {
2055				continue;
2056			}
2057			rc = tw;
2058			goto out;
2059		}
2060		read_unlock_bh(lock);
2061		st->state = TCP_SEQ_STATE_ESTABLISHED;
2062	}
2063out:
2064	return rc;
2065}
2066
2067static void *established_get_next(struct seq_file *seq, void *cur)
2068{
2069	struct sock *sk = cur;
2070	struct inet_timewait_sock *tw;
2071	struct hlist_node *node;
2072	struct tcp_iter_state* st = seq->private;
2073	struct net *net = st->net;
2074
2075	++st->num;
2076
2077	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2078		tw = cur;
2079		tw = tw_next(tw);
2080get_tw:
2081		while (tw && tw->tw_family != st->family && tw->tw_net != net) {
2082			tw = tw_next(tw);
2083		}
2084		if (tw) {
2085			cur = tw;
2086			goto out;
2087		}
2088		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2089		st->state = TCP_SEQ_STATE_ESTABLISHED;
2090
2091		if (++st->bucket < tcp_hashinfo.ehash_size) {
2092			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2093			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2094		} else {
2095			cur = NULL;
2096			goto out;
2097		}
2098	} else
2099		sk = sk_next(sk);
2100
2101	sk_for_each_from(sk, node) {
2102		if (sk->sk_family == st->family && sk->sk_net == net)
2103			goto found;
2104	}
2105
2106	st->state = TCP_SEQ_STATE_TIME_WAIT;
2107	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2108	goto get_tw;
2109found:
2110	cur = sk;
2111out:
2112	return cur;
2113}
2114
2115static void *established_get_idx(struct seq_file *seq, loff_t pos)
2116{
2117	void *rc = established_get_first(seq);
2118
2119	while (rc && pos) {
2120		rc = established_get_next(seq, rc);
2121		--pos;
2122	}
2123	return rc;
2124}
2125
2126static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2127{
2128	void *rc;
2129	struct tcp_iter_state* st = seq->private;
2130
2131	inet_listen_lock(&tcp_hashinfo);
2132	st->state = TCP_SEQ_STATE_LISTENING;
2133	rc	  = listening_get_idx(seq, &pos);
2134
2135	if (!rc) {
2136		inet_listen_unlock(&tcp_hashinfo);
2137		st->state = TCP_SEQ_STATE_ESTABLISHED;
2138		rc	  = established_get_idx(seq, pos);
2139	}
2140
2141	return rc;
2142}
2143
2144static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2145{
2146	struct tcp_iter_state* st = seq->private;
2147	st->state = TCP_SEQ_STATE_LISTENING;
2148	st->num = 0;
2149	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2150}
2151
2152static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2153{
2154	void *rc = NULL;
2155	struct tcp_iter_state* st;
2156
2157	if (v == SEQ_START_TOKEN) {
2158		rc = tcp_get_idx(seq, 0);
2159		goto out;
2160	}
2161	st = seq->private;
2162
2163	switch (st->state) {
2164	case TCP_SEQ_STATE_OPENREQ:
2165	case TCP_SEQ_STATE_LISTENING:
2166		rc = listening_get_next(seq, v);
2167		if (!rc) {
2168			inet_listen_unlock(&tcp_hashinfo);
2169			st->state = TCP_SEQ_STATE_ESTABLISHED;
2170			rc	  = established_get_first(seq);
2171		}
2172		break;
2173	case TCP_SEQ_STATE_ESTABLISHED:
2174	case TCP_SEQ_STATE_TIME_WAIT:
2175		rc = established_get_next(seq, v);
2176		break;
2177	}
2178out:
2179	++*pos;
2180	return rc;
2181}
2182
2183static void tcp_seq_stop(struct seq_file *seq, void *v)
2184{
2185	struct tcp_iter_state* st = seq->private;
2186
2187	switch (st->state) {
2188	case TCP_SEQ_STATE_OPENREQ:
2189		if (v) {
2190			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2191			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2192		}
2193	case TCP_SEQ_STATE_LISTENING:
2194		if (v != SEQ_START_TOKEN)
2195			inet_listen_unlock(&tcp_hashinfo);
2196		break;
2197	case TCP_SEQ_STATE_TIME_WAIT:
2198	case TCP_SEQ_STATE_ESTABLISHED:
2199		if (v)
2200			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2201		break;
2202	}
2203}
2204
2205static int tcp_seq_open(struct inode *inode, struct file *file)
2206{
2207	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2208	struct seq_file *seq;
2209	struct tcp_iter_state *s;
2210	struct net *net;
2211	int rc;
2212
2213	if (unlikely(afinfo == NULL))
2214		return -EINVAL;
2215
2216	s = kzalloc(sizeof(*s), GFP_KERNEL);
2217	if (!s)
2218		return -ENOMEM;
2219
2220	rc = -ENXIO;
2221	net = get_proc_net(inode);
2222	if (!net)
2223		goto out_kfree;
2224
2225	s->family		= afinfo->family;
2226	s->seq_ops.start	= tcp_seq_start;
2227	s->seq_ops.next		= tcp_seq_next;
2228	s->seq_ops.show		= afinfo->seq_show;
2229	s->seq_ops.stop		= tcp_seq_stop;
2230	s->net                  = net;
2231
2232	rc = seq_open(file, &s->seq_ops);
2233	if (rc)
2234		goto out_put_net;
2235	seq = file->private_data;
2236	seq->private = s;
2237out:
2238	return rc;
2239out_put_net:
2240	put_net(net);
2241out_kfree:
2242	kfree(s);
2243	goto out;
2244}
2245
2246static int tcp_seq_release(struct inode *inode, struct file *file)
2247{
2248	struct seq_file *seq = file->private_data;
2249	struct tcp_iter_state *s = seq->private;
2250
2251	put_net(s->net);
2252	seq_release_private(inode, file);
2253	return 0;
2254}
2255
2256int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2257{
2258	int rc = 0;
2259	struct proc_dir_entry *p;
2260
2261	if (!afinfo)
2262		return -EINVAL;
2263	afinfo->seq_fops->owner		= afinfo->owner;
2264	afinfo->seq_fops->open		= tcp_seq_open;
2265	afinfo->seq_fops->read		= seq_read;
2266	afinfo->seq_fops->llseek	= seq_lseek;
2267	afinfo->seq_fops->release	= tcp_seq_release;
2268
2269	p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2270	if (p)
2271		p->data = afinfo;
2272	else
2273		rc = -ENOMEM;
2274	return rc;
2275}
2276
2277void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2278{
2279	if (!afinfo)
2280		return;
2281	proc_net_remove(&init_net, afinfo->name);
2282	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2283}
2284
2285static void get_openreq4(struct sock *sk, struct request_sock *req,
2286			 char *tmpbuf, int i, int uid)
2287{
2288	const struct inet_request_sock *ireq = inet_rsk(req);
2289	int ttd = req->expires - jiffies;
2290
2291	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2292		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2293		i,
2294		ireq->loc_addr,
2295		ntohs(inet_sk(sk)->sport),
2296		ireq->rmt_addr,
2297		ntohs(ireq->rmt_port),
2298		TCP_SYN_RECV,
2299		0, 0, /* could print option size, but that is af dependent. */
2300		1,    /* timers active (only the expire timer) */
2301		jiffies_to_clock_t(ttd),
2302		req->retrans,
2303		uid,
2304		0,  /* non standard timer */
2305		0, /* open_requests have no inode */
2306		atomic_read(&sk->sk_refcnt),
2307		req);
2308}
2309
2310static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2311{
2312	int timer_active;
2313	unsigned long timer_expires;
2314	struct tcp_sock *tp = tcp_sk(sk);
2315	const struct inet_connection_sock *icsk = inet_csk(sk);
2316	struct inet_sock *inet = inet_sk(sk);
2317	__be32 dest = inet->daddr;
2318	__be32 src = inet->rcv_saddr;
2319	__u16 destp = ntohs(inet->dport);
2320	__u16 srcp = ntohs(inet->sport);
2321
2322	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2323		timer_active	= 1;
2324		timer_expires	= icsk->icsk_timeout;
2325	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2326		timer_active	= 4;
2327		timer_expires	= icsk->icsk_timeout;
2328	} else if (timer_pending(&sk->sk_timer)) {
2329		timer_active	= 2;
2330		timer_expires	= sk->sk_timer.expires;
2331	} else {
2332		timer_active	= 0;
2333		timer_expires = jiffies;
2334	}
2335
2336	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2337			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
2338		i, src, srcp, dest, destp, sk->sk_state,
2339		tp->write_seq - tp->snd_una,
2340		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2341					     (tp->rcv_nxt - tp->copied_seq),
2342		timer_active,
2343		jiffies_to_clock_t(timer_expires - jiffies),
2344		icsk->icsk_retransmits,
2345		sock_i_uid(sk),
2346		icsk->icsk_probes_out,
2347		sock_i_ino(sk),
2348		atomic_read(&sk->sk_refcnt), sk,
2349		icsk->icsk_rto,
2350		icsk->icsk_ack.ato,
2351		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2352		tp->snd_cwnd,
2353		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2354}
2355
2356static void get_timewait4_sock(struct inet_timewait_sock *tw,
2357			       char *tmpbuf, int i)
2358{
2359	__be32 dest, src;
2360	__u16 destp, srcp;
2361	int ttd = tw->tw_ttd - jiffies;
2362
2363	if (ttd < 0)
2364		ttd = 0;
2365
2366	dest  = tw->tw_daddr;
2367	src   = tw->tw_rcv_saddr;
2368	destp = ntohs(tw->tw_dport);
2369	srcp  = ntohs(tw->tw_sport);
2370
2371	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2372		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2373		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2374		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2375		atomic_read(&tw->tw_refcnt), tw);
2376}
2377
2378#define TMPSZ 150
2379
2380static int tcp4_seq_show(struct seq_file *seq, void *v)
2381{
2382	struct tcp_iter_state* st;
2383	char tmpbuf[TMPSZ + 1];
2384
2385	if (v == SEQ_START_TOKEN) {
2386		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2387			   "  sl  local_address rem_address   st tx_queue "
2388			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2389			   "inode");
2390		goto out;
2391	}
2392	st = seq->private;
2393
2394	switch (st->state) {
2395	case TCP_SEQ_STATE_LISTENING:
2396	case TCP_SEQ_STATE_ESTABLISHED:
2397		get_tcp4_sock(v, tmpbuf, st->num);
2398		break;
2399	case TCP_SEQ_STATE_OPENREQ:
2400		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2401		break;
2402	case TCP_SEQ_STATE_TIME_WAIT:
2403		get_timewait4_sock(v, tmpbuf, st->num);
2404		break;
2405	}
2406	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2407out:
2408	return 0;
2409}
2410
2411static struct file_operations tcp4_seq_fops;
2412static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2413	.owner		= THIS_MODULE,
2414	.name		= "tcp",
2415	.family		= AF_INET,
2416	.seq_show	= tcp4_seq_show,
2417	.seq_fops	= &tcp4_seq_fops,
2418};
2419
2420int __init tcp4_proc_init(void)
2421{
2422	return tcp_proc_register(&tcp4_seq_afinfo);
2423}
2424
2425void tcp4_proc_exit(void)
2426{
2427	tcp_proc_unregister(&tcp4_seq_afinfo);
2428}
2429#endif /* CONFIG_PROC_FS */
2430
2431DEFINE_PROTO_INUSE(tcp)
2432
2433struct proto tcp_prot = {
2434	.name			= "TCP",
2435	.owner			= THIS_MODULE,
2436	.close			= tcp_close,
2437	.connect		= tcp_v4_connect,
2438	.disconnect		= tcp_disconnect,
2439	.accept			= inet_csk_accept,
2440	.ioctl			= tcp_ioctl,
2441	.init			= tcp_v4_init_sock,
2442	.destroy		= tcp_v4_destroy_sock,
2443	.shutdown		= tcp_shutdown,
2444	.setsockopt		= tcp_setsockopt,
2445	.getsockopt		= tcp_getsockopt,
2446	.recvmsg		= tcp_recvmsg,
2447	.backlog_rcv		= tcp_v4_do_rcv,
2448	.hash			= inet_hash,
2449	.unhash			= inet_unhash,
2450	.get_port		= inet_csk_get_port,
2451	.enter_memory_pressure	= tcp_enter_memory_pressure,
2452	.sockets_allocated	= &tcp_sockets_allocated,
2453	.orphan_count		= &tcp_orphan_count,
2454	.memory_allocated	= &tcp_memory_allocated,
2455	.memory_pressure	= &tcp_memory_pressure,
2456	.sysctl_mem		= sysctl_tcp_mem,
2457	.sysctl_wmem		= sysctl_tcp_wmem,
2458	.sysctl_rmem		= sysctl_tcp_rmem,
2459	.max_header		= MAX_TCP_HEADER,
2460	.obj_size		= sizeof(struct tcp_sock),
2461	.twsk_prot		= &tcp_timewait_sock_ops,
2462	.rsk_prot		= &tcp_request_sock_ops,
2463	.hashinfo		= &tcp_hashinfo,
2464#ifdef CONFIG_COMPAT
2465	.compat_setsockopt	= compat_tcp_setsockopt,
2466	.compat_getsockopt	= compat_tcp_getsockopt,
2467#endif
2468	REF_PROTO_INUSE(tcp)
2469};
2470
2471void __init tcp_v4_init(void)
2472{
2473	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2474				     IPPROTO_TCP) < 0)
2475		panic("Failed to create the TCP control socket.\n");
2476}
2477
2478EXPORT_SYMBOL(ipv4_specific);
2479EXPORT_SYMBOL(tcp_hashinfo);
2480EXPORT_SYMBOL(tcp_prot);
2481EXPORT_SYMBOL(tcp_v4_conn_request);
2482EXPORT_SYMBOL(tcp_v4_connect);
2483EXPORT_SYMBOL(tcp_v4_do_rcv);
2484EXPORT_SYMBOL(tcp_v4_remember_stamp);
2485EXPORT_SYMBOL(tcp_v4_send_check);
2486EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2487
2488#ifdef CONFIG_PROC_FS
2489EXPORT_SYMBOL(tcp_proc_register);
2490EXPORT_SYMBOL(tcp_proc_unregister);
2491#endif
2492EXPORT_SYMBOL(sysctl_tcp_low_latency);
2493
2494