tcp_ipv4.c revision 0b040829952d84bf2a62526f0e24b624e0699447
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/types.h>
55#include <linux/fcntl.h>
56#include <linux/module.h>
57#include <linux/random.h>
58#include <linux/cache.h>
59#include <linux/jhash.h>
60#include <linux/init.h>
61#include <linux/times.h>
62
63#include <net/net_namespace.h>
64#include <net/icmp.h>
65#include <net/inet_hashtables.h>
66#include <net/tcp.h>
67#include <net/transp_v6.h>
68#include <net/ipv6.h>
69#include <net/inet_common.h>
70#include <net/timewait_sock.h>
71#include <net/xfrm.h>
72#include <net/netdma.h>
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79
80#include <linux/crypto.h>
81#include <linux/scatterlist.h>
82
83int sysctl_tcp_tw_reuse __read_mostly;
84int sysctl_tcp_low_latency __read_mostly;
85
86/* Check TCP sequence numbers in ICMP packets. */
87#define ICMP_MIN_LENGTH 8
88
89void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90
91#ifdef CONFIG_TCP_MD5SIG
92static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93						   __be32 addr);
94static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
95				   __be32 saddr, __be32 daddr,
96				   struct tcphdr *th, int protocol,
97				   unsigned int tcplen);
98#endif
99
100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102	.lhash_users = ATOMIC_INIT(0),
103	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104};
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144
145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147/* This will initiate an outgoing connection. */
148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149{
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153	struct rtable *rt;
154	__be32 daddr, nexthop;
155	int tmp;
156	int err;
157
158	if (addr_len < sizeof(struct sockaddr_in))
159		return -EINVAL;
160
161	if (usin->sin_family != AF_INET)
162		return -EAFNOSUPPORT;
163
164	nexthop = daddr = usin->sin_addr.s_addr;
165	if (inet->opt && inet->opt->srr) {
166		if (!daddr)
167			return -EINVAL;
168		nexthop = inet->opt->faddr;
169	}
170
171	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173			       IPPROTO_TCP,
174			       inet->sport, usin->sin_port, sk, 1);
175	if (tmp < 0) {
176		if (tmp == -ENETUNREACH)
177			IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
178		return tmp;
179	}
180
181	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182		ip_rt_put(rt);
183		return -ENETUNREACH;
184	}
185
186	if (!inet->opt || !inet->opt->srr)
187		daddr = rt->rt_dst;
188
189	if (!inet->saddr)
190		inet->saddr = rt->rt_src;
191	inet->rcv_saddr = inet->saddr;
192
193	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194		/* Reset inherited state */
195		tp->rx_opt.ts_recent	   = 0;
196		tp->rx_opt.ts_recent_stamp = 0;
197		tp->write_seq		   = 0;
198	}
199
200	if (tcp_death_row.sysctl_tw_recycle &&
201	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202		struct inet_peer *peer = rt_get_peer(rt);
203		/*
204		 * VJ's idea. We save last timestamp seen from
205		 * the destination in peer table, when entering state
206		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207		 * when trying new connection.
208		 */
209		if (peer != NULL &&
210		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212			tp->rx_opt.ts_recent = peer->tcp_ts;
213		}
214	}
215
216	inet->dport = usin->sin_port;
217	inet->daddr = daddr;
218
219	inet_csk(sk)->icsk_ext_hdr_len = 0;
220	if (inet->opt)
221		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223	tp->rx_opt.mss_clamp = 536;
224
225	/* Socket identity is still unknown (sport may be zero).
226	 * However we set state to SYN-SENT and not releasing socket
227	 * lock select source port, enter ourselves into the hash tables and
228	 * complete initialization after this.
229	 */
230	tcp_set_state(sk, TCP_SYN_SENT);
231	err = inet_hash_connect(&tcp_death_row, sk);
232	if (err)
233		goto failure;
234
235	err = ip_route_newports(&rt, IPPROTO_TCP,
236				inet->sport, inet->dport, sk);
237	if (err)
238		goto failure;
239
240	/* OK, now commit destination to socket.  */
241	sk->sk_gso_type = SKB_GSO_TCPV4;
242	sk_setup_caps(sk, &rt->u.dst);
243
244	if (!tp->write_seq)
245		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246							   inet->daddr,
247							   inet->sport,
248							   usin->sin_port);
249
250	inet->id = tp->write_seq ^ jiffies;
251
252	err = tcp_connect(sk);
253	rt = NULL;
254	if (err)
255		goto failure;
256
257	return 0;
258
259failure:
260	/*
261	 * This unhashes the socket and releases the local port,
262	 * if necessary.
263	 */
264	tcp_set_state(sk, TCP_CLOSE);
265	ip_rt_put(rt);
266	sk->sk_route_caps = 0;
267	inet->dport = 0;
268	return err;
269}
270
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275{
276	struct dst_entry *dst;
277	struct inet_sock *inet = inet_sk(sk);
278
279	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280	 * send out by Linux are always <576bytes so they should go through
281	 * unfragmented).
282	 */
283	if (sk->sk_state == TCP_LISTEN)
284		return;
285
286	/* We don't check in the destentry if pmtu discovery is forbidden
287	 * on this route. We just assume that no packet_to_big packets
288	 * are send back when pmtu discovery is not active.
289	 * There is a small race when the user changes this flag in the
290	 * route, but I think that's acceptable.
291	 */
292	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293		return;
294
295	dst->ops->update_pmtu(dst, mtu);
296
297	/* Something is about to be wrong... Remember soft error
298	 * for the case, if this connection will not able to recover.
299	 */
300	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301		sk->sk_err_soft = EMSGSIZE;
302
303	mtu = dst_mtu(dst);
304
305	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307		tcp_sync_mss(sk, mtu);
308
309		/* Resend the TCP packet because it's
310		 * clear that the old packet has been
311		 * dropped. This is the new "fast" path mtu
312		 * discovery.
313		 */
314		tcp_simple_retransmit(sk);
315	} /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition.  If err < 0 then the socket should
321 * be closed and the error returned to the user.  If err > 0
322 * it's just the icmp type << 8 | icmp code.  After adjustment
323 * header points to the first 8 bytes of the tcp header.  We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336	struct iphdr *iph = (struct iphdr *)skb->data;
337	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338	struct tcp_sock *tp;
339	struct inet_sock *inet;
340	const int type = icmp_hdr(skb)->type;
341	const int code = icmp_hdr(skb)->code;
342	struct sock *sk;
343	__u32 seq;
344	int err;
345
346	if (skb->len < (iph->ihl << 2) + 8) {
347		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
348		return;
349	}
350
351	sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
352			iph->saddr, th->source, inet_iif(skb));
353	if (!sk) {
354		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355		return;
356	}
357	if (sk->sk_state == TCP_TIME_WAIT) {
358		inet_twsk_put(inet_twsk(sk));
359		return;
360	}
361
362	bh_lock_sock(sk);
363	/* If too many ICMPs get dropped on busy
364	 * servers this needs to be solved differently.
365	 */
366	if (sock_owned_by_user(sk))
367		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
368
369	if (sk->sk_state == TCP_CLOSE)
370		goto out;
371
372	tp = tcp_sk(sk);
373	seq = ntohl(th->seq);
374	if (sk->sk_state != TCP_LISTEN &&
375	    !between(seq, tp->snd_una, tp->snd_nxt)) {
376		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
377		goto out;
378	}
379
380	switch (type) {
381	case ICMP_SOURCE_QUENCH:
382		/* Just silently ignore these. */
383		goto out;
384	case ICMP_PARAMETERPROB:
385		err = EPROTO;
386		break;
387	case ICMP_DEST_UNREACH:
388		if (code > NR_ICMP_UNREACH)
389			goto out;
390
391		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
392			if (!sock_owned_by_user(sk))
393				do_pmtu_discovery(sk, iph, info);
394			goto out;
395		}
396
397		err = icmp_err_convert[code].errno;
398		break;
399	case ICMP_TIME_EXCEEDED:
400		err = EHOSTUNREACH;
401		break;
402	default:
403		goto out;
404	}
405
406	switch (sk->sk_state) {
407		struct request_sock *req, **prev;
408	case TCP_LISTEN:
409		if (sock_owned_by_user(sk))
410			goto out;
411
412		req = inet_csk_search_req(sk, &prev, th->dest,
413					  iph->daddr, iph->saddr);
414		if (!req)
415			goto out;
416
417		/* ICMPs are not backlogged, hence we cannot get
418		   an established socket here.
419		 */
420		BUG_TRAP(!req->sk);
421
422		if (seq != tcp_rsk(req)->snt_isn) {
423			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
424			goto out;
425		}
426
427		/*
428		 * Still in SYN_RECV, just remove it silently.
429		 * There is no good way to pass the error to the newly
430		 * created socket, and POSIX does not want network
431		 * errors returned from accept().
432		 */
433		inet_csk_reqsk_queue_drop(sk, req, prev);
434		goto out;
435
436	case TCP_SYN_SENT:
437	case TCP_SYN_RECV:  /* Cannot happen.
438			       It can f.e. if SYNs crossed.
439			     */
440		if (!sock_owned_by_user(sk)) {
441			sk->sk_err = err;
442
443			sk->sk_error_report(sk);
444
445			tcp_done(sk);
446		} else {
447			sk->sk_err_soft = err;
448		}
449		goto out;
450	}
451
452	/* If we've already connected we will keep trying
453	 * until we time out, or the user gives up.
454	 *
455	 * rfc1122 4.2.3.9 allows to consider as hard errors
456	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457	 * but it is obsoleted by pmtu discovery).
458	 *
459	 * Note, that in modern internet, where routing is unreliable
460	 * and in each dark corner broken firewalls sit, sending random
461	 * errors ordered by their masters even this two messages finally lose
462	 * their original sense (even Linux sends invalid PORT_UNREACHs)
463	 *
464	 * Now we are in compliance with RFCs.
465	 *							--ANK (980905)
466	 */
467
468	inet = inet_sk(sk);
469	if (!sock_owned_by_user(sk) && inet->recverr) {
470		sk->sk_err = err;
471		sk->sk_error_report(sk);
472	} else	{ /* Only an error on timeout */
473		sk->sk_err_soft = err;
474	}
475
476out:
477	bh_unlock_sock(sk);
478	sock_put(sk);
479}
480
481/* This routine computes an IPv4 TCP checksum. */
482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483{
484	struct inet_sock *inet = inet_sk(sk);
485	struct tcphdr *th = tcp_hdr(skb);
486
487	if (skb->ip_summed == CHECKSUM_PARTIAL) {
488		th->check = ~tcp_v4_check(len, inet->saddr,
489					  inet->daddr, 0);
490		skb->csum_start = skb_transport_header(skb) - skb->head;
491		skb->csum_offset = offsetof(struct tcphdr, check);
492	} else {
493		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
494					 csum_partial((char *)th,
495						      th->doff << 2,
496						      skb->csum));
497	}
498}
499
500int tcp_v4_gso_send_check(struct sk_buff *skb)
501{
502	const struct iphdr *iph;
503	struct tcphdr *th;
504
505	if (!pskb_may_pull(skb, sizeof(*th)))
506		return -EINVAL;
507
508	iph = ip_hdr(skb);
509	th = tcp_hdr(skb);
510
511	th->check = 0;
512	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
513	skb->csum_start = skb_transport_header(skb) - skb->head;
514	skb->csum_offset = offsetof(struct tcphdr, check);
515	skb->ip_summed = CHECKSUM_PARTIAL;
516	return 0;
517}
518
519/*
520 *	This routine will send an RST to the other tcp.
521 *
522 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
523 *		      for reset.
524 *	Answer: if a packet caused RST, it is not for a socket
525 *		existing in our system, if it is matched to a socket,
526 *		it is just duplicate segment or bug in other side's TCP.
527 *		So that we build reply only basing on parameters
528 *		arrived with segment.
529 *	Exception: precedence violation. We do not implement it in any case.
530 */
531
532static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
533{
534	struct tcphdr *th = tcp_hdr(skb);
535	struct {
536		struct tcphdr th;
537#ifdef CONFIG_TCP_MD5SIG
538		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
539#endif
540	} rep;
541	struct ip_reply_arg arg;
542#ifdef CONFIG_TCP_MD5SIG
543	struct tcp_md5sig_key *key;
544#endif
545
546	/* Never send a reset in response to a reset. */
547	if (th->rst)
548		return;
549
550	if (skb->rtable->rt_type != RTN_LOCAL)
551		return;
552
553	/* Swap the send and the receive. */
554	memset(&rep, 0, sizeof(rep));
555	rep.th.dest   = th->source;
556	rep.th.source = th->dest;
557	rep.th.doff   = sizeof(struct tcphdr) / 4;
558	rep.th.rst    = 1;
559
560	if (th->ack) {
561		rep.th.seq = th->ack_seq;
562	} else {
563		rep.th.ack = 1;
564		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
565				       skb->len - (th->doff << 2));
566	}
567
568	memset(&arg, 0, sizeof(arg));
569	arg.iov[0].iov_base = (unsigned char *)&rep;
570	arg.iov[0].iov_len  = sizeof(rep.th);
571
572#ifdef CONFIG_TCP_MD5SIG
573	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
574	if (key) {
575		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
576				   (TCPOPT_NOP << 16) |
577				   (TCPOPT_MD5SIG << 8) |
578				   TCPOLEN_MD5SIG);
579		/* Update length and the length the header thinks exists */
580		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
581		rep.th.doff = arg.iov[0].iov_len / 4;
582
583		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
584					key,
585					ip_hdr(skb)->daddr,
586					ip_hdr(skb)->saddr,
587					&rep.th, IPPROTO_TCP,
588					arg.iov[0].iov_len);
589	}
590#endif
591	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
592				      ip_hdr(skb)->saddr, /* XXX */
593				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
594	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
595
596	ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
597		      &arg, arg.iov[0].iov_len);
598
599	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
600	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
601}
602
603/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604   outside socket context is ugly, certainly. What can I do?
605 */
606
607static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
608			    struct sk_buff *skb, u32 seq, u32 ack,
609			    u32 win, u32 ts)
610{
611	struct tcphdr *th = tcp_hdr(skb);
612	struct {
613		struct tcphdr th;
614		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615#ifdef CONFIG_TCP_MD5SIG
616			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617#endif
618			];
619	} rep;
620	struct ip_reply_arg arg;
621#ifdef CONFIG_TCP_MD5SIG
622	struct tcp_md5sig_key *key;
623	struct tcp_md5sig_key tw_key;
624#endif
625
626	memset(&rep.th, 0, sizeof(struct tcphdr));
627	memset(&arg, 0, sizeof(arg));
628
629	arg.iov[0].iov_base = (unsigned char *)&rep;
630	arg.iov[0].iov_len  = sizeof(rep.th);
631	if (ts) {
632		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
633				   (TCPOPT_TIMESTAMP << 8) |
634				   TCPOLEN_TIMESTAMP);
635		rep.opt[1] = htonl(tcp_time_stamp);
636		rep.opt[2] = htonl(ts);
637		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
638	}
639
640	/* Swap the send and the receive. */
641	rep.th.dest    = th->source;
642	rep.th.source  = th->dest;
643	rep.th.doff    = arg.iov[0].iov_len / 4;
644	rep.th.seq     = htonl(seq);
645	rep.th.ack_seq = htonl(ack);
646	rep.th.ack     = 1;
647	rep.th.window  = htons(win);
648
649#ifdef CONFIG_TCP_MD5SIG
650	/*
651	 * The SKB holds an imcoming packet, but may not have a valid ->sk
652	 * pointer. This is especially the case when we're dealing with a
653	 * TIME_WAIT ack, because the sk structure is long gone, and only
654	 * the tcp_timewait_sock remains. So the md5 key is stashed in that
655	 * structure, and we use it in preference.  I believe that (twsk ||
656	 * skb->sk) holds true, but we program defensively.
657	 */
658	if (!twsk && skb->sk) {
659		key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
660	} else if (twsk && twsk->tw_md5_keylen) {
661		tw_key.key = twsk->tw_md5_key;
662		tw_key.keylen = twsk->tw_md5_keylen;
663		key = &tw_key;
664	} else
665		key = NULL;
666
667	if (key) {
668		int offset = (ts) ? 3 : 0;
669
670		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
671					  (TCPOPT_NOP << 16) |
672					  (TCPOPT_MD5SIG << 8) |
673					  TCPOLEN_MD5SIG);
674		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
675		rep.th.doff = arg.iov[0].iov_len/4;
676
677		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
678					key,
679					ip_hdr(skb)->daddr,
680					ip_hdr(skb)->saddr,
681					&rep.th, IPPROTO_TCP,
682					arg.iov[0].iov_len);
683	}
684#endif
685	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
686				      ip_hdr(skb)->saddr, /* XXX */
687				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
688	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
689	if (twsk)
690		arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
691
692	ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
693		      &arg, arg.iov[0].iov_len);
694
695	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
696}
697
698static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
699{
700	struct inet_timewait_sock *tw = inet_twsk(sk);
701	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
702
703	tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
704			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
705			tcptw->tw_ts_recent);
706
707	inet_twsk_put(tw);
708}
709
710static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
711				  struct request_sock *req)
712{
713	tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
714			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
715			req->ts_recent);
716}
717
718/*
719 *	Send a SYN-ACK after having received a SYN.
720 *	This still operates on a request_sock only, not on a big
721 *	socket.
722 */
723static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
724				struct dst_entry *dst)
725{
726	const struct inet_request_sock *ireq = inet_rsk(req);
727	int err = -1;
728	struct sk_buff * skb;
729
730	/* First, grab a route. */
731	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
732		return -1;
733
734	skb = tcp_make_synack(sk, dst, req);
735
736	if (skb) {
737		struct tcphdr *th = tcp_hdr(skb);
738
739		th->check = tcp_v4_check(skb->len,
740					 ireq->loc_addr,
741					 ireq->rmt_addr,
742					 csum_partial((char *)th, skb->len,
743						      skb->csum));
744
745		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
746					    ireq->rmt_addr,
747					    ireq->opt);
748		err = net_xmit_eval(err);
749	}
750
751	dst_release(dst);
752	return err;
753}
754
755static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
756{
757	return __tcp_v4_send_synack(sk, req, NULL);
758}
759
760/*
761 *	IPv4 request_sock destructor.
762 */
763static void tcp_v4_reqsk_destructor(struct request_sock *req)
764{
765	kfree(inet_rsk(req)->opt);
766}
767
768#ifdef CONFIG_SYN_COOKIES
769static void syn_flood_warning(struct sk_buff *skb)
770{
771	static unsigned long warntime;
772
773	if (time_after(jiffies, (warntime + HZ * 60))) {
774		warntime = jiffies;
775		printk(KERN_INFO
776		       "possible SYN flooding on port %d. Sending cookies.\n",
777		       ntohs(tcp_hdr(skb)->dest));
778	}
779}
780#endif
781
782/*
783 * Save and compile IPv4 options into the request_sock if needed.
784 */
785static struct ip_options *tcp_v4_save_options(struct sock *sk,
786					      struct sk_buff *skb)
787{
788	struct ip_options *opt = &(IPCB(skb)->opt);
789	struct ip_options *dopt = NULL;
790
791	if (opt && opt->optlen) {
792		int opt_size = optlength(opt);
793		dopt = kmalloc(opt_size, GFP_ATOMIC);
794		if (dopt) {
795			if (ip_options_echo(dopt, skb)) {
796				kfree(dopt);
797				dopt = NULL;
798			}
799		}
800	}
801	return dopt;
802}
803
804#ifdef CONFIG_TCP_MD5SIG
805/*
806 * RFC2385 MD5 checksumming requires a mapping of
807 * IP address->MD5 Key.
808 * We need to maintain these in the sk structure.
809 */
810
811/* Find the Key structure for an address.  */
812static struct tcp_md5sig_key *
813			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
814{
815	struct tcp_sock *tp = tcp_sk(sk);
816	int i;
817
818	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
819		return NULL;
820	for (i = 0; i < tp->md5sig_info->entries4; i++) {
821		if (tp->md5sig_info->keys4[i].addr == addr)
822			return &tp->md5sig_info->keys4[i].base;
823	}
824	return NULL;
825}
826
827struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
828					 struct sock *addr_sk)
829{
830	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
831}
832
833EXPORT_SYMBOL(tcp_v4_md5_lookup);
834
835static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
836						      struct request_sock *req)
837{
838	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
839}
840
841/* This can be called on a newly created socket, from other files */
842int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
843		      u8 *newkey, u8 newkeylen)
844{
845	/* Add Key to the list */
846	struct tcp_md5sig_key *key;
847	struct tcp_sock *tp = tcp_sk(sk);
848	struct tcp4_md5sig_key *keys;
849
850	key = tcp_v4_md5_do_lookup(sk, addr);
851	if (key) {
852		/* Pre-existing entry - just update that one. */
853		kfree(key->key);
854		key->key = newkey;
855		key->keylen = newkeylen;
856	} else {
857		struct tcp_md5sig_info *md5sig;
858
859		if (!tp->md5sig_info) {
860			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
861						  GFP_ATOMIC);
862			if (!tp->md5sig_info) {
863				kfree(newkey);
864				return -ENOMEM;
865			}
866			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
867		}
868		if (tcp_alloc_md5sig_pool() == NULL) {
869			kfree(newkey);
870			return -ENOMEM;
871		}
872		md5sig = tp->md5sig_info;
873
874		if (md5sig->alloced4 == md5sig->entries4) {
875			keys = kmalloc((sizeof(*keys) *
876					(md5sig->entries4 + 1)), GFP_ATOMIC);
877			if (!keys) {
878				kfree(newkey);
879				tcp_free_md5sig_pool();
880				return -ENOMEM;
881			}
882
883			if (md5sig->entries4)
884				memcpy(keys, md5sig->keys4,
885				       sizeof(*keys) * md5sig->entries4);
886
887			/* Free old key list, and reference new one */
888			kfree(md5sig->keys4);
889			md5sig->keys4 = keys;
890			md5sig->alloced4++;
891		}
892		md5sig->entries4++;
893		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
894		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
895		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
896	}
897	return 0;
898}
899
900EXPORT_SYMBOL(tcp_v4_md5_do_add);
901
902static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
903			       u8 *newkey, u8 newkeylen)
904{
905	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
906				 newkey, newkeylen);
907}
908
909int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
910{
911	struct tcp_sock *tp = tcp_sk(sk);
912	int i;
913
914	for (i = 0; i < tp->md5sig_info->entries4; i++) {
915		if (tp->md5sig_info->keys4[i].addr == addr) {
916			/* Free the key */
917			kfree(tp->md5sig_info->keys4[i].base.key);
918			tp->md5sig_info->entries4--;
919
920			if (tp->md5sig_info->entries4 == 0) {
921				kfree(tp->md5sig_info->keys4);
922				tp->md5sig_info->keys4 = NULL;
923				tp->md5sig_info->alloced4 = 0;
924			} else if (tp->md5sig_info->entries4 != i) {
925				/* Need to do some manipulation */
926				memmove(&tp->md5sig_info->keys4[i],
927					&tp->md5sig_info->keys4[i+1],
928					(tp->md5sig_info->entries4 - i) *
929					 sizeof(struct tcp4_md5sig_key));
930			}
931			tcp_free_md5sig_pool();
932			return 0;
933		}
934	}
935	return -ENOENT;
936}
937
938EXPORT_SYMBOL(tcp_v4_md5_do_del);
939
940static void tcp_v4_clear_md5_list(struct sock *sk)
941{
942	struct tcp_sock *tp = tcp_sk(sk);
943
944	/* Free each key, then the set of key keys,
945	 * the crypto element, and then decrement our
946	 * hold on the last resort crypto.
947	 */
948	if (tp->md5sig_info->entries4) {
949		int i;
950		for (i = 0; i < tp->md5sig_info->entries4; i++)
951			kfree(tp->md5sig_info->keys4[i].base.key);
952		tp->md5sig_info->entries4 = 0;
953		tcp_free_md5sig_pool();
954	}
955	if (tp->md5sig_info->keys4) {
956		kfree(tp->md5sig_info->keys4);
957		tp->md5sig_info->keys4 = NULL;
958		tp->md5sig_info->alloced4  = 0;
959	}
960}
961
962static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
963				 int optlen)
964{
965	struct tcp_md5sig cmd;
966	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
967	u8 *newkey;
968
969	if (optlen < sizeof(cmd))
970		return -EINVAL;
971
972	if (copy_from_user(&cmd, optval, sizeof(cmd)))
973		return -EFAULT;
974
975	if (sin->sin_family != AF_INET)
976		return -EINVAL;
977
978	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
979		if (!tcp_sk(sk)->md5sig_info)
980			return -ENOENT;
981		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
982	}
983
984	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
985		return -EINVAL;
986
987	if (!tcp_sk(sk)->md5sig_info) {
988		struct tcp_sock *tp = tcp_sk(sk);
989		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
990
991		if (!p)
992			return -EINVAL;
993
994		tp->md5sig_info = p;
995		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
996	}
997
998	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
999	if (!newkey)
1000		return -ENOMEM;
1001	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1002				 newkey, cmd.tcpm_keylen);
1003}
1004
1005static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1006				   __be32 saddr, __be32 daddr,
1007				   struct tcphdr *th, int protocol,
1008				   unsigned int tcplen)
1009{
1010	struct scatterlist sg[4];
1011	__u16 data_len;
1012	int block = 0;
1013	__sum16 old_checksum;
1014	struct tcp_md5sig_pool *hp;
1015	struct tcp4_pseudohdr *bp;
1016	struct hash_desc *desc;
1017	int err;
1018	unsigned int nbytes = 0;
1019
1020	/*
1021	 * Okay, so RFC2385 is turned on for this connection,
1022	 * so we need to generate the MD5 hash for the packet now.
1023	 */
1024
1025	hp = tcp_get_md5sig_pool();
1026	if (!hp)
1027		goto clear_hash_noput;
1028
1029	bp = &hp->md5_blk.ip4;
1030	desc = &hp->md5_desc;
1031
1032	/*
1033	 * 1. the TCP pseudo-header (in the order: source IP address,
1034	 * destination IP address, zero-padded protocol number, and
1035	 * segment length)
1036	 */
1037	bp->saddr = saddr;
1038	bp->daddr = daddr;
1039	bp->pad = 0;
1040	bp->protocol = protocol;
1041	bp->len = htons(tcplen);
1042
1043	sg_init_table(sg, 4);
1044
1045	sg_set_buf(&sg[block++], bp, sizeof(*bp));
1046	nbytes += sizeof(*bp);
1047
1048	/* 2. the TCP header, excluding options, and assuming a
1049	 * checksum of zero/
1050	 */
1051	old_checksum = th->check;
1052	th->check = 0;
1053	sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1054	nbytes += sizeof(struct tcphdr);
1055
1056	/* 3. the TCP segment data (if any) */
1057	data_len = tcplen - (th->doff << 2);
1058	if (data_len > 0) {
1059		unsigned char *data = (unsigned char *)th + (th->doff << 2);
1060		sg_set_buf(&sg[block++], data, data_len);
1061		nbytes += data_len;
1062	}
1063
1064	/* 4. an independently-specified key or password, known to both
1065	 * TCPs and presumably connection-specific
1066	 */
1067	sg_set_buf(&sg[block++], key->key, key->keylen);
1068	nbytes += key->keylen;
1069
1070	sg_mark_end(&sg[block - 1]);
1071
1072	/* Now store the Hash into the packet */
1073	err = crypto_hash_init(desc);
1074	if (err)
1075		goto clear_hash;
1076	err = crypto_hash_update(desc, sg, nbytes);
1077	if (err)
1078		goto clear_hash;
1079	err = crypto_hash_final(desc, md5_hash);
1080	if (err)
1081		goto clear_hash;
1082
1083	/* Reset header, and free up the crypto */
1084	tcp_put_md5sig_pool();
1085	th->check = old_checksum;
1086
1087out:
1088	return 0;
1089clear_hash:
1090	tcp_put_md5sig_pool();
1091clear_hash_noput:
1092	memset(md5_hash, 0, 16);
1093	goto out;
1094}
1095
1096int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1097			 struct sock *sk,
1098			 struct dst_entry *dst,
1099			 struct request_sock *req,
1100			 struct tcphdr *th, int protocol,
1101			 unsigned int tcplen)
1102{
1103	__be32 saddr, daddr;
1104
1105	if (sk) {
1106		saddr = inet_sk(sk)->saddr;
1107		daddr = inet_sk(sk)->daddr;
1108	} else {
1109		struct rtable *rt = (struct rtable *)dst;
1110		BUG_ON(!rt);
1111		saddr = rt->rt_src;
1112		daddr = rt->rt_dst;
1113	}
1114	return tcp_v4_do_calc_md5_hash(md5_hash, key,
1115				       saddr, daddr,
1116				       th, protocol, tcplen);
1117}
1118
1119EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1120
1121static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1122{
1123	/*
1124	 * This gets called for each TCP segment that arrives
1125	 * so we want to be efficient.
1126	 * We have 3 drop cases:
1127	 * o No MD5 hash and one expected.
1128	 * o MD5 hash and we're not expecting one.
1129	 * o MD5 hash and its wrong.
1130	 */
1131	__u8 *hash_location = NULL;
1132	struct tcp_md5sig_key *hash_expected;
1133	const struct iphdr *iph = ip_hdr(skb);
1134	struct tcphdr *th = tcp_hdr(skb);
1135	int length = (th->doff << 2) - sizeof(struct tcphdr);
1136	int genhash;
1137	unsigned char *ptr;
1138	unsigned char newhash[16];
1139
1140	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1141
1142	/*
1143	 * If the TCP option length is less than the TCP_MD5SIG
1144	 * option length, then we can shortcut
1145	 */
1146	if (length < TCPOLEN_MD5SIG) {
1147		if (hash_expected)
1148			return 1;
1149		else
1150			return 0;
1151	}
1152
1153	/* Okay, we can't shortcut - we have to grub through the options */
1154	ptr = (unsigned char *)(th + 1);
1155	while (length > 0) {
1156		int opcode = *ptr++;
1157		int opsize;
1158
1159		switch (opcode) {
1160		case TCPOPT_EOL:
1161			goto done_opts;
1162		case TCPOPT_NOP:
1163			length--;
1164			continue;
1165		default:
1166			opsize = *ptr++;
1167			if (opsize < 2)
1168				goto done_opts;
1169			if (opsize > length)
1170				goto done_opts;
1171
1172			if (opcode == TCPOPT_MD5SIG) {
1173				hash_location = ptr;
1174				goto done_opts;
1175			}
1176		}
1177		ptr += opsize-2;
1178		length -= opsize;
1179	}
1180done_opts:
1181	/* We've parsed the options - do we have a hash? */
1182	if (!hash_expected && !hash_location)
1183		return 0;
1184
1185	if (hash_expected && !hash_location) {
1186		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1187			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1188			       NIPQUAD(iph->saddr), ntohs(th->source),
1189			       NIPQUAD(iph->daddr), ntohs(th->dest));
1190		return 1;
1191	}
1192
1193	if (!hash_expected && hash_location) {
1194		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1195			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1196			       NIPQUAD(iph->saddr), ntohs(th->source),
1197			       NIPQUAD(iph->daddr), ntohs(th->dest));
1198		return 1;
1199	}
1200
1201	/* Okay, so this is hash_expected and hash_location -
1202	 * so we need to calculate the checksum.
1203	 */
1204	genhash = tcp_v4_do_calc_md5_hash(newhash,
1205					  hash_expected,
1206					  iph->saddr, iph->daddr,
1207					  th, sk->sk_protocol,
1208					  skb->len);
1209
1210	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1211		if (net_ratelimit()) {
1212			printk(KERN_INFO "MD5 Hash failed for "
1213			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1214			       NIPQUAD(iph->saddr), ntohs(th->source),
1215			       NIPQUAD(iph->daddr), ntohs(th->dest),
1216			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1217		}
1218		return 1;
1219	}
1220	return 0;
1221}
1222
1223#endif
1224
1225struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1226	.family		=	PF_INET,
1227	.obj_size	=	sizeof(struct tcp_request_sock),
1228	.rtx_syn_ack	=	tcp_v4_send_synack,
1229	.send_ack	=	tcp_v4_reqsk_send_ack,
1230	.destructor	=	tcp_v4_reqsk_destructor,
1231	.send_reset	=	tcp_v4_send_reset,
1232};
1233
1234#ifdef CONFIG_TCP_MD5SIG
1235static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1236	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1237};
1238#endif
1239
1240static struct timewait_sock_ops tcp_timewait_sock_ops = {
1241	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1242	.twsk_unique	= tcp_twsk_unique,
1243	.twsk_destructor= tcp_twsk_destructor,
1244};
1245
1246int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1247{
1248	struct inet_request_sock *ireq;
1249	struct tcp_options_received tmp_opt;
1250	struct request_sock *req;
1251	__be32 saddr = ip_hdr(skb)->saddr;
1252	__be32 daddr = ip_hdr(skb)->daddr;
1253	__u32 isn = TCP_SKB_CB(skb)->when;
1254	struct dst_entry *dst = NULL;
1255#ifdef CONFIG_SYN_COOKIES
1256	int want_cookie = 0;
1257#else
1258#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1259#endif
1260
1261	/* Never answer to SYNs send to broadcast or multicast */
1262	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1263		goto drop;
1264
1265	/* TW buckets are converted to open requests without
1266	 * limitations, they conserve resources and peer is
1267	 * evidently real one.
1268	 */
1269	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270#ifdef CONFIG_SYN_COOKIES
1271		if (sysctl_tcp_syncookies) {
1272			want_cookie = 1;
1273		} else
1274#endif
1275		goto drop;
1276	}
1277
1278	/* Accept backlog is full. If we have already queued enough
1279	 * of warm entries in syn queue, drop request. It is better than
1280	 * clogging syn queue with openreqs with exponentially increasing
1281	 * timeout.
1282	 */
1283	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1284		goto drop;
1285
1286	req = reqsk_alloc(&tcp_request_sock_ops);
1287	if (!req)
1288		goto drop;
1289
1290#ifdef CONFIG_TCP_MD5SIG
1291	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1292#endif
1293
1294	tcp_clear_options(&tmp_opt);
1295	tmp_opt.mss_clamp = 536;
1296	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1297
1298	tcp_parse_options(skb, &tmp_opt, 0);
1299
1300	if (want_cookie && !tmp_opt.saw_tstamp)
1301		tcp_clear_options(&tmp_opt);
1302
1303	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1304		/* Some OSes (unknown ones, but I see them on web server, which
1305		 * contains information interesting only for windows'
1306		 * users) do not send their stamp in SYN. It is easy case.
1307		 * We simply do not advertise TS support.
1308		 */
1309		tmp_opt.saw_tstamp = 0;
1310		tmp_opt.tstamp_ok  = 0;
1311	}
1312	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1313
1314	tcp_openreq_init(req, &tmp_opt, skb);
1315
1316	if (security_inet_conn_request(sk, skb, req))
1317		goto drop_and_free;
1318
1319	ireq = inet_rsk(req);
1320	ireq->loc_addr = daddr;
1321	ireq->rmt_addr = saddr;
1322	ireq->opt = tcp_v4_save_options(sk, skb);
1323	if (!want_cookie)
1324		TCP_ECN_create_request(req, tcp_hdr(skb));
1325
1326	if (want_cookie) {
1327#ifdef CONFIG_SYN_COOKIES
1328		syn_flood_warning(skb);
1329		req->cookie_ts = tmp_opt.tstamp_ok;
1330#endif
1331		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1332	} else if (!isn) {
1333		struct inet_peer *peer = NULL;
1334
1335		/* VJ's idea. We save last timestamp seen
1336		 * from the destination in peer table, when entering
1337		 * state TIME-WAIT, and check against it before
1338		 * accepting new connection request.
1339		 *
1340		 * If "isn" is not zero, this request hit alive
1341		 * timewait bucket, so that all the necessary checks
1342		 * are made in the function processing timewait state.
1343		 */
1344		if (tmp_opt.saw_tstamp &&
1345		    tcp_death_row.sysctl_tw_recycle &&
1346		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1347		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1348		    peer->v4daddr == saddr) {
1349			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1350			    (s32)(peer->tcp_ts - req->ts_recent) >
1351							TCP_PAWS_WINDOW) {
1352				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1353				goto drop_and_release;
1354			}
1355		}
1356		/* Kill the following clause, if you dislike this way. */
1357		else if (!sysctl_tcp_syncookies &&
1358			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1359			  (sysctl_max_syn_backlog >> 2)) &&
1360			 (!peer || !peer->tcp_ts_stamp) &&
1361			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1362			/* Without syncookies last quarter of
1363			 * backlog is filled with destinations,
1364			 * proven to be alive.
1365			 * It means that we continue to communicate
1366			 * to destinations, already remembered
1367			 * to the moment of synflood.
1368			 */
1369			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1370				       "request from " NIPQUAD_FMT "/%u\n",
1371				       NIPQUAD(saddr),
1372				       ntohs(tcp_hdr(skb)->source));
1373			goto drop_and_release;
1374		}
1375
1376		isn = tcp_v4_init_sequence(skb);
1377	}
1378	tcp_rsk(req)->snt_isn = isn;
1379
1380	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1381		goto drop_and_free;
1382
1383	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1384	return 0;
1385
1386drop_and_release:
1387	dst_release(dst);
1388drop_and_free:
1389	reqsk_free(req);
1390drop:
1391	return 0;
1392}
1393
1394
1395/*
1396 * The three way handshake has completed - we got a valid synack -
1397 * now create the new socket.
1398 */
1399struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1400				  struct request_sock *req,
1401				  struct dst_entry *dst)
1402{
1403	struct inet_request_sock *ireq;
1404	struct inet_sock *newinet;
1405	struct tcp_sock *newtp;
1406	struct sock *newsk;
1407#ifdef CONFIG_TCP_MD5SIG
1408	struct tcp_md5sig_key *key;
1409#endif
1410
1411	if (sk_acceptq_is_full(sk))
1412		goto exit_overflow;
1413
1414	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1415		goto exit;
1416
1417	newsk = tcp_create_openreq_child(sk, req, skb);
1418	if (!newsk)
1419		goto exit;
1420
1421	newsk->sk_gso_type = SKB_GSO_TCPV4;
1422	sk_setup_caps(newsk, dst);
1423
1424	newtp		      = tcp_sk(newsk);
1425	newinet		      = inet_sk(newsk);
1426	ireq		      = inet_rsk(req);
1427	newinet->daddr	      = ireq->rmt_addr;
1428	newinet->rcv_saddr    = ireq->loc_addr;
1429	newinet->saddr	      = ireq->loc_addr;
1430	newinet->opt	      = ireq->opt;
1431	ireq->opt	      = NULL;
1432	newinet->mc_index     = inet_iif(skb);
1433	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1434	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1435	if (newinet->opt)
1436		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1437	newinet->id = newtp->write_seq ^ jiffies;
1438
1439	tcp_mtup_init(newsk);
1440	tcp_sync_mss(newsk, dst_mtu(dst));
1441	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1442	tcp_initialize_rcv_mss(newsk);
1443
1444#ifdef CONFIG_TCP_MD5SIG
1445	/* Copy over the MD5 key from the original socket */
1446	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1447		/*
1448		 * We're using one, so create a matching key
1449		 * on the newsk structure. If we fail to get
1450		 * memory, then we end up not copying the key
1451		 * across. Shucks.
1452		 */
1453		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1454		if (newkey != NULL)
1455			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1456					  newkey, key->keylen);
1457	}
1458#endif
1459
1460	__inet_hash_nolisten(newsk);
1461	__inet_inherit_port(sk, newsk);
1462
1463	return newsk;
1464
1465exit_overflow:
1466	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1467exit:
1468	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1469	dst_release(dst);
1470	return NULL;
1471}
1472
1473static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1474{
1475	struct tcphdr *th = tcp_hdr(skb);
1476	const struct iphdr *iph = ip_hdr(skb);
1477	struct sock *nsk;
1478	struct request_sock **prev;
1479	/* Find possible connection requests. */
1480	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1481						       iph->saddr, iph->daddr);
1482	if (req)
1483		return tcp_check_req(sk, skb, req, prev);
1484
1485	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1486			th->source, iph->daddr, th->dest, inet_iif(skb));
1487
1488	if (nsk) {
1489		if (nsk->sk_state != TCP_TIME_WAIT) {
1490			bh_lock_sock(nsk);
1491			return nsk;
1492		}
1493		inet_twsk_put(inet_twsk(nsk));
1494		return NULL;
1495	}
1496
1497#ifdef CONFIG_SYN_COOKIES
1498	if (!th->rst && !th->syn && th->ack)
1499		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1500#endif
1501	return sk;
1502}
1503
1504static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1505{
1506	const struct iphdr *iph = ip_hdr(skb);
1507
1508	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1509		if (!tcp_v4_check(skb->len, iph->saddr,
1510				  iph->daddr, skb->csum)) {
1511			skb->ip_summed = CHECKSUM_UNNECESSARY;
1512			return 0;
1513		}
1514	}
1515
1516	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1517				       skb->len, IPPROTO_TCP, 0);
1518
1519	if (skb->len <= 76) {
1520		return __skb_checksum_complete(skb);
1521	}
1522	return 0;
1523}
1524
1525
1526/* The socket must have it's spinlock held when we get
1527 * here.
1528 *
1529 * We have a potential double-lock case here, so even when
1530 * doing backlog processing we use the BH locking scheme.
1531 * This is because we cannot sleep with the original spinlock
1532 * held.
1533 */
1534int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1535{
1536	struct sock *rsk;
1537#ifdef CONFIG_TCP_MD5SIG
1538	/*
1539	 * We really want to reject the packet as early as possible
1540	 * if:
1541	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1542	 *  o There is an MD5 option and we're not expecting one
1543	 */
1544	if (tcp_v4_inbound_md5_hash(sk, skb))
1545		goto discard;
1546#endif
1547
1548	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1549		TCP_CHECK_TIMER(sk);
1550		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1551			rsk = sk;
1552			goto reset;
1553		}
1554		TCP_CHECK_TIMER(sk);
1555		return 0;
1556	}
1557
1558	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1559		goto csum_err;
1560
1561	if (sk->sk_state == TCP_LISTEN) {
1562		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1563		if (!nsk)
1564			goto discard;
1565
1566		if (nsk != sk) {
1567			if (tcp_child_process(sk, nsk, skb)) {
1568				rsk = nsk;
1569				goto reset;
1570			}
1571			return 0;
1572		}
1573	}
1574
1575	TCP_CHECK_TIMER(sk);
1576	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1577		rsk = sk;
1578		goto reset;
1579	}
1580	TCP_CHECK_TIMER(sk);
1581	return 0;
1582
1583reset:
1584	tcp_v4_send_reset(rsk, skb);
1585discard:
1586	kfree_skb(skb);
1587	/* Be careful here. If this function gets more complicated and
1588	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1589	 * might be destroyed here. This current version compiles correctly,
1590	 * but you have been warned.
1591	 */
1592	return 0;
1593
1594csum_err:
1595	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1596	goto discard;
1597}
1598
1599/*
1600 *	From tcp_input.c
1601 */
1602
1603int tcp_v4_rcv(struct sk_buff *skb)
1604{
1605	const struct iphdr *iph;
1606	struct tcphdr *th;
1607	struct sock *sk;
1608	int ret;
1609
1610	if (skb->pkt_type != PACKET_HOST)
1611		goto discard_it;
1612
1613	/* Count it even if it's bad */
1614	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1615
1616	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1617		goto discard_it;
1618
1619	th = tcp_hdr(skb);
1620
1621	if (th->doff < sizeof(struct tcphdr) / 4)
1622		goto bad_packet;
1623	if (!pskb_may_pull(skb, th->doff * 4))
1624		goto discard_it;
1625
1626	/* An explanation is required here, I think.
1627	 * Packet length and doff are validated by header prediction,
1628	 * provided case of th->doff==0 is eliminated.
1629	 * So, we defer the checks. */
1630	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1631		goto bad_packet;
1632
1633	th = tcp_hdr(skb);
1634	iph = ip_hdr(skb);
1635	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1636	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1637				    skb->len - th->doff * 4);
1638	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1639	TCP_SKB_CB(skb)->when	 = 0;
1640	TCP_SKB_CB(skb)->flags	 = iph->tos;
1641	TCP_SKB_CB(skb)->sacked	 = 0;
1642
1643	sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1644			th->source, iph->daddr, th->dest, inet_iif(skb));
1645	if (!sk)
1646		goto no_tcp_socket;
1647
1648process:
1649	if (sk->sk_state == TCP_TIME_WAIT)
1650		goto do_time_wait;
1651
1652	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1653		goto discard_and_relse;
1654	nf_reset(skb);
1655
1656	if (sk_filter(sk, skb))
1657		goto discard_and_relse;
1658
1659	skb->dev = NULL;
1660
1661	bh_lock_sock_nested(sk);
1662	ret = 0;
1663	if (!sock_owned_by_user(sk)) {
1664#ifdef CONFIG_NET_DMA
1665		struct tcp_sock *tp = tcp_sk(sk);
1666		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1667			tp->ucopy.dma_chan = get_softnet_dma();
1668		if (tp->ucopy.dma_chan)
1669			ret = tcp_v4_do_rcv(sk, skb);
1670		else
1671#endif
1672		{
1673			if (!tcp_prequeue(sk, skb))
1674			ret = tcp_v4_do_rcv(sk, skb);
1675		}
1676	} else
1677		sk_add_backlog(sk, skb);
1678	bh_unlock_sock(sk);
1679
1680	sock_put(sk);
1681
1682	return ret;
1683
1684no_tcp_socket:
1685	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1686		goto discard_it;
1687
1688	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1689bad_packet:
1690		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1691	} else {
1692		tcp_v4_send_reset(NULL, skb);
1693	}
1694
1695discard_it:
1696	/* Discard frame. */
1697	kfree_skb(skb);
1698	return 0;
1699
1700discard_and_relse:
1701	sock_put(sk);
1702	goto discard_it;
1703
1704do_time_wait:
1705	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1706		inet_twsk_put(inet_twsk(sk));
1707		goto discard_it;
1708	}
1709
1710	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1712		inet_twsk_put(inet_twsk(sk));
1713		goto discard_it;
1714	}
1715	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1716	case TCP_TW_SYN: {
1717		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1718							&tcp_hashinfo,
1719							iph->daddr, th->dest,
1720							inet_iif(skb));
1721		if (sk2) {
1722			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1723			inet_twsk_put(inet_twsk(sk));
1724			sk = sk2;
1725			goto process;
1726		}
1727		/* Fall through to ACK */
1728	}
1729	case TCP_TW_ACK:
1730		tcp_v4_timewait_ack(sk, skb);
1731		break;
1732	case TCP_TW_RST:
1733		goto no_tcp_socket;
1734	case TCP_TW_SUCCESS:;
1735	}
1736	goto discard_it;
1737}
1738
1739/* VJ's idea. Save last timestamp seen from this destination
1740 * and hold it at least for normal timewait interval to use for duplicate
1741 * segment detection in subsequent connections, before they enter synchronized
1742 * state.
1743 */
1744
1745int tcp_v4_remember_stamp(struct sock *sk)
1746{
1747	struct inet_sock *inet = inet_sk(sk);
1748	struct tcp_sock *tp = tcp_sk(sk);
1749	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1750	struct inet_peer *peer = NULL;
1751	int release_it = 0;
1752
1753	if (!rt || rt->rt_dst != inet->daddr) {
1754		peer = inet_getpeer(inet->daddr, 1);
1755		release_it = 1;
1756	} else {
1757		if (!rt->peer)
1758			rt_bind_peer(rt, 1);
1759		peer = rt->peer;
1760	}
1761
1762	if (peer) {
1763		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1764		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1765		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1766			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1767			peer->tcp_ts = tp->rx_opt.ts_recent;
1768		}
1769		if (release_it)
1770			inet_putpeer(peer);
1771		return 1;
1772	}
1773
1774	return 0;
1775}
1776
1777int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1778{
1779	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1780
1781	if (peer) {
1782		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1783
1784		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1785		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1786		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1787			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1788			peer->tcp_ts	   = tcptw->tw_ts_recent;
1789		}
1790		inet_putpeer(peer);
1791		return 1;
1792	}
1793
1794	return 0;
1795}
1796
1797struct inet_connection_sock_af_ops ipv4_specific = {
1798	.queue_xmit	   = ip_queue_xmit,
1799	.send_check	   = tcp_v4_send_check,
1800	.rebuild_header	   = inet_sk_rebuild_header,
1801	.conn_request	   = tcp_v4_conn_request,
1802	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1803	.remember_stamp	   = tcp_v4_remember_stamp,
1804	.net_header_len	   = sizeof(struct iphdr),
1805	.setsockopt	   = ip_setsockopt,
1806	.getsockopt	   = ip_getsockopt,
1807	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1808	.sockaddr_len	   = sizeof(struct sockaddr_in),
1809	.bind_conflict	   = inet_csk_bind_conflict,
1810#ifdef CONFIG_COMPAT
1811	.compat_setsockopt = compat_ip_setsockopt,
1812	.compat_getsockopt = compat_ip_getsockopt,
1813#endif
1814};
1815
1816#ifdef CONFIG_TCP_MD5SIG
1817static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1818	.md5_lookup		= tcp_v4_md5_lookup,
1819	.calc_md5_hash		= tcp_v4_calc_md5_hash,
1820	.md5_add		= tcp_v4_md5_add_func,
1821	.md5_parse		= tcp_v4_parse_md5_keys,
1822};
1823#endif
1824
1825/* NOTE: A lot of things set to zero explicitly by call to
1826 *       sk_alloc() so need not be done here.
1827 */
1828static int tcp_v4_init_sock(struct sock *sk)
1829{
1830	struct inet_connection_sock *icsk = inet_csk(sk);
1831	struct tcp_sock *tp = tcp_sk(sk);
1832
1833	skb_queue_head_init(&tp->out_of_order_queue);
1834	tcp_init_xmit_timers(sk);
1835	tcp_prequeue_init(tp);
1836
1837	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1838	tp->mdev = TCP_TIMEOUT_INIT;
1839
1840	/* So many TCP implementations out there (incorrectly) count the
1841	 * initial SYN frame in their delayed-ACK and congestion control
1842	 * algorithms that we must have the following bandaid to talk
1843	 * efficiently to them.  -DaveM
1844	 */
1845	tp->snd_cwnd = 2;
1846
1847	/* See draft-stevens-tcpca-spec-01 for discussion of the
1848	 * initialization of these values.
1849	 */
1850	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1851	tp->snd_cwnd_clamp = ~0;
1852	tp->mss_cache = 536;
1853
1854	tp->reordering = sysctl_tcp_reordering;
1855	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1856
1857	sk->sk_state = TCP_CLOSE;
1858
1859	sk->sk_write_space = sk_stream_write_space;
1860	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1861
1862	icsk->icsk_af_ops = &ipv4_specific;
1863	icsk->icsk_sync_mss = tcp_sync_mss;
1864#ifdef CONFIG_TCP_MD5SIG
1865	tp->af_specific = &tcp_sock_ipv4_specific;
1866#endif
1867
1868	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1869	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1870
1871	atomic_inc(&tcp_sockets_allocated);
1872
1873	return 0;
1874}
1875
1876int tcp_v4_destroy_sock(struct sock *sk)
1877{
1878	struct tcp_sock *tp = tcp_sk(sk);
1879
1880	tcp_clear_xmit_timers(sk);
1881
1882	tcp_cleanup_congestion_control(sk);
1883
1884	/* Cleanup up the write buffer. */
1885	tcp_write_queue_purge(sk);
1886
1887	/* Cleans up our, hopefully empty, out_of_order_queue. */
1888	__skb_queue_purge(&tp->out_of_order_queue);
1889
1890#ifdef CONFIG_TCP_MD5SIG
1891	/* Clean up the MD5 key list, if any */
1892	if (tp->md5sig_info) {
1893		tcp_v4_clear_md5_list(sk);
1894		kfree(tp->md5sig_info);
1895		tp->md5sig_info = NULL;
1896	}
1897#endif
1898
1899#ifdef CONFIG_NET_DMA
1900	/* Cleans up our sk_async_wait_queue */
1901	__skb_queue_purge(&sk->sk_async_wait_queue);
1902#endif
1903
1904	/* Clean prequeue, it must be empty really */
1905	__skb_queue_purge(&tp->ucopy.prequeue);
1906
1907	/* Clean up a referenced TCP bind bucket. */
1908	if (inet_csk(sk)->icsk_bind_hash)
1909		inet_put_port(sk);
1910
1911	/*
1912	 * If sendmsg cached page exists, toss it.
1913	 */
1914	if (sk->sk_sndmsg_page) {
1915		__free_page(sk->sk_sndmsg_page);
1916		sk->sk_sndmsg_page = NULL;
1917	}
1918
1919	if (tp->defer_tcp_accept.request) {
1920		reqsk_free(tp->defer_tcp_accept.request);
1921		sock_put(tp->defer_tcp_accept.listen_sk);
1922		sock_put(sk);
1923		tp->defer_tcp_accept.listen_sk = NULL;
1924		tp->defer_tcp_accept.request = NULL;
1925	}
1926
1927	atomic_dec(&tcp_sockets_allocated);
1928
1929	return 0;
1930}
1931
1932EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933
1934#ifdef CONFIG_PROC_FS
1935/* Proc filesystem TCP sock list dumping. */
1936
1937static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1938{
1939	return hlist_empty(head) ? NULL :
1940		list_entry(head->first, struct inet_timewait_sock, tw_node);
1941}
1942
1943static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1944{
1945	return tw->tw_node.next ?
1946		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1947}
1948
1949static void *listening_get_next(struct seq_file *seq, void *cur)
1950{
1951	struct inet_connection_sock *icsk;
1952	struct hlist_node *node;
1953	struct sock *sk = cur;
1954	struct tcp_iter_state* st = seq->private;
1955	struct net *net = seq_file_net(seq);
1956
1957	if (!sk) {
1958		st->bucket = 0;
1959		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1960		goto get_sk;
1961	}
1962
1963	++st->num;
1964
1965	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1966		struct request_sock *req = cur;
1967
1968		icsk = inet_csk(st->syn_wait_sk);
1969		req = req->dl_next;
1970		while (1) {
1971			while (req) {
1972				if (req->rsk_ops->family == st->family &&
1973				    net_eq(sock_net(req->sk), net)) {
1974					cur = req;
1975					goto out;
1976				}
1977				req = req->dl_next;
1978			}
1979			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1980				break;
1981get_req:
1982			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1983		}
1984		sk	  = sk_next(st->syn_wait_sk);
1985		st->state = TCP_SEQ_STATE_LISTENING;
1986		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1987	} else {
1988		icsk = inet_csk(sk);
1989		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1990		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1991			goto start_req;
1992		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1993		sk = sk_next(sk);
1994	}
1995get_sk:
1996	sk_for_each_from(sk, node) {
1997		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1998			cur = sk;
1999			goto out;
2000		}
2001		icsk = inet_csk(sk);
2002		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2003		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2004start_req:
2005			st->uid		= sock_i_uid(sk);
2006			st->syn_wait_sk = sk;
2007			st->state	= TCP_SEQ_STATE_OPENREQ;
2008			st->sbucket	= 0;
2009			goto get_req;
2010		}
2011		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012	}
2013	if (++st->bucket < INET_LHTABLE_SIZE) {
2014		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2015		goto get_sk;
2016	}
2017	cur = NULL;
2018out:
2019	return cur;
2020}
2021
2022static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2023{
2024	void *rc = listening_get_next(seq, NULL);
2025
2026	while (rc && *pos) {
2027		rc = listening_get_next(seq, rc);
2028		--*pos;
2029	}
2030	return rc;
2031}
2032
2033static void *established_get_first(struct seq_file *seq)
2034{
2035	struct tcp_iter_state* st = seq->private;
2036	struct net *net = seq_file_net(seq);
2037	void *rc = NULL;
2038
2039	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2040		struct sock *sk;
2041		struct hlist_node *node;
2042		struct inet_timewait_sock *tw;
2043		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2044
2045		read_lock_bh(lock);
2046		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2047			if (sk->sk_family != st->family ||
2048			    !net_eq(sock_net(sk), net)) {
2049				continue;
2050			}
2051			rc = sk;
2052			goto out;
2053		}
2054		st->state = TCP_SEQ_STATE_TIME_WAIT;
2055		inet_twsk_for_each(tw, node,
2056				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2057			if (tw->tw_family != st->family ||
2058			    !net_eq(twsk_net(tw), net)) {
2059				continue;
2060			}
2061			rc = tw;
2062			goto out;
2063		}
2064		read_unlock_bh(lock);
2065		st->state = TCP_SEQ_STATE_ESTABLISHED;
2066	}
2067out:
2068	return rc;
2069}
2070
2071static void *established_get_next(struct seq_file *seq, void *cur)
2072{
2073	struct sock *sk = cur;
2074	struct inet_timewait_sock *tw;
2075	struct hlist_node *node;
2076	struct tcp_iter_state* st = seq->private;
2077	struct net *net = seq_file_net(seq);
2078
2079	++st->num;
2080
2081	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2082		tw = cur;
2083		tw = tw_next(tw);
2084get_tw:
2085		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2086			tw = tw_next(tw);
2087		}
2088		if (tw) {
2089			cur = tw;
2090			goto out;
2091		}
2092		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2093		st->state = TCP_SEQ_STATE_ESTABLISHED;
2094
2095		if (++st->bucket < tcp_hashinfo.ehash_size) {
2096			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2098		} else {
2099			cur = NULL;
2100			goto out;
2101		}
2102	} else
2103		sk = sk_next(sk);
2104
2105	sk_for_each_from(sk, node) {
2106		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2107			goto found;
2108	}
2109
2110	st->state = TCP_SEQ_STATE_TIME_WAIT;
2111	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2112	goto get_tw;
2113found:
2114	cur = sk;
2115out:
2116	return cur;
2117}
2118
2119static void *established_get_idx(struct seq_file *seq, loff_t pos)
2120{
2121	void *rc = established_get_first(seq);
2122
2123	while (rc && pos) {
2124		rc = established_get_next(seq, rc);
2125		--pos;
2126	}
2127	return rc;
2128}
2129
2130static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2131{
2132	void *rc;
2133	struct tcp_iter_state* st = seq->private;
2134
2135	inet_listen_lock(&tcp_hashinfo);
2136	st->state = TCP_SEQ_STATE_LISTENING;
2137	rc	  = listening_get_idx(seq, &pos);
2138
2139	if (!rc) {
2140		inet_listen_unlock(&tcp_hashinfo);
2141		st->state = TCP_SEQ_STATE_ESTABLISHED;
2142		rc	  = established_get_idx(seq, pos);
2143	}
2144
2145	return rc;
2146}
2147
2148static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2149{
2150	struct tcp_iter_state* st = seq->private;
2151	st->state = TCP_SEQ_STATE_LISTENING;
2152	st->num = 0;
2153	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154}
2155
2156static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157{
2158	void *rc = NULL;
2159	struct tcp_iter_state* st;
2160
2161	if (v == SEQ_START_TOKEN) {
2162		rc = tcp_get_idx(seq, 0);
2163		goto out;
2164	}
2165	st = seq->private;
2166
2167	switch (st->state) {
2168	case TCP_SEQ_STATE_OPENREQ:
2169	case TCP_SEQ_STATE_LISTENING:
2170		rc = listening_get_next(seq, v);
2171		if (!rc) {
2172			inet_listen_unlock(&tcp_hashinfo);
2173			st->state = TCP_SEQ_STATE_ESTABLISHED;
2174			rc	  = established_get_first(seq);
2175		}
2176		break;
2177	case TCP_SEQ_STATE_ESTABLISHED:
2178	case TCP_SEQ_STATE_TIME_WAIT:
2179		rc = established_get_next(seq, v);
2180		break;
2181	}
2182out:
2183	++*pos;
2184	return rc;
2185}
2186
2187static void tcp_seq_stop(struct seq_file *seq, void *v)
2188{
2189	struct tcp_iter_state* st = seq->private;
2190
2191	switch (st->state) {
2192	case TCP_SEQ_STATE_OPENREQ:
2193		if (v) {
2194			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2195			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2196		}
2197	case TCP_SEQ_STATE_LISTENING:
2198		if (v != SEQ_START_TOKEN)
2199			inet_listen_unlock(&tcp_hashinfo);
2200		break;
2201	case TCP_SEQ_STATE_TIME_WAIT:
2202	case TCP_SEQ_STATE_ESTABLISHED:
2203		if (v)
2204			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2205		break;
2206	}
2207}
2208
2209static int tcp_seq_open(struct inode *inode, struct file *file)
2210{
2211	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2212	struct tcp_iter_state *s;
2213	int err;
2214
2215	err = seq_open_net(inode, file, &afinfo->seq_ops,
2216			  sizeof(struct tcp_iter_state));
2217	if (err < 0)
2218		return err;
2219
2220	s = ((struct seq_file *)file->private_data)->private;
2221	s->family		= afinfo->family;
2222	return 0;
2223}
2224
2225int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2226{
2227	int rc = 0;
2228	struct proc_dir_entry *p;
2229
2230	afinfo->seq_fops.open		= tcp_seq_open;
2231	afinfo->seq_fops.read		= seq_read;
2232	afinfo->seq_fops.llseek		= seq_lseek;
2233	afinfo->seq_fops.release	= seq_release_net;
2234
2235	afinfo->seq_ops.start		= tcp_seq_start;
2236	afinfo->seq_ops.next		= tcp_seq_next;
2237	afinfo->seq_ops.stop		= tcp_seq_stop;
2238
2239	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2240			     &afinfo->seq_fops, afinfo);
2241	if (!p)
2242		rc = -ENOMEM;
2243	return rc;
2244}
2245
2246void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2247{
2248	proc_net_remove(net, afinfo->name);
2249}
2250
2251static void get_openreq4(struct sock *sk, struct request_sock *req,
2252			 struct seq_file *f, int i, int uid, int *len)
2253{
2254	const struct inet_request_sock *ireq = inet_rsk(req);
2255	int ttd = req->expires - jiffies;
2256
2257	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2258		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2259		i,
2260		ireq->loc_addr,
2261		ntohs(inet_sk(sk)->sport),
2262		ireq->rmt_addr,
2263		ntohs(ireq->rmt_port),
2264		TCP_SYN_RECV,
2265		0, 0, /* could print option size, but that is af dependent. */
2266		1,    /* timers active (only the expire timer) */
2267		jiffies_to_clock_t(ttd),
2268		req->retrans,
2269		uid,
2270		0,  /* non standard timer */
2271		0, /* open_requests have no inode */
2272		atomic_read(&sk->sk_refcnt),
2273		req,
2274		len);
2275}
2276
2277static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2278{
2279	int timer_active;
2280	unsigned long timer_expires;
2281	struct tcp_sock *tp = tcp_sk(sk);
2282	const struct inet_connection_sock *icsk = inet_csk(sk);
2283	struct inet_sock *inet = inet_sk(sk);
2284	__be32 dest = inet->daddr;
2285	__be32 src = inet->rcv_saddr;
2286	__u16 destp = ntohs(inet->dport);
2287	__u16 srcp = ntohs(inet->sport);
2288
2289	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2290		timer_active	= 1;
2291		timer_expires	= icsk->icsk_timeout;
2292	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2293		timer_active	= 4;
2294		timer_expires	= icsk->icsk_timeout;
2295	} else if (timer_pending(&sk->sk_timer)) {
2296		timer_active	= 2;
2297		timer_expires	= sk->sk_timer.expires;
2298	} else {
2299		timer_active	= 0;
2300		timer_expires = jiffies;
2301	}
2302
2303	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2304			"%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2305		i, src, srcp, dest, destp, sk->sk_state,
2306		tp->write_seq - tp->snd_una,
2307		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2308					     (tp->rcv_nxt - tp->copied_seq),
2309		timer_active,
2310		jiffies_to_clock_t(timer_expires - jiffies),
2311		icsk->icsk_retransmits,
2312		sock_i_uid(sk),
2313		icsk->icsk_probes_out,
2314		sock_i_ino(sk),
2315		atomic_read(&sk->sk_refcnt), sk,
2316		icsk->icsk_rto,
2317		icsk->icsk_ack.ato,
2318		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2319		tp->snd_cwnd,
2320		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2321		len);
2322}
2323
2324static void get_timewait4_sock(struct inet_timewait_sock *tw,
2325			       struct seq_file *f, int i, int *len)
2326{
2327	__be32 dest, src;
2328	__u16 destp, srcp;
2329	int ttd = tw->tw_ttd - jiffies;
2330
2331	if (ttd < 0)
2332		ttd = 0;
2333
2334	dest  = tw->tw_daddr;
2335	src   = tw->tw_rcv_saddr;
2336	destp = ntohs(tw->tw_dport);
2337	srcp  = ntohs(tw->tw_sport);
2338
2339	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2340		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2341		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2342		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2343		atomic_read(&tw->tw_refcnt), tw, len);
2344}
2345
2346#define TMPSZ 150
2347
2348static int tcp4_seq_show(struct seq_file *seq, void *v)
2349{
2350	struct tcp_iter_state* st;
2351	int len;
2352
2353	if (v == SEQ_START_TOKEN) {
2354		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2355			   "  sl  local_address rem_address   st tx_queue "
2356			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2357			   "inode");
2358		goto out;
2359	}
2360	st = seq->private;
2361
2362	switch (st->state) {
2363	case TCP_SEQ_STATE_LISTENING:
2364	case TCP_SEQ_STATE_ESTABLISHED:
2365		get_tcp4_sock(v, seq, st->num, &len);
2366		break;
2367	case TCP_SEQ_STATE_OPENREQ:
2368		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2369		break;
2370	case TCP_SEQ_STATE_TIME_WAIT:
2371		get_timewait4_sock(v, seq, st->num, &len);
2372		break;
2373	}
2374	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2375out:
2376	return 0;
2377}
2378
2379static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2380	.name		= "tcp",
2381	.family		= AF_INET,
2382	.seq_fops	= {
2383		.owner		= THIS_MODULE,
2384	},
2385	.seq_ops	= {
2386		.show		= tcp4_seq_show,
2387	},
2388};
2389
2390static int tcp4_proc_init_net(struct net *net)
2391{
2392	return tcp_proc_register(net, &tcp4_seq_afinfo);
2393}
2394
2395static void tcp4_proc_exit_net(struct net *net)
2396{
2397	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2398}
2399
2400static struct pernet_operations tcp4_net_ops = {
2401	.init = tcp4_proc_init_net,
2402	.exit = tcp4_proc_exit_net,
2403};
2404
2405int __init tcp4_proc_init(void)
2406{
2407	return register_pernet_subsys(&tcp4_net_ops);
2408}
2409
2410void tcp4_proc_exit(void)
2411{
2412	unregister_pernet_subsys(&tcp4_net_ops);
2413}
2414#endif /* CONFIG_PROC_FS */
2415
2416struct proto tcp_prot = {
2417	.name			= "TCP",
2418	.owner			= THIS_MODULE,
2419	.close			= tcp_close,
2420	.connect		= tcp_v4_connect,
2421	.disconnect		= tcp_disconnect,
2422	.accept			= inet_csk_accept,
2423	.ioctl			= tcp_ioctl,
2424	.init			= tcp_v4_init_sock,
2425	.destroy		= tcp_v4_destroy_sock,
2426	.shutdown		= tcp_shutdown,
2427	.setsockopt		= tcp_setsockopt,
2428	.getsockopt		= tcp_getsockopt,
2429	.recvmsg		= tcp_recvmsg,
2430	.backlog_rcv		= tcp_v4_do_rcv,
2431	.hash			= inet_hash,
2432	.unhash			= inet_unhash,
2433	.get_port		= inet_csk_get_port,
2434	.enter_memory_pressure	= tcp_enter_memory_pressure,
2435	.sockets_allocated	= &tcp_sockets_allocated,
2436	.orphan_count		= &tcp_orphan_count,
2437	.memory_allocated	= &tcp_memory_allocated,
2438	.memory_pressure	= &tcp_memory_pressure,
2439	.sysctl_mem		= sysctl_tcp_mem,
2440	.sysctl_wmem		= sysctl_tcp_wmem,
2441	.sysctl_rmem		= sysctl_tcp_rmem,
2442	.max_header		= MAX_TCP_HEADER,
2443	.obj_size		= sizeof(struct tcp_sock),
2444	.twsk_prot		= &tcp_timewait_sock_ops,
2445	.rsk_prot		= &tcp_request_sock_ops,
2446	.h.hashinfo		= &tcp_hashinfo,
2447#ifdef CONFIG_COMPAT
2448	.compat_setsockopt	= compat_tcp_setsockopt,
2449	.compat_getsockopt	= compat_tcp_getsockopt,
2450#endif
2451};
2452
2453
2454static int __net_init tcp_sk_init(struct net *net)
2455{
2456	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2457				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2458}
2459
2460static void __net_exit tcp_sk_exit(struct net *net)
2461{
2462	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2463}
2464
2465static struct pernet_operations __net_initdata tcp_sk_ops = {
2466       .init = tcp_sk_init,
2467       .exit = tcp_sk_exit,
2468};
2469
2470void __init tcp_v4_init(void)
2471{
2472	if (register_pernet_device(&tcp_sk_ops))
2473		panic("Failed to create the TCP control socket.\n");
2474}
2475
2476EXPORT_SYMBOL(ipv4_specific);
2477EXPORT_SYMBOL(tcp_hashinfo);
2478EXPORT_SYMBOL(tcp_prot);
2479EXPORT_SYMBOL(tcp_v4_conn_request);
2480EXPORT_SYMBOL(tcp_v4_connect);
2481EXPORT_SYMBOL(tcp_v4_do_rcv);
2482EXPORT_SYMBOL(tcp_v4_remember_stamp);
2483EXPORT_SYMBOL(tcp_v4_send_check);
2484EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2485
2486#ifdef CONFIG_PROC_FS
2487EXPORT_SYMBOL(tcp_proc_register);
2488EXPORT_SYMBOL(tcp_proc_unregister);
2489#endif
2490EXPORT_SYMBOL(sysctl_tcp_low_latency);
2491
2492