tcp_ipv4.c revision 547b792cac0a038b9dbf958d3c120df3740b5572
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/types.h>
55#include <linux/fcntl.h>
56#include <linux/module.h>
57#include <linux/random.h>
58#include <linux/cache.h>
59#include <linux/jhash.h>
60#include <linux/init.h>
61#include <linux/times.h>
62
63#include <net/net_namespace.h>
64#include <net/icmp.h>
65#include <net/inet_hashtables.h>
66#include <net/tcp.h>
67#include <net/transp_v6.h>
68#include <net/ipv6.h>
69#include <net/inet_common.h>
70#include <net/timewait_sock.h>
71#include <net/xfrm.h>
72#include <net/netdma.h>
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79
80#include <linux/crypto.h>
81#include <linux/scatterlist.h>
82
83int sysctl_tcp_tw_reuse __read_mostly;
84int sysctl_tcp_low_latency __read_mostly;
85
86
87#ifdef CONFIG_TCP_MD5SIG
88static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89						   __be32 addr);
90static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91			       __be32 daddr, __be32 saddr, struct tcphdr *th);
92#else
93static inline
94struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95{
96	return NULL;
97}
98#endif
99
100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102	.lhash_users = ATOMIC_INIT(0),
103	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104};
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144
145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147/* This will initiate an outgoing connection. */
148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149{
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153	struct rtable *rt;
154	__be32 daddr, nexthop;
155	int tmp;
156	int err;
157
158	if (addr_len < sizeof(struct sockaddr_in))
159		return -EINVAL;
160
161	if (usin->sin_family != AF_INET)
162		return -EAFNOSUPPORT;
163
164	nexthop = daddr = usin->sin_addr.s_addr;
165	if (inet->opt && inet->opt->srr) {
166		if (!daddr)
167			return -EINVAL;
168		nexthop = inet->opt->faddr;
169	}
170
171	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173			       IPPROTO_TCP,
174			       inet->sport, usin->sin_port, sk, 1);
175	if (tmp < 0) {
176		if (tmp == -ENETUNREACH)
177			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178		return tmp;
179	}
180
181	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182		ip_rt_put(rt);
183		return -ENETUNREACH;
184	}
185
186	if (!inet->opt || !inet->opt->srr)
187		daddr = rt->rt_dst;
188
189	if (!inet->saddr)
190		inet->saddr = rt->rt_src;
191	inet->rcv_saddr = inet->saddr;
192
193	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194		/* Reset inherited state */
195		tp->rx_opt.ts_recent	   = 0;
196		tp->rx_opt.ts_recent_stamp = 0;
197		tp->write_seq		   = 0;
198	}
199
200	if (tcp_death_row.sysctl_tw_recycle &&
201	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202		struct inet_peer *peer = rt_get_peer(rt);
203		/*
204		 * VJ's idea. We save last timestamp seen from
205		 * the destination in peer table, when entering state
206		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207		 * when trying new connection.
208		 */
209		if (peer != NULL &&
210		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212			tp->rx_opt.ts_recent = peer->tcp_ts;
213		}
214	}
215
216	inet->dport = usin->sin_port;
217	inet->daddr = daddr;
218
219	inet_csk(sk)->icsk_ext_hdr_len = 0;
220	if (inet->opt)
221		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223	tp->rx_opt.mss_clamp = 536;
224
225	/* Socket identity is still unknown (sport may be zero).
226	 * However we set state to SYN-SENT and not releasing socket
227	 * lock select source port, enter ourselves into the hash tables and
228	 * complete initialization after this.
229	 */
230	tcp_set_state(sk, TCP_SYN_SENT);
231	err = inet_hash_connect(&tcp_death_row, sk);
232	if (err)
233		goto failure;
234
235	err = ip_route_newports(&rt, IPPROTO_TCP,
236				inet->sport, inet->dport, sk);
237	if (err)
238		goto failure;
239
240	/* OK, now commit destination to socket.  */
241	sk->sk_gso_type = SKB_GSO_TCPV4;
242	sk_setup_caps(sk, &rt->u.dst);
243
244	if (!tp->write_seq)
245		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246							   inet->daddr,
247							   inet->sport,
248							   usin->sin_port);
249
250	inet->id = tp->write_seq ^ jiffies;
251
252	err = tcp_connect(sk);
253	rt = NULL;
254	if (err)
255		goto failure;
256
257	return 0;
258
259failure:
260	/*
261	 * This unhashes the socket and releases the local port,
262	 * if necessary.
263	 */
264	tcp_set_state(sk, TCP_CLOSE);
265	ip_rt_put(rt);
266	sk->sk_route_caps = 0;
267	inet->dport = 0;
268	return err;
269}
270
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275{
276	struct dst_entry *dst;
277	struct inet_sock *inet = inet_sk(sk);
278
279	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280	 * send out by Linux are always <576bytes so they should go through
281	 * unfragmented).
282	 */
283	if (sk->sk_state == TCP_LISTEN)
284		return;
285
286	/* We don't check in the destentry if pmtu discovery is forbidden
287	 * on this route. We just assume that no packet_to_big packets
288	 * are send back when pmtu discovery is not active.
289	 * There is a small race when the user changes this flag in the
290	 * route, but I think that's acceptable.
291	 */
292	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293		return;
294
295	dst->ops->update_pmtu(dst, mtu);
296
297	/* Something is about to be wrong... Remember soft error
298	 * for the case, if this connection will not able to recover.
299	 */
300	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301		sk->sk_err_soft = EMSGSIZE;
302
303	mtu = dst_mtu(dst);
304
305	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307		tcp_sync_mss(sk, mtu);
308
309		/* Resend the TCP packet because it's
310		 * clear that the old packet has been
311		 * dropped. This is the new "fast" path mtu
312		 * discovery.
313		 */
314		tcp_simple_retransmit(sk);
315	} /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition.  If err < 0 then the socket should
321 * be closed and the error returned to the user.  If err > 0
322 * it's just the icmp type << 8 | icmp code.  After adjustment
323 * header points to the first 8 bytes of the tcp header.  We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336	struct iphdr *iph = (struct iphdr *)skb->data;
337	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338	struct tcp_sock *tp;
339	struct inet_sock *inet;
340	const int type = icmp_hdr(skb)->type;
341	const int code = icmp_hdr(skb)->code;
342	struct sock *sk;
343	__u32 seq;
344	int err;
345	struct net *net = dev_net(skb->dev);
346
347	if (skb->len < (iph->ihl << 2) + 8) {
348		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349		return;
350	}
351
352	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353			iph->saddr, th->source, inet_iif(skb));
354	if (!sk) {
355		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356		return;
357	}
358	if (sk->sk_state == TCP_TIME_WAIT) {
359		inet_twsk_put(inet_twsk(sk));
360		return;
361	}
362
363	bh_lock_sock(sk);
364	/* If too many ICMPs get dropped on busy
365	 * servers this needs to be solved differently.
366	 */
367	if (sock_owned_by_user(sk))
368		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370	if (sk->sk_state == TCP_CLOSE)
371		goto out;
372
373	tp = tcp_sk(sk);
374	seq = ntohl(th->seq);
375	if (sk->sk_state != TCP_LISTEN &&
376	    !between(seq, tp->snd_una, tp->snd_nxt)) {
377		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
378		goto out;
379	}
380
381	switch (type) {
382	case ICMP_SOURCE_QUENCH:
383		/* Just silently ignore these. */
384		goto out;
385	case ICMP_PARAMETERPROB:
386		err = EPROTO;
387		break;
388	case ICMP_DEST_UNREACH:
389		if (code > NR_ICMP_UNREACH)
390			goto out;
391
392		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393			if (!sock_owned_by_user(sk))
394				do_pmtu_discovery(sk, iph, info);
395			goto out;
396		}
397
398		err = icmp_err_convert[code].errno;
399		break;
400	case ICMP_TIME_EXCEEDED:
401		err = EHOSTUNREACH;
402		break;
403	default:
404		goto out;
405	}
406
407	switch (sk->sk_state) {
408		struct request_sock *req, **prev;
409	case TCP_LISTEN:
410		if (sock_owned_by_user(sk))
411			goto out;
412
413		req = inet_csk_search_req(sk, &prev, th->dest,
414					  iph->daddr, iph->saddr);
415		if (!req)
416			goto out;
417
418		/* ICMPs are not backlogged, hence we cannot get
419		   an established socket here.
420		 */
421		WARN_ON(req->sk);
422
423		if (seq != tcp_rsk(req)->snt_isn) {
424			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
425			goto out;
426		}
427
428		/*
429		 * Still in SYN_RECV, just remove it silently.
430		 * There is no good way to pass the error to the newly
431		 * created socket, and POSIX does not want network
432		 * errors returned from accept().
433		 */
434		inet_csk_reqsk_queue_drop(sk, req, prev);
435		goto out;
436
437	case TCP_SYN_SENT:
438	case TCP_SYN_RECV:  /* Cannot happen.
439			       It can f.e. if SYNs crossed.
440			     */
441		if (!sock_owned_by_user(sk)) {
442			sk->sk_err = err;
443
444			sk->sk_error_report(sk);
445
446			tcp_done(sk);
447		} else {
448			sk->sk_err_soft = err;
449		}
450		goto out;
451	}
452
453	/* If we've already connected we will keep trying
454	 * until we time out, or the user gives up.
455	 *
456	 * rfc1122 4.2.3.9 allows to consider as hard errors
457	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458	 * but it is obsoleted by pmtu discovery).
459	 *
460	 * Note, that in modern internet, where routing is unreliable
461	 * and in each dark corner broken firewalls sit, sending random
462	 * errors ordered by their masters even this two messages finally lose
463	 * their original sense (even Linux sends invalid PORT_UNREACHs)
464	 *
465	 * Now we are in compliance with RFCs.
466	 *							--ANK (980905)
467	 */
468
469	inet = inet_sk(sk);
470	if (!sock_owned_by_user(sk) && inet->recverr) {
471		sk->sk_err = err;
472		sk->sk_error_report(sk);
473	} else	{ /* Only an error on timeout */
474		sk->sk_err_soft = err;
475	}
476
477out:
478	bh_unlock_sock(sk);
479	sock_put(sk);
480}
481
482/* This routine computes an IPv4 TCP checksum. */
483void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484{
485	struct inet_sock *inet = inet_sk(sk);
486	struct tcphdr *th = tcp_hdr(skb);
487
488	if (skb->ip_summed == CHECKSUM_PARTIAL) {
489		th->check = ~tcp_v4_check(len, inet->saddr,
490					  inet->daddr, 0);
491		skb->csum_start = skb_transport_header(skb) - skb->head;
492		skb->csum_offset = offsetof(struct tcphdr, check);
493	} else {
494		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495					 csum_partial((char *)th,
496						      th->doff << 2,
497						      skb->csum));
498	}
499}
500
501int tcp_v4_gso_send_check(struct sk_buff *skb)
502{
503	const struct iphdr *iph;
504	struct tcphdr *th;
505
506	if (!pskb_may_pull(skb, sizeof(*th)))
507		return -EINVAL;
508
509	iph = ip_hdr(skb);
510	th = tcp_hdr(skb);
511
512	th->check = 0;
513	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514	skb->csum_start = skb_transport_header(skb) - skb->head;
515	skb->csum_offset = offsetof(struct tcphdr, check);
516	skb->ip_summed = CHECKSUM_PARTIAL;
517	return 0;
518}
519
520/*
521 *	This routine will send an RST to the other tcp.
522 *
523 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524 *		      for reset.
525 *	Answer: if a packet caused RST, it is not for a socket
526 *		existing in our system, if it is matched to a socket,
527 *		it is just duplicate segment or bug in other side's TCP.
528 *		So that we build reply only basing on parameters
529 *		arrived with segment.
530 *	Exception: precedence violation. We do not implement it in any case.
531 */
532
533static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534{
535	struct tcphdr *th = tcp_hdr(skb);
536	struct {
537		struct tcphdr th;
538#ifdef CONFIG_TCP_MD5SIG
539		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540#endif
541	} rep;
542	struct ip_reply_arg arg;
543#ifdef CONFIG_TCP_MD5SIG
544	struct tcp_md5sig_key *key;
545#endif
546	struct net *net;
547
548	/* Never send a reset in response to a reset. */
549	if (th->rst)
550		return;
551
552	if (skb->rtable->rt_type != RTN_LOCAL)
553		return;
554
555	/* Swap the send and the receive. */
556	memset(&rep, 0, sizeof(rep));
557	rep.th.dest   = th->source;
558	rep.th.source = th->dest;
559	rep.th.doff   = sizeof(struct tcphdr) / 4;
560	rep.th.rst    = 1;
561
562	if (th->ack) {
563		rep.th.seq = th->ack_seq;
564	} else {
565		rep.th.ack = 1;
566		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567				       skb->len - (th->doff << 2));
568	}
569
570	memset(&arg, 0, sizeof(arg));
571	arg.iov[0].iov_base = (unsigned char *)&rep;
572	arg.iov[0].iov_len  = sizeof(rep.th);
573
574#ifdef CONFIG_TCP_MD5SIG
575	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576	if (key) {
577		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578				   (TCPOPT_NOP << 16) |
579				   (TCPOPT_MD5SIG << 8) |
580				   TCPOLEN_MD5SIG);
581		/* Update length and the length the header thinks exists */
582		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583		rep.th.doff = arg.iov[0].iov_len / 4;
584
585		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586				     key, ip_hdr(skb)->daddr,
587				     ip_hdr(skb)->saddr, &rep.th);
588	}
589#endif
590	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591				      ip_hdr(skb)->saddr, /* XXX */
592				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
593	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594
595	net = dev_net(skb->dst->dev);
596	ip_send_reply(net->ipv4.tcp_sock, skb,
597		      &arg, arg.iov[0].iov_len);
598
599	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
600	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
601}
602
603/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604   outside socket context is ugly, certainly. What can I do?
605 */
606
607static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608			    u32 win, u32 ts, int oif,
609			    struct tcp_md5sig_key *key)
610{
611	struct tcphdr *th = tcp_hdr(skb);
612	struct {
613		struct tcphdr th;
614		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615#ifdef CONFIG_TCP_MD5SIG
616			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617#endif
618			];
619	} rep;
620	struct ip_reply_arg arg;
621	struct net *net = dev_net(skb->dev);
622
623	memset(&rep.th, 0, sizeof(struct tcphdr));
624	memset(&arg, 0, sizeof(arg));
625
626	arg.iov[0].iov_base = (unsigned char *)&rep;
627	arg.iov[0].iov_len  = sizeof(rep.th);
628	if (ts) {
629		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
630				   (TCPOPT_TIMESTAMP << 8) |
631				   TCPOLEN_TIMESTAMP);
632		rep.opt[1] = htonl(tcp_time_stamp);
633		rep.opt[2] = htonl(ts);
634		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
635	}
636
637	/* Swap the send and the receive. */
638	rep.th.dest    = th->source;
639	rep.th.source  = th->dest;
640	rep.th.doff    = arg.iov[0].iov_len / 4;
641	rep.th.seq     = htonl(seq);
642	rep.th.ack_seq = htonl(ack);
643	rep.th.ack     = 1;
644	rep.th.window  = htons(win);
645
646#ifdef CONFIG_TCP_MD5SIG
647	if (key) {
648		int offset = (ts) ? 3 : 0;
649
650		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651					  (TCPOPT_NOP << 16) |
652					  (TCPOPT_MD5SIG << 8) |
653					  TCPOLEN_MD5SIG);
654		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
655		rep.th.doff = arg.iov[0].iov_len/4;
656
657		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
658				    key, ip_hdr(skb)->daddr,
659				    ip_hdr(skb)->saddr, &rep.th);
660	}
661#endif
662	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663				      ip_hdr(skb)->saddr, /* XXX */
664				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
665	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666	if (oif)
667		arg.bound_dev_if = oif;
668
669	ip_send_reply(net->ipv4.tcp_sock, skb,
670		      &arg, arg.iov[0].iov_len);
671
672	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673}
674
675static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676{
677	struct inet_timewait_sock *tw = inet_twsk(sk);
678	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682			tcptw->tw_ts_recent,
683			tw->tw_bound_dev_if,
684			tcp_twsk_md5_key(tcptw)
685			);
686
687	inet_twsk_put(tw);
688}
689
690static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
691				  struct request_sock *req)
692{
693	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695			req->ts_recent,
696			0,
697			tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
698}
699
700/*
701 *	Send a SYN-ACK after having received a SYN.
702 *	This still operates on a request_sock only, not on a big
703 *	socket.
704 */
705static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
706				struct dst_entry *dst)
707{
708	const struct inet_request_sock *ireq = inet_rsk(req);
709	int err = -1;
710	struct sk_buff * skb;
711
712	/* First, grab a route. */
713	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
714		return -1;
715
716	skb = tcp_make_synack(sk, dst, req);
717
718	if (skb) {
719		struct tcphdr *th = tcp_hdr(skb);
720
721		th->check = tcp_v4_check(skb->len,
722					 ireq->loc_addr,
723					 ireq->rmt_addr,
724					 csum_partial((char *)th, skb->len,
725						      skb->csum));
726
727		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
728					    ireq->rmt_addr,
729					    ireq->opt);
730		err = net_xmit_eval(err);
731	}
732
733	dst_release(dst);
734	return err;
735}
736
737static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
738{
739	return __tcp_v4_send_synack(sk, req, NULL);
740}
741
742/*
743 *	IPv4 request_sock destructor.
744 */
745static void tcp_v4_reqsk_destructor(struct request_sock *req)
746{
747	kfree(inet_rsk(req)->opt);
748}
749
750#ifdef CONFIG_SYN_COOKIES
751static void syn_flood_warning(struct sk_buff *skb)
752{
753	static unsigned long warntime;
754
755	if (time_after(jiffies, (warntime + HZ * 60))) {
756		warntime = jiffies;
757		printk(KERN_INFO
758		       "possible SYN flooding on port %d. Sending cookies.\n",
759		       ntohs(tcp_hdr(skb)->dest));
760	}
761}
762#endif
763
764/*
765 * Save and compile IPv4 options into the request_sock if needed.
766 */
767static struct ip_options *tcp_v4_save_options(struct sock *sk,
768					      struct sk_buff *skb)
769{
770	struct ip_options *opt = &(IPCB(skb)->opt);
771	struct ip_options *dopt = NULL;
772
773	if (opt && opt->optlen) {
774		int opt_size = optlength(opt);
775		dopt = kmalloc(opt_size, GFP_ATOMIC);
776		if (dopt) {
777			if (ip_options_echo(dopt, skb)) {
778				kfree(dopt);
779				dopt = NULL;
780			}
781		}
782	}
783	return dopt;
784}
785
786#ifdef CONFIG_TCP_MD5SIG
787/*
788 * RFC2385 MD5 checksumming requires a mapping of
789 * IP address->MD5 Key.
790 * We need to maintain these in the sk structure.
791 */
792
793/* Find the Key structure for an address.  */
794static struct tcp_md5sig_key *
795			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
796{
797	struct tcp_sock *tp = tcp_sk(sk);
798	int i;
799
800	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
801		return NULL;
802	for (i = 0; i < tp->md5sig_info->entries4; i++) {
803		if (tp->md5sig_info->keys4[i].addr == addr)
804			return &tp->md5sig_info->keys4[i].base;
805	}
806	return NULL;
807}
808
809struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
810					 struct sock *addr_sk)
811{
812	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
813}
814
815EXPORT_SYMBOL(tcp_v4_md5_lookup);
816
817static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
818						      struct request_sock *req)
819{
820	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
821}
822
823/* This can be called on a newly created socket, from other files */
824int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
825		      u8 *newkey, u8 newkeylen)
826{
827	/* Add Key to the list */
828	struct tcp_md5sig_key *key;
829	struct tcp_sock *tp = tcp_sk(sk);
830	struct tcp4_md5sig_key *keys;
831
832	key = tcp_v4_md5_do_lookup(sk, addr);
833	if (key) {
834		/* Pre-existing entry - just update that one. */
835		kfree(key->key);
836		key->key = newkey;
837		key->keylen = newkeylen;
838	} else {
839		struct tcp_md5sig_info *md5sig;
840
841		if (!tp->md5sig_info) {
842			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
843						  GFP_ATOMIC);
844			if (!tp->md5sig_info) {
845				kfree(newkey);
846				return -ENOMEM;
847			}
848			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
849		}
850		if (tcp_alloc_md5sig_pool() == NULL) {
851			kfree(newkey);
852			return -ENOMEM;
853		}
854		md5sig = tp->md5sig_info;
855
856		if (md5sig->alloced4 == md5sig->entries4) {
857			keys = kmalloc((sizeof(*keys) *
858					(md5sig->entries4 + 1)), GFP_ATOMIC);
859			if (!keys) {
860				kfree(newkey);
861				tcp_free_md5sig_pool();
862				return -ENOMEM;
863			}
864
865			if (md5sig->entries4)
866				memcpy(keys, md5sig->keys4,
867				       sizeof(*keys) * md5sig->entries4);
868
869			/* Free old key list, and reference new one */
870			kfree(md5sig->keys4);
871			md5sig->keys4 = keys;
872			md5sig->alloced4++;
873		}
874		md5sig->entries4++;
875		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
876		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
877		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
878	}
879	return 0;
880}
881
882EXPORT_SYMBOL(tcp_v4_md5_do_add);
883
884static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
885			       u8 *newkey, u8 newkeylen)
886{
887	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
888				 newkey, newkeylen);
889}
890
891int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
892{
893	struct tcp_sock *tp = tcp_sk(sk);
894	int i;
895
896	for (i = 0; i < tp->md5sig_info->entries4; i++) {
897		if (tp->md5sig_info->keys4[i].addr == addr) {
898			/* Free the key */
899			kfree(tp->md5sig_info->keys4[i].base.key);
900			tp->md5sig_info->entries4--;
901
902			if (tp->md5sig_info->entries4 == 0) {
903				kfree(tp->md5sig_info->keys4);
904				tp->md5sig_info->keys4 = NULL;
905				tp->md5sig_info->alloced4 = 0;
906			} else if (tp->md5sig_info->entries4 != i) {
907				/* Need to do some manipulation */
908				memmove(&tp->md5sig_info->keys4[i],
909					&tp->md5sig_info->keys4[i+1],
910					(tp->md5sig_info->entries4 - i) *
911					 sizeof(struct tcp4_md5sig_key));
912			}
913			tcp_free_md5sig_pool();
914			return 0;
915		}
916	}
917	return -ENOENT;
918}
919
920EXPORT_SYMBOL(tcp_v4_md5_do_del);
921
922static void tcp_v4_clear_md5_list(struct sock *sk)
923{
924	struct tcp_sock *tp = tcp_sk(sk);
925
926	/* Free each key, then the set of key keys,
927	 * the crypto element, and then decrement our
928	 * hold on the last resort crypto.
929	 */
930	if (tp->md5sig_info->entries4) {
931		int i;
932		for (i = 0; i < tp->md5sig_info->entries4; i++)
933			kfree(tp->md5sig_info->keys4[i].base.key);
934		tp->md5sig_info->entries4 = 0;
935		tcp_free_md5sig_pool();
936	}
937	if (tp->md5sig_info->keys4) {
938		kfree(tp->md5sig_info->keys4);
939		tp->md5sig_info->keys4 = NULL;
940		tp->md5sig_info->alloced4  = 0;
941	}
942}
943
944static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
945				 int optlen)
946{
947	struct tcp_md5sig cmd;
948	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
949	u8 *newkey;
950
951	if (optlen < sizeof(cmd))
952		return -EINVAL;
953
954	if (copy_from_user(&cmd, optval, sizeof(cmd)))
955		return -EFAULT;
956
957	if (sin->sin_family != AF_INET)
958		return -EINVAL;
959
960	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
961		if (!tcp_sk(sk)->md5sig_info)
962			return -ENOENT;
963		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
964	}
965
966	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
967		return -EINVAL;
968
969	if (!tcp_sk(sk)->md5sig_info) {
970		struct tcp_sock *tp = tcp_sk(sk);
971		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
972
973		if (!p)
974			return -EINVAL;
975
976		tp->md5sig_info = p;
977		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
978	}
979
980	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
981	if (!newkey)
982		return -ENOMEM;
983	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
984				 newkey, cmd.tcpm_keylen);
985}
986
987static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
988					__be32 daddr, __be32 saddr, int nbytes)
989{
990	struct tcp4_pseudohdr *bp;
991	struct scatterlist sg;
992
993	bp = &hp->md5_blk.ip4;
994
995	/*
996	 * 1. the TCP pseudo-header (in the order: source IP address,
997	 * destination IP address, zero-padded protocol number, and
998	 * segment length)
999	 */
1000	bp->saddr = saddr;
1001	bp->daddr = daddr;
1002	bp->pad = 0;
1003	bp->protocol = IPPROTO_TCP;
1004	bp->len = cpu_to_be16(nbytes);
1005
1006	sg_init_one(&sg, bp, sizeof(*bp));
1007	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008}
1009
1010static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1012{
1013	struct tcp_md5sig_pool *hp;
1014	struct hash_desc *desc;
1015
1016	hp = tcp_get_md5sig_pool();
1017	if (!hp)
1018		goto clear_hash_noput;
1019	desc = &hp->md5_desc;
1020
1021	if (crypto_hash_init(desc))
1022		goto clear_hash;
1023	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024		goto clear_hash;
1025	if (tcp_md5_hash_header(hp, th))
1026		goto clear_hash;
1027	if (tcp_md5_hash_key(hp, key))
1028		goto clear_hash;
1029	if (crypto_hash_final(desc, md5_hash))
1030		goto clear_hash;
1031
1032	tcp_put_md5sig_pool();
1033	return 0;
1034
1035clear_hash:
1036	tcp_put_md5sig_pool();
1037clear_hash_noput:
1038	memset(md5_hash, 0, 16);
1039	return 1;
1040}
1041
1042int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043			struct sock *sk, struct request_sock *req,
1044			struct sk_buff *skb)
1045{
1046	struct tcp_md5sig_pool *hp;
1047	struct hash_desc *desc;
1048	struct tcphdr *th = tcp_hdr(skb);
1049	__be32 saddr, daddr;
1050
1051	if (sk) {
1052		saddr = inet_sk(sk)->saddr;
1053		daddr = inet_sk(sk)->daddr;
1054	} else if (req) {
1055		saddr = inet_rsk(req)->loc_addr;
1056		daddr = inet_rsk(req)->rmt_addr;
1057	} else {
1058		const struct iphdr *iph = ip_hdr(skb);
1059		saddr = iph->saddr;
1060		daddr = iph->daddr;
1061	}
1062
1063	hp = tcp_get_md5sig_pool();
1064	if (!hp)
1065		goto clear_hash_noput;
1066	desc = &hp->md5_desc;
1067
1068	if (crypto_hash_init(desc))
1069		goto clear_hash;
1070
1071	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072		goto clear_hash;
1073	if (tcp_md5_hash_header(hp, th))
1074		goto clear_hash;
1075	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076		goto clear_hash;
1077	if (tcp_md5_hash_key(hp, key))
1078		goto clear_hash;
1079	if (crypto_hash_final(desc, md5_hash))
1080		goto clear_hash;
1081
1082	tcp_put_md5sig_pool();
1083	return 0;
1084
1085clear_hash:
1086	tcp_put_md5sig_pool();
1087clear_hash_noput:
1088	memset(md5_hash, 0, 16);
1089	return 1;
1090}
1091
1092EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093
1094static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095{
1096	/*
1097	 * This gets called for each TCP segment that arrives
1098	 * so we want to be efficient.
1099	 * We have 3 drop cases:
1100	 * o No MD5 hash and one expected.
1101	 * o MD5 hash and we're not expecting one.
1102	 * o MD5 hash and its wrong.
1103	 */
1104	__u8 *hash_location = NULL;
1105	struct tcp_md5sig_key *hash_expected;
1106	const struct iphdr *iph = ip_hdr(skb);
1107	struct tcphdr *th = tcp_hdr(skb);
1108	int genhash;
1109	unsigned char newhash[16];
1110
1111	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112	hash_location = tcp_parse_md5sig_option(th);
1113
1114	/* We've parsed the options - do we have a hash? */
1115	if (!hash_expected && !hash_location)
1116		return 0;
1117
1118	if (hash_expected && !hash_location) {
1119		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1120			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1121			       NIPQUAD(iph->saddr), ntohs(th->source),
1122			       NIPQUAD(iph->daddr), ntohs(th->dest));
1123		return 1;
1124	}
1125
1126	if (!hash_expected && hash_location) {
1127		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1128			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1129			       NIPQUAD(iph->saddr), ntohs(th->source),
1130			       NIPQUAD(iph->daddr), ntohs(th->dest));
1131		return 1;
1132	}
1133
1134	/* Okay, so this is hash_expected and hash_location -
1135	 * so we need to calculate the checksum.
1136	 */
1137	genhash = tcp_v4_md5_hash_skb(newhash,
1138				      hash_expected,
1139				      NULL, NULL, skb);
1140
1141	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1142		if (net_ratelimit()) {
1143			printk(KERN_INFO "MD5 Hash failed for "
1144			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1145			       NIPQUAD(iph->saddr), ntohs(th->source),
1146			       NIPQUAD(iph->daddr), ntohs(th->dest),
1147			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1148		}
1149		return 1;
1150	}
1151	return 0;
1152}
1153
1154#endif
1155
1156struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1157	.family		=	PF_INET,
1158	.obj_size	=	sizeof(struct tcp_request_sock),
1159	.rtx_syn_ack	=	tcp_v4_send_synack,
1160	.send_ack	=	tcp_v4_reqsk_send_ack,
1161	.destructor	=	tcp_v4_reqsk_destructor,
1162	.send_reset	=	tcp_v4_send_reset,
1163};
1164
1165#ifdef CONFIG_TCP_MD5SIG
1166static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1167	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1168};
1169#endif
1170
1171static struct timewait_sock_ops tcp_timewait_sock_ops = {
1172	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1173	.twsk_unique	= tcp_twsk_unique,
1174	.twsk_destructor= tcp_twsk_destructor,
1175};
1176
1177int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1178{
1179	struct inet_request_sock *ireq;
1180	struct tcp_options_received tmp_opt;
1181	struct request_sock *req;
1182	__be32 saddr = ip_hdr(skb)->saddr;
1183	__be32 daddr = ip_hdr(skb)->daddr;
1184	__u32 isn = TCP_SKB_CB(skb)->when;
1185	struct dst_entry *dst = NULL;
1186#ifdef CONFIG_SYN_COOKIES
1187	int want_cookie = 0;
1188#else
1189#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1190#endif
1191
1192	/* Never answer to SYNs send to broadcast or multicast */
1193	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1194		goto drop;
1195
1196	/* TW buckets are converted to open requests without
1197	 * limitations, they conserve resources and peer is
1198	 * evidently real one.
1199	 */
1200	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1201#ifdef CONFIG_SYN_COOKIES
1202		if (sysctl_tcp_syncookies) {
1203			want_cookie = 1;
1204		} else
1205#endif
1206		goto drop;
1207	}
1208
1209	/* Accept backlog is full. If we have already queued enough
1210	 * of warm entries in syn queue, drop request. It is better than
1211	 * clogging syn queue with openreqs with exponentially increasing
1212	 * timeout.
1213	 */
1214	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1215		goto drop;
1216
1217	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1218	if (!req)
1219		goto drop;
1220
1221#ifdef CONFIG_TCP_MD5SIG
1222	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1223#endif
1224
1225	tcp_clear_options(&tmp_opt);
1226	tmp_opt.mss_clamp = 536;
1227	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1228
1229	tcp_parse_options(skb, &tmp_opt, 0);
1230
1231	if (want_cookie && !tmp_opt.saw_tstamp)
1232		tcp_clear_options(&tmp_opt);
1233
1234	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1235		/* Some OSes (unknown ones, but I see them on web server, which
1236		 * contains information interesting only for windows'
1237		 * users) do not send their stamp in SYN. It is easy case.
1238		 * We simply do not advertise TS support.
1239		 */
1240		tmp_opt.saw_tstamp = 0;
1241		tmp_opt.tstamp_ok  = 0;
1242	}
1243	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1244
1245	tcp_openreq_init(req, &tmp_opt, skb);
1246
1247	if (security_inet_conn_request(sk, skb, req))
1248		goto drop_and_free;
1249
1250	ireq = inet_rsk(req);
1251	ireq->loc_addr = daddr;
1252	ireq->rmt_addr = saddr;
1253	ireq->opt = tcp_v4_save_options(sk, skb);
1254	if (!want_cookie)
1255		TCP_ECN_create_request(req, tcp_hdr(skb));
1256
1257	if (want_cookie) {
1258#ifdef CONFIG_SYN_COOKIES
1259		syn_flood_warning(skb);
1260		req->cookie_ts = tmp_opt.tstamp_ok;
1261#endif
1262		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1263	} else if (!isn) {
1264		struct inet_peer *peer = NULL;
1265
1266		/* VJ's idea. We save last timestamp seen
1267		 * from the destination in peer table, when entering
1268		 * state TIME-WAIT, and check against it before
1269		 * accepting new connection request.
1270		 *
1271		 * If "isn" is not zero, this request hit alive
1272		 * timewait bucket, so that all the necessary checks
1273		 * are made in the function processing timewait state.
1274		 */
1275		if (tmp_opt.saw_tstamp &&
1276		    tcp_death_row.sysctl_tw_recycle &&
1277		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1278		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1279		    peer->v4daddr == saddr) {
1280			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1281			    (s32)(peer->tcp_ts - req->ts_recent) >
1282							TCP_PAWS_WINDOW) {
1283				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1284				goto drop_and_release;
1285			}
1286		}
1287		/* Kill the following clause, if you dislike this way. */
1288		else if (!sysctl_tcp_syncookies &&
1289			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1290			  (sysctl_max_syn_backlog >> 2)) &&
1291			 (!peer || !peer->tcp_ts_stamp) &&
1292			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1293			/* Without syncookies last quarter of
1294			 * backlog is filled with destinations,
1295			 * proven to be alive.
1296			 * It means that we continue to communicate
1297			 * to destinations, already remembered
1298			 * to the moment of synflood.
1299			 */
1300			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1301				       "request from " NIPQUAD_FMT "/%u\n",
1302				       NIPQUAD(saddr),
1303				       ntohs(tcp_hdr(skb)->source));
1304			goto drop_and_release;
1305		}
1306
1307		isn = tcp_v4_init_sequence(skb);
1308	}
1309	tcp_rsk(req)->snt_isn = isn;
1310
1311	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1312		goto drop_and_free;
1313
1314	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1315	return 0;
1316
1317drop_and_release:
1318	dst_release(dst);
1319drop_and_free:
1320	reqsk_free(req);
1321drop:
1322	return 0;
1323}
1324
1325
1326/*
1327 * The three way handshake has completed - we got a valid synack -
1328 * now create the new socket.
1329 */
1330struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1331				  struct request_sock *req,
1332				  struct dst_entry *dst)
1333{
1334	struct inet_request_sock *ireq;
1335	struct inet_sock *newinet;
1336	struct tcp_sock *newtp;
1337	struct sock *newsk;
1338#ifdef CONFIG_TCP_MD5SIG
1339	struct tcp_md5sig_key *key;
1340#endif
1341
1342	if (sk_acceptq_is_full(sk))
1343		goto exit_overflow;
1344
1345	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1346		goto exit;
1347
1348	newsk = tcp_create_openreq_child(sk, req, skb);
1349	if (!newsk)
1350		goto exit;
1351
1352	newsk->sk_gso_type = SKB_GSO_TCPV4;
1353	sk_setup_caps(newsk, dst);
1354
1355	newtp		      = tcp_sk(newsk);
1356	newinet		      = inet_sk(newsk);
1357	ireq		      = inet_rsk(req);
1358	newinet->daddr	      = ireq->rmt_addr;
1359	newinet->rcv_saddr    = ireq->loc_addr;
1360	newinet->saddr	      = ireq->loc_addr;
1361	newinet->opt	      = ireq->opt;
1362	ireq->opt	      = NULL;
1363	newinet->mc_index     = inet_iif(skb);
1364	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1365	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1366	if (newinet->opt)
1367		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1368	newinet->id = newtp->write_seq ^ jiffies;
1369
1370	tcp_mtup_init(newsk);
1371	tcp_sync_mss(newsk, dst_mtu(dst));
1372	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1373	tcp_initialize_rcv_mss(newsk);
1374
1375#ifdef CONFIG_TCP_MD5SIG
1376	/* Copy over the MD5 key from the original socket */
1377	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1378		/*
1379		 * We're using one, so create a matching key
1380		 * on the newsk structure. If we fail to get
1381		 * memory, then we end up not copying the key
1382		 * across. Shucks.
1383		 */
1384		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1385		if (newkey != NULL)
1386			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1387					  newkey, key->keylen);
1388		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1389	}
1390#endif
1391
1392	__inet_hash_nolisten(newsk);
1393	__inet_inherit_port(sk, newsk);
1394
1395	return newsk;
1396
1397exit_overflow:
1398	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1399exit:
1400	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1401	dst_release(dst);
1402	return NULL;
1403}
1404
1405static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1406{
1407	struct tcphdr *th = tcp_hdr(skb);
1408	const struct iphdr *iph = ip_hdr(skb);
1409	struct sock *nsk;
1410	struct request_sock **prev;
1411	/* Find possible connection requests. */
1412	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1413						       iph->saddr, iph->daddr);
1414	if (req)
1415		return tcp_check_req(sk, skb, req, prev);
1416
1417	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1418			th->source, iph->daddr, th->dest, inet_iif(skb));
1419
1420	if (nsk) {
1421		if (nsk->sk_state != TCP_TIME_WAIT) {
1422			bh_lock_sock(nsk);
1423			return nsk;
1424		}
1425		inet_twsk_put(inet_twsk(nsk));
1426		return NULL;
1427	}
1428
1429#ifdef CONFIG_SYN_COOKIES
1430	if (!th->rst && !th->syn && th->ack)
1431		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1432#endif
1433	return sk;
1434}
1435
1436static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1437{
1438	const struct iphdr *iph = ip_hdr(skb);
1439
1440	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1441		if (!tcp_v4_check(skb->len, iph->saddr,
1442				  iph->daddr, skb->csum)) {
1443			skb->ip_summed = CHECKSUM_UNNECESSARY;
1444			return 0;
1445		}
1446	}
1447
1448	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1449				       skb->len, IPPROTO_TCP, 0);
1450
1451	if (skb->len <= 76) {
1452		return __skb_checksum_complete(skb);
1453	}
1454	return 0;
1455}
1456
1457
1458/* The socket must have it's spinlock held when we get
1459 * here.
1460 *
1461 * We have a potential double-lock case here, so even when
1462 * doing backlog processing we use the BH locking scheme.
1463 * This is because we cannot sleep with the original spinlock
1464 * held.
1465 */
1466int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1467{
1468	struct sock *rsk;
1469#ifdef CONFIG_TCP_MD5SIG
1470	/*
1471	 * We really want to reject the packet as early as possible
1472	 * if:
1473	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1474	 *  o There is an MD5 option and we're not expecting one
1475	 */
1476	if (tcp_v4_inbound_md5_hash(sk, skb))
1477		goto discard;
1478#endif
1479
1480	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1481		TCP_CHECK_TIMER(sk);
1482		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1483			rsk = sk;
1484			goto reset;
1485		}
1486		TCP_CHECK_TIMER(sk);
1487		return 0;
1488	}
1489
1490	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1491		goto csum_err;
1492
1493	if (sk->sk_state == TCP_LISTEN) {
1494		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1495		if (!nsk)
1496			goto discard;
1497
1498		if (nsk != sk) {
1499			if (tcp_child_process(sk, nsk, skb)) {
1500				rsk = nsk;
1501				goto reset;
1502			}
1503			return 0;
1504		}
1505	}
1506
1507	TCP_CHECK_TIMER(sk);
1508	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1509		rsk = sk;
1510		goto reset;
1511	}
1512	TCP_CHECK_TIMER(sk);
1513	return 0;
1514
1515reset:
1516	tcp_v4_send_reset(rsk, skb);
1517discard:
1518	kfree_skb(skb);
1519	/* Be careful here. If this function gets more complicated and
1520	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1521	 * might be destroyed here. This current version compiles correctly,
1522	 * but you have been warned.
1523	 */
1524	return 0;
1525
1526csum_err:
1527	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1528	goto discard;
1529}
1530
1531/*
1532 *	From tcp_input.c
1533 */
1534
1535int tcp_v4_rcv(struct sk_buff *skb)
1536{
1537	const struct iphdr *iph;
1538	struct tcphdr *th;
1539	struct sock *sk;
1540	int ret;
1541	struct net *net = dev_net(skb->dev);
1542
1543	if (skb->pkt_type != PACKET_HOST)
1544		goto discard_it;
1545
1546	/* Count it even if it's bad */
1547	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1548
1549	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1550		goto discard_it;
1551
1552	th = tcp_hdr(skb);
1553
1554	if (th->doff < sizeof(struct tcphdr) / 4)
1555		goto bad_packet;
1556	if (!pskb_may_pull(skb, th->doff * 4))
1557		goto discard_it;
1558
1559	/* An explanation is required here, I think.
1560	 * Packet length and doff are validated by header prediction,
1561	 * provided case of th->doff==0 is eliminated.
1562	 * So, we defer the checks. */
1563	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1564		goto bad_packet;
1565
1566	th = tcp_hdr(skb);
1567	iph = ip_hdr(skb);
1568	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1569	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1570				    skb->len - th->doff * 4);
1571	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1572	TCP_SKB_CB(skb)->when	 = 0;
1573	TCP_SKB_CB(skb)->flags	 = iph->tos;
1574	TCP_SKB_CB(skb)->sacked	 = 0;
1575
1576	sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1577			th->source, iph->daddr, th->dest, inet_iif(skb));
1578	if (!sk)
1579		goto no_tcp_socket;
1580
1581process:
1582	if (sk->sk_state == TCP_TIME_WAIT)
1583		goto do_time_wait;
1584
1585	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1586		goto discard_and_relse;
1587	nf_reset(skb);
1588
1589	if (sk_filter(sk, skb))
1590		goto discard_and_relse;
1591
1592	skb->dev = NULL;
1593
1594	bh_lock_sock_nested(sk);
1595	ret = 0;
1596	if (!sock_owned_by_user(sk)) {
1597#ifdef CONFIG_NET_DMA
1598		struct tcp_sock *tp = tcp_sk(sk);
1599		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1600			tp->ucopy.dma_chan = get_softnet_dma();
1601		if (tp->ucopy.dma_chan)
1602			ret = tcp_v4_do_rcv(sk, skb);
1603		else
1604#endif
1605		{
1606			if (!tcp_prequeue(sk, skb))
1607			ret = tcp_v4_do_rcv(sk, skb);
1608		}
1609	} else
1610		sk_add_backlog(sk, skb);
1611	bh_unlock_sock(sk);
1612
1613	sock_put(sk);
1614
1615	return ret;
1616
1617no_tcp_socket:
1618	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1619		goto discard_it;
1620
1621	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1622bad_packet:
1623		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1624	} else {
1625		tcp_v4_send_reset(NULL, skb);
1626	}
1627
1628discard_it:
1629	/* Discard frame. */
1630	kfree_skb(skb);
1631	return 0;
1632
1633discard_and_relse:
1634	sock_put(sk);
1635	goto discard_it;
1636
1637do_time_wait:
1638	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1639		inet_twsk_put(inet_twsk(sk));
1640		goto discard_it;
1641	}
1642
1643	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1644		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1645		inet_twsk_put(inet_twsk(sk));
1646		goto discard_it;
1647	}
1648	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1649	case TCP_TW_SYN: {
1650		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1651							&tcp_hashinfo,
1652							iph->daddr, th->dest,
1653							inet_iif(skb));
1654		if (sk2) {
1655			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1656			inet_twsk_put(inet_twsk(sk));
1657			sk = sk2;
1658			goto process;
1659		}
1660		/* Fall through to ACK */
1661	}
1662	case TCP_TW_ACK:
1663		tcp_v4_timewait_ack(sk, skb);
1664		break;
1665	case TCP_TW_RST:
1666		goto no_tcp_socket;
1667	case TCP_TW_SUCCESS:;
1668	}
1669	goto discard_it;
1670}
1671
1672/* VJ's idea. Save last timestamp seen from this destination
1673 * and hold it at least for normal timewait interval to use for duplicate
1674 * segment detection in subsequent connections, before they enter synchronized
1675 * state.
1676 */
1677
1678int tcp_v4_remember_stamp(struct sock *sk)
1679{
1680	struct inet_sock *inet = inet_sk(sk);
1681	struct tcp_sock *tp = tcp_sk(sk);
1682	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1683	struct inet_peer *peer = NULL;
1684	int release_it = 0;
1685
1686	if (!rt || rt->rt_dst != inet->daddr) {
1687		peer = inet_getpeer(inet->daddr, 1);
1688		release_it = 1;
1689	} else {
1690		if (!rt->peer)
1691			rt_bind_peer(rt, 1);
1692		peer = rt->peer;
1693	}
1694
1695	if (peer) {
1696		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1697		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1698		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1699			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1700			peer->tcp_ts = tp->rx_opt.ts_recent;
1701		}
1702		if (release_it)
1703			inet_putpeer(peer);
1704		return 1;
1705	}
1706
1707	return 0;
1708}
1709
1710int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1711{
1712	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1713
1714	if (peer) {
1715		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1716
1717		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1718		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1719		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1720			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1721			peer->tcp_ts	   = tcptw->tw_ts_recent;
1722		}
1723		inet_putpeer(peer);
1724		return 1;
1725	}
1726
1727	return 0;
1728}
1729
1730struct inet_connection_sock_af_ops ipv4_specific = {
1731	.queue_xmit	   = ip_queue_xmit,
1732	.send_check	   = tcp_v4_send_check,
1733	.rebuild_header	   = inet_sk_rebuild_header,
1734	.conn_request	   = tcp_v4_conn_request,
1735	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1736	.remember_stamp	   = tcp_v4_remember_stamp,
1737	.net_header_len	   = sizeof(struct iphdr),
1738	.setsockopt	   = ip_setsockopt,
1739	.getsockopt	   = ip_getsockopt,
1740	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1741	.sockaddr_len	   = sizeof(struct sockaddr_in),
1742	.bind_conflict	   = inet_csk_bind_conflict,
1743#ifdef CONFIG_COMPAT
1744	.compat_setsockopt = compat_ip_setsockopt,
1745	.compat_getsockopt = compat_ip_getsockopt,
1746#endif
1747};
1748
1749#ifdef CONFIG_TCP_MD5SIG
1750static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1751	.md5_lookup		= tcp_v4_md5_lookup,
1752	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1753	.md5_add		= tcp_v4_md5_add_func,
1754	.md5_parse		= tcp_v4_parse_md5_keys,
1755};
1756#endif
1757
1758/* NOTE: A lot of things set to zero explicitly by call to
1759 *       sk_alloc() so need not be done here.
1760 */
1761static int tcp_v4_init_sock(struct sock *sk)
1762{
1763	struct inet_connection_sock *icsk = inet_csk(sk);
1764	struct tcp_sock *tp = tcp_sk(sk);
1765
1766	skb_queue_head_init(&tp->out_of_order_queue);
1767	tcp_init_xmit_timers(sk);
1768	tcp_prequeue_init(tp);
1769
1770	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1771	tp->mdev = TCP_TIMEOUT_INIT;
1772
1773	/* So many TCP implementations out there (incorrectly) count the
1774	 * initial SYN frame in their delayed-ACK and congestion control
1775	 * algorithms that we must have the following bandaid to talk
1776	 * efficiently to them.  -DaveM
1777	 */
1778	tp->snd_cwnd = 2;
1779
1780	/* See draft-stevens-tcpca-spec-01 for discussion of the
1781	 * initialization of these values.
1782	 */
1783	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1784	tp->snd_cwnd_clamp = ~0;
1785	tp->mss_cache = 536;
1786
1787	tp->reordering = sysctl_tcp_reordering;
1788	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1789
1790	sk->sk_state = TCP_CLOSE;
1791
1792	sk->sk_write_space = sk_stream_write_space;
1793	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1794
1795	icsk->icsk_af_ops = &ipv4_specific;
1796	icsk->icsk_sync_mss = tcp_sync_mss;
1797#ifdef CONFIG_TCP_MD5SIG
1798	tp->af_specific = &tcp_sock_ipv4_specific;
1799#endif
1800
1801	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1802	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1803
1804	atomic_inc(&tcp_sockets_allocated);
1805
1806	return 0;
1807}
1808
1809void tcp_v4_destroy_sock(struct sock *sk)
1810{
1811	struct tcp_sock *tp = tcp_sk(sk);
1812
1813	tcp_clear_xmit_timers(sk);
1814
1815	tcp_cleanup_congestion_control(sk);
1816
1817	/* Cleanup up the write buffer. */
1818	tcp_write_queue_purge(sk);
1819
1820	/* Cleans up our, hopefully empty, out_of_order_queue. */
1821	__skb_queue_purge(&tp->out_of_order_queue);
1822
1823#ifdef CONFIG_TCP_MD5SIG
1824	/* Clean up the MD5 key list, if any */
1825	if (tp->md5sig_info) {
1826		tcp_v4_clear_md5_list(sk);
1827		kfree(tp->md5sig_info);
1828		tp->md5sig_info = NULL;
1829	}
1830#endif
1831
1832#ifdef CONFIG_NET_DMA
1833	/* Cleans up our sk_async_wait_queue */
1834	__skb_queue_purge(&sk->sk_async_wait_queue);
1835#endif
1836
1837	/* Clean prequeue, it must be empty really */
1838	__skb_queue_purge(&tp->ucopy.prequeue);
1839
1840	/* Clean up a referenced TCP bind bucket. */
1841	if (inet_csk(sk)->icsk_bind_hash)
1842		inet_put_port(sk);
1843
1844	/*
1845	 * If sendmsg cached page exists, toss it.
1846	 */
1847	if (sk->sk_sndmsg_page) {
1848		__free_page(sk->sk_sndmsg_page);
1849		sk->sk_sndmsg_page = NULL;
1850	}
1851
1852	atomic_dec(&tcp_sockets_allocated);
1853}
1854
1855EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856
1857#ifdef CONFIG_PROC_FS
1858/* Proc filesystem TCP sock list dumping. */
1859
1860static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1861{
1862	return hlist_empty(head) ? NULL :
1863		list_entry(head->first, struct inet_timewait_sock, tw_node);
1864}
1865
1866static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1867{
1868	return tw->tw_node.next ?
1869		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1870}
1871
1872static void *listening_get_next(struct seq_file *seq, void *cur)
1873{
1874	struct inet_connection_sock *icsk;
1875	struct hlist_node *node;
1876	struct sock *sk = cur;
1877	struct tcp_iter_state* st = seq->private;
1878	struct net *net = seq_file_net(seq);
1879
1880	if (!sk) {
1881		st->bucket = 0;
1882		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1883		goto get_sk;
1884	}
1885
1886	++st->num;
1887
1888	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1889		struct request_sock *req = cur;
1890
1891		icsk = inet_csk(st->syn_wait_sk);
1892		req = req->dl_next;
1893		while (1) {
1894			while (req) {
1895				if (req->rsk_ops->family == st->family) {
1896					cur = req;
1897					goto out;
1898				}
1899				req = req->dl_next;
1900			}
1901			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1902				break;
1903get_req:
1904			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1905		}
1906		sk	  = sk_next(st->syn_wait_sk);
1907		st->state = TCP_SEQ_STATE_LISTENING;
1908		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909	} else {
1910		icsk = inet_csk(sk);
1911		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1913			goto start_req;
1914		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1915		sk = sk_next(sk);
1916	}
1917get_sk:
1918	sk_for_each_from(sk, node) {
1919		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1920			cur = sk;
1921			goto out;
1922		}
1923		icsk = inet_csk(sk);
1924		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1925		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1926start_req:
1927			st->uid		= sock_i_uid(sk);
1928			st->syn_wait_sk = sk;
1929			st->state	= TCP_SEQ_STATE_OPENREQ;
1930			st->sbucket	= 0;
1931			goto get_req;
1932		}
1933		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1934	}
1935	if (++st->bucket < INET_LHTABLE_SIZE) {
1936		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1937		goto get_sk;
1938	}
1939	cur = NULL;
1940out:
1941	return cur;
1942}
1943
1944static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1945{
1946	void *rc = listening_get_next(seq, NULL);
1947
1948	while (rc && *pos) {
1949		rc = listening_get_next(seq, rc);
1950		--*pos;
1951	}
1952	return rc;
1953}
1954
1955static void *established_get_first(struct seq_file *seq)
1956{
1957	struct tcp_iter_state* st = seq->private;
1958	struct net *net = seq_file_net(seq);
1959	void *rc = NULL;
1960
1961	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1962		struct sock *sk;
1963		struct hlist_node *node;
1964		struct inet_timewait_sock *tw;
1965		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1966
1967		read_lock_bh(lock);
1968		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1969			if (sk->sk_family != st->family ||
1970			    !net_eq(sock_net(sk), net)) {
1971				continue;
1972			}
1973			rc = sk;
1974			goto out;
1975		}
1976		st->state = TCP_SEQ_STATE_TIME_WAIT;
1977		inet_twsk_for_each(tw, node,
1978				   &tcp_hashinfo.ehash[st->bucket].twchain) {
1979			if (tw->tw_family != st->family ||
1980			    !net_eq(twsk_net(tw), net)) {
1981				continue;
1982			}
1983			rc = tw;
1984			goto out;
1985		}
1986		read_unlock_bh(lock);
1987		st->state = TCP_SEQ_STATE_ESTABLISHED;
1988	}
1989out:
1990	return rc;
1991}
1992
1993static void *established_get_next(struct seq_file *seq, void *cur)
1994{
1995	struct sock *sk = cur;
1996	struct inet_timewait_sock *tw;
1997	struct hlist_node *node;
1998	struct tcp_iter_state* st = seq->private;
1999	struct net *net = seq_file_net(seq);
2000
2001	++st->num;
2002
2003	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2004		tw = cur;
2005		tw = tw_next(tw);
2006get_tw:
2007		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2008			tw = tw_next(tw);
2009		}
2010		if (tw) {
2011			cur = tw;
2012			goto out;
2013		}
2014		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015		st->state = TCP_SEQ_STATE_ESTABLISHED;
2016
2017		if (++st->bucket < tcp_hashinfo.ehash_size) {
2018			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2020		} else {
2021			cur = NULL;
2022			goto out;
2023		}
2024	} else
2025		sk = sk_next(sk);
2026
2027	sk_for_each_from(sk, node) {
2028		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2029			goto found;
2030	}
2031
2032	st->state = TCP_SEQ_STATE_TIME_WAIT;
2033	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2034	goto get_tw;
2035found:
2036	cur = sk;
2037out:
2038	return cur;
2039}
2040
2041static void *established_get_idx(struct seq_file *seq, loff_t pos)
2042{
2043	void *rc = established_get_first(seq);
2044
2045	while (rc && pos) {
2046		rc = established_get_next(seq, rc);
2047		--pos;
2048	}
2049	return rc;
2050}
2051
2052static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2053{
2054	void *rc;
2055	struct tcp_iter_state* st = seq->private;
2056
2057	inet_listen_lock(&tcp_hashinfo);
2058	st->state = TCP_SEQ_STATE_LISTENING;
2059	rc	  = listening_get_idx(seq, &pos);
2060
2061	if (!rc) {
2062		inet_listen_unlock(&tcp_hashinfo);
2063		st->state = TCP_SEQ_STATE_ESTABLISHED;
2064		rc	  = established_get_idx(seq, pos);
2065	}
2066
2067	return rc;
2068}
2069
2070static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2071{
2072	struct tcp_iter_state* st = seq->private;
2073	st->state = TCP_SEQ_STATE_LISTENING;
2074	st->num = 0;
2075	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2076}
2077
2078static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2079{
2080	void *rc = NULL;
2081	struct tcp_iter_state* st;
2082
2083	if (v == SEQ_START_TOKEN) {
2084		rc = tcp_get_idx(seq, 0);
2085		goto out;
2086	}
2087	st = seq->private;
2088
2089	switch (st->state) {
2090	case TCP_SEQ_STATE_OPENREQ:
2091	case TCP_SEQ_STATE_LISTENING:
2092		rc = listening_get_next(seq, v);
2093		if (!rc) {
2094			inet_listen_unlock(&tcp_hashinfo);
2095			st->state = TCP_SEQ_STATE_ESTABLISHED;
2096			rc	  = established_get_first(seq);
2097		}
2098		break;
2099	case TCP_SEQ_STATE_ESTABLISHED:
2100	case TCP_SEQ_STATE_TIME_WAIT:
2101		rc = established_get_next(seq, v);
2102		break;
2103	}
2104out:
2105	++*pos;
2106	return rc;
2107}
2108
2109static void tcp_seq_stop(struct seq_file *seq, void *v)
2110{
2111	struct tcp_iter_state* st = seq->private;
2112
2113	switch (st->state) {
2114	case TCP_SEQ_STATE_OPENREQ:
2115		if (v) {
2116			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2117			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2118		}
2119	case TCP_SEQ_STATE_LISTENING:
2120		if (v != SEQ_START_TOKEN)
2121			inet_listen_unlock(&tcp_hashinfo);
2122		break;
2123	case TCP_SEQ_STATE_TIME_WAIT:
2124	case TCP_SEQ_STATE_ESTABLISHED:
2125		if (v)
2126			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2127		break;
2128	}
2129}
2130
2131static int tcp_seq_open(struct inode *inode, struct file *file)
2132{
2133	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2134	struct tcp_iter_state *s;
2135	int err;
2136
2137	err = seq_open_net(inode, file, &afinfo->seq_ops,
2138			  sizeof(struct tcp_iter_state));
2139	if (err < 0)
2140		return err;
2141
2142	s = ((struct seq_file *)file->private_data)->private;
2143	s->family		= afinfo->family;
2144	return 0;
2145}
2146
2147int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2148{
2149	int rc = 0;
2150	struct proc_dir_entry *p;
2151
2152	afinfo->seq_fops.open		= tcp_seq_open;
2153	afinfo->seq_fops.read		= seq_read;
2154	afinfo->seq_fops.llseek		= seq_lseek;
2155	afinfo->seq_fops.release	= seq_release_net;
2156
2157	afinfo->seq_ops.start		= tcp_seq_start;
2158	afinfo->seq_ops.next		= tcp_seq_next;
2159	afinfo->seq_ops.stop		= tcp_seq_stop;
2160
2161	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2162			     &afinfo->seq_fops, afinfo);
2163	if (!p)
2164		rc = -ENOMEM;
2165	return rc;
2166}
2167
2168void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2169{
2170	proc_net_remove(net, afinfo->name);
2171}
2172
2173static void get_openreq4(struct sock *sk, struct request_sock *req,
2174			 struct seq_file *f, int i, int uid, int *len)
2175{
2176	const struct inet_request_sock *ireq = inet_rsk(req);
2177	int ttd = req->expires - jiffies;
2178
2179	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2180		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2181		i,
2182		ireq->loc_addr,
2183		ntohs(inet_sk(sk)->sport),
2184		ireq->rmt_addr,
2185		ntohs(ireq->rmt_port),
2186		TCP_SYN_RECV,
2187		0, 0, /* could print option size, but that is af dependent. */
2188		1,    /* timers active (only the expire timer) */
2189		jiffies_to_clock_t(ttd),
2190		req->retrans,
2191		uid,
2192		0,  /* non standard timer */
2193		0, /* open_requests have no inode */
2194		atomic_read(&sk->sk_refcnt),
2195		req,
2196		len);
2197}
2198
2199static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2200{
2201	int timer_active;
2202	unsigned long timer_expires;
2203	struct tcp_sock *tp = tcp_sk(sk);
2204	const struct inet_connection_sock *icsk = inet_csk(sk);
2205	struct inet_sock *inet = inet_sk(sk);
2206	__be32 dest = inet->daddr;
2207	__be32 src = inet->rcv_saddr;
2208	__u16 destp = ntohs(inet->dport);
2209	__u16 srcp = ntohs(inet->sport);
2210
2211	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2212		timer_active	= 1;
2213		timer_expires	= icsk->icsk_timeout;
2214	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2215		timer_active	= 4;
2216		timer_expires	= icsk->icsk_timeout;
2217	} else if (timer_pending(&sk->sk_timer)) {
2218		timer_active	= 2;
2219		timer_expires	= sk->sk_timer.expires;
2220	} else {
2221		timer_active	= 0;
2222		timer_expires = jiffies;
2223	}
2224
2225	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2226			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2227		i, src, srcp, dest, destp, sk->sk_state,
2228		tp->write_seq - tp->snd_una,
2229		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2230					     (tp->rcv_nxt - tp->copied_seq),
2231		timer_active,
2232		jiffies_to_clock_t(timer_expires - jiffies),
2233		icsk->icsk_retransmits,
2234		sock_i_uid(sk),
2235		icsk->icsk_probes_out,
2236		sock_i_ino(sk),
2237		atomic_read(&sk->sk_refcnt), sk,
2238		jiffies_to_clock_t(icsk->icsk_rto),
2239		jiffies_to_clock_t(icsk->icsk_ack.ato),
2240		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2241		tp->snd_cwnd,
2242		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2243		len);
2244}
2245
2246static void get_timewait4_sock(struct inet_timewait_sock *tw,
2247			       struct seq_file *f, int i, int *len)
2248{
2249	__be32 dest, src;
2250	__u16 destp, srcp;
2251	int ttd = tw->tw_ttd - jiffies;
2252
2253	if (ttd < 0)
2254		ttd = 0;
2255
2256	dest  = tw->tw_daddr;
2257	src   = tw->tw_rcv_saddr;
2258	destp = ntohs(tw->tw_dport);
2259	srcp  = ntohs(tw->tw_sport);
2260
2261	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2262		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2263		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2264		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2265		atomic_read(&tw->tw_refcnt), tw, len);
2266}
2267
2268#define TMPSZ 150
2269
2270static int tcp4_seq_show(struct seq_file *seq, void *v)
2271{
2272	struct tcp_iter_state* st;
2273	int len;
2274
2275	if (v == SEQ_START_TOKEN) {
2276		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2277			   "  sl  local_address rem_address   st tx_queue "
2278			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2279			   "inode");
2280		goto out;
2281	}
2282	st = seq->private;
2283
2284	switch (st->state) {
2285	case TCP_SEQ_STATE_LISTENING:
2286	case TCP_SEQ_STATE_ESTABLISHED:
2287		get_tcp4_sock(v, seq, st->num, &len);
2288		break;
2289	case TCP_SEQ_STATE_OPENREQ:
2290		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2291		break;
2292	case TCP_SEQ_STATE_TIME_WAIT:
2293		get_timewait4_sock(v, seq, st->num, &len);
2294		break;
2295	}
2296	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2297out:
2298	return 0;
2299}
2300
2301static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2302	.name		= "tcp",
2303	.family		= AF_INET,
2304	.seq_fops	= {
2305		.owner		= THIS_MODULE,
2306	},
2307	.seq_ops	= {
2308		.show		= tcp4_seq_show,
2309	},
2310};
2311
2312static int tcp4_proc_init_net(struct net *net)
2313{
2314	return tcp_proc_register(net, &tcp4_seq_afinfo);
2315}
2316
2317static void tcp4_proc_exit_net(struct net *net)
2318{
2319	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2320}
2321
2322static struct pernet_operations tcp4_net_ops = {
2323	.init = tcp4_proc_init_net,
2324	.exit = tcp4_proc_exit_net,
2325};
2326
2327int __init tcp4_proc_init(void)
2328{
2329	return register_pernet_subsys(&tcp4_net_ops);
2330}
2331
2332void tcp4_proc_exit(void)
2333{
2334	unregister_pernet_subsys(&tcp4_net_ops);
2335}
2336#endif /* CONFIG_PROC_FS */
2337
2338struct proto tcp_prot = {
2339	.name			= "TCP",
2340	.owner			= THIS_MODULE,
2341	.close			= tcp_close,
2342	.connect		= tcp_v4_connect,
2343	.disconnect		= tcp_disconnect,
2344	.accept			= inet_csk_accept,
2345	.ioctl			= tcp_ioctl,
2346	.init			= tcp_v4_init_sock,
2347	.destroy		= tcp_v4_destroy_sock,
2348	.shutdown		= tcp_shutdown,
2349	.setsockopt		= tcp_setsockopt,
2350	.getsockopt		= tcp_getsockopt,
2351	.recvmsg		= tcp_recvmsg,
2352	.backlog_rcv		= tcp_v4_do_rcv,
2353	.hash			= inet_hash,
2354	.unhash			= inet_unhash,
2355	.get_port		= inet_csk_get_port,
2356	.enter_memory_pressure	= tcp_enter_memory_pressure,
2357	.sockets_allocated	= &tcp_sockets_allocated,
2358	.orphan_count		= &tcp_orphan_count,
2359	.memory_allocated	= &tcp_memory_allocated,
2360	.memory_pressure	= &tcp_memory_pressure,
2361	.sysctl_mem		= sysctl_tcp_mem,
2362	.sysctl_wmem		= sysctl_tcp_wmem,
2363	.sysctl_rmem		= sysctl_tcp_rmem,
2364	.max_header		= MAX_TCP_HEADER,
2365	.obj_size		= sizeof(struct tcp_sock),
2366	.twsk_prot		= &tcp_timewait_sock_ops,
2367	.rsk_prot		= &tcp_request_sock_ops,
2368	.h.hashinfo		= &tcp_hashinfo,
2369#ifdef CONFIG_COMPAT
2370	.compat_setsockopt	= compat_tcp_setsockopt,
2371	.compat_getsockopt	= compat_tcp_getsockopt,
2372#endif
2373};
2374
2375
2376static int __net_init tcp_sk_init(struct net *net)
2377{
2378	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2379				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2380}
2381
2382static void __net_exit tcp_sk_exit(struct net *net)
2383{
2384	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2385}
2386
2387static struct pernet_operations __net_initdata tcp_sk_ops = {
2388       .init = tcp_sk_init,
2389       .exit = tcp_sk_exit,
2390};
2391
2392void __init tcp_v4_init(void)
2393{
2394	if (register_pernet_device(&tcp_sk_ops))
2395		panic("Failed to create the TCP control socket.\n");
2396}
2397
2398EXPORT_SYMBOL(ipv4_specific);
2399EXPORT_SYMBOL(tcp_hashinfo);
2400EXPORT_SYMBOL(tcp_prot);
2401EXPORT_SYMBOL(tcp_v4_conn_request);
2402EXPORT_SYMBOL(tcp_v4_connect);
2403EXPORT_SYMBOL(tcp_v4_do_rcv);
2404EXPORT_SYMBOL(tcp_v4_remember_stamp);
2405EXPORT_SYMBOL(tcp_v4_send_check);
2406EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2407
2408#ifdef CONFIG_PROC_FS
2409EXPORT_SYMBOL(tcp_proc_register);
2410EXPORT_SYMBOL(tcp_proc_unregister);
2411#endif
2412EXPORT_SYMBOL(sysctl_tcp_low_latency);
2413
2414