tcp_ipv4.c revision e3afe7b75ed8f809c1473ea9b39267487c187ccb
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63
64#include <net/net_namespace.h>
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
84int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
86
87
88#ifdef CONFIG_TCP_MD5SIG
89static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90						   __be32 addr);
91static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92			       __be32 daddr, __be32 saddr, struct tcphdr *th);
93#else
94static inline
95struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96{
97	return NULL;
98}
99#endif
100
101struct inet_hashinfo tcp_hashinfo;
102
103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104{
105	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106					  ip_hdr(skb)->saddr,
107					  tcp_hdr(skb)->dest,
108					  tcp_hdr(skb)->source);
109}
110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
113	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114	struct tcp_sock *tp = tcp_sk(sk);
115
116	/* With PAWS, it is safe from the viewpoint
117	   of data integrity. Even without PAWS it is safe provided sequence
118	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120	   Actually, the idea is close to VJ's one, only timestamp cache is
121	   held not per host, but per port pair and TW bucket is used as state
122	   holder.
123
124	   If TW bucket has been already destroyed we fall back to VJ's scheme
125	   and use initial timestamp retrieved from peer table.
126	 */
127	if (tcptw->tw_ts_recent_stamp &&
128	    (twp == NULL || (sysctl_tcp_tw_reuse &&
129			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131		if (tp->write_seq == 0)
132			tp->write_seq = 1;
133		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135		sock_hold(sktw);
136		return 1;
137	}
138
139	return 0;
140}
141
142EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144/* This will initiate an outgoing connection. */
145int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146{
147	struct inet_sock *inet = inet_sk(sk);
148	struct tcp_sock *tp = tcp_sk(sk);
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct rtable *rt;
151	__be32 daddr, nexthop;
152	int tmp;
153	int err;
154
155	if (addr_len < sizeof(struct sockaddr_in))
156		return -EINVAL;
157
158	if (usin->sin_family != AF_INET)
159		return -EAFNOSUPPORT;
160
161	nexthop = daddr = usin->sin_addr.s_addr;
162	if (inet->opt && inet->opt->srr) {
163		if (!daddr)
164			return -EINVAL;
165		nexthop = inet->opt->faddr;
166	}
167
168	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
169			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170			       IPPROTO_TCP,
171			       inet->sport, usin->sin_port, sk, 1);
172	if (tmp < 0) {
173		if (tmp == -ENETUNREACH)
174			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175		return tmp;
176	}
177
178	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179		ip_rt_put(rt);
180		return -ENETUNREACH;
181	}
182
183	if (!inet->opt || !inet->opt->srr)
184		daddr = rt->rt_dst;
185
186	if (!inet->saddr)
187		inet->saddr = rt->rt_src;
188	inet->rcv_saddr = inet->saddr;
189
190	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
191		/* Reset inherited state */
192		tp->rx_opt.ts_recent	   = 0;
193		tp->rx_opt.ts_recent_stamp = 0;
194		tp->write_seq		   = 0;
195	}
196
197	if (tcp_death_row.sysctl_tw_recycle &&
198	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199		struct inet_peer *peer = rt_get_peer(rt);
200		/*
201		 * VJ's idea. We save last timestamp seen from
202		 * the destination in peer table, when entering state
203		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204		 * when trying new connection.
205		 */
206		if (peer != NULL &&
207		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
208			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209			tp->rx_opt.ts_recent = peer->tcp_ts;
210		}
211	}
212
213	inet->dport = usin->sin_port;
214	inet->daddr = daddr;
215
216	inet_csk(sk)->icsk_ext_hdr_len = 0;
217	if (inet->opt)
218		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220	tp->rx_opt.mss_clamp = 536;
221
222	/* Socket identity is still unknown (sport may be zero).
223	 * However we set state to SYN-SENT and not releasing socket
224	 * lock select source port, enter ourselves into the hash tables and
225	 * complete initialization after this.
226	 */
227	tcp_set_state(sk, TCP_SYN_SENT);
228	err = inet_hash_connect(&tcp_death_row, sk);
229	if (err)
230		goto failure;
231
232	err = ip_route_newports(&rt, IPPROTO_TCP,
233				inet->sport, inet->dport, sk);
234	if (err)
235		goto failure;
236
237	/* OK, now commit destination to socket.  */
238	sk->sk_gso_type = SKB_GSO_TCPV4;
239	sk_setup_caps(sk, &rt->u.dst);
240
241	if (!tp->write_seq)
242		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
243							   inet->daddr,
244							   inet->sport,
245							   usin->sin_port);
246
247	inet->id = tp->write_seq ^ jiffies;
248
249	err = tcp_connect(sk);
250	rt = NULL;
251	if (err)
252		goto failure;
253
254	return 0;
255
256failure:
257	/*
258	 * This unhashes the socket and releases the local port,
259	 * if necessary.
260	 */
261	tcp_set_state(sk, TCP_CLOSE);
262	ip_rt_put(rt);
263	sk->sk_route_caps = 0;
264	inet->dport = 0;
265	return err;
266}
267
268/*
269 * This routine does path mtu discovery as defined in RFC1191.
270 */
271static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272{
273	struct dst_entry *dst;
274	struct inet_sock *inet = inet_sk(sk);
275
276	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277	 * send out by Linux are always <576bytes so they should go through
278	 * unfragmented).
279	 */
280	if (sk->sk_state == TCP_LISTEN)
281		return;
282
283	/* We don't check in the destentry if pmtu discovery is forbidden
284	 * on this route. We just assume that no packet_to_big packets
285	 * are send back when pmtu discovery is not active.
286	 * There is a small race when the user changes this flag in the
287	 * route, but I think that's acceptable.
288	 */
289	if ((dst = __sk_dst_check(sk, 0)) == NULL)
290		return;
291
292	dst->ops->update_pmtu(dst, mtu);
293
294	/* Something is about to be wrong... Remember soft error
295	 * for the case, if this connection will not able to recover.
296	 */
297	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298		sk->sk_err_soft = EMSGSIZE;
299
300	mtu = dst_mtu(dst);
301
302	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304		tcp_sync_mss(sk, mtu);
305
306		/* Resend the TCP packet because it's
307		 * clear that the old packet has been
308		 * dropped. This is the new "fast" path mtu
309		 * discovery.
310		 */
311		tcp_simple_retransmit(sk);
312	} /* else let the usual retransmit timer handle it */
313}
314
315/*
316 * This routine is called by the ICMP module when it gets some
317 * sort of error condition.  If err < 0 then the socket should
318 * be closed and the error returned to the user.  If err > 0
319 * it's just the icmp type << 8 | icmp code.  After adjustment
320 * header points to the first 8 bytes of the tcp header.  We need
321 * to find the appropriate port.
322 *
323 * The locking strategy used here is very "optimistic". When
324 * someone else accesses the socket the ICMP is just dropped
325 * and for some paths there is no check at all.
326 * A more general error queue to queue errors for later handling
327 * is probably better.
328 *
329 */
330
331void tcp_v4_err(struct sk_buff *skb, u32 info)
332{
333	struct iphdr *iph = (struct iphdr *)skb->data;
334	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
335	struct tcp_sock *tp;
336	struct inet_sock *inet;
337	const int type = icmp_hdr(skb)->type;
338	const int code = icmp_hdr(skb)->code;
339	struct sock *sk;
340	__u32 seq;
341	int err;
342	struct net *net = dev_net(skb->dev);
343
344	if (skb->len < (iph->ihl << 2) + 8) {
345		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
346		return;
347	}
348
349	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
350			iph->saddr, th->source, inet_iif(skb));
351	if (!sk) {
352		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
353		return;
354	}
355	if (sk->sk_state == TCP_TIME_WAIT) {
356		inet_twsk_put(inet_twsk(sk));
357		return;
358	}
359
360	bh_lock_sock(sk);
361	/* If too many ICMPs get dropped on busy
362	 * servers this needs to be solved differently.
363	 */
364	if (sock_owned_by_user(sk))
365		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366
367	if (sk->sk_state == TCP_CLOSE)
368		goto out;
369
370	tp = tcp_sk(sk);
371	seq = ntohl(th->seq);
372	if (sk->sk_state != TCP_LISTEN &&
373	    !between(seq, tp->snd_una, tp->snd_nxt)) {
374		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
375		goto out;
376	}
377
378	switch (type) {
379	case ICMP_SOURCE_QUENCH:
380		/* Just silently ignore these. */
381		goto out;
382	case ICMP_PARAMETERPROB:
383		err = EPROTO;
384		break;
385	case ICMP_DEST_UNREACH:
386		if (code > NR_ICMP_UNREACH)
387			goto out;
388
389		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
390			if (!sock_owned_by_user(sk))
391				do_pmtu_discovery(sk, iph, info);
392			goto out;
393		}
394
395		err = icmp_err_convert[code].errno;
396		break;
397	case ICMP_TIME_EXCEEDED:
398		err = EHOSTUNREACH;
399		break;
400	default:
401		goto out;
402	}
403
404	switch (sk->sk_state) {
405		struct request_sock *req, **prev;
406	case TCP_LISTEN:
407		if (sock_owned_by_user(sk))
408			goto out;
409
410		req = inet_csk_search_req(sk, &prev, th->dest,
411					  iph->daddr, iph->saddr);
412		if (!req)
413			goto out;
414
415		/* ICMPs are not backlogged, hence we cannot get
416		   an established socket here.
417		 */
418		WARN_ON(req->sk);
419
420		if (seq != tcp_rsk(req)->snt_isn) {
421			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
422			goto out;
423		}
424
425		/*
426		 * Still in SYN_RECV, just remove it silently.
427		 * There is no good way to pass the error to the newly
428		 * created socket, and POSIX does not want network
429		 * errors returned from accept().
430		 */
431		inet_csk_reqsk_queue_drop(sk, req, prev);
432		goto out;
433
434	case TCP_SYN_SENT:
435	case TCP_SYN_RECV:  /* Cannot happen.
436			       It can f.e. if SYNs crossed.
437			     */
438		if (!sock_owned_by_user(sk)) {
439			sk->sk_err = err;
440
441			sk->sk_error_report(sk);
442
443			tcp_done(sk);
444		} else {
445			sk->sk_err_soft = err;
446		}
447		goto out;
448	}
449
450	/* If we've already connected we will keep trying
451	 * until we time out, or the user gives up.
452	 *
453	 * rfc1122 4.2.3.9 allows to consider as hard errors
454	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
455	 * but it is obsoleted by pmtu discovery).
456	 *
457	 * Note, that in modern internet, where routing is unreliable
458	 * and in each dark corner broken firewalls sit, sending random
459	 * errors ordered by their masters even this two messages finally lose
460	 * their original sense (even Linux sends invalid PORT_UNREACHs)
461	 *
462	 * Now we are in compliance with RFCs.
463	 *							--ANK (980905)
464	 */
465
466	inet = inet_sk(sk);
467	if (!sock_owned_by_user(sk) && inet->recverr) {
468		sk->sk_err = err;
469		sk->sk_error_report(sk);
470	} else	{ /* Only an error on timeout */
471		sk->sk_err_soft = err;
472	}
473
474out:
475	bh_unlock_sock(sk);
476	sock_put(sk);
477}
478
479/* This routine computes an IPv4 TCP checksum. */
480void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
481{
482	struct inet_sock *inet = inet_sk(sk);
483	struct tcphdr *th = tcp_hdr(skb);
484
485	if (skb->ip_summed == CHECKSUM_PARTIAL) {
486		th->check = ~tcp_v4_check(len, inet->saddr,
487					  inet->daddr, 0);
488		skb->csum_start = skb_transport_header(skb) - skb->head;
489		skb->csum_offset = offsetof(struct tcphdr, check);
490	} else {
491		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
492					 csum_partial(th,
493						      th->doff << 2,
494						      skb->csum));
495	}
496}
497
498int tcp_v4_gso_send_check(struct sk_buff *skb)
499{
500	const struct iphdr *iph;
501	struct tcphdr *th;
502
503	if (!pskb_may_pull(skb, sizeof(*th)))
504		return -EINVAL;
505
506	iph = ip_hdr(skb);
507	th = tcp_hdr(skb);
508
509	th->check = 0;
510	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
511	skb->csum_start = skb_transport_header(skb) - skb->head;
512	skb->csum_offset = offsetof(struct tcphdr, check);
513	skb->ip_summed = CHECKSUM_PARTIAL;
514	return 0;
515}
516
517/*
518 *	This routine will send an RST to the other tcp.
519 *
520 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
521 *		      for reset.
522 *	Answer: if a packet caused RST, it is not for a socket
523 *		existing in our system, if it is matched to a socket,
524 *		it is just duplicate segment or bug in other side's TCP.
525 *		So that we build reply only basing on parameters
526 *		arrived with segment.
527 *	Exception: precedence violation. We do not implement it in any case.
528 */
529
530static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
531{
532	struct tcphdr *th = tcp_hdr(skb);
533	struct {
534		struct tcphdr th;
535#ifdef CONFIG_TCP_MD5SIG
536		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
537#endif
538	} rep;
539	struct ip_reply_arg arg;
540#ifdef CONFIG_TCP_MD5SIG
541	struct tcp_md5sig_key *key;
542#endif
543	struct net *net;
544
545	/* Never send a reset in response to a reset. */
546	if (th->rst)
547		return;
548
549	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
550		return;
551
552	/* Swap the send and the receive. */
553	memset(&rep, 0, sizeof(rep));
554	rep.th.dest   = th->source;
555	rep.th.source = th->dest;
556	rep.th.doff   = sizeof(struct tcphdr) / 4;
557	rep.th.rst    = 1;
558
559	if (th->ack) {
560		rep.th.seq = th->ack_seq;
561	} else {
562		rep.th.ack = 1;
563		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
564				       skb->len - (th->doff << 2));
565	}
566
567	memset(&arg, 0, sizeof(arg));
568	arg.iov[0].iov_base = (unsigned char *)&rep;
569	arg.iov[0].iov_len  = sizeof(rep.th);
570
571#ifdef CONFIG_TCP_MD5SIG
572	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
573	if (key) {
574		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
575				   (TCPOPT_NOP << 16) |
576				   (TCPOPT_MD5SIG << 8) |
577				   TCPOLEN_MD5SIG);
578		/* Update length and the length the header thinks exists */
579		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
580		rep.th.doff = arg.iov[0].iov_len / 4;
581
582		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
583				     key, ip_hdr(skb)->saddr,
584				     ip_hdr(skb)->daddr, &rep.th);
585	}
586#endif
587	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
588				      ip_hdr(skb)->saddr, /* XXX */
589				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
590	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
591	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
592
593	net = dev_net(skb_dst(skb)->dev);
594	ip_send_reply(net->ipv4.tcp_sock, skb,
595		      &arg, arg.iov[0].iov_len);
596
597	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
598	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
599}
600
601/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
602   outside socket context is ugly, certainly. What can I do?
603 */
604
605static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
606			    u32 win, u32 ts, int oif,
607			    struct tcp_md5sig_key *key,
608			    int reply_flags)
609{
610	struct tcphdr *th = tcp_hdr(skb);
611	struct {
612		struct tcphdr th;
613		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
614#ifdef CONFIG_TCP_MD5SIG
615			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
616#endif
617			];
618	} rep;
619	struct ip_reply_arg arg;
620	struct net *net = dev_net(skb_dst(skb)->dev);
621
622	memset(&rep.th, 0, sizeof(struct tcphdr));
623	memset(&arg, 0, sizeof(arg));
624
625	arg.iov[0].iov_base = (unsigned char *)&rep;
626	arg.iov[0].iov_len  = sizeof(rep.th);
627	if (ts) {
628		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
629				   (TCPOPT_TIMESTAMP << 8) |
630				   TCPOLEN_TIMESTAMP);
631		rep.opt[1] = htonl(tcp_time_stamp);
632		rep.opt[2] = htonl(ts);
633		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
634	}
635
636	/* Swap the send and the receive. */
637	rep.th.dest    = th->source;
638	rep.th.source  = th->dest;
639	rep.th.doff    = arg.iov[0].iov_len / 4;
640	rep.th.seq     = htonl(seq);
641	rep.th.ack_seq = htonl(ack);
642	rep.th.ack     = 1;
643	rep.th.window  = htons(win);
644
645#ifdef CONFIG_TCP_MD5SIG
646	if (key) {
647		int offset = (ts) ? 3 : 0;
648
649		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
650					  (TCPOPT_NOP << 16) |
651					  (TCPOPT_MD5SIG << 8) |
652					  TCPOLEN_MD5SIG);
653		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
654		rep.th.doff = arg.iov[0].iov_len/4;
655
656		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
657				    key, ip_hdr(skb)->saddr,
658				    ip_hdr(skb)->daddr, &rep.th);
659	}
660#endif
661	arg.flags = reply_flags;
662	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663				      ip_hdr(skb)->saddr, /* XXX */
664				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
665	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666	if (oif)
667		arg.bound_dev_if = oif;
668
669	ip_send_reply(net->ipv4.tcp_sock, skb,
670		      &arg, arg.iov[0].iov_len);
671
672	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673}
674
675static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676{
677	struct inet_timewait_sock *tw = inet_twsk(sk);
678	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682			tcptw->tw_ts_recent,
683			tw->tw_bound_dev_if,
684			tcp_twsk_md5_key(tcptw),
685			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
686			);
687
688	inet_twsk_put(tw);
689}
690
691static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
692				  struct request_sock *req)
693{
694	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
695			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
696			req->ts_recent,
697			0,
698			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
699			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
700}
701
702/*
703 *	Send a SYN-ACK after having received a SYN.
704 *	This still operates on a request_sock only, not on a big
705 *	socket.
706 */
707static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
708				struct dst_entry *dst)
709{
710	const struct inet_request_sock *ireq = inet_rsk(req);
711	int err = -1;
712	struct sk_buff * skb;
713
714	/* First, grab a route. */
715	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
716		return -1;
717
718	skb = tcp_make_synack(sk, dst, req);
719
720	if (skb) {
721		struct tcphdr *th = tcp_hdr(skb);
722
723		th->check = tcp_v4_check(skb->len,
724					 ireq->loc_addr,
725					 ireq->rmt_addr,
726					 csum_partial(th, skb->len,
727						      skb->csum));
728
729		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
730					    ireq->rmt_addr,
731					    ireq->opt);
732		err = net_xmit_eval(err);
733	}
734
735	dst_release(dst);
736	return err;
737}
738
739static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
740{
741	return __tcp_v4_send_synack(sk, req, NULL);
742}
743
744/*
745 *	IPv4 request_sock destructor.
746 */
747static void tcp_v4_reqsk_destructor(struct request_sock *req)
748{
749	kfree(inet_rsk(req)->opt);
750}
751
752#ifdef CONFIG_SYN_COOKIES
753static void syn_flood_warning(struct sk_buff *skb)
754{
755	static unsigned long warntime;
756
757	if (time_after(jiffies, (warntime + HZ * 60))) {
758		warntime = jiffies;
759		printk(KERN_INFO
760		       "possible SYN flooding on port %d. Sending cookies.\n",
761		       ntohs(tcp_hdr(skb)->dest));
762	}
763}
764#endif
765
766/*
767 * Save and compile IPv4 options into the request_sock if needed.
768 */
769static struct ip_options *tcp_v4_save_options(struct sock *sk,
770					      struct sk_buff *skb)
771{
772	struct ip_options *opt = &(IPCB(skb)->opt);
773	struct ip_options *dopt = NULL;
774
775	if (opt && opt->optlen) {
776		int opt_size = optlength(opt);
777		dopt = kmalloc(opt_size, GFP_ATOMIC);
778		if (dopt) {
779			if (ip_options_echo(dopt, skb)) {
780				kfree(dopt);
781				dopt = NULL;
782			}
783		}
784	}
785	return dopt;
786}
787
788#ifdef CONFIG_TCP_MD5SIG
789/*
790 * RFC2385 MD5 checksumming requires a mapping of
791 * IP address->MD5 Key.
792 * We need to maintain these in the sk structure.
793 */
794
795/* Find the Key structure for an address.  */
796static struct tcp_md5sig_key *
797			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
798{
799	struct tcp_sock *tp = tcp_sk(sk);
800	int i;
801
802	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
803		return NULL;
804	for (i = 0; i < tp->md5sig_info->entries4; i++) {
805		if (tp->md5sig_info->keys4[i].addr == addr)
806			return &tp->md5sig_info->keys4[i].base;
807	}
808	return NULL;
809}
810
811struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
812					 struct sock *addr_sk)
813{
814	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
815}
816
817EXPORT_SYMBOL(tcp_v4_md5_lookup);
818
819static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
820						      struct request_sock *req)
821{
822	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
823}
824
825/* This can be called on a newly created socket, from other files */
826int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
827		      u8 *newkey, u8 newkeylen)
828{
829	/* Add Key to the list */
830	struct tcp_md5sig_key *key;
831	struct tcp_sock *tp = tcp_sk(sk);
832	struct tcp4_md5sig_key *keys;
833
834	key = tcp_v4_md5_do_lookup(sk, addr);
835	if (key) {
836		/* Pre-existing entry - just update that one. */
837		kfree(key->key);
838		key->key = newkey;
839		key->keylen = newkeylen;
840	} else {
841		struct tcp_md5sig_info *md5sig;
842
843		if (!tp->md5sig_info) {
844			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
845						  GFP_ATOMIC);
846			if (!tp->md5sig_info) {
847				kfree(newkey);
848				return -ENOMEM;
849			}
850			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
851		}
852		if (tcp_alloc_md5sig_pool() == NULL) {
853			kfree(newkey);
854			return -ENOMEM;
855		}
856		md5sig = tp->md5sig_info;
857
858		if (md5sig->alloced4 == md5sig->entries4) {
859			keys = kmalloc((sizeof(*keys) *
860					(md5sig->entries4 + 1)), GFP_ATOMIC);
861			if (!keys) {
862				kfree(newkey);
863				tcp_free_md5sig_pool();
864				return -ENOMEM;
865			}
866
867			if (md5sig->entries4)
868				memcpy(keys, md5sig->keys4,
869				       sizeof(*keys) * md5sig->entries4);
870
871			/* Free old key list, and reference new one */
872			kfree(md5sig->keys4);
873			md5sig->keys4 = keys;
874			md5sig->alloced4++;
875		}
876		md5sig->entries4++;
877		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
878		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
879		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
880	}
881	return 0;
882}
883
884EXPORT_SYMBOL(tcp_v4_md5_do_add);
885
886static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
887			       u8 *newkey, u8 newkeylen)
888{
889	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
890				 newkey, newkeylen);
891}
892
893int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
894{
895	struct tcp_sock *tp = tcp_sk(sk);
896	int i;
897
898	for (i = 0; i < tp->md5sig_info->entries4; i++) {
899		if (tp->md5sig_info->keys4[i].addr == addr) {
900			/* Free the key */
901			kfree(tp->md5sig_info->keys4[i].base.key);
902			tp->md5sig_info->entries4--;
903
904			if (tp->md5sig_info->entries4 == 0) {
905				kfree(tp->md5sig_info->keys4);
906				tp->md5sig_info->keys4 = NULL;
907				tp->md5sig_info->alloced4 = 0;
908			} else if (tp->md5sig_info->entries4 != i) {
909				/* Need to do some manipulation */
910				memmove(&tp->md5sig_info->keys4[i],
911					&tp->md5sig_info->keys4[i+1],
912					(tp->md5sig_info->entries4 - i) *
913					 sizeof(struct tcp4_md5sig_key));
914			}
915			tcp_free_md5sig_pool();
916			return 0;
917		}
918	}
919	return -ENOENT;
920}
921
922EXPORT_SYMBOL(tcp_v4_md5_do_del);
923
924static void tcp_v4_clear_md5_list(struct sock *sk)
925{
926	struct tcp_sock *tp = tcp_sk(sk);
927
928	/* Free each key, then the set of key keys,
929	 * the crypto element, and then decrement our
930	 * hold on the last resort crypto.
931	 */
932	if (tp->md5sig_info->entries4) {
933		int i;
934		for (i = 0; i < tp->md5sig_info->entries4; i++)
935			kfree(tp->md5sig_info->keys4[i].base.key);
936		tp->md5sig_info->entries4 = 0;
937		tcp_free_md5sig_pool();
938	}
939	if (tp->md5sig_info->keys4) {
940		kfree(tp->md5sig_info->keys4);
941		tp->md5sig_info->keys4 = NULL;
942		tp->md5sig_info->alloced4  = 0;
943	}
944}
945
946static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
947				 int optlen)
948{
949	struct tcp_md5sig cmd;
950	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
951	u8 *newkey;
952
953	if (optlen < sizeof(cmd))
954		return -EINVAL;
955
956	if (copy_from_user(&cmd, optval, sizeof(cmd)))
957		return -EFAULT;
958
959	if (sin->sin_family != AF_INET)
960		return -EINVAL;
961
962	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
963		if (!tcp_sk(sk)->md5sig_info)
964			return -ENOENT;
965		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
966	}
967
968	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
969		return -EINVAL;
970
971	if (!tcp_sk(sk)->md5sig_info) {
972		struct tcp_sock *tp = tcp_sk(sk);
973		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
974
975		if (!p)
976			return -EINVAL;
977
978		tp->md5sig_info = p;
979		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
980	}
981
982	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
983	if (!newkey)
984		return -ENOMEM;
985	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
986				 newkey, cmd.tcpm_keylen);
987}
988
989static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
990					__be32 daddr, __be32 saddr, int nbytes)
991{
992	struct tcp4_pseudohdr *bp;
993	struct scatterlist sg;
994
995	bp = &hp->md5_blk.ip4;
996
997	/*
998	 * 1. the TCP pseudo-header (in the order: source IP address,
999	 * destination IP address, zero-padded protocol number, and
1000	 * segment length)
1001	 */
1002	bp->saddr = saddr;
1003	bp->daddr = daddr;
1004	bp->pad = 0;
1005	bp->protocol = IPPROTO_TCP;
1006	bp->len = cpu_to_be16(nbytes);
1007
1008	sg_init_one(&sg, bp, sizeof(*bp));
1009	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1010}
1011
1012static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1013			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1014{
1015	struct tcp_md5sig_pool *hp;
1016	struct hash_desc *desc;
1017
1018	hp = tcp_get_md5sig_pool();
1019	if (!hp)
1020		goto clear_hash_noput;
1021	desc = &hp->md5_desc;
1022
1023	if (crypto_hash_init(desc))
1024		goto clear_hash;
1025	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1026		goto clear_hash;
1027	if (tcp_md5_hash_header(hp, th))
1028		goto clear_hash;
1029	if (tcp_md5_hash_key(hp, key))
1030		goto clear_hash;
1031	if (crypto_hash_final(desc, md5_hash))
1032		goto clear_hash;
1033
1034	tcp_put_md5sig_pool();
1035	return 0;
1036
1037clear_hash:
1038	tcp_put_md5sig_pool();
1039clear_hash_noput:
1040	memset(md5_hash, 0, 16);
1041	return 1;
1042}
1043
1044int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1045			struct sock *sk, struct request_sock *req,
1046			struct sk_buff *skb)
1047{
1048	struct tcp_md5sig_pool *hp;
1049	struct hash_desc *desc;
1050	struct tcphdr *th = tcp_hdr(skb);
1051	__be32 saddr, daddr;
1052
1053	if (sk) {
1054		saddr = inet_sk(sk)->saddr;
1055		daddr = inet_sk(sk)->daddr;
1056	} else if (req) {
1057		saddr = inet_rsk(req)->loc_addr;
1058		daddr = inet_rsk(req)->rmt_addr;
1059	} else {
1060		const struct iphdr *iph = ip_hdr(skb);
1061		saddr = iph->saddr;
1062		daddr = iph->daddr;
1063	}
1064
1065	hp = tcp_get_md5sig_pool();
1066	if (!hp)
1067		goto clear_hash_noput;
1068	desc = &hp->md5_desc;
1069
1070	if (crypto_hash_init(desc))
1071		goto clear_hash;
1072
1073	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1074		goto clear_hash;
1075	if (tcp_md5_hash_header(hp, th))
1076		goto clear_hash;
1077	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1078		goto clear_hash;
1079	if (tcp_md5_hash_key(hp, key))
1080		goto clear_hash;
1081	if (crypto_hash_final(desc, md5_hash))
1082		goto clear_hash;
1083
1084	tcp_put_md5sig_pool();
1085	return 0;
1086
1087clear_hash:
1088	tcp_put_md5sig_pool();
1089clear_hash_noput:
1090	memset(md5_hash, 0, 16);
1091	return 1;
1092}
1093
1094EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1095
1096static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1097{
1098	/*
1099	 * This gets called for each TCP segment that arrives
1100	 * so we want to be efficient.
1101	 * We have 3 drop cases:
1102	 * o No MD5 hash and one expected.
1103	 * o MD5 hash and we're not expecting one.
1104	 * o MD5 hash and its wrong.
1105	 */
1106	__u8 *hash_location = NULL;
1107	struct tcp_md5sig_key *hash_expected;
1108	const struct iphdr *iph = ip_hdr(skb);
1109	struct tcphdr *th = tcp_hdr(skb);
1110	int genhash;
1111	unsigned char newhash[16];
1112
1113	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1114	hash_location = tcp_parse_md5sig_option(th);
1115
1116	/* We've parsed the options - do we have a hash? */
1117	if (!hash_expected && !hash_location)
1118		return 0;
1119
1120	if (hash_expected && !hash_location) {
1121		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1122		return 1;
1123	}
1124
1125	if (!hash_expected && hash_location) {
1126		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1127		return 1;
1128	}
1129
1130	/* Okay, so this is hash_expected and hash_location -
1131	 * so we need to calculate the checksum.
1132	 */
1133	genhash = tcp_v4_md5_hash_skb(newhash,
1134				      hash_expected,
1135				      NULL, NULL, skb);
1136
1137	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1138		if (net_ratelimit()) {
1139			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1140			       &iph->saddr, ntohs(th->source),
1141			       &iph->daddr, ntohs(th->dest),
1142			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1143		}
1144		return 1;
1145	}
1146	return 0;
1147}
1148
1149#endif
1150
1151struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1152	.family		=	PF_INET,
1153	.obj_size	=	sizeof(struct tcp_request_sock),
1154	.rtx_syn_ack	=	tcp_v4_send_synack,
1155	.send_ack	=	tcp_v4_reqsk_send_ack,
1156	.destructor	=	tcp_v4_reqsk_destructor,
1157	.send_reset	=	tcp_v4_send_reset,
1158};
1159
1160#ifdef CONFIG_TCP_MD5SIG
1161static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1162	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1163	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1164};
1165#endif
1166
1167static struct timewait_sock_ops tcp_timewait_sock_ops = {
1168	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1169	.twsk_unique	= tcp_twsk_unique,
1170	.twsk_destructor= tcp_twsk_destructor,
1171};
1172
1173int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1174{
1175	struct inet_request_sock *ireq;
1176	struct tcp_options_received tmp_opt;
1177	struct request_sock *req;
1178	__be32 saddr = ip_hdr(skb)->saddr;
1179	__be32 daddr = ip_hdr(skb)->daddr;
1180	__u32 isn = TCP_SKB_CB(skb)->when;
1181	struct dst_entry *dst = NULL;
1182#ifdef CONFIG_SYN_COOKIES
1183	int want_cookie = 0;
1184#else
1185#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1186#endif
1187
1188	/* Never answer to SYNs send to broadcast or multicast */
1189	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1190		goto drop;
1191
1192	/* TW buckets are converted to open requests without
1193	 * limitations, they conserve resources and peer is
1194	 * evidently real one.
1195	 */
1196	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1197#ifdef CONFIG_SYN_COOKIES
1198		if (sysctl_tcp_syncookies) {
1199			want_cookie = 1;
1200		} else
1201#endif
1202		goto drop;
1203	}
1204
1205	/* Accept backlog is full. If we have already queued enough
1206	 * of warm entries in syn queue, drop request. It is better than
1207	 * clogging syn queue with openreqs with exponentially increasing
1208	 * timeout.
1209	 */
1210	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1211		goto drop;
1212
1213	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1214	if (!req)
1215		goto drop;
1216
1217#ifdef CONFIG_TCP_MD5SIG
1218	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1219#endif
1220
1221	tcp_clear_options(&tmp_opt);
1222	tmp_opt.mss_clamp = 536;
1223	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1224
1225	tcp_parse_options(skb, &tmp_opt, 0);
1226
1227	if (want_cookie && !tmp_opt.saw_tstamp)
1228		tcp_clear_options(&tmp_opt);
1229
1230	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1231
1232	tcp_openreq_init(req, &tmp_opt, skb);
1233
1234	ireq = inet_rsk(req);
1235	ireq->loc_addr = daddr;
1236	ireq->rmt_addr = saddr;
1237	ireq->no_srccheck = inet_sk(sk)->transparent;
1238	ireq->opt = tcp_v4_save_options(sk, skb);
1239
1240	if (security_inet_conn_request(sk, skb, req))
1241		goto drop_and_free;
1242
1243	if (!want_cookie)
1244		TCP_ECN_create_request(req, tcp_hdr(skb));
1245
1246	if (want_cookie) {
1247#ifdef CONFIG_SYN_COOKIES
1248		syn_flood_warning(skb);
1249		req->cookie_ts = tmp_opt.tstamp_ok;
1250#endif
1251		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1252	} else if (!isn) {
1253		struct inet_peer *peer = NULL;
1254
1255		/* VJ's idea. We save last timestamp seen
1256		 * from the destination in peer table, when entering
1257		 * state TIME-WAIT, and check against it before
1258		 * accepting new connection request.
1259		 *
1260		 * If "isn" is not zero, this request hit alive
1261		 * timewait bucket, so that all the necessary checks
1262		 * are made in the function processing timewait state.
1263		 */
1264		if (tmp_opt.saw_tstamp &&
1265		    tcp_death_row.sysctl_tw_recycle &&
1266		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1267		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1268		    peer->v4daddr == saddr) {
1269			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1270			    (s32)(peer->tcp_ts - req->ts_recent) >
1271							TCP_PAWS_WINDOW) {
1272				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1273				goto drop_and_release;
1274			}
1275		}
1276		/* Kill the following clause, if you dislike this way. */
1277		else if (!sysctl_tcp_syncookies &&
1278			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1279			  (sysctl_max_syn_backlog >> 2)) &&
1280			 (!peer || !peer->tcp_ts_stamp) &&
1281			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1282			/* Without syncookies last quarter of
1283			 * backlog is filled with destinations,
1284			 * proven to be alive.
1285			 * It means that we continue to communicate
1286			 * to destinations, already remembered
1287			 * to the moment of synflood.
1288			 */
1289			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1290				       &saddr, ntohs(tcp_hdr(skb)->source));
1291			goto drop_and_release;
1292		}
1293
1294		isn = tcp_v4_init_sequence(skb);
1295	}
1296	tcp_rsk(req)->snt_isn = isn;
1297
1298	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1299		goto drop_and_free;
1300
1301	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1302	return 0;
1303
1304drop_and_release:
1305	dst_release(dst);
1306drop_and_free:
1307	reqsk_free(req);
1308drop:
1309	return 0;
1310}
1311
1312
1313/*
1314 * The three way handshake has completed - we got a valid synack -
1315 * now create the new socket.
1316 */
1317struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1318				  struct request_sock *req,
1319				  struct dst_entry *dst)
1320{
1321	struct inet_request_sock *ireq;
1322	struct inet_sock *newinet;
1323	struct tcp_sock *newtp;
1324	struct sock *newsk;
1325#ifdef CONFIG_TCP_MD5SIG
1326	struct tcp_md5sig_key *key;
1327#endif
1328
1329	if (sk_acceptq_is_full(sk))
1330		goto exit_overflow;
1331
1332	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1333		goto exit;
1334
1335	newsk = tcp_create_openreq_child(sk, req, skb);
1336	if (!newsk)
1337		goto exit;
1338
1339	newsk->sk_gso_type = SKB_GSO_TCPV4;
1340	sk_setup_caps(newsk, dst);
1341
1342	newtp		      = tcp_sk(newsk);
1343	newinet		      = inet_sk(newsk);
1344	ireq		      = inet_rsk(req);
1345	newinet->daddr	      = ireq->rmt_addr;
1346	newinet->rcv_saddr    = ireq->loc_addr;
1347	newinet->saddr	      = ireq->loc_addr;
1348	newinet->opt	      = ireq->opt;
1349	ireq->opt	      = NULL;
1350	newinet->mc_index     = inet_iif(skb);
1351	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1352	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1353	if (newinet->opt)
1354		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1355	newinet->id = newtp->write_seq ^ jiffies;
1356
1357	tcp_mtup_init(newsk);
1358	tcp_sync_mss(newsk, dst_mtu(dst));
1359	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1360	if (tcp_sk(sk)->rx_opt.user_mss &&
1361	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1362		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1363
1364	tcp_initialize_rcv_mss(newsk);
1365
1366#ifdef CONFIG_TCP_MD5SIG
1367	/* Copy over the MD5 key from the original socket */
1368	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1369		/*
1370		 * We're using one, so create a matching key
1371		 * on the newsk structure. If we fail to get
1372		 * memory, then we end up not copying the key
1373		 * across. Shucks.
1374		 */
1375		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1376		if (newkey != NULL)
1377			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1378					  newkey, key->keylen);
1379		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1380	}
1381#endif
1382
1383	__inet_hash_nolisten(newsk);
1384	__inet_inherit_port(sk, newsk);
1385
1386	return newsk;
1387
1388exit_overflow:
1389	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1390exit:
1391	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1392	dst_release(dst);
1393	return NULL;
1394}
1395
1396static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1397{
1398	struct tcphdr *th = tcp_hdr(skb);
1399	const struct iphdr *iph = ip_hdr(skb);
1400	struct sock *nsk;
1401	struct request_sock **prev;
1402	/* Find possible connection requests. */
1403	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1404						       iph->saddr, iph->daddr);
1405	if (req)
1406		return tcp_check_req(sk, skb, req, prev);
1407
1408	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1409			th->source, iph->daddr, th->dest, inet_iif(skb));
1410
1411	if (nsk) {
1412		if (nsk->sk_state != TCP_TIME_WAIT) {
1413			bh_lock_sock(nsk);
1414			return nsk;
1415		}
1416		inet_twsk_put(inet_twsk(nsk));
1417		return NULL;
1418	}
1419
1420#ifdef CONFIG_SYN_COOKIES
1421	if (!th->rst && !th->syn && th->ack)
1422		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1423#endif
1424	return sk;
1425}
1426
1427static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1428{
1429	const struct iphdr *iph = ip_hdr(skb);
1430
1431	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1432		if (!tcp_v4_check(skb->len, iph->saddr,
1433				  iph->daddr, skb->csum)) {
1434			skb->ip_summed = CHECKSUM_UNNECESSARY;
1435			return 0;
1436		}
1437	}
1438
1439	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1440				       skb->len, IPPROTO_TCP, 0);
1441
1442	if (skb->len <= 76) {
1443		return __skb_checksum_complete(skb);
1444	}
1445	return 0;
1446}
1447
1448
1449/* The socket must have it's spinlock held when we get
1450 * here.
1451 *
1452 * We have a potential double-lock case here, so even when
1453 * doing backlog processing we use the BH locking scheme.
1454 * This is because we cannot sleep with the original spinlock
1455 * held.
1456 */
1457int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1458{
1459	struct sock *rsk;
1460#ifdef CONFIG_TCP_MD5SIG
1461	/*
1462	 * We really want to reject the packet as early as possible
1463	 * if:
1464	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1465	 *  o There is an MD5 option and we're not expecting one
1466	 */
1467	if (tcp_v4_inbound_md5_hash(sk, skb))
1468		goto discard;
1469#endif
1470
1471	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1472		TCP_CHECK_TIMER(sk);
1473		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1474			rsk = sk;
1475			goto reset;
1476		}
1477		TCP_CHECK_TIMER(sk);
1478		return 0;
1479	}
1480
1481	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1482		goto csum_err;
1483
1484	if (sk->sk_state == TCP_LISTEN) {
1485		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1486		if (!nsk)
1487			goto discard;
1488
1489		if (nsk != sk) {
1490			if (tcp_child_process(sk, nsk, skb)) {
1491				rsk = nsk;
1492				goto reset;
1493			}
1494			return 0;
1495		}
1496	}
1497
1498	TCP_CHECK_TIMER(sk);
1499	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1500		rsk = sk;
1501		goto reset;
1502	}
1503	TCP_CHECK_TIMER(sk);
1504	return 0;
1505
1506reset:
1507	tcp_v4_send_reset(rsk, skb);
1508discard:
1509	kfree_skb(skb);
1510	/* Be careful here. If this function gets more complicated and
1511	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1512	 * might be destroyed here. This current version compiles correctly,
1513	 * but you have been warned.
1514	 */
1515	return 0;
1516
1517csum_err:
1518	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1519	goto discard;
1520}
1521
1522/*
1523 *	From tcp_input.c
1524 */
1525
1526int tcp_v4_rcv(struct sk_buff *skb)
1527{
1528	const struct iphdr *iph;
1529	struct tcphdr *th;
1530	struct sock *sk;
1531	int ret;
1532	struct net *net = dev_net(skb->dev);
1533
1534	if (skb->pkt_type != PACKET_HOST)
1535		goto discard_it;
1536
1537	/* Count it even if it's bad */
1538	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1539
1540	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1541		goto discard_it;
1542
1543	th = tcp_hdr(skb);
1544
1545	if (th->doff < sizeof(struct tcphdr) / 4)
1546		goto bad_packet;
1547	if (!pskb_may_pull(skb, th->doff * 4))
1548		goto discard_it;
1549
1550	/* An explanation is required here, I think.
1551	 * Packet length and doff are validated by header prediction,
1552	 * provided case of th->doff==0 is eliminated.
1553	 * So, we defer the checks. */
1554	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1555		goto bad_packet;
1556
1557	th = tcp_hdr(skb);
1558	iph = ip_hdr(skb);
1559	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1560	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1561				    skb->len - th->doff * 4);
1562	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1563	TCP_SKB_CB(skb)->when	 = 0;
1564	TCP_SKB_CB(skb)->flags	 = iph->tos;
1565	TCP_SKB_CB(skb)->sacked	 = 0;
1566
1567	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1568	if (!sk)
1569		goto no_tcp_socket;
1570
1571process:
1572	if (sk->sk_state == TCP_TIME_WAIT)
1573		goto do_time_wait;
1574
1575	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1576		goto discard_and_relse;
1577	nf_reset(skb);
1578
1579	if (sk_filter(sk, skb))
1580		goto discard_and_relse;
1581
1582	skb->dev = NULL;
1583
1584	bh_lock_sock_nested(sk);
1585	ret = 0;
1586	if (!sock_owned_by_user(sk)) {
1587#ifdef CONFIG_NET_DMA
1588		struct tcp_sock *tp = tcp_sk(sk);
1589		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1590			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1591		if (tp->ucopy.dma_chan)
1592			ret = tcp_v4_do_rcv(sk, skb);
1593		else
1594#endif
1595		{
1596			if (!tcp_prequeue(sk, skb))
1597				ret = tcp_v4_do_rcv(sk, skb);
1598		}
1599	} else
1600		sk_add_backlog(sk, skb);
1601	bh_unlock_sock(sk);
1602
1603	sock_put(sk);
1604
1605	return ret;
1606
1607no_tcp_socket:
1608	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1609		goto discard_it;
1610
1611	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1612bad_packet:
1613		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1614	} else {
1615		tcp_v4_send_reset(NULL, skb);
1616	}
1617
1618discard_it:
1619	/* Discard frame. */
1620	kfree_skb(skb);
1621	return 0;
1622
1623discard_and_relse:
1624	sock_put(sk);
1625	goto discard_it;
1626
1627do_time_wait:
1628	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1629		inet_twsk_put(inet_twsk(sk));
1630		goto discard_it;
1631	}
1632
1633	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1634		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1635		inet_twsk_put(inet_twsk(sk));
1636		goto discard_it;
1637	}
1638	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1639	case TCP_TW_SYN: {
1640		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1641							&tcp_hashinfo,
1642							iph->daddr, th->dest,
1643							inet_iif(skb));
1644		if (sk2) {
1645			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1646			inet_twsk_put(inet_twsk(sk));
1647			sk = sk2;
1648			goto process;
1649		}
1650		/* Fall through to ACK */
1651	}
1652	case TCP_TW_ACK:
1653		tcp_v4_timewait_ack(sk, skb);
1654		break;
1655	case TCP_TW_RST:
1656		goto no_tcp_socket;
1657	case TCP_TW_SUCCESS:;
1658	}
1659	goto discard_it;
1660}
1661
1662/* VJ's idea. Save last timestamp seen from this destination
1663 * and hold it at least for normal timewait interval to use for duplicate
1664 * segment detection in subsequent connections, before they enter synchronized
1665 * state.
1666 */
1667
1668int tcp_v4_remember_stamp(struct sock *sk)
1669{
1670	struct inet_sock *inet = inet_sk(sk);
1671	struct tcp_sock *tp = tcp_sk(sk);
1672	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1673	struct inet_peer *peer = NULL;
1674	int release_it = 0;
1675
1676	if (!rt || rt->rt_dst != inet->daddr) {
1677		peer = inet_getpeer(inet->daddr, 1);
1678		release_it = 1;
1679	} else {
1680		if (!rt->peer)
1681			rt_bind_peer(rt, 1);
1682		peer = rt->peer;
1683	}
1684
1685	if (peer) {
1686		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1687		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1688		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1689			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1690			peer->tcp_ts = tp->rx_opt.ts_recent;
1691		}
1692		if (release_it)
1693			inet_putpeer(peer);
1694		return 1;
1695	}
1696
1697	return 0;
1698}
1699
1700int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1701{
1702	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1703
1704	if (peer) {
1705		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1706
1707		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1708		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1709		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1710			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1711			peer->tcp_ts	   = tcptw->tw_ts_recent;
1712		}
1713		inet_putpeer(peer);
1714		return 1;
1715	}
1716
1717	return 0;
1718}
1719
1720struct inet_connection_sock_af_ops ipv4_specific = {
1721	.queue_xmit	   = ip_queue_xmit,
1722	.send_check	   = tcp_v4_send_check,
1723	.rebuild_header	   = inet_sk_rebuild_header,
1724	.conn_request	   = tcp_v4_conn_request,
1725	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1726	.remember_stamp	   = tcp_v4_remember_stamp,
1727	.net_header_len	   = sizeof(struct iphdr),
1728	.setsockopt	   = ip_setsockopt,
1729	.getsockopt	   = ip_getsockopt,
1730	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1731	.sockaddr_len	   = sizeof(struct sockaddr_in),
1732	.bind_conflict	   = inet_csk_bind_conflict,
1733#ifdef CONFIG_COMPAT
1734	.compat_setsockopt = compat_ip_setsockopt,
1735	.compat_getsockopt = compat_ip_getsockopt,
1736#endif
1737};
1738
1739#ifdef CONFIG_TCP_MD5SIG
1740static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1741	.md5_lookup		= tcp_v4_md5_lookup,
1742	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1743	.md5_add		= tcp_v4_md5_add_func,
1744	.md5_parse		= tcp_v4_parse_md5_keys,
1745};
1746#endif
1747
1748/* NOTE: A lot of things set to zero explicitly by call to
1749 *       sk_alloc() so need not be done here.
1750 */
1751static int tcp_v4_init_sock(struct sock *sk)
1752{
1753	struct inet_connection_sock *icsk = inet_csk(sk);
1754	struct tcp_sock *tp = tcp_sk(sk);
1755
1756	skb_queue_head_init(&tp->out_of_order_queue);
1757	tcp_init_xmit_timers(sk);
1758	tcp_prequeue_init(tp);
1759
1760	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1761	tp->mdev = TCP_TIMEOUT_INIT;
1762
1763	/* So many TCP implementations out there (incorrectly) count the
1764	 * initial SYN frame in their delayed-ACK and congestion control
1765	 * algorithms that we must have the following bandaid to talk
1766	 * efficiently to them.  -DaveM
1767	 */
1768	tp->snd_cwnd = 2;
1769
1770	/* See draft-stevens-tcpca-spec-01 for discussion of the
1771	 * initialization of these values.
1772	 */
1773	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1774	tp->snd_cwnd_clamp = ~0;
1775	tp->mss_cache = 536;
1776
1777	tp->reordering = sysctl_tcp_reordering;
1778	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1779
1780	sk->sk_state = TCP_CLOSE;
1781
1782	sk->sk_write_space = sk_stream_write_space;
1783	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1784
1785	icsk->icsk_af_ops = &ipv4_specific;
1786	icsk->icsk_sync_mss = tcp_sync_mss;
1787#ifdef CONFIG_TCP_MD5SIG
1788	tp->af_specific = &tcp_sock_ipv4_specific;
1789#endif
1790
1791	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1792	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1793
1794	local_bh_disable();
1795	percpu_counter_inc(&tcp_sockets_allocated);
1796	local_bh_enable();
1797
1798	return 0;
1799}
1800
1801void tcp_v4_destroy_sock(struct sock *sk)
1802{
1803	struct tcp_sock *tp = tcp_sk(sk);
1804
1805	tcp_clear_xmit_timers(sk);
1806
1807	tcp_cleanup_congestion_control(sk);
1808
1809	/* Cleanup up the write buffer. */
1810	tcp_write_queue_purge(sk);
1811
1812	/* Cleans up our, hopefully empty, out_of_order_queue. */
1813	__skb_queue_purge(&tp->out_of_order_queue);
1814
1815#ifdef CONFIG_TCP_MD5SIG
1816	/* Clean up the MD5 key list, if any */
1817	if (tp->md5sig_info) {
1818		tcp_v4_clear_md5_list(sk);
1819		kfree(tp->md5sig_info);
1820		tp->md5sig_info = NULL;
1821	}
1822#endif
1823
1824#ifdef CONFIG_NET_DMA
1825	/* Cleans up our sk_async_wait_queue */
1826	__skb_queue_purge(&sk->sk_async_wait_queue);
1827#endif
1828
1829	/* Clean prequeue, it must be empty really */
1830	__skb_queue_purge(&tp->ucopy.prequeue);
1831
1832	/* Clean up a referenced TCP bind bucket. */
1833	if (inet_csk(sk)->icsk_bind_hash)
1834		inet_put_port(sk);
1835
1836	/*
1837	 * If sendmsg cached page exists, toss it.
1838	 */
1839	if (sk->sk_sndmsg_page) {
1840		__free_page(sk->sk_sndmsg_page);
1841		sk->sk_sndmsg_page = NULL;
1842	}
1843
1844	percpu_counter_dec(&tcp_sockets_allocated);
1845}
1846
1847EXPORT_SYMBOL(tcp_v4_destroy_sock);
1848
1849#ifdef CONFIG_PROC_FS
1850/* Proc filesystem TCP sock list dumping. */
1851
1852static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1853{
1854	return hlist_nulls_empty(head) ? NULL :
1855		list_entry(head->first, struct inet_timewait_sock, tw_node);
1856}
1857
1858static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1859{
1860	return !is_a_nulls(tw->tw_node.next) ?
1861		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1862}
1863
1864static void *listening_get_next(struct seq_file *seq, void *cur)
1865{
1866	struct inet_connection_sock *icsk;
1867	struct hlist_nulls_node *node;
1868	struct sock *sk = cur;
1869	struct inet_listen_hashbucket *ilb;
1870	struct tcp_iter_state *st = seq->private;
1871	struct net *net = seq_file_net(seq);
1872
1873	if (!sk) {
1874		st->bucket = 0;
1875		ilb = &tcp_hashinfo.listening_hash[0];
1876		spin_lock_bh(&ilb->lock);
1877		sk = sk_nulls_head(&ilb->head);
1878		goto get_sk;
1879	}
1880	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1881	++st->num;
1882
1883	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1884		struct request_sock *req = cur;
1885
1886		icsk = inet_csk(st->syn_wait_sk);
1887		req = req->dl_next;
1888		while (1) {
1889			while (req) {
1890				if (req->rsk_ops->family == st->family) {
1891					cur = req;
1892					goto out;
1893				}
1894				req = req->dl_next;
1895			}
1896			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1897				break;
1898get_req:
1899			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1900		}
1901		sk	  = sk_next(st->syn_wait_sk);
1902		st->state = TCP_SEQ_STATE_LISTENING;
1903		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1904	} else {
1905		icsk = inet_csk(sk);
1906		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1908			goto start_req;
1909		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910		sk = sk_next(sk);
1911	}
1912get_sk:
1913	sk_nulls_for_each_from(sk, node) {
1914		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1915			cur = sk;
1916			goto out;
1917		}
1918		icsk = inet_csk(sk);
1919		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1920		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1921start_req:
1922			st->uid		= sock_i_uid(sk);
1923			st->syn_wait_sk = sk;
1924			st->state	= TCP_SEQ_STATE_OPENREQ;
1925			st->sbucket	= 0;
1926			goto get_req;
1927		}
1928		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1929	}
1930	spin_unlock_bh(&ilb->lock);
1931	if (++st->bucket < INET_LHTABLE_SIZE) {
1932		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1933		spin_lock_bh(&ilb->lock);
1934		sk = sk_nulls_head(&ilb->head);
1935		goto get_sk;
1936	}
1937	cur = NULL;
1938out:
1939	return cur;
1940}
1941
1942static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943{
1944	void *rc = listening_get_next(seq, NULL);
1945
1946	while (rc && *pos) {
1947		rc = listening_get_next(seq, rc);
1948		--*pos;
1949	}
1950	return rc;
1951}
1952
1953static inline int empty_bucket(struct tcp_iter_state *st)
1954{
1955	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1956		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1957}
1958
1959static void *established_get_first(struct seq_file *seq)
1960{
1961	struct tcp_iter_state *st = seq->private;
1962	struct net *net = seq_file_net(seq);
1963	void *rc = NULL;
1964
1965	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1966		struct sock *sk;
1967		struct hlist_nulls_node *node;
1968		struct inet_timewait_sock *tw;
1969		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1970
1971		/* Lockless fast path for the common case of empty buckets */
1972		if (empty_bucket(st))
1973			continue;
1974
1975		spin_lock_bh(lock);
1976		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1977			if (sk->sk_family != st->family ||
1978			    !net_eq(sock_net(sk), net)) {
1979				continue;
1980			}
1981			rc = sk;
1982			goto out;
1983		}
1984		st->state = TCP_SEQ_STATE_TIME_WAIT;
1985		inet_twsk_for_each(tw, node,
1986				   &tcp_hashinfo.ehash[st->bucket].twchain) {
1987			if (tw->tw_family != st->family ||
1988			    !net_eq(twsk_net(tw), net)) {
1989				continue;
1990			}
1991			rc = tw;
1992			goto out;
1993		}
1994		spin_unlock_bh(lock);
1995		st->state = TCP_SEQ_STATE_ESTABLISHED;
1996	}
1997out:
1998	return rc;
1999}
2000
2001static void *established_get_next(struct seq_file *seq, void *cur)
2002{
2003	struct sock *sk = cur;
2004	struct inet_timewait_sock *tw;
2005	struct hlist_nulls_node *node;
2006	struct tcp_iter_state *st = seq->private;
2007	struct net *net = seq_file_net(seq);
2008
2009	++st->num;
2010
2011	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2012		tw = cur;
2013		tw = tw_next(tw);
2014get_tw:
2015		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2016			tw = tw_next(tw);
2017		}
2018		if (tw) {
2019			cur = tw;
2020			goto out;
2021		}
2022		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2023		st->state = TCP_SEQ_STATE_ESTABLISHED;
2024
2025		/* Look for next non empty bucket */
2026		while (++st->bucket < tcp_hashinfo.ehash_size &&
2027				empty_bucket(st))
2028			;
2029		if (st->bucket >= tcp_hashinfo.ehash_size)
2030			return NULL;
2031
2032		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2033		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2034	} else
2035		sk = sk_nulls_next(sk);
2036
2037	sk_nulls_for_each_from(sk, node) {
2038		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2039			goto found;
2040	}
2041
2042	st->state = TCP_SEQ_STATE_TIME_WAIT;
2043	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2044	goto get_tw;
2045found:
2046	cur = sk;
2047out:
2048	return cur;
2049}
2050
2051static void *established_get_idx(struct seq_file *seq, loff_t pos)
2052{
2053	void *rc = established_get_first(seq);
2054
2055	while (rc && pos) {
2056		rc = established_get_next(seq, rc);
2057		--pos;
2058	}
2059	return rc;
2060}
2061
2062static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2063{
2064	void *rc;
2065	struct tcp_iter_state *st = seq->private;
2066
2067	st->state = TCP_SEQ_STATE_LISTENING;
2068	rc	  = listening_get_idx(seq, &pos);
2069
2070	if (!rc) {
2071		st->state = TCP_SEQ_STATE_ESTABLISHED;
2072		rc	  = established_get_idx(seq, pos);
2073	}
2074
2075	return rc;
2076}
2077
2078static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2079{
2080	struct tcp_iter_state *st = seq->private;
2081	st->state = TCP_SEQ_STATE_LISTENING;
2082	st->num = 0;
2083	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2084}
2085
2086static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2087{
2088	void *rc = NULL;
2089	struct tcp_iter_state *st;
2090
2091	if (v == SEQ_START_TOKEN) {
2092		rc = tcp_get_idx(seq, 0);
2093		goto out;
2094	}
2095	st = seq->private;
2096
2097	switch (st->state) {
2098	case TCP_SEQ_STATE_OPENREQ:
2099	case TCP_SEQ_STATE_LISTENING:
2100		rc = listening_get_next(seq, v);
2101		if (!rc) {
2102			st->state = TCP_SEQ_STATE_ESTABLISHED;
2103			rc	  = established_get_first(seq);
2104		}
2105		break;
2106	case TCP_SEQ_STATE_ESTABLISHED:
2107	case TCP_SEQ_STATE_TIME_WAIT:
2108		rc = established_get_next(seq, v);
2109		break;
2110	}
2111out:
2112	++*pos;
2113	return rc;
2114}
2115
2116static void tcp_seq_stop(struct seq_file *seq, void *v)
2117{
2118	struct tcp_iter_state *st = seq->private;
2119
2120	switch (st->state) {
2121	case TCP_SEQ_STATE_OPENREQ:
2122		if (v) {
2123			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2124			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2125		}
2126	case TCP_SEQ_STATE_LISTENING:
2127		if (v != SEQ_START_TOKEN)
2128			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2129		break;
2130	case TCP_SEQ_STATE_TIME_WAIT:
2131	case TCP_SEQ_STATE_ESTABLISHED:
2132		if (v)
2133			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2134		break;
2135	}
2136}
2137
2138static int tcp_seq_open(struct inode *inode, struct file *file)
2139{
2140	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2141	struct tcp_iter_state *s;
2142	int err;
2143
2144	err = seq_open_net(inode, file, &afinfo->seq_ops,
2145			  sizeof(struct tcp_iter_state));
2146	if (err < 0)
2147		return err;
2148
2149	s = ((struct seq_file *)file->private_data)->private;
2150	s->family		= afinfo->family;
2151	return 0;
2152}
2153
2154int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2155{
2156	int rc = 0;
2157	struct proc_dir_entry *p;
2158
2159	afinfo->seq_fops.open		= tcp_seq_open;
2160	afinfo->seq_fops.read		= seq_read;
2161	afinfo->seq_fops.llseek		= seq_lseek;
2162	afinfo->seq_fops.release	= seq_release_net;
2163
2164	afinfo->seq_ops.start		= tcp_seq_start;
2165	afinfo->seq_ops.next		= tcp_seq_next;
2166	afinfo->seq_ops.stop		= tcp_seq_stop;
2167
2168	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2169			     &afinfo->seq_fops, afinfo);
2170	if (!p)
2171		rc = -ENOMEM;
2172	return rc;
2173}
2174
2175void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2176{
2177	proc_net_remove(net, afinfo->name);
2178}
2179
2180static void get_openreq4(struct sock *sk, struct request_sock *req,
2181			 struct seq_file *f, int i, int uid, int *len)
2182{
2183	const struct inet_request_sock *ireq = inet_rsk(req);
2184	int ttd = req->expires - jiffies;
2185
2186	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2187		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2188		i,
2189		ireq->loc_addr,
2190		ntohs(inet_sk(sk)->sport),
2191		ireq->rmt_addr,
2192		ntohs(ireq->rmt_port),
2193		TCP_SYN_RECV,
2194		0, 0, /* could print option size, but that is af dependent. */
2195		1,    /* timers active (only the expire timer) */
2196		jiffies_to_clock_t(ttd),
2197		req->retrans,
2198		uid,
2199		0,  /* non standard timer */
2200		0, /* open_requests have no inode */
2201		atomic_read(&sk->sk_refcnt),
2202		req,
2203		len);
2204}
2205
2206static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2207{
2208	int timer_active;
2209	unsigned long timer_expires;
2210	struct tcp_sock *tp = tcp_sk(sk);
2211	const struct inet_connection_sock *icsk = inet_csk(sk);
2212	struct inet_sock *inet = inet_sk(sk);
2213	__be32 dest = inet->daddr;
2214	__be32 src = inet->rcv_saddr;
2215	__u16 destp = ntohs(inet->dport);
2216	__u16 srcp = ntohs(inet->sport);
2217
2218	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2219		timer_active	= 1;
2220		timer_expires	= icsk->icsk_timeout;
2221	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2222		timer_active	= 4;
2223		timer_expires	= icsk->icsk_timeout;
2224	} else if (timer_pending(&sk->sk_timer)) {
2225		timer_active	= 2;
2226		timer_expires	= sk->sk_timer.expires;
2227	} else {
2228		timer_active	= 0;
2229		timer_expires = jiffies;
2230	}
2231
2232	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2233			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2234		i, src, srcp, dest, destp, sk->sk_state,
2235		tp->write_seq - tp->snd_una,
2236		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2237					     (tp->rcv_nxt - tp->copied_seq),
2238		timer_active,
2239		jiffies_to_clock_t(timer_expires - jiffies),
2240		icsk->icsk_retransmits,
2241		sock_i_uid(sk),
2242		icsk->icsk_probes_out,
2243		sock_i_ino(sk),
2244		atomic_read(&sk->sk_refcnt), sk,
2245		jiffies_to_clock_t(icsk->icsk_rto),
2246		jiffies_to_clock_t(icsk->icsk_ack.ato),
2247		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2248		tp->snd_cwnd,
2249		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2250		len);
2251}
2252
2253static void get_timewait4_sock(struct inet_timewait_sock *tw,
2254			       struct seq_file *f, int i, int *len)
2255{
2256	__be32 dest, src;
2257	__u16 destp, srcp;
2258	int ttd = tw->tw_ttd - jiffies;
2259
2260	if (ttd < 0)
2261		ttd = 0;
2262
2263	dest  = tw->tw_daddr;
2264	src   = tw->tw_rcv_saddr;
2265	destp = ntohs(tw->tw_dport);
2266	srcp  = ntohs(tw->tw_sport);
2267
2268	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2269		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2270		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2271		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2272		atomic_read(&tw->tw_refcnt), tw, len);
2273}
2274
2275#define TMPSZ 150
2276
2277static int tcp4_seq_show(struct seq_file *seq, void *v)
2278{
2279	struct tcp_iter_state *st;
2280	int len;
2281
2282	if (v == SEQ_START_TOKEN) {
2283		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2284			   "  sl  local_address rem_address   st tx_queue "
2285			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2286			   "inode");
2287		goto out;
2288	}
2289	st = seq->private;
2290
2291	switch (st->state) {
2292	case TCP_SEQ_STATE_LISTENING:
2293	case TCP_SEQ_STATE_ESTABLISHED:
2294		get_tcp4_sock(v, seq, st->num, &len);
2295		break;
2296	case TCP_SEQ_STATE_OPENREQ:
2297		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2298		break;
2299	case TCP_SEQ_STATE_TIME_WAIT:
2300		get_timewait4_sock(v, seq, st->num, &len);
2301		break;
2302	}
2303	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2304out:
2305	return 0;
2306}
2307
2308static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2309	.name		= "tcp",
2310	.family		= AF_INET,
2311	.seq_fops	= {
2312		.owner		= THIS_MODULE,
2313	},
2314	.seq_ops	= {
2315		.show		= tcp4_seq_show,
2316	},
2317};
2318
2319static int tcp4_proc_init_net(struct net *net)
2320{
2321	return tcp_proc_register(net, &tcp4_seq_afinfo);
2322}
2323
2324static void tcp4_proc_exit_net(struct net *net)
2325{
2326	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2327}
2328
2329static struct pernet_operations tcp4_net_ops = {
2330	.init = tcp4_proc_init_net,
2331	.exit = tcp4_proc_exit_net,
2332};
2333
2334int __init tcp4_proc_init(void)
2335{
2336	return register_pernet_subsys(&tcp4_net_ops);
2337}
2338
2339void tcp4_proc_exit(void)
2340{
2341	unregister_pernet_subsys(&tcp4_net_ops);
2342}
2343#endif /* CONFIG_PROC_FS */
2344
2345struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2346{
2347	struct iphdr *iph = skb_gro_network_header(skb);
2348
2349	switch (skb->ip_summed) {
2350	case CHECKSUM_COMPLETE:
2351		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2352				  skb->csum)) {
2353			skb->ip_summed = CHECKSUM_UNNECESSARY;
2354			break;
2355		}
2356
2357		/* fall through */
2358	case CHECKSUM_NONE:
2359		NAPI_GRO_CB(skb)->flush = 1;
2360		return NULL;
2361	}
2362
2363	return tcp_gro_receive(head, skb);
2364}
2365EXPORT_SYMBOL(tcp4_gro_receive);
2366
2367int tcp4_gro_complete(struct sk_buff *skb)
2368{
2369	struct iphdr *iph = ip_hdr(skb);
2370	struct tcphdr *th = tcp_hdr(skb);
2371
2372	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2373				  iph->saddr, iph->daddr, 0);
2374	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2375
2376	return tcp_gro_complete(skb);
2377}
2378EXPORT_SYMBOL(tcp4_gro_complete);
2379
2380struct proto tcp_prot = {
2381	.name			= "TCP",
2382	.owner			= THIS_MODULE,
2383	.close			= tcp_close,
2384	.connect		= tcp_v4_connect,
2385	.disconnect		= tcp_disconnect,
2386	.accept			= inet_csk_accept,
2387	.ioctl			= tcp_ioctl,
2388	.init			= tcp_v4_init_sock,
2389	.destroy		= tcp_v4_destroy_sock,
2390	.shutdown		= tcp_shutdown,
2391	.setsockopt		= tcp_setsockopt,
2392	.getsockopt		= tcp_getsockopt,
2393	.recvmsg		= tcp_recvmsg,
2394	.backlog_rcv		= tcp_v4_do_rcv,
2395	.hash			= inet_hash,
2396	.unhash			= inet_unhash,
2397	.get_port		= inet_csk_get_port,
2398	.enter_memory_pressure	= tcp_enter_memory_pressure,
2399	.sockets_allocated	= &tcp_sockets_allocated,
2400	.orphan_count		= &tcp_orphan_count,
2401	.memory_allocated	= &tcp_memory_allocated,
2402	.memory_pressure	= &tcp_memory_pressure,
2403	.sysctl_mem		= sysctl_tcp_mem,
2404	.sysctl_wmem		= sysctl_tcp_wmem,
2405	.sysctl_rmem		= sysctl_tcp_rmem,
2406	.max_header		= MAX_TCP_HEADER,
2407	.obj_size		= sizeof(struct tcp_sock),
2408	.slab_flags		= SLAB_DESTROY_BY_RCU,
2409	.twsk_prot		= &tcp_timewait_sock_ops,
2410	.rsk_prot		= &tcp_request_sock_ops,
2411	.h.hashinfo		= &tcp_hashinfo,
2412#ifdef CONFIG_COMPAT
2413	.compat_setsockopt	= compat_tcp_setsockopt,
2414	.compat_getsockopt	= compat_tcp_getsockopt,
2415#endif
2416};
2417
2418
2419static int __net_init tcp_sk_init(struct net *net)
2420{
2421	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2422				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2423}
2424
2425static void __net_exit tcp_sk_exit(struct net *net)
2426{
2427	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2428	inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2429}
2430
2431static struct pernet_operations __net_initdata tcp_sk_ops = {
2432       .init = tcp_sk_init,
2433       .exit = tcp_sk_exit,
2434};
2435
2436void __init tcp_v4_init(void)
2437{
2438	inet_hashinfo_init(&tcp_hashinfo);
2439	if (register_pernet_subsys(&tcp_sk_ops))
2440		panic("Failed to create the TCP control socket.\n");
2441}
2442
2443EXPORT_SYMBOL(ipv4_specific);
2444EXPORT_SYMBOL(tcp_hashinfo);
2445EXPORT_SYMBOL(tcp_prot);
2446EXPORT_SYMBOL(tcp_v4_conn_request);
2447EXPORT_SYMBOL(tcp_v4_connect);
2448EXPORT_SYMBOL(tcp_v4_do_rcv);
2449EXPORT_SYMBOL(tcp_v4_remember_stamp);
2450EXPORT_SYMBOL(tcp_v4_send_check);
2451EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2452
2453#ifdef CONFIG_PROC_FS
2454EXPORT_SYMBOL(tcp_proc_register);
2455EXPORT_SYMBOL(tcp_proc_unregister);
2456#endif
2457EXPORT_SYMBOL(sysctl_tcp_low_latency);
2458
2459