tcp_ipv4.c revision 3b401a81c0d50ea9c718cf837f62cc2e6e79cc30
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63
64#include <net/net_namespace.h>
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/transp_v6.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/timewait_sock.h>
72#include <net/xfrm.h>
73#include <net/netdma.h>
74
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
81#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
84int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
86
87
88#ifdef CONFIG_TCP_MD5SIG
89static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90						   __be32 addr);
91static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92			       __be32 daddr, __be32 saddr, struct tcphdr *th);
93#else
94static inline
95struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96{
97	return NULL;
98}
99#endif
100
101struct inet_hashinfo tcp_hashinfo;
102
103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104{
105	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106					  ip_hdr(skb)->saddr,
107					  tcp_hdr(skb)->dest,
108					  tcp_hdr(skb)->source);
109}
110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
113	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114	struct tcp_sock *tp = tcp_sk(sk);
115
116	/* With PAWS, it is safe from the viewpoint
117	   of data integrity. Even without PAWS it is safe provided sequence
118	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120	   Actually, the idea is close to VJ's one, only timestamp cache is
121	   held not per host, but per port pair and TW bucket is used as state
122	   holder.
123
124	   If TW bucket has been already destroyed we fall back to VJ's scheme
125	   and use initial timestamp retrieved from peer table.
126	 */
127	if (tcptw->tw_ts_recent_stamp &&
128	    (twp == NULL || (sysctl_tcp_tw_reuse &&
129			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131		if (tp->write_seq == 0)
132			tp->write_seq = 1;
133		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135		sock_hold(sktw);
136		return 1;
137	}
138
139	return 0;
140}
141
142EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144/* This will initiate an outgoing connection. */
145int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146{
147	struct inet_sock *inet = inet_sk(sk);
148	struct tcp_sock *tp = tcp_sk(sk);
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct rtable *rt;
151	__be32 daddr, nexthop;
152	int tmp;
153	int err;
154
155	if (addr_len < sizeof(struct sockaddr_in))
156		return -EINVAL;
157
158	if (usin->sin_family != AF_INET)
159		return -EAFNOSUPPORT;
160
161	nexthop = daddr = usin->sin_addr.s_addr;
162	if (inet->opt && inet->opt->srr) {
163		if (!daddr)
164			return -EINVAL;
165		nexthop = inet->opt->faddr;
166	}
167
168	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
169			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170			       IPPROTO_TCP,
171			       inet->sport, usin->sin_port, sk, 1);
172	if (tmp < 0) {
173		if (tmp == -ENETUNREACH)
174			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175		return tmp;
176	}
177
178	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179		ip_rt_put(rt);
180		return -ENETUNREACH;
181	}
182
183	if (!inet->opt || !inet->opt->srr)
184		daddr = rt->rt_dst;
185
186	if (!inet->saddr)
187		inet->saddr = rt->rt_src;
188	inet->rcv_saddr = inet->saddr;
189
190	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
191		/* Reset inherited state */
192		tp->rx_opt.ts_recent	   = 0;
193		tp->rx_opt.ts_recent_stamp = 0;
194		tp->write_seq		   = 0;
195	}
196
197	if (tcp_death_row.sysctl_tw_recycle &&
198	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199		struct inet_peer *peer = rt_get_peer(rt);
200		/*
201		 * VJ's idea. We save last timestamp seen from
202		 * the destination in peer table, when entering state
203		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204		 * when trying new connection.
205		 */
206		if (peer != NULL &&
207		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
208			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209			tp->rx_opt.ts_recent = peer->tcp_ts;
210		}
211	}
212
213	inet->dport = usin->sin_port;
214	inet->daddr = daddr;
215
216	inet_csk(sk)->icsk_ext_hdr_len = 0;
217	if (inet->opt)
218		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220	tp->rx_opt.mss_clamp = 536;
221
222	/* Socket identity is still unknown (sport may be zero).
223	 * However we set state to SYN-SENT and not releasing socket
224	 * lock select source port, enter ourselves into the hash tables and
225	 * complete initialization after this.
226	 */
227	tcp_set_state(sk, TCP_SYN_SENT);
228	err = inet_hash_connect(&tcp_death_row, sk);
229	if (err)
230		goto failure;
231
232	err = ip_route_newports(&rt, IPPROTO_TCP,
233				inet->sport, inet->dport, sk);
234	if (err)
235		goto failure;
236
237	/* OK, now commit destination to socket.  */
238	sk->sk_gso_type = SKB_GSO_TCPV4;
239	sk_setup_caps(sk, &rt->u.dst);
240
241	if (!tp->write_seq)
242		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
243							   inet->daddr,
244							   inet->sport,
245							   usin->sin_port);
246
247	inet->id = tp->write_seq ^ jiffies;
248
249	err = tcp_connect(sk);
250	rt = NULL;
251	if (err)
252		goto failure;
253
254	return 0;
255
256failure:
257	/*
258	 * This unhashes the socket and releases the local port,
259	 * if necessary.
260	 */
261	tcp_set_state(sk, TCP_CLOSE);
262	ip_rt_put(rt);
263	sk->sk_route_caps = 0;
264	inet->dport = 0;
265	return err;
266}
267
268/*
269 * This routine does path mtu discovery as defined in RFC1191.
270 */
271static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272{
273	struct dst_entry *dst;
274	struct inet_sock *inet = inet_sk(sk);
275
276	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277	 * send out by Linux are always <576bytes so they should go through
278	 * unfragmented).
279	 */
280	if (sk->sk_state == TCP_LISTEN)
281		return;
282
283	/* We don't check in the destentry if pmtu discovery is forbidden
284	 * on this route. We just assume that no packet_to_big packets
285	 * are send back when pmtu discovery is not active.
286	 * There is a small race when the user changes this flag in the
287	 * route, but I think that's acceptable.
288	 */
289	if ((dst = __sk_dst_check(sk, 0)) == NULL)
290		return;
291
292	dst->ops->update_pmtu(dst, mtu);
293
294	/* Something is about to be wrong... Remember soft error
295	 * for the case, if this connection will not able to recover.
296	 */
297	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298		sk->sk_err_soft = EMSGSIZE;
299
300	mtu = dst_mtu(dst);
301
302	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304		tcp_sync_mss(sk, mtu);
305
306		/* Resend the TCP packet because it's
307		 * clear that the old packet has been
308		 * dropped. This is the new "fast" path mtu
309		 * discovery.
310		 */
311		tcp_simple_retransmit(sk);
312	} /* else let the usual retransmit timer handle it */
313}
314
315/*
316 * This routine is called by the ICMP module when it gets some
317 * sort of error condition.  If err < 0 then the socket should
318 * be closed and the error returned to the user.  If err > 0
319 * it's just the icmp type << 8 | icmp code.  After adjustment
320 * header points to the first 8 bytes of the tcp header.  We need
321 * to find the appropriate port.
322 *
323 * The locking strategy used here is very "optimistic". When
324 * someone else accesses the socket the ICMP is just dropped
325 * and for some paths there is no check at all.
326 * A more general error queue to queue errors for later handling
327 * is probably better.
328 *
329 */
330
331void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332{
333	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335	struct inet_connection_sock *icsk;
336	struct tcp_sock *tp;
337	struct inet_sock *inet;
338	const int type = icmp_hdr(icmp_skb)->type;
339	const int code = icmp_hdr(icmp_skb)->code;
340	struct sock *sk;
341	struct sk_buff *skb;
342	__u32 seq;
343	__u32 remaining;
344	int err;
345	struct net *net = dev_net(icmp_skb->dev);
346
347	if (icmp_skb->len < (iph->ihl << 2) + 8) {
348		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349		return;
350	}
351
352	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353			iph->saddr, th->source, inet_iif(icmp_skb));
354	if (!sk) {
355		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356		return;
357	}
358	if (sk->sk_state == TCP_TIME_WAIT) {
359		inet_twsk_put(inet_twsk(sk));
360		return;
361	}
362
363	bh_lock_sock(sk);
364	/* If too many ICMPs get dropped on busy
365	 * servers this needs to be solved differently.
366	 */
367	if (sock_owned_by_user(sk))
368		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370	if (sk->sk_state == TCP_CLOSE)
371		goto out;
372
373	icsk = inet_csk(sk);
374	tp = tcp_sk(sk);
375	seq = ntohl(th->seq);
376	if (sk->sk_state != TCP_LISTEN &&
377	    !between(seq, tp->snd_una, tp->snd_nxt)) {
378		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379		goto out;
380	}
381
382	switch (type) {
383	case ICMP_SOURCE_QUENCH:
384		/* Just silently ignore these. */
385		goto out;
386	case ICMP_PARAMETERPROB:
387		err = EPROTO;
388		break;
389	case ICMP_DEST_UNREACH:
390		if (code > NR_ICMP_UNREACH)
391			goto out;
392
393		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394			if (!sock_owned_by_user(sk))
395				do_pmtu_discovery(sk, iph, info);
396			goto out;
397		}
398
399		err = icmp_err_convert[code].errno;
400		/* check if icmp_skb allows revert of backoff
401		 * (see draft-zimmermann-tcp-lcd) */
402		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
403			break;
404		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
405		    !icsk->icsk_backoff)
406			break;
407
408		icsk->icsk_backoff--;
409		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
410					 icsk->icsk_backoff;
411		tcp_bound_rto(sk);
412
413		skb = tcp_write_queue_head(sk);
414		BUG_ON(!skb);
415
416		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
417				tcp_time_stamp - TCP_SKB_CB(skb)->when);
418
419		if (remaining) {
420			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
421						  remaining, TCP_RTO_MAX);
422		} else if (sock_owned_by_user(sk)) {
423			/* RTO revert clocked out retransmission,
424			 * but socket is locked. Will defer. */
425			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426						  HZ/20, TCP_RTO_MAX);
427		} else {
428			/* RTO revert clocked out retransmission.
429			 * Will retransmit now */
430			tcp_retransmit_timer(sk);
431		}
432
433		break;
434	case ICMP_TIME_EXCEEDED:
435		err = EHOSTUNREACH;
436		break;
437	default:
438		goto out;
439	}
440
441	switch (sk->sk_state) {
442		struct request_sock *req, **prev;
443	case TCP_LISTEN:
444		if (sock_owned_by_user(sk))
445			goto out;
446
447		req = inet_csk_search_req(sk, &prev, th->dest,
448					  iph->daddr, iph->saddr);
449		if (!req)
450			goto out;
451
452		/* ICMPs are not backlogged, hence we cannot get
453		   an established socket here.
454		 */
455		WARN_ON(req->sk);
456
457		if (seq != tcp_rsk(req)->snt_isn) {
458			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
459			goto out;
460		}
461
462		/*
463		 * Still in SYN_RECV, just remove it silently.
464		 * There is no good way to pass the error to the newly
465		 * created socket, and POSIX does not want network
466		 * errors returned from accept().
467		 */
468		inet_csk_reqsk_queue_drop(sk, req, prev);
469		goto out;
470
471	case TCP_SYN_SENT:
472	case TCP_SYN_RECV:  /* Cannot happen.
473			       It can f.e. if SYNs crossed.
474			     */
475		if (!sock_owned_by_user(sk)) {
476			sk->sk_err = err;
477
478			sk->sk_error_report(sk);
479
480			tcp_done(sk);
481		} else {
482			sk->sk_err_soft = err;
483		}
484		goto out;
485	}
486
487	/* If we've already connected we will keep trying
488	 * until we time out, or the user gives up.
489	 *
490	 * rfc1122 4.2.3.9 allows to consider as hard errors
491	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
492	 * but it is obsoleted by pmtu discovery).
493	 *
494	 * Note, that in modern internet, where routing is unreliable
495	 * and in each dark corner broken firewalls sit, sending random
496	 * errors ordered by their masters even this two messages finally lose
497	 * their original sense (even Linux sends invalid PORT_UNREACHs)
498	 *
499	 * Now we are in compliance with RFCs.
500	 *							--ANK (980905)
501	 */
502
503	inet = inet_sk(sk);
504	if (!sock_owned_by_user(sk) && inet->recverr) {
505		sk->sk_err = err;
506		sk->sk_error_report(sk);
507	} else	{ /* Only an error on timeout */
508		sk->sk_err_soft = err;
509	}
510
511out:
512	bh_unlock_sock(sk);
513	sock_put(sk);
514}
515
516/* This routine computes an IPv4 TCP checksum. */
517void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
518{
519	struct inet_sock *inet = inet_sk(sk);
520	struct tcphdr *th = tcp_hdr(skb);
521
522	if (skb->ip_summed == CHECKSUM_PARTIAL) {
523		th->check = ~tcp_v4_check(len, inet->saddr,
524					  inet->daddr, 0);
525		skb->csum_start = skb_transport_header(skb) - skb->head;
526		skb->csum_offset = offsetof(struct tcphdr, check);
527	} else {
528		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
529					 csum_partial(th,
530						      th->doff << 2,
531						      skb->csum));
532	}
533}
534
535int tcp_v4_gso_send_check(struct sk_buff *skb)
536{
537	const struct iphdr *iph;
538	struct tcphdr *th;
539
540	if (!pskb_may_pull(skb, sizeof(*th)))
541		return -EINVAL;
542
543	iph = ip_hdr(skb);
544	th = tcp_hdr(skb);
545
546	th->check = 0;
547	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
548	skb->csum_start = skb_transport_header(skb) - skb->head;
549	skb->csum_offset = offsetof(struct tcphdr, check);
550	skb->ip_summed = CHECKSUM_PARTIAL;
551	return 0;
552}
553
554/*
555 *	This routine will send an RST to the other tcp.
556 *
557 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
558 *		      for reset.
559 *	Answer: if a packet caused RST, it is not for a socket
560 *		existing in our system, if it is matched to a socket,
561 *		it is just duplicate segment or bug in other side's TCP.
562 *		So that we build reply only basing on parameters
563 *		arrived with segment.
564 *	Exception: precedence violation. We do not implement it in any case.
565 */
566
567static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
568{
569	struct tcphdr *th = tcp_hdr(skb);
570	struct {
571		struct tcphdr th;
572#ifdef CONFIG_TCP_MD5SIG
573		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
574#endif
575	} rep;
576	struct ip_reply_arg arg;
577#ifdef CONFIG_TCP_MD5SIG
578	struct tcp_md5sig_key *key;
579#endif
580	struct net *net;
581
582	/* Never send a reset in response to a reset. */
583	if (th->rst)
584		return;
585
586	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
587		return;
588
589	/* Swap the send and the receive. */
590	memset(&rep, 0, sizeof(rep));
591	rep.th.dest   = th->source;
592	rep.th.source = th->dest;
593	rep.th.doff   = sizeof(struct tcphdr) / 4;
594	rep.th.rst    = 1;
595
596	if (th->ack) {
597		rep.th.seq = th->ack_seq;
598	} else {
599		rep.th.ack = 1;
600		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
601				       skb->len - (th->doff << 2));
602	}
603
604	memset(&arg, 0, sizeof(arg));
605	arg.iov[0].iov_base = (unsigned char *)&rep;
606	arg.iov[0].iov_len  = sizeof(rep.th);
607
608#ifdef CONFIG_TCP_MD5SIG
609	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
610	if (key) {
611		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
612				   (TCPOPT_NOP << 16) |
613				   (TCPOPT_MD5SIG << 8) |
614				   TCPOLEN_MD5SIG);
615		/* Update length and the length the header thinks exists */
616		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
617		rep.th.doff = arg.iov[0].iov_len / 4;
618
619		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
620				     key, ip_hdr(skb)->saddr,
621				     ip_hdr(skb)->daddr, &rep.th);
622	}
623#endif
624	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
625				      ip_hdr(skb)->saddr, /* XXX */
626				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
627	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
628	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
629
630	net = dev_net(skb_dst(skb)->dev);
631	ip_send_reply(net->ipv4.tcp_sock, skb,
632		      &arg, arg.iov[0].iov_len);
633
634	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
635	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
636}
637
638/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
639   outside socket context is ugly, certainly. What can I do?
640 */
641
642static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
643			    u32 win, u32 ts, int oif,
644			    struct tcp_md5sig_key *key,
645			    int reply_flags)
646{
647	struct tcphdr *th = tcp_hdr(skb);
648	struct {
649		struct tcphdr th;
650		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
651#ifdef CONFIG_TCP_MD5SIG
652			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
653#endif
654			];
655	} rep;
656	struct ip_reply_arg arg;
657	struct net *net = dev_net(skb_dst(skb)->dev);
658
659	memset(&rep.th, 0, sizeof(struct tcphdr));
660	memset(&arg, 0, sizeof(arg));
661
662	arg.iov[0].iov_base = (unsigned char *)&rep;
663	arg.iov[0].iov_len  = sizeof(rep.th);
664	if (ts) {
665		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
666				   (TCPOPT_TIMESTAMP << 8) |
667				   TCPOLEN_TIMESTAMP);
668		rep.opt[1] = htonl(tcp_time_stamp);
669		rep.opt[2] = htonl(ts);
670		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
671	}
672
673	/* Swap the send and the receive. */
674	rep.th.dest    = th->source;
675	rep.th.source  = th->dest;
676	rep.th.doff    = arg.iov[0].iov_len / 4;
677	rep.th.seq     = htonl(seq);
678	rep.th.ack_seq = htonl(ack);
679	rep.th.ack     = 1;
680	rep.th.window  = htons(win);
681
682#ifdef CONFIG_TCP_MD5SIG
683	if (key) {
684		int offset = (ts) ? 3 : 0;
685
686		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
687					  (TCPOPT_NOP << 16) |
688					  (TCPOPT_MD5SIG << 8) |
689					  TCPOLEN_MD5SIG);
690		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
691		rep.th.doff = arg.iov[0].iov_len/4;
692
693		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
694				    key, ip_hdr(skb)->saddr,
695				    ip_hdr(skb)->daddr, &rep.th);
696	}
697#endif
698	arg.flags = reply_flags;
699	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
700				      ip_hdr(skb)->saddr, /* XXX */
701				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
702	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703	if (oif)
704		arg.bound_dev_if = oif;
705
706	ip_send_reply(net->ipv4.tcp_sock, skb,
707		      &arg, arg.iov[0].iov_len);
708
709	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
710}
711
712static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
713{
714	struct inet_timewait_sock *tw = inet_twsk(sk);
715	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
716
717	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
718			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
719			tcptw->tw_ts_recent,
720			tw->tw_bound_dev_if,
721			tcp_twsk_md5_key(tcptw),
722			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
723			);
724
725	inet_twsk_put(tw);
726}
727
728static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
729				  struct request_sock *req)
730{
731	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
732			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
733			req->ts_recent,
734			0,
735			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
736			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
737}
738
739/*
740 *	Send a SYN-ACK after having received a SYN.
741 *	This still operates on a request_sock only, not on a big
742 *	socket.
743 */
744static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
745				struct dst_entry *dst)
746{
747	const struct inet_request_sock *ireq = inet_rsk(req);
748	int err = -1;
749	struct sk_buff * skb;
750
751	/* First, grab a route. */
752	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
753		return -1;
754
755	skb = tcp_make_synack(sk, dst, req);
756
757	if (skb) {
758		struct tcphdr *th = tcp_hdr(skb);
759
760		th->check = tcp_v4_check(skb->len,
761					 ireq->loc_addr,
762					 ireq->rmt_addr,
763					 csum_partial(th, skb->len,
764						      skb->csum));
765
766		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
767					    ireq->rmt_addr,
768					    ireq->opt);
769		err = net_xmit_eval(err);
770	}
771
772	dst_release(dst);
773	return err;
774}
775
776static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
777{
778	return __tcp_v4_send_synack(sk, req, NULL);
779}
780
781/*
782 *	IPv4 request_sock destructor.
783 */
784static void tcp_v4_reqsk_destructor(struct request_sock *req)
785{
786	kfree(inet_rsk(req)->opt);
787}
788
789#ifdef CONFIG_SYN_COOKIES
790static void syn_flood_warning(struct sk_buff *skb)
791{
792	static unsigned long warntime;
793
794	if (time_after(jiffies, (warntime + HZ * 60))) {
795		warntime = jiffies;
796		printk(KERN_INFO
797		       "possible SYN flooding on port %d. Sending cookies.\n",
798		       ntohs(tcp_hdr(skb)->dest));
799	}
800}
801#endif
802
803/*
804 * Save and compile IPv4 options into the request_sock if needed.
805 */
806static struct ip_options *tcp_v4_save_options(struct sock *sk,
807					      struct sk_buff *skb)
808{
809	struct ip_options *opt = &(IPCB(skb)->opt);
810	struct ip_options *dopt = NULL;
811
812	if (opt && opt->optlen) {
813		int opt_size = optlength(opt);
814		dopt = kmalloc(opt_size, GFP_ATOMIC);
815		if (dopt) {
816			if (ip_options_echo(dopt, skb)) {
817				kfree(dopt);
818				dopt = NULL;
819			}
820		}
821	}
822	return dopt;
823}
824
825#ifdef CONFIG_TCP_MD5SIG
826/*
827 * RFC2385 MD5 checksumming requires a mapping of
828 * IP address->MD5 Key.
829 * We need to maintain these in the sk structure.
830 */
831
832/* Find the Key structure for an address.  */
833static struct tcp_md5sig_key *
834			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
835{
836	struct tcp_sock *tp = tcp_sk(sk);
837	int i;
838
839	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
840		return NULL;
841	for (i = 0; i < tp->md5sig_info->entries4; i++) {
842		if (tp->md5sig_info->keys4[i].addr == addr)
843			return &tp->md5sig_info->keys4[i].base;
844	}
845	return NULL;
846}
847
848struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
849					 struct sock *addr_sk)
850{
851	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
852}
853
854EXPORT_SYMBOL(tcp_v4_md5_lookup);
855
856static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
857						      struct request_sock *req)
858{
859	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
860}
861
862/* This can be called on a newly created socket, from other files */
863int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
864		      u8 *newkey, u8 newkeylen)
865{
866	/* Add Key to the list */
867	struct tcp_md5sig_key *key;
868	struct tcp_sock *tp = tcp_sk(sk);
869	struct tcp4_md5sig_key *keys;
870
871	key = tcp_v4_md5_do_lookup(sk, addr);
872	if (key) {
873		/* Pre-existing entry - just update that one. */
874		kfree(key->key);
875		key->key = newkey;
876		key->keylen = newkeylen;
877	} else {
878		struct tcp_md5sig_info *md5sig;
879
880		if (!tp->md5sig_info) {
881			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
882						  GFP_ATOMIC);
883			if (!tp->md5sig_info) {
884				kfree(newkey);
885				return -ENOMEM;
886			}
887			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
888		}
889		if (tcp_alloc_md5sig_pool() == NULL) {
890			kfree(newkey);
891			return -ENOMEM;
892		}
893		md5sig = tp->md5sig_info;
894
895		if (md5sig->alloced4 == md5sig->entries4) {
896			keys = kmalloc((sizeof(*keys) *
897					(md5sig->entries4 + 1)), GFP_ATOMIC);
898			if (!keys) {
899				kfree(newkey);
900				tcp_free_md5sig_pool();
901				return -ENOMEM;
902			}
903
904			if (md5sig->entries4)
905				memcpy(keys, md5sig->keys4,
906				       sizeof(*keys) * md5sig->entries4);
907
908			/* Free old key list, and reference new one */
909			kfree(md5sig->keys4);
910			md5sig->keys4 = keys;
911			md5sig->alloced4++;
912		}
913		md5sig->entries4++;
914		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
915		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
916		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
917	}
918	return 0;
919}
920
921EXPORT_SYMBOL(tcp_v4_md5_do_add);
922
923static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
924			       u8 *newkey, u8 newkeylen)
925{
926	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
927				 newkey, newkeylen);
928}
929
930int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
931{
932	struct tcp_sock *tp = tcp_sk(sk);
933	int i;
934
935	for (i = 0; i < tp->md5sig_info->entries4; i++) {
936		if (tp->md5sig_info->keys4[i].addr == addr) {
937			/* Free the key */
938			kfree(tp->md5sig_info->keys4[i].base.key);
939			tp->md5sig_info->entries4--;
940
941			if (tp->md5sig_info->entries4 == 0) {
942				kfree(tp->md5sig_info->keys4);
943				tp->md5sig_info->keys4 = NULL;
944				tp->md5sig_info->alloced4 = 0;
945			} else if (tp->md5sig_info->entries4 != i) {
946				/* Need to do some manipulation */
947				memmove(&tp->md5sig_info->keys4[i],
948					&tp->md5sig_info->keys4[i+1],
949					(tp->md5sig_info->entries4 - i) *
950					 sizeof(struct tcp4_md5sig_key));
951			}
952			tcp_free_md5sig_pool();
953			return 0;
954		}
955	}
956	return -ENOENT;
957}
958
959EXPORT_SYMBOL(tcp_v4_md5_do_del);
960
961static void tcp_v4_clear_md5_list(struct sock *sk)
962{
963	struct tcp_sock *tp = tcp_sk(sk);
964
965	/* Free each key, then the set of key keys,
966	 * the crypto element, and then decrement our
967	 * hold on the last resort crypto.
968	 */
969	if (tp->md5sig_info->entries4) {
970		int i;
971		for (i = 0; i < tp->md5sig_info->entries4; i++)
972			kfree(tp->md5sig_info->keys4[i].base.key);
973		tp->md5sig_info->entries4 = 0;
974		tcp_free_md5sig_pool();
975	}
976	if (tp->md5sig_info->keys4) {
977		kfree(tp->md5sig_info->keys4);
978		tp->md5sig_info->keys4 = NULL;
979		tp->md5sig_info->alloced4  = 0;
980	}
981}
982
983static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
984				 int optlen)
985{
986	struct tcp_md5sig cmd;
987	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
988	u8 *newkey;
989
990	if (optlen < sizeof(cmd))
991		return -EINVAL;
992
993	if (copy_from_user(&cmd, optval, sizeof(cmd)))
994		return -EFAULT;
995
996	if (sin->sin_family != AF_INET)
997		return -EINVAL;
998
999	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1000		if (!tcp_sk(sk)->md5sig_info)
1001			return -ENOENT;
1002		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1003	}
1004
1005	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1006		return -EINVAL;
1007
1008	if (!tcp_sk(sk)->md5sig_info) {
1009		struct tcp_sock *tp = tcp_sk(sk);
1010		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1011
1012		if (!p)
1013			return -EINVAL;
1014
1015		tp->md5sig_info = p;
1016		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1017	}
1018
1019	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1020	if (!newkey)
1021		return -ENOMEM;
1022	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1023				 newkey, cmd.tcpm_keylen);
1024}
1025
1026static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1027					__be32 daddr, __be32 saddr, int nbytes)
1028{
1029	struct tcp4_pseudohdr *bp;
1030	struct scatterlist sg;
1031
1032	bp = &hp->md5_blk.ip4;
1033
1034	/*
1035	 * 1. the TCP pseudo-header (in the order: source IP address,
1036	 * destination IP address, zero-padded protocol number, and
1037	 * segment length)
1038	 */
1039	bp->saddr = saddr;
1040	bp->daddr = daddr;
1041	bp->pad = 0;
1042	bp->protocol = IPPROTO_TCP;
1043	bp->len = cpu_to_be16(nbytes);
1044
1045	sg_init_one(&sg, bp, sizeof(*bp));
1046	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1047}
1048
1049static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1050			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1051{
1052	struct tcp_md5sig_pool *hp;
1053	struct hash_desc *desc;
1054
1055	hp = tcp_get_md5sig_pool();
1056	if (!hp)
1057		goto clear_hash_noput;
1058	desc = &hp->md5_desc;
1059
1060	if (crypto_hash_init(desc))
1061		goto clear_hash;
1062	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1063		goto clear_hash;
1064	if (tcp_md5_hash_header(hp, th))
1065		goto clear_hash;
1066	if (tcp_md5_hash_key(hp, key))
1067		goto clear_hash;
1068	if (crypto_hash_final(desc, md5_hash))
1069		goto clear_hash;
1070
1071	tcp_put_md5sig_pool();
1072	return 0;
1073
1074clear_hash:
1075	tcp_put_md5sig_pool();
1076clear_hash_noput:
1077	memset(md5_hash, 0, 16);
1078	return 1;
1079}
1080
1081int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1082			struct sock *sk, struct request_sock *req,
1083			struct sk_buff *skb)
1084{
1085	struct tcp_md5sig_pool *hp;
1086	struct hash_desc *desc;
1087	struct tcphdr *th = tcp_hdr(skb);
1088	__be32 saddr, daddr;
1089
1090	if (sk) {
1091		saddr = inet_sk(sk)->saddr;
1092		daddr = inet_sk(sk)->daddr;
1093	} else if (req) {
1094		saddr = inet_rsk(req)->loc_addr;
1095		daddr = inet_rsk(req)->rmt_addr;
1096	} else {
1097		const struct iphdr *iph = ip_hdr(skb);
1098		saddr = iph->saddr;
1099		daddr = iph->daddr;
1100	}
1101
1102	hp = tcp_get_md5sig_pool();
1103	if (!hp)
1104		goto clear_hash_noput;
1105	desc = &hp->md5_desc;
1106
1107	if (crypto_hash_init(desc))
1108		goto clear_hash;
1109
1110	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1111		goto clear_hash;
1112	if (tcp_md5_hash_header(hp, th))
1113		goto clear_hash;
1114	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115		goto clear_hash;
1116	if (tcp_md5_hash_key(hp, key))
1117		goto clear_hash;
1118	if (crypto_hash_final(desc, md5_hash))
1119		goto clear_hash;
1120
1121	tcp_put_md5sig_pool();
1122	return 0;
1123
1124clear_hash:
1125	tcp_put_md5sig_pool();
1126clear_hash_noput:
1127	memset(md5_hash, 0, 16);
1128	return 1;
1129}
1130
1131EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132
1133static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1134{
1135	/*
1136	 * This gets called for each TCP segment that arrives
1137	 * so we want to be efficient.
1138	 * We have 3 drop cases:
1139	 * o No MD5 hash and one expected.
1140	 * o MD5 hash and we're not expecting one.
1141	 * o MD5 hash and its wrong.
1142	 */
1143	__u8 *hash_location = NULL;
1144	struct tcp_md5sig_key *hash_expected;
1145	const struct iphdr *iph = ip_hdr(skb);
1146	struct tcphdr *th = tcp_hdr(skb);
1147	int genhash;
1148	unsigned char newhash[16];
1149
1150	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1151	hash_location = tcp_parse_md5sig_option(th);
1152
1153	/* We've parsed the options - do we have a hash? */
1154	if (!hash_expected && !hash_location)
1155		return 0;
1156
1157	if (hash_expected && !hash_location) {
1158		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159		return 1;
1160	}
1161
1162	if (!hash_expected && hash_location) {
1163		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164		return 1;
1165	}
1166
1167	/* Okay, so this is hash_expected and hash_location -
1168	 * so we need to calculate the checksum.
1169	 */
1170	genhash = tcp_v4_md5_hash_skb(newhash,
1171				      hash_expected,
1172				      NULL, NULL, skb);
1173
1174	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175		if (net_ratelimit()) {
1176			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1177			       &iph->saddr, ntohs(th->source),
1178			       &iph->daddr, ntohs(th->dest),
1179			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1180		}
1181		return 1;
1182	}
1183	return 0;
1184}
1185
1186#endif
1187
1188struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1189	.family		=	PF_INET,
1190	.obj_size	=	sizeof(struct tcp_request_sock),
1191	.rtx_syn_ack	=	tcp_v4_send_synack,
1192	.send_ack	=	tcp_v4_reqsk_send_ack,
1193	.destructor	=	tcp_v4_reqsk_destructor,
1194	.send_reset	=	tcp_v4_send_reset,
1195};
1196
1197#ifdef CONFIG_TCP_MD5SIG
1198static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1199	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1200	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1201};
1202#endif
1203
1204static struct timewait_sock_ops tcp_timewait_sock_ops = {
1205	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1206	.twsk_unique	= tcp_twsk_unique,
1207	.twsk_destructor= tcp_twsk_destructor,
1208};
1209
1210int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1211{
1212	struct inet_request_sock *ireq;
1213	struct tcp_options_received tmp_opt;
1214	struct request_sock *req;
1215	__be32 saddr = ip_hdr(skb)->saddr;
1216	__be32 daddr = ip_hdr(skb)->daddr;
1217	__u32 isn = TCP_SKB_CB(skb)->when;
1218	struct dst_entry *dst = NULL;
1219#ifdef CONFIG_SYN_COOKIES
1220	int want_cookie = 0;
1221#else
1222#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1223#endif
1224
1225	/* Never answer to SYNs send to broadcast or multicast */
1226	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1227		goto drop;
1228
1229	/* TW buckets are converted to open requests without
1230	 * limitations, they conserve resources and peer is
1231	 * evidently real one.
1232	 */
1233	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1234#ifdef CONFIG_SYN_COOKIES
1235		if (sysctl_tcp_syncookies) {
1236			want_cookie = 1;
1237		} else
1238#endif
1239		goto drop;
1240	}
1241
1242	/* Accept backlog is full. If we have already queued enough
1243	 * of warm entries in syn queue, drop request. It is better than
1244	 * clogging syn queue with openreqs with exponentially increasing
1245	 * timeout.
1246	 */
1247	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1248		goto drop;
1249
1250	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1251	if (!req)
1252		goto drop;
1253
1254#ifdef CONFIG_TCP_MD5SIG
1255	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1256#endif
1257
1258	tcp_clear_options(&tmp_opt);
1259	tmp_opt.mss_clamp = 536;
1260	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1261
1262	tcp_parse_options(skb, &tmp_opt, 0);
1263
1264	if (want_cookie && !tmp_opt.saw_tstamp)
1265		tcp_clear_options(&tmp_opt);
1266
1267	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1268
1269	tcp_openreq_init(req, &tmp_opt, skb);
1270
1271	ireq = inet_rsk(req);
1272	ireq->loc_addr = daddr;
1273	ireq->rmt_addr = saddr;
1274	ireq->no_srccheck = inet_sk(sk)->transparent;
1275	ireq->opt = tcp_v4_save_options(sk, skb);
1276
1277	if (security_inet_conn_request(sk, skb, req))
1278		goto drop_and_free;
1279
1280	if (!want_cookie)
1281		TCP_ECN_create_request(req, tcp_hdr(skb));
1282
1283	if (want_cookie) {
1284#ifdef CONFIG_SYN_COOKIES
1285		syn_flood_warning(skb);
1286		req->cookie_ts = tmp_opt.tstamp_ok;
1287#endif
1288		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1289	} else if (!isn) {
1290		struct inet_peer *peer = NULL;
1291
1292		/* VJ's idea. We save last timestamp seen
1293		 * from the destination in peer table, when entering
1294		 * state TIME-WAIT, and check against it before
1295		 * accepting new connection request.
1296		 *
1297		 * If "isn" is not zero, this request hit alive
1298		 * timewait bucket, so that all the necessary checks
1299		 * are made in the function processing timewait state.
1300		 */
1301		if (tmp_opt.saw_tstamp &&
1302		    tcp_death_row.sysctl_tw_recycle &&
1303		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1304		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1305		    peer->v4daddr == saddr) {
1306			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1307			    (s32)(peer->tcp_ts - req->ts_recent) >
1308							TCP_PAWS_WINDOW) {
1309				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1310				goto drop_and_release;
1311			}
1312		}
1313		/* Kill the following clause, if you dislike this way. */
1314		else if (!sysctl_tcp_syncookies &&
1315			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1316			  (sysctl_max_syn_backlog >> 2)) &&
1317			 (!peer || !peer->tcp_ts_stamp) &&
1318			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1319			/* Without syncookies last quarter of
1320			 * backlog is filled with destinations,
1321			 * proven to be alive.
1322			 * It means that we continue to communicate
1323			 * to destinations, already remembered
1324			 * to the moment of synflood.
1325			 */
1326			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1327				       &saddr, ntohs(tcp_hdr(skb)->source));
1328			goto drop_and_release;
1329		}
1330
1331		isn = tcp_v4_init_sequence(skb);
1332	}
1333	tcp_rsk(req)->snt_isn = isn;
1334
1335	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1336		goto drop_and_free;
1337
1338	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1339	return 0;
1340
1341drop_and_release:
1342	dst_release(dst);
1343drop_and_free:
1344	reqsk_free(req);
1345drop:
1346	return 0;
1347}
1348
1349
1350/*
1351 * The three way handshake has completed - we got a valid synack -
1352 * now create the new socket.
1353 */
1354struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1355				  struct request_sock *req,
1356				  struct dst_entry *dst)
1357{
1358	struct inet_request_sock *ireq;
1359	struct inet_sock *newinet;
1360	struct tcp_sock *newtp;
1361	struct sock *newsk;
1362#ifdef CONFIG_TCP_MD5SIG
1363	struct tcp_md5sig_key *key;
1364#endif
1365
1366	if (sk_acceptq_is_full(sk))
1367		goto exit_overflow;
1368
1369	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1370		goto exit;
1371
1372	newsk = tcp_create_openreq_child(sk, req, skb);
1373	if (!newsk)
1374		goto exit;
1375
1376	newsk->sk_gso_type = SKB_GSO_TCPV4;
1377	sk_setup_caps(newsk, dst);
1378
1379	newtp		      = tcp_sk(newsk);
1380	newinet		      = inet_sk(newsk);
1381	ireq		      = inet_rsk(req);
1382	newinet->daddr	      = ireq->rmt_addr;
1383	newinet->rcv_saddr    = ireq->loc_addr;
1384	newinet->saddr	      = ireq->loc_addr;
1385	newinet->opt	      = ireq->opt;
1386	ireq->opt	      = NULL;
1387	newinet->mc_index     = inet_iif(skb);
1388	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1389	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1390	if (newinet->opt)
1391		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1392	newinet->id = newtp->write_seq ^ jiffies;
1393
1394	tcp_mtup_init(newsk);
1395	tcp_sync_mss(newsk, dst_mtu(dst));
1396	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1397	if (tcp_sk(sk)->rx_opt.user_mss &&
1398	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1399		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1400
1401	tcp_initialize_rcv_mss(newsk);
1402
1403#ifdef CONFIG_TCP_MD5SIG
1404	/* Copy over the MD5 key from the original socket */
1405	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1406		/*
1407		 * We're using one, so create a matching key
1408		 * on the newsk structure. If we fail to get
1409		 * memory, then we end up not copying the key
1410		 * across. Shucks.
1411		 */
1412		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1413		if (newkey != NULL)
1414			tcp_v4_md5_do_add(newsk, newinet->daddr,
1415					  newkey, key->keylen);
1416		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1417	}
1418#endif
1419
1420	__inet_hash_nolisten(newsk);
1421	__inet_inherit_port(sk, newsk);
1422
1423	return newsk;
1424
1425exit_overflow:
1426	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1427exit:
1428	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1429	dst_release(dst);
1430	return NULL;
1431}
1432
1433static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1434{
1435	struct tcphdr *th = tcp_hdr(skb);
1436	const struct iphdr *iph = ip_hdr(skb);
1437	struct sock *nsk;
1438	struct request_sock **prev;
1439	/* Find possible connection requests. */
1440	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1441						       iph->saddr, iph->daddr);
1442	if (req)
1443		return tcp_check_req(sk, skb, req, prev);
1444
1445	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1446			th->source, iph->daddr, th->dest, inet_iif(skb));
1447
1448	if (nsk) {
1449		if (nsk->sk_state != TCP_TIME_WAIT) {
1450			bh_lock_sock(nsk);
1451			return nsk;
1452		}
1453		inet_twsk_put(inet_twsk(nsk));
1454		return NULL;
1455	}
1456
1457#ifdef CONFIG_SYN_COOKIES
1458	if (!th->rst && !th->syn && th->ack)
1459		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1460#endif
1461	return sk;
1462}
1463
1464static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1465{
1466	const struct iphdr *iph = ip_hdr(skb);
1467
1468	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1469		if (!tcp_v4_check(skb->len, iph->saddr,
1470				  iph->daddr, skb->csum)) {
1471			skb->ip_summed = CHECKSUM_UNNECESSARY;
1472			return 0;
1473		}
1474	}
1475
1476	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1477				       skb->len, IPPROTO_TCP, 0);
1478
1479	if (skb->len <= 76) {
1480		return __skb_checksum_complete(skb);
1481	}
1482	return 0;
1483}
1484
1485
1486/* The socket must have it's spinlock held when we get
1487 * here.
1488 *
1489 * We have a potential double-lock case here, so even when
1490 * doing backlog processing we use the BH locking scheme.
1491 * This is because we cannot sleep with the original spinlock
1492 * held.
1493 */
1494int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1495{
1496	struct sock *rsk;
1497#ifdef CONFIG_TCP_MD5SIG
1498	/*
1499	 * We really want to reject the packet as early as possible
1500	 * if:
1501	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1502	 *  o There is an MD5 option and we're not expecting one
1503	 */
1504	if (tcp_v4_inbound_md5_hash(sk, skb))
1505		goto discard;
1506#endif
1507
1508	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1509		TCP_CHECK_TIMER(sk);
1510		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1511			rsk = sk;
1512			goto reset;
1513		}
1514		TCP_CHECK_TIMER(sk);
1515		return 0;
1516	}
1517
1518	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1519		goto csum_err;
1520
1521	if (sk->sk_state == TCP_LISTEN) {
1522		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1523		if (!nsk)
1524			goto discard;
1525
1526		if (nsk != sk) {
1527			if (tcp_child_process(sk, nsk, skb)) {
1528				rsk = nsk;
1529				goto reset;
1530			}
1531			return 0;
1532		}
1533	}
1534
1535	TCP_CHECK_TIMER(sk);
1536	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1537		rsk = sk;
1538		goto reset;
1539	}
1540	TCP_CHECK_TIMER(sk);
1541	return 0;
1542
1543reset:
1544	tcp_v4_send_reset(rsk, skb);
1545discard:
1546	kfree_skb(skb);
1547	/* Be careful here. If this function gets more complicated and
1548	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1549	 * might be destroyed here. This current version compiles correctly,
1550	 * but you have been warned.
1551	 */
1552	return 0;
1553
1554csum_err:
1555	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1556	goto discard;
1557}
1558
1559/*
1560 *	From tcp_input.c
1561 */
1562
1563int tcp_v4_rcv(struct sk_buff *skb)
1564{
1565	const struct iphdr *iph;
1566	struct tcphdr *th;
1567	struct sock *sk;
1568	int ret;
1569	struct net *net = dev_net(skb->dev);
1570
1571	if (skb->pkt_type != PACKET_HOST)
1572		goto discard_it;
1573
1574	/* Count it even if it's bad */
1575	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1576
1577	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1578		goto discard_it;
1579
1580	th = tcp_hdr(skb);
1581
1582	if (th->doff < sizeof(struct tcphdr) / 4)
1583		goto bad_packet;
1584	if (!pskb_may_pull(skb, th->doff * 4))
1585		goto discard_it;
1586
1587	/* An explanation is required here, I think.
1588	 * Packet length and doff are validated by header prediction,
1589	 * provided case of th->doff==0 is eliminated.
1590	 * So, we defer the checks. */
1591	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1592		goto bad_packet;
1593
1594	th = tcp_hdr(skb);
1595	iph = ip_hdr(skb);
1596	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1597	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1598				    skb->len - th->doff * 4);
1599	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1600	TCP_SKB_CB(skb)->when	 = 0;
1601	TCP_SKB_CB(skb)->flags	 = iph->tos;
1602	TCP_SKB_CB(skb)->sacked	 = 0;
1603
1604	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1605	if (!sk)
1606		goto no_tcp_socket;
1607
1608process:
1609	if (sk->sk_state == TCP_TIME_WAIT)
1610		goto do_time_wait;
1611
1612	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1613		goto discard_and_relse;
1614	nf_reset(skb);
1615
1616	if (sk_filter(sk, skb))
1617		goto discard_and_relse;
1618
1619	skb->dev = NULL;
1620
1621	bh_lock_sock_nested(sk);
1622	ret = 0;
1623	if (!sock_owned_by_user(sk)) {
1624#ifdef CONFIG_NET_DMA
1625		struct tcp_sock *tp = tcp_sk(sk);
1626		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1627			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1628		if (tp->ucopy.dma_chan)
1629			ret = tcp_v4_do_rcv(sk, skb);
1630		else
1631#endif
1632		{
1633			if (!tcp_prequeue(sk, skb))
1634				ret = tcp_v4_do_rcv(sk, skb);
1635		}
1636	} else
1637		sk_add_backlog(sk, skb);
1638	bh_unlock_sock(sk);
1639
1640	sock_put(sk);
1641
1642	return ret;
1643
1644no_tcp_socket:
1645	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1646		goto discard_it;
1647
1648	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1649bad_packet:
1650		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1651	} else {
1652		tcp_v4_send_reset(NULL, skb);
1653	}
1654
1655discard_it:
1656	/* Discard frame. */
1657	kfree_skb(skb);
1658	return 0;
1659
1660discard_and_relse:
1661	sock_put(sk);
1662	goto discard_it;
1663
1664do_time_wait:
1665	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1666		inet_twsk_put(inet_twsk(sk));
1667		goto discard_it;
1668	}
1669
1670	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1671		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1672		inet_twsk_put(inet_twsk(sk));
1673		goto discard_it;
1674	}
1675	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1676	case TCP_TW_SYN: {
1677		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1678							&tcp_hashinfo,
1679							iph->daddr, th->dest,
1680							inet_iif(skb));
1681		if (sk2) {
1682			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1683			inet_twsk_put(inet_twsk(sk));
1684			sk = sk2;
1685			goto process;
1686		}
1687		/* Fall through to ACK */
1688	}
1689	case TCP_TW_ACK:
1690		tcp_v4_timewait_ack(sk, skb);
1691		break;
1692	case TCP_TW_RST:
1693		goto no_tcp_socket;
1694	case TCP_TW_SUCCESS:;
1695	}
1696	goto discard_it;
1697}
1698
1699/* VJ's idea. Save last timestamp seen from this destination
1700 * and hold it at least for normal timewait interval to use for duplicate
1701 * segment detection in subsequent connections, before they enter synchronized
1702 * state.
1703 */
1704
1705int tcp_v4_remember_stamp(struct sock *sk)
1706{
1707	struct inet_sock *inet = inet_sk(sk);
1708	struct tcp_sock *tp = tcp_sk(sk);
1709	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1710	struct inet_peer *peer = NULL;
1711	int release_it = 0;
1712
1713	if (!rt || rt->rt_dst != inet->daddr) {
1714		peer = inet_getpeer(inet->daddr, 1);
1715		release_it = 1;
1716	} else {
1717		if (!rt->peer)
1718			rt_bind_peer(rt, 1);
1719		peer = rt->peer;
1720	}
1721
1722	if (peer) {
1723		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1724		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1725		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1726			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1727			peer->tcp_ts = tp->rx_opt.ts_recent;
1728		}
1729		if (release_it)
1730			inet_putpeer(peer);
1731		return 1;
1732	}
1733
1734	return 0;
1735}
1736
1737int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1738{
1739	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1740
1741	if (peer) {
1742		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1743
1744		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1745		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1746		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1747			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1748			peer->tcp_ts	   = tcptw->tw_ts_recent;
1749		}
1750		inet_putpeer(peer);
1751		return 1;
1752	}
1753
1754	return 0;
1755}
1756
1757const struct inet_connection_sock_af_ops ipv4_specific = {
1758	.queue_xmit	   = ip_queue_xmit,
1759	.send_check	   = tcp_v4_send_check,
1760	.rebuild_header	   = inet_sk_rebuild_header,
1761	.conn_request	   = tcp_v4_conn_request,
1762	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1763	.remember_stamp	   = tcp_v4_remember_stamp,
1764	.net_header_len	   = sizeof(struct iphdr),
1765	.setsockopt	   = ip_setsockopt,
1766	.getsockopt	   = ip_getsockopt,
1767	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1768	.sockaddr_len	   = sizeof(struct sockaddr_in),
1769	.bind_conflict	   = inet_csk_bind_conflict,
1770#ifdef CONFIG_COMPAT
1771	.compat_setsockopt = compat_ip_setsockopt,
1772	.compat_getsockopt = compat_ip_getsockopt,
1773#endif
1774};
1775
1776#ifdef CONFIG_TCP_MD5SIG
1777static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1778	.md5_lookup		= tcp_v4_md5_lookup,
1779	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1780	.md5_add		= tcp_v4_md5_add_func,
1781	.md5_parse		= tcp_v4_parse_md5_keys,
1782};
1783#endif
1784
1785/* NOTE: A lot of things set to zero explicitly by call to
1786 *       sk_alloc() so need not be done here.
1787 */
1788static int tcp_v4_init_sock(struct sock *sk)
1789{
1790	struct inet_connection_sock *icsk = inet_csk(sk);
1791	struct tcp_sock *tp = tcp_sk(sk);
1792
1793	skb_queue_head_init(&tp->out_of_order_queue);
1794	tcp_init_xmit_timers(sk);
1795	tcp_prequeue_init(tp);
1796
1797	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1798	tp->mdev = TCP_TIMEOUT_INIT;
1799
1800	/* So many TCP implementations out there (incorrectly) count the
1801	 * initial SYN frame in their delayed-ACK and congestion control
1802	 * algorithms that we must have the following bandaid to talk
1803	 * efficiently to them.  -DaveM
1804	 */
1805	tp->snd_cwnd = 2;
1806
1807	/* See draft-stevens-tcpca-spec-01 for discussion of the
1808	 * initialization of these values.
1809	 */
1810	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1811	tp->snd_cwnd_clamp = ~0;
1812	tp->mss_cache = 536;
1813
1814	tp->reordering = sysctl_tcp_reordering;
1815	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1816
1817	sk->sk_state = TCP_CLOSE;
1818
1819	sk->sk_write_space = sk_stream_write_space;
1820	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1821
1822	icsk->icsk_af_ops = &ipv4_specific;
1823	icsk->icsk_sync_mss = tcp_sync_mss;
1824#ifdef CONFIG_TCP_MD5SIG
1825	tp->af_specific = &tcp_sock_ipv4_specific;
1826#endif
1827
1828	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1829	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1830
1831	local_bh_disable();
1832	percpu_counter_inc(&tcp_sockets_allocated);
1833	local_bh_enable();
1834
1835	return 0;
1836}
1837
1838void tcp_v4_destroy_sock(struct sock *sk)
1839{
1840	struct tcp_sock *tp = tcp_sk(sk);
1841
1842	tcp_clear_xmit_timers(sk);
1843
1844	tcp_cleanup_congestion_control(sk);
1845
1846	/* Cleanup up the write buffer. */
1847	tcp_write_queue_purge(sk);
1848
1849	/* Cleans up our, hopefully empty, out_of_order_queue. */
1850	__skb_queue_purge(&tp->out_of_order_queue);
1851
1852#ifdef CONFIG_TCP_MD5SIG
1853	/* Clean up the MD5 key list, if any */
1854	if (tp->md5sig_info) {
1855		tcp_v4_clear_md5_list(sk);
1856		kfree(tp->md5sig_info);
1857		tp->md5sig_info = NULL;
1858	}
1859#endif
1860
1861#ifdef CONFIG_NET_DMA
1862	/* Cleans up our sk_async_wait_queue */
1863	__skb_queue_purge(&sk->sk_async_wait_queue);
1864#endif
1865
1866	/* Clean prequeue, it must be empty really */
1867	__skb_queue_purge(&tp->ucopy.prequeue);
1868
1869	/* Clean up a referenced TCP bind bucket. */
1870	if (inet_csk(sk)->icsk_bind_hash)
1871		inet_put_port(sk);
1872
1873	/*
1874	 * If sendmsg cached page exists, toss it.
1875	 */
1876	if (sk->sk_sndmsg_page) {
1877		__free_page(sk->sk_sndmsg_page);
1878		sk->sk_sndmsg_page = NULL;
1879	}
1880
1881	percpu_counter_dec(&tcp_sockets_allocated);
1882}
1883
1884EXPORT_SYMBOL(tcp_v4_destroy_sock);
1885
1886#ifdef CONFIG_PROC_FS
1887/* Proc filesystem TCP sock list dumping. */
1888
1889static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1890{
1891	return hlist_nulls_empty(head) ? NULL :
1892		list_entry(head->first, struct inet_timewait_sock, tw_node);
1893}
1894
1895static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1896{
1897	return !is_a_nulls(tw->tw_node.next) ?
1898		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1899}
1900
1901static void *listening_get_next(struct seq_file *seq, void *cur)
1902{
1903	struct inet_connection_sock *icsk;
1904	struct hlist_nulls_node *node;
1905	struct sock *sk = cur;
1906	struct inet_listen_hashbucket *ilb;
1907	struct tcp_iter_state *st = seq->private;
1908	struct net *net = seq_file_net(seq);
1909
1910	if (!sk) {
1911		st->bucket = 0;
1912		ilb = &tcp_hashinfo.listening_hash[0];
1913		spin_lock_bh(&ilb->lock);
1914		sk = sk_nulls_head(&ilb->head);
1915		goto get_sk;
1916	}
1917	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1918	++st->num;
1919
1920	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1921		struct request_sock *req = cur;
1922
1923		icsk = inet_csk(st->syn_wait_sk);
1924		req = req->dl_next;
1925		while (1) {
1926			while (req) {
1927				if (req->rsk_ops->family == st->family) {
1928					cur = req;
1929					goto out;
1930				}
1931				req = req->dl_next;
1932			}
1933			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1934				break;
1935get_req:
1936			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1937		}
1938		sk	  = sk_next(st->syn_wait_sk);
1939		st->state = TCP_SEQ_STATE_LISTENING;
1940		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1941	} else {
1942		icsk = inet_csk(sk);
1943		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1944		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1945			goto start_req;
1946		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1947		sk = sk_next(sk);
1948	}
1949get_sk:
1950	sk_nulls_for_each_from(sk, node) {
1951		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1952			cur = sk;
1953			goto out;
1954		}
1955		icsk = inet_csk(sk);
1956		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1957		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1958start_req:
1959			st->uid		= sock_i_uid(sk);
1960			st->syn_wait_sk = sk;
1961			st->state	= TCP_SEQ_STATE_OPENREQ;
1962			st->sbucket	= 0;
1963			goto get_req;
1964		}
1965		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1966	}
1967	spin_unlock_bh(&ilb->lock);
1968	if (++st->bucket < INET_LHTABLE_SIZE) {
1969		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1970		spin_lock_bh(&ilb->lock);
1971		sk = sk_nulls_head(&ilb->head);
1972		goto get_sk;
1973	}
1974	cur = NULL;
1975out:
1976	return cur;
1977}
1978
1979static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1980{
1981	void *rc = listening_get_next(seq, NULL);
1982
1983	while (rc && *pos) {
1984		rc = listening_get_next(seq, rc);
1985		--*pos;
1986	}
1987	return rc;
1988}
1989
1990static inline int empty_bucket(struct tcp_iter_state *st)
1991{
1992	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1993		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1994}
1995
1996static void *established_get_first(struct seq_file *seq)
1997{
1998	struct tcp_iter_state *st = seq->private;
1999	struct net *net = seq_file_net(seq);
2000	void *rc = NULL;
2001
2002	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2003		struct sock *sk;
2004		struct hlist_nulls_node *node;
2005		struct inet_timewait_sock *tw;
2006		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2007
2008		/* Lockless fast path for the common case of empty buckets */
2009		if (empty_bucket(st))
2010			continue;
2011
2012		spin_lock_bh(lock);
2013		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2014			if (sk->sk_family != st->family ||
2015			    !net_eq(sock_net(sk), net)) {
2016				continue;
2017			}
2018			rc = sk;
2019			goto out;
2020		}
2021		st->state = TCP_SEQ_STATE_TIME_WAIT;
2022		inet_twsk_for_each(tw, node,
2023				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2024			if (tw->tw_family != st->family ||
2025			    !net_eq(twsk_net(tw), net)) {
2026				continue;
2027			}
2028			rc = tw;
2029			goto out;
2030		}
2031		spin_unlock_bh(lock);
2032		st->state = TCP_SEQ_STATE_ESTABLISHED;
2033	}
2034out:
2035	return rc;
2036}
2037
2038static void *established_get_next(struct seq_file *seq, void *cur)
2039{
2040	struct sock *sk = cur;
2041	struct inet_timewait_sock *tw;
2042	struct hlist_nulls_node *node;
2043	struct tcp_iter_state *st = seq->private;
2044	struct net *net = seq_file_net(seq);
2045
2046	++st->num;
2047
2048	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2049		tw = cur;
2050		tw = tw_next(tw);
2051get_tw:
2052		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2053			tw = tw_next(tw);
2054		}
2055		if (tw) {
2056			cur = tw;
2057			goto out;
2058		}
2059		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2060		st->state = TCP_SEQ_STATE_ESTABLISHED;
2061
2062		/* Look for next non empty bucket */
2063		while (++st->bucket < tcp_hashinfo.ehash_size &&
2064				empty_bucket(st))
2065			;
2066		if (st->bucket >= tcp_hashinfo.ehash_size)
2067			return NULL;
2068
2069		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2070		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2071	} else
2072		sk = sk_nulls_next(sk);
2073
2074	sk_nulls_for_each_from(sk, node) {
2075		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2076			goto found;
2077	}
2078
2079	st->state = TCP_SEQ_STATE_TIME_WAIT;
2080	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2081	goto get_tw;
2082found:
2083	cur = sk;
2084out:
2085	return cur;
2086}
2087
2088static void *established_get_idx(struct seq_file *seq, loff_t pos)
2089{
2090	void *rc = established_get_first(seq);
2091
2092	while (rc && pos) {
2093		rc = established_get_next(seq, rc);
2094		--pos;
2095	}
2096	return rc;
2097}
2098
2099static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2100{
2101	void *rc;
2102	struct tcp_iter_state *st = seq->private;
2103
2104	st->state = TCP_SEQ_STATE_LISTENING;
2105	rc	  = listening_get_idx(seq, &pos);
2106
2107	if (!rc) {
2108		st->state = TCP_SEQ_STATE_ESTABLISHED;
2109		rc	  = established_get_idx(seq, pos);
2110	}
2111
2112	return rc;
2113}
2114
2115static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2116{
2117	struct tcp_iter_state *st = seq->private;
2118	st->state = TCP_SEQ_STATE_LISTENING;
2119	st->num = 0;
2120	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2121}
2122
2123static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2124{
2125	void *rc = NULL;
2126	struct tcp_iter_state *st;
2127
2128	if (v == SEQ_START_TOKEN) {
2129		rc = tcp_get_idx(seq, 0);
2130		goto out;
2131	}
2132	st = seq->private;
2133
2134	switch (st->state) {
2135	case TCP_SEQ_STATE_OPENREQ:
2136	case TCP_SEQ_STATE_LISTENING:
2137		rc = listening_get_next(seq, v);
2138		if (!rc) {
2139			st->state = TCP_SEQ_STATE_ESTABLISHED;
2140			rc	  = established_get_first(seq);
2141		}
2142		break;
2143	case TCP_SEQ_STATE_ESTABLISHED:
2144	case TCP_SEQ_STATE_TIME_WAIT:
2145		rc = established_get_next(seq, v);
2146		break;
2147	}
2148out:
2149	++*pos;
2150	return rc;
2151}
2152
2153static void tcp_seq_stop(struct seq_file *seq, void *v)
2154{
2155	struct tcp_iter_state *st = seq->private;
2156
2157	switch (st->state) {
2158	case TCP_SEQ_STATE_OPENREQ:
2159		if (v) {
2160			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2161			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2162		}
2163	case TCP_SEQ_STATE_LISTENING:
2164		if (v != SEQ_START_TOKEN)
2165			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2166		break;
2167	case TCP_SEQ_STATE_TIME_WAIT:
2168	case TCP_SEQ_STATE_ESTABLISHED:
2169		if (v)
2170			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2171		break;
2172	}
2173}
2174
2175static int tcp_seq_open(struct inode *inode, struct file *file)
2176{
2177	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2178	struct tcp_iter_state *s;
2179	int err;
2180
2181	err = seq_open_net(inode, file, &afinfo->seq_ops,
2182			  sizeof(struct tcp_iter_state));
2183	if (err < 0)
2184		return err;
2185
2186	s = ((struct seq_file *)file->private_data)->private;
2187	s->family		= afinfo->family;
2188	return 0;
2189}
2190
2191int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2192{
2193	int rc = 0;
2194	struct proc_dir_entry *p;
2195
2196	afinfo->seq_fops.open		= tcp_seq_open;
2197	afinfo->seq_fops.read		= seq_read;
2198	afinfo->seq_fops.llseek		= seq_lseek;
2199	afinfo->seq_fops.release	= seq_release_net;
2200
2201	afinfo->seq_ops.start		= tcp_seq_start;
2202	afinfo->seq_ops.next		= tcp_seq_next;
2203	afinfo->seq_ops.stop		= tcp_seq_stop;
2204
2205	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2206			     &afinfo->seq_fops, afinfo);
2207	if (!p)
2208		rc = -ENOMEM;
2209	return rc;
2210}
2211
2212void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2213{
2214	proc_net_remove(net, afinfo->name);
2215}
2216
2217static void get_openreq4(struct sock *sk, struct request_sock *req,
2218			 struct seq_file *f, int i, int uid, int *len)
2219{
2220	const struct inet_request_sock *ireq = inet_rsk(req);
2221	int ttd = req->expires - jiffies;
2222
2223	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2224		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2225		i,
2226		ireq->loc_addr,
2227		ntohs(inet_sk(sk)->sport),
2228		ireq->rmt_addr,
2229		ntohs(ireq->rmt_port),
2230		TCP_SYN_RECV,
2231		0, 0, /* could print option size, but that is af dependent. */
2232		1,    /* timers active (only the expire timer) */
2233		jiffies_to_clock_t(ttd),
2234		req->retrans,
2235		uid,
2236		0,  /* non standard timer */
2237		0, /* open_requests have no inode */
2238		atomic_read(&sk->sk_refcnt),
2239		req,
2240		len);
2241}
2242
2243static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2244{
2245	int timer_active;
2246	unsigned long timer_expires;
2247	struct tcp_sock *tp = tcp_sk(sk);
2248	const struct inet_connection_sock *icsk = inet_csk(sk);
2249	struct inet_sock *inet = inet_sk(sk);
2250	__be32 dest = inet->daddr;
2251	__be32 src = inet->rcv_saddr;
2252	__u16 destp = ntohs(inet->dport);
2253	__u16 srcp = ntohs(inet->sport);
2254
2255	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2256		timer_active	= 1;
2257		timer_expires	= icsk->icsk_timeout;
2258	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2259		timer_active	= 4;
2260		timer_expires	= icsk->icsk_timeout;
2261	} else if (timer_pending(&sk->sk_timer)) {
2262		timer_active	= 2;
2263		timer_expires	= sk->sk_timer.expires;
2264	} else {
2265		timer_active	= 0;
2266		timer_expires = jiffies;
2267	}
2268
2269	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2270			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2271		i, src, srcp, dest, destp, sk->sk_state,
2272		tp->write_seq - tp->snd_una,
2273		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2274					     (tp->rcv_nxt - tp->copied_seq),
2275		timer_active,
2276		jiffies_to_clock_t(timer_expires - jiffies),
2277		icsk->icsk_retransmits,
2278		sock_i_uid(sk),
2279		icsk->icsk_probes_out,
2280		sock_i_ino(sk),
2281		atomic_read(&sk->sk_refcnt), sk,
2282		jiffies_to_clock_t(icsk->icsk_rto),
2283		jiffies_to_clock_t(icsk->icsk_ack.ato),
2284		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2285		tp->snd_cwnd,
2286		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2287		len);
2288}
2289
2290static void get_timewait4_sock(struct inet_timewait_sock *tw,
2291			       struct seq_file *f, int i, int *len)
2292{
2293	__be32 dest, src;
2294	__u16 destp, srcp;
2295	int ttd = tw->tw_ttd - jiffies;
2296
2297	if (ttd < 0)
2298		ttd = 0;
2299
2300	dest  = tw->tw_daddr;
2301	src   = tw->tw_rcv_saddr;
2302	destp = ntohs(tw->tw_dport);
2303	srcp  = ntohs(tw->tw_sport);
2304
2305	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2306		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2307		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2308		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2309		atomic_read(&tw->tw_refcnt), tw, len);
2310}
2311
2312#define TMPSZ 150
2313
2314static int tcp4_seq_show(struct seq_file *seq, void *v)
2315{
2316	struct tcp_iter_state *st;
2317	int len;
2318
2319	if (v == SEQ_START_TOKEN) {
2320		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2321			   "  sl  local_address rem_address   st tx_queue "
2322			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2323			   "inode");
2324		goto out;
2325	}
2326	st = seq->private;
2327
2328	switch (st->state) {
2329	case TCP_SEQ_STATE_LISTENING:
2330	case TCP_SEQ_STATE_ESTABLISHED:
2331		get_tcp4_sock(v, seq, st->num, &len);
2332		break;
2333	case TCP_SEQ_STATE_OPENREQ:
2334		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2335		break;
2336	case TCP_SEQ_STATE_TIME_WAIT:
2337		get_timewait4_sock(v, seq, st->num, &len);
2338		break;
2339	}
2340	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2341out:
2342	return 0;
2343}
2344
2345static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2346	.name		= "tcp",
2347	.family		= AF_INET,
2348	.seq_fops	= {
2349		.owner		= THIS_MODULE,
2350	},
2351	.seq_ops	= {
2352		.show		= tcp4_seq_show,
2353	},
2354};
2355
2356static int tcp4_proc_init_net(struct net *net)
2357{
2358	return tcp_proc_register(net, &tcp4_seq_afinfo);
2359}
2360
2361static void tcp4_proc_exit_net(struct net *net)
2362{
2363	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2364}
2365
2366static struct pernet_operations tcp4_net_ops = {
2367	.init = tcp4_proc_init_net,
2368	.exit = tcp4_proc_exit_net,
2369};
2370
2371int __init tcp4_proc_init(void)
2372{
2373	return register_pernet_subsys(&tcp4_net_ops);
2374}
2375
2376void tcp4_proc_exit(void)
2377{
2378	unregister_pernet_subsys(&tcp4_net_ops);
2379}
2380#endif /* CONFIG_PROC_FS */
2381
2382struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2383{
2384	struct iphdr *iph = skb_gro_network_header(skb);
2385
2386	switch (skb->ip_summed) {
2387	case CHECKSUM_COMPLETE:
2388		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2389				  skb->csum)) {
2390			skb->ip_summed = CHECKSUM_UNNECESSARY;
2391			break;
2392		}
2393
2394		/* fall through */
2395	case CHECKSUM_NONE:
2396		NAPI_GRO_CB(skb)->flush = 1;
2397		return NULL;
2398	}
2399
2400	return tcp_gro_receive(head, skb);
2401}
2402EXPORT_SYMBOL(tcp4_gro_receive);
2403
2404int tcp4_gro_complete(struct sk_buff *skb)
2405{
2406	struct iphdr *iph = ip_hdr(skb);
2407	struct tcphdr *th = tcp_hdr(skb);
2408
2409	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2410				  iph->saddr, iph->daddr, 0);
2411	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2412
2413	return tcp_gro_complete(skb);
2414}
2415EXPORT_SYMBOL(tcp4_gro_complete);
2416
2417struct proto tcp_prot = {
2418	.name			= "TCP",
2419	.owner			= THIS_MODULE,
2420	.close			= tcp_close,
2421	.connect		= tcp_v4_connect,
2422	.disconnect		= tcp_disconnect,
2423	.accept			= inet_csk_accept,
2424	.ioctl			= tcp_ioctl,
2425	.init			= tcp_v4_init_sock,
2426	.destroy		= tcp_v4_destroy_sock,
2427	.shutdown		= tcp_shutdown,
2428	.setsockopt		= tcp_setsockopt,
2429	.getsockopt		= tcp_getsockopt,
2430	.recvmsg		= tcp_recvmsg,
2431	.backlog_rcv		= tcp_v4_do_rcv,
2432	.hash			= inet_hash,
2433	.unhash			= inet_unhash,
2434	.get_port		= inet_csk_get_port,
2435	.enter_memory_pressure	= tcp_enter_memory_pressure,
2436	.sockets_allocated	= &tcp_sockets_allocated,
2437	.orphan_count		= &tcp_orphan_count,
2438	.memory_allocated	= &tcp_memory_allocated,
2439	.memory_pressure	= &tcp_memory_pressure,
2440	.sysctl_mem		= sysctl_tcp_mem,
2441	.sysctl_wmem		= sysctl_tcp_wmem,
2442	.sysctl_rmem		= sysctl_tcp_rmem,
2443	.max_header		= MAX_TCP_HEADER,
2444	.obj_size		= sizeof(struct tcp_sock),
2445	.slab_flags		= SLAB_DESTROY_BY_RCU,
2446	.twsk_prot		= &tcp_timewait_sock_ops,
2447	.rsk_prot		= &tcp_request_sock_ops,
2448	.h.hashinfo		= &tcp_hashinfo,
2449#ifdef CONFIG_COMPAT
2450	.compat_setsockopt	= compat_tcp_setsockopt,
2451	.compat_getsockopt	= compat_tcp_getsockopt,
2452#endif
2453};
2454
2455
2456static int __net_init tcp_sk_init(struct net *net)
2457{
2458	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2459				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2460}
2461
2462static void __net_exit tcp_sk_exit(struct net *net)
2463{
2464	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2465	inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2466}
2467
2468static struct pernet_operations __net_initdata tcp_sk_ops = {
2469       .init = tcp_sk_init,
2470       .exit = tcp_sk_exit,
2471};
2472
2473void __init tcp_v4_init(void)
2474{
2475	inet_hashinfo_init(&tcp_hashinfo);
2476	if (register_pernet_subsys(&tcp_sk_ops))
2477		panic("Failed to create the TCP control socket.\n");
2478}
2479
2480EXPORT_SYMBOL(ipv4_specific);
2481EXPORT_SYMBOL(tcp_hashinfo);
2482EXPORT_SYMBOL(tcp_prot);
2483EXPORT_SYMBOL(tcp_v4_conn_request);
2484EXPORT_SYMBOL(tcp_v4_connect);
2485EXPORT_SYMBOL(tcp_v4_do_rcv);
2486EXPORT_SYMBOL(tcp_v4_remember_stamp);
2487EXPORT_SYMBOL(tcp_v4_send_check);
2488EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2489
2490#ifdef CONFIG_PROC_FS
2491EXPORT_SYMBOL(tcp_proc_register);
2492EXPORT_SYMBOL(tcp_proc_unregister);
2493#endif
2494EXPORT_SYMBOL(sysctl_tcp_low_latency);
2495
2496