tcp_ipv4.c revision bdeab991918663aed38757904219e8398214334c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75#include <net/secure_seq.h>
76
77#include <linux/inet.h>
78#include <linux/ipv6.h>
79#include <linux/stddef.h>
80#include <linux/proc_fs.h>
81#include <linux/seq_file.h>
82
83#include <linux/crypto.h>
84#include <linux/scatterlist.h>
85
86int sysctl_tcp_tw_reuse __read_mostly;
87int sysctl_tcp_low_latency __read_mostly;
88EXPORT_SYMBOL(sysctl_tcp_low_latency);
89
90
91#ifdef CONFIG_TCP_MD5SIG
92static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93						   __be32 addr);
94static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95			       __be32 daddr, __be32 saddr, struct tcphdr *th);
96#else
97static inline
98struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99{
100	return NULL;
101}
102#endif
103
104struct inet_hashinfo tcp_hashinfo;
105EXPORT_SYMBOL(tcp_hashinfo);
106
107static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108{
109	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110					  ip_hdr(skb)->saddr,
111					  tcp_hdr(skb)->dest,
112					  tcp_hdr(skb)->source);
113}
114
115int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116{
117	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118	struct tcp_sock *tp = tcp_sk(sk);
119
120	/* With PAWS, it is safe from the viewpoint
121	   of data integrity. Even without PAWS it is safe provided sequence
122	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123
124	   Actually, the idea is close to VJ's one, only timestamp cache is
125	   held not per host, but per port pair and TW bucket is used as state
126	   holder.
127
128	   If TW bucket has been already destroyed we fall back to VJ's scheme
129	   and use initial timestamp retrieved from peer table.
130	 */
131	if (tcptw->tw_ts_recent_stamp &&
132	    (twp == NULL || (sysctl_tcp_tw_reuse &&
133			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135		if (tp->write_seq == 0)
136			tp->write_seq = 1;
137		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
138		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139		sock_hold(sktw);
140		return 1;
141	}
142
143	return 0;
144}
145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147/* This will initiate an outgoing connection. */
148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149{
150	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151	struct inet_sock *inet = inet_sk(sk);
152	struct tcp_sock *tp = tcp_sk(sk);
153	__be16 orig_sport, orig_dport;
154	__be32 daddr, nexthop;
155	struct flowi4 *fl4;
156	struct rtable *rt;
157	int err;
158	struct ip_options_rcu *inet_opt;
159
160	if (addr_len < sizeof(struct sockaddr_in))
161		return -EINVAL;
162
163	if (usin->sin_family != AF_INET)
164		return -EAFNOSUPPORT;
165
166	nexthop = daddr = usin->sin_addr.s_addr;
167	inet_opt = rcu_dereference_protected(inet->inet_opt,
168					     sock_owned_by_user(sk));
169	if (inet_opt && inet_opt->opt.srr) {
170		if (!daddr)
171			return -EINVAL;
172		nexthop = inet_opt->opt.faddr;
173	}
174
175	orig_sport = inet->inet_sport;
176	orig_dport = usin->sin_port;
177	fl4 = &inet->cork.fl.u.ip4;
178	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
179			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
180			      IPPROTO_TCP,
181			      orig_sport, orig_dport, sk, true);
182	if (IS_ERR(rt)) {
183		err = PTR_ERR(rt);
184		if (err == -ENETUNREACH)
185			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
186		return err;
187	}
188
189	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
190		ip_rt_put(rt);
191		return -ENETUNREACH;
192	}
193
194	if (!inet_opt || !inet_opt->opt.srr)
195		daddr = fl4->daddr;
196
197	if (!inet->inet_saddr)
198		inet->inet_saddr = fl4->saddr;
199	inet->inet_rcv_saddr = inet->inet_saddr;
200
201	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
202		/* Reset inherited state */
203		tp->rx_opt.ts_recent	   = 0;
204		tp->rx_opt.ts_recent_stamp = 0;
205		tp->write_seq		   = 0;
206	}
207
208	if (tcp_death_row.sysctl_tw_recycle &&
209	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
210		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
211		/*
212		 * VJ's idea. We save last timestamp seen from
213		 * the destination in peer table, when entering state
214		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
215		 * when trying new connection.
216		 */
217		if (peer) {
218			inet_peer_refcheck(peer);
219			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
220				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
221				tp->rx_opt.ts_recent = peer->tcp_ts;
222			}
223		}
224	}
225
226	inet->inet_dport = usin->sin_port;
227	inet->inet_daddr = daddr;
228
229	inet_csk(sk)->icsk_ext_hdr_len = 0;
230	if (inet_opt)
231		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
232
233	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
234
235	/* Socket identity is still unknown (sport may be zero).
236	 * However we set state to SYN-SENT and not releasing socket
237	 * lock select source port, enter ourselves into the hash tables and
238	 * complete initialization after this.
239	 */
240	tcp_set_state(sk, TCP_SYN_SENT);
241	err = inet_hash_connect(&tcp_death_row, sk);
242	if (err)
243		goto failure;
244
245	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
246			       inet->inet_sport, inet->inet_dport, sk);
247	if (IS_ERR(rt)) {
248		err = PTR_ERR(rt);
249		rt = NULL;
250		goto failure;
251	}
252	/* OK, now commit destination to socket.  */
253	sk->sk_gso_type = SKB_GSO_TCPV4;
254	sk_setup_caps(sk, &rt->dst);
255
256	if (!tp->write_seq)
257		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
258							   inet->inet_daddr,
259							   inet->inet_sport,
260							   usin->sin_port);
261
262	inet->inet_id = tp->write_seq ^ jiffies;
263
264	err = tcp_connect(sk);
265	rt = NULL;
266	if (err)
267		goto failure;
268
269	return 0;
270
271failure:
272	/*
273	 * This unhashes the socket and releases the local port,
274	 * if necessary.
275	 */
276	tcp_set_state(sk, TCP_CLOSE);
277	ip_rt_put(rt);
278	sk->sk_route_caps = 0;
279	inet->inet_dport = 0;
280	return err;
281}
282EXPORT_SYMBOL(tcp_v4_connect);
283
284/*
285 * This routine does path mtu discovery as defined in RFC1191.
286 */
287static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
288{
289	struct dst_entry *dst;
290	struct inet_sock *inet = inet_sk(sk);
291
292	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
293	 * send out by Linux are always <576bytes so they should go through
294	 * unfragmented).
295	 */
296	if (sk->sk_state == TCP_LISTEN)
297		return;
298
299	/* We don't check in the destentry if pmtu discovery is forbidden
300	 * on this route. We just assume that no packet_to_big packets
301	 * are send back when pmtu discovery is not active.
302	 * There is a small race when the user changes this flag in the
303	 * route, but I think that's acceptable.
304	 */
305	if ((dst = __sk_dst_check(sk, 0)) == NULL)
306		return;
307
308	dst->ops->update_pmtu(dst, mtu);
309
310	/* Something is about to be wrong... Remember soft error
311	 * for the case, if this connection will not able to recover.
312	 */
313	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314		sk->sk_err_soft = EMSGSIZE;
315
316	mtu = dst_mtu(dst);
317
318	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
320		tcp_sync_mss(sk, mtu);
321
322		/* Resend the TCP packet because it's
323		 * clear that the old packet has been
324		 * dropped. This is the new "fast" path mtu
325		 * discovery.
326		 */
327		tcp_simple_retransmit(sk);
328	} /* else let the usual retransmit timer handle it */
329}
330
331/*
332 * This routine is called by the ICMP module when it gets some
333 * sort of error condition.  If err < 0 then the socket should
334 * be closed and the error returned to the user.  If err > 0
335 * it's just the icmp type << 8 | icmp code.  After adjustment
336 * header points to the first 8 bytes of the tcp header.  We need
337 * to find the appropriate port.
338 *
339 * The locking strategy used here is very "optimistic". When
340 * someone else accesses the socket the ICMP is just dropped
341 * and for some paths there is no check at all.
342 * A more general error queue to queue errors for later handling
343 * is probably better.
344 *
345 */
346
347void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
348{
349	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
350	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
351	struct inet_connection_sock *icsk;
352	struct tcp_sock *tp;
353	struct inet_sock *inet;
354	const int type = icmp_hdr(icmp_skb)->type;
355	const int code = icmp_hdr(icmp_skb)->code;
356	struct sock *sk;
357	struct sk_buff *skb;
358	__u32 seq;
359	__u32 remaining;
360	int err;
361	struct net *net = dev_net(icmp_skb->dev);
362
363	if (icmp_skb->len < (iph->ihl << 2) + 8) {
364		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365		return;
366	}
367
368	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
369			iph->saddr, th->source, inet_iif(icmp_skb));
370	if (!sk) {
371		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
372		return;
373	}
374	if (sk->sk_state == TCP_TIME_WAIT) {
375		inet_twsk_put(inet_twsk(sk));
376		return;
377	}
378
379	bh_lock_sock(sk);
380	/* If too many ICMPs get dropped on busy
381	 * servers this needs to be solved differently.
382	 */
383	if (sock_owned_by_user(sk))
384		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
385
386	if (sk->sk_state == TCP_CLOSE)
387		goto out;
388
389	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
390		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
391		goto out;
392	}
393
394	icsk = inet_csk(sk);
395	tp = tcp_sk(sk);
396	seq = ntohl(th->seq);
397	if (sk->sk_state != TCP_LISTEN &&
398	    !between(seq, tp->snd_una, tp->snd_nxt)) {
399		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
400		goto out;
401	}
402
403	switch (type) {
404	case ICMP_SOURCE_QUENCH:
405		/* Just silently ignore these. */
406		goto out;
407	case ICMP_PARAMETERPROB:
408		err = EPROTO;
409		break;
410	case ICMP_DEST_UNREACH:
411		if (code > NR_ICMP_UNREACH)
412			goto out;
413
414		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
415			if (!sock_owned_by_user(sk))
416				do_pmtu_discovery(sk, iph, info);
417			goto out;
418		}
419
420		err = icmp_err_convert[code].errno;
421		/* check if icmp_skb allows revert of backoff
422		 * (see draft-zimmermann-tcp-lcd) */
423		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
424			break;
425		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
426		    !icsk->icsk_backoff)
427			break;
428
429		if (sock_owned_by_user(sk))
430			break;
431
432		icsk->icsk_backoff--;
433		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
434			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
435		tcp_bound_rto(sk);
436
437		skb = tcp_write_queue_head(sk);
438		BUG_ON(!skb);
439
440		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
441				tcp_time_stamp - TCP_SKB_CB(skb)->when);
442
443		if (remaining) {
444			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445						  remaining, TCP_RTO_MAX);
446		} else {
447			/* RTO revert clocked out retransmission.
448			 * Will retransmit now */
449			tcp_retransmit_timer(sk);
450		}
451
452		break;
453	case ICMP_TIME_EXCEEDED:
454		err = EHOSTUNREACH;
455		break;
456	default:
457		goto out;
458	}
459
460	switch (sk->sk_state) {
461		struct request_sock *req, **prev;
462	case TCP_LISTEN:
463		if (sock_owned_by_user(sk))
464			goto out;
465
466		req = inet_csk_search_req(sk, &prev, th->dest,
467					  iph->daddr, iph->saddr);
468		if (!req)
469			goto out;
470
471		/* ICMPs are not backlogged, hence we cannot get
472		   an established socket here.
473		 */
474		WARN_ON(req->sk);
475
476		if (seq != tcp_rsk(req)->snt_isn) {
477			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478			goto out;
479		}
480
481		/*
482		 * Still in SYN_RECV, just remove it silently.
483		 * There is no good way to pass the error to the newly
484		 * created socket, and POSIX does not want network
485		 * errors returned from accept().
486		 */
487		inet_csk_reqsk_queue_drop(sk, req, prev);
488		goto out;
489
490	case TCP_SYN_SENT:
491	case TCP_SYN_RECV:  /* Cannot happen.
492			       It can f.e. if SYNs crossed.
493			     */
494		if (!sock_owned_by_user(sk)) {
495			sk->sk_err = err;
496
497			sk->sk_error_report(sk);
498
499			tcp_done(sk);
500		} else {
501			sk->sk_err_soft = err;
502		}
503		goto out;
504	}
505
506	/* If we've already connected we will keep trying
507	 * until we time out, or the user gives up.
508	 *
509	 * rfc1122 4.2.3.9 allows to consider as hard errors
510	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
511	 * but it is obsoleted by pmtu discovery).
512	 *
513	 * Note, that in modern internet, where routing is unreliable
514	 * and in each dark corner broken firewalls sit, sending random
515	 * errors ordered by their masters even this two messages finally lose
516	 * their original sense (even Linux sends invalid PORT_UNREACHs)
517	 *
518	 * Now we are in compliance with RFCs.
519	 *							--ANK (980905)
520	 */
521
522	inet = inet_sk(sk);
523	if (!sock_owned_by_user(sk) && inet->recverr) {
524		sk->sk_err = err;
525		sk->sk_error_report(sk);
526	} else	{ /* Only an error on timeout */
527		sk->sk_err_soft = err;
528	}
529
530out:
531	bh_unlock_sock(sk);
532	sock_put(sk);
533}
534
535static void __tcp_v4_send_check(struct sk_buff *skb,
536				__be32 saddr, __be32 daddr)
537{
538	struct tcphdr *th = tcp_hdr(skb);
539
540	if (skb->ip_summed == CHECKSUM_PARTIAL) {
541		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
542		skb->csum_start = skb_transport_header(skb) - skb->head;
543		skb->csum_offset = offsetof(struct tcphdr, check);
544	} else {
545		th->check = tcp_v4_check(skb->len, saddr, daddr,
546					 csum_partial(th,
547						      th->doff << 2,
548						      skb->csum));
549	}
550}
551
552/* This routine computes an IPv4 TCP checksum. */
553void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
554{
555	struct inet_sock *inet = inet_sk(sk);
556
557	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
558}
559EXPORT_SYMBOL(tcp_v4_send_check);
560
561int tcp_v4_gso_send_check(struct sk_buff *skb)
562{
563	const struct iphdr *iph;
564	struct tcphdr *th;
565
566	if (!pskb_may_pull(skb, sizeof(*th)))
567		return -EINVAL;
568
569	iph = ip_hdr(skb);
570	th = tcp_hdr(skb);
571
572	th->check = 0;
573	skb->ip_summed = CHECKSUM_PARTIAL;
574	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
575	return 0;
576}
577
578/*
579 *	This routine will send an RST to the other tcp.
580 *
581 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
582 *		      for reset.
583 *	Answer: if a packet caused RST, it is not for a socket
584 *		existing in our system, if it is matched to a socket,
585 *		it is just duplicate segment or bug in other side's TCP.
586 *		So that we build reply only basing on parameters
587 *		arrived with segment.
588 *	Exception: precedence violation. We do not implement it in any case.
589 */
590
591static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
592{
593	struct tcphdr *th = tcp_hdr(skb);
594	struct {
595		struct tcphdr th;
596#ifdef CONFIG_TCP_MD5SIG
597		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
598#endif
599	} rep;
600	struct ip_reply_arg arg;
601#ifdef CONFIG_TCP_MD5SIG
602	struct tcp_md5sig_key *key;
603#endif
604	struct net *net;
605
606	/* Never send a reset in response to a reset. */
607	if (th->rst)
608		return;
609
610	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
611		return;
612
613	/* Swap the send and the receive. */
614	memset(&rep, 0, sizeof(rep));
615	rep.th.dest   = th->source;
616	rep.th.source = th->dest;
617	rep.th.doff   = sizeof(struct tcphdr) / 4;
618	rep.th.rst    = 1;
619
620	if (th->ack) {
621		rep.th.seq = th->ack_seq;
622	} else {
623		rep.th.ack = 1;
624		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625				       skb->len - (th->doff << 2));
626	}
627
628	memset(&arg, 0, sizeof(arg));
629	arg.iov[0].iov_base = (unsigned char *)&rep;
630	arg.iov[0].iov_len  = sizeof(rep.th);
631
632#ifdef CONFIG_TCP_MD5SIG
633	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
634	if (key) {
635		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
636				   (TCPOPT_NOP << 16) |
637				   (TCPOPT_MD5SIG << 8) |
638				   TCPOLEN_MD5SIG);
639		/* Update length and the length the header thinks exists */
640		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
641		rep.th.doff = arg.iov[0].iov_len / 4;
642
643		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
644				     key, ip_hdr(skb)->saddr,
645				     ip_hdr(skb)->daddr, &rep.th);
646	}
647#endif
648	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
649				      ip_hdr(skb)->saddr, /* XXX */
650				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
651	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
652	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
653
654	net = dev_net(skb_dst(skb)->dev);
655	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
656		      &arg, arg.iov[0].iov_len);
657
658	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
659	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
660}
661
662/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
663   outside socket context is ugly, certainly. What can I do?
664 */
665
666static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
667			    u32 win, u32 ts, int oif,
668			    struct tcp_md5sig_key *key,
669			    int reply_flags)
670{
671	struct tcphdr *th = tcp_hdr(skb);
672	struct {
673		struct tcphdr th;
674		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
675#ifdef CONFIG_TCP_MD5SIG
676			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
677#endif
678			];
679	} rep;
680	struct ip_reply_arg arg;
681	struct net *net = dev_net(skb_dst(skb)->dev);
682
683	memset(&rep.th, 0, sizeof(struct tcphdr));
684	memset(&arg, 0, sizeof(arg));
685
686	arg.iov[0].iov_base = (unsigned char *)&rep;
687	arg.iov[0].iov_len  = sizeof(rep.th);
688	if (ts) {
689		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
690				   (TCPOPT_TIMESTAMP << 8) |
691				   TCPOLEN_TIMESTAMP);
692		rep.opt[1] = htonl(tcp_time_stamp);
693		rep.opt[2] = htonl(ts);
694		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
695	}
696
697	/* Swap the send and the receive. */
698	rep.th.dest    = th->source;
699	rep.th.source  = th->dest;
700	rep.th.doff    = arg.iov[0].iov_len / 4;
701	rep.th.seq     = htonl(seq);
702	rep.th.ack_seq = htonl(ack);
703	rep.th.ack     = 1;
704	rep.th.window  = htons(win);
705
706#ifdef CONFIG_TCP_MD5SIG
707	if (key) {
708		int offset = (ts) ? 3 : 0;
709
710		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
711					  (TCPOPT_NOP << 16) |
712					  (TCPOPT_MD5SIG << 8) |
713					  TCPOLEN_MD5SIG);
714		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
715		rep.th.doff = arg.iov[0].iov_len/4;
716
717		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
718				    key, ip_hdr(skb)->saddr,
719				    ip_hdr(skb)->daddr, &rep.th);
720	}
721#endif
722	arg.flags = reply_flags;
723	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
724				      ip_hdr(skb)->saddr, /* XXX */
725				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
726	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
727	if (oif)
728		arg.bound_dev_if = oif;
729
730	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
731		      &arg, arg.iov[0].iov_len);
732
733	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
734}
735
736static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
737{
738	struct inet_timewait_sock *tw = inet_twsk(sk);
739	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
740
741	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
742			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
743			tcptw->tw_ts_recent,
744			tw->tw_bound_dev_if,
745			tcp_twsk_md5_key(tcptw),
746			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
747			);
748
749	inet_twsk_put(tw);
750}
751
752static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
753				  struct request_sock *req)
754{
755	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
756			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
757			req->ts_recent,
758			0,
759			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
760			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
761}
762
763/*
764 *	Send a SYN-ACK after having received a SYN.
765 *	This still operates on a request_sock only, not on a big
766 *	socket.
767 */
768static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
769			      struct request_sock *req,
770			      struct request_values *rvp)
771{
772	const struct inet_request_sock *ireq = inet_rsk(req);
773	struct flowi4 fl4;
774	int err = -1;
775	struct sk_buff * skb;
776
777	/* First, grab a route. */
778	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
779		return -1;
780
781	skb = tcp_make_synack(sk, dst, req, rvp);
782
783	if (skb) {
784		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
785
786		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
787					    ireq->rmt_addr,
788					    ireq->opt);
789		err = net_xmit_eval(err);
790	}
791
792	dst_release(dst);
793	return err;
794}
795
796static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
797			      struct request_values *rvp)
798{
799	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
800	return tcp_v4_send_synack(sk, NULL, req, rvp);
801}
802
803/*
804 *	IPv4 request_sock destructor.
805 */
806static void tcp_v4_reqsk_destructor(struct request_sock *req)
807{
808	kfree(inet_rsk(req)->opt);
809}
810
811static void syn_flood_warning(const struct sk_buff *skb)
812{
813	const char *msg;
814
815#ifdef CONFIG_SYN_COOKIES
816	if (sysctl_tcp_syncookies)
817		msg = "Sending cookies";
818	else
819#endif
820		msg = "Dropping request";
821
822	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
823				ntohs(tcp_hdr(skb)->dest), msg);
824}
825
826/*
827 * Save and compile IPv4 options into the request_sock if needed.
828 */
829static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
830						  struct sk_buff *skb)
831{
832	const struct ip_options *opt = &(IPCB(skb)->opt);
833	struct ip_options_rcu *dopt = NULL;
834
835	if (opt && opt->optlen) {
836		int opt_size = sizeof(*dopt) + opt->optlen;
837
838		dopt = kmalloc(opt_size, GFP_ATOMIC);
839		if (dopt) {
840			if (ip_options_echo(&dopt->opt, skb)) {
841				kfree(dopt);
842				dopt = NULL;
843			}
844		}
845	}
846	return dopt;
847}
848
849#ifdef CONFIG_TCP_MD5SIG
850/*
851 * RFC2385 MD5 checksumming requires a mapping of
852 * IP address->MD5 Key.
853 * We need to maintain these in the sk structure.
854 */
855
856/* Find the Key structure for an address.  */
857static struct tcp_md5sig_key *
858			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
859{
860	struct tcp_sock *tp = tcp_sk(sk);
861	int i;
862
863	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
864		return NULL;
865	for (i = 0; i < tp->md5sig_info->entries4; i++) {
866		if (tp->md5sig_info->keys4[i].addr == addr)
867			return &tp->md5sig_info->keys4[i].base;
868	}
869	return NULL;
870}
871
872struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
873					 struct sock *addr_sk)
874{
875	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
876}
877EXPORT_SYMBOL(tcp_v4_md5_lookup);
878
879static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
880						      struct request_sock *req)
881{
882	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
883}
884
885/* This can be called on a newly created socket, from other files */
886int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
887		      u8 *newkey, u8 newkeylen)
888{
889	/* Add Key to the list */
890	struct tcp_md5sig_key *key;
891	struct tcp_sock *tp = tcp_sk(sk);
892	struct tcp4_md5sig_key *keys;
893
894	key = tcp_v4_md5_do_lookup(sk, addr);
895	if (key) {
896		/* Pre-existing entry - just update that one. */
897		kfree(key->key);
898		key->key = newkey;
899		key->keylen = newkeylen;
900	} else {
901		struct tcp_md5sig_info *md5sig;
902
903		if (!tp->md5sig_info) {
904			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
905						  GFP_ATOMIC);
906			if (!tp->md5sig_info) {
907				kfree(newkey);
908				return -ENOMEM;
909			}
910			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
911		}
912		if (tcp_alloc_md5sig_pool(sk) == NULL) {
913			kfree(newkey);
914			return -ENOMEM;
915		}
916		md5sig = tp->md5sig_info;
917
918		if (md5sig->alloced4 == md5sig->entries4) {
919			keys = kmalloc((sizeof(*keys) *
920					(md5sig->entries4 + 1)), GFP_ATOMIC);
921			if (!keys) {
922				kfree(newkey);
923				tcp_free_md5sig_pool();
924				return -ENOMEM;
925			}
926
927			if (md5sig->entries4)
928				memcpy(keys, md5sig->keys4,
929				       sizeof(*keys) * md5sig->entries4);
930
931			/* Free old key list, and reference new one */
932			kfree(md5sig->keys4);
933			md5sig->keys4 = keys;
934			md5sig->alloced4++;
935		}
936		md5sig->entries4++;
937		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
938		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
939		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
940	}
941	return 0;
942}
943EXPORT_SYMBOL(tcp_v4_md5_do_add);
944
945static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
946			       u8 *newkey, u8 newkeylen)
947{
948	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
949				 newkey, newkeylen);
950}
951
952int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
953{
954	struct tcp_sock *tp = tcp_sk(sk);
955	int i;
956
957	for (i = 0; i < tp->md5sig_info->entries4; i++) {
958		if (tp->md5sig_info->keys4[i].addr == addr) {
959			/* Free the key */
960			kfree(tp->md5sig_info->keys4[i].base.key);
961			tp->md5sig_info->entries4--;
962
963			if (tp->md5sig_info->entries4 == 0) {
964				kfree(tp->md5sig_info->keys4);
965				tp->md5sig_info->keys4 = NULL;
966				tp->md5sig_info->alloced4 = 0;
967			} else if (tp->md5sig_info->entries4 != i) {
968				/* Need to do some manipulation */
969				memmove(&tp->md5sig_info->keys4[i],
970					&tp->md5sig_info->keys4[i+1],
971					(tp->md5sig_info->entries4 - i) *
972					 sizeof(struct tcp4_md5sig_key));
973			}
974			tcp_free_md5sig_pool();
975			return 0;
976		}
977	}
978	return -ENOENT;
979}
980EXPORT_SYMBOL(tcp_v4_md5_do_del);
981
982static void tcp_v4_clear_md5_list(struct sock *sk)
983{
984	struct tcp_sock *tp = tcp_sk(sk);
985
986	/* Free each key, then the set of key keys,
987	 * the crypto element, and then decrement our
988	 * hold on the last resort crypto.
989	 */
990	if (tp->md5sig_info->entries4) {
991		int i;
992		for (i = 0; i < tp->md5sig_info->entries4; i++)
993			kfree(tp->md5sig_info->keys4[i].base.key);
994		tp->md5sig_info->entries4 = 0;
995		tcp_free_md5sig_pool();
996	}
997	if (tp->md5sig_info->keys4) {
998		kfree(tp->md5sig_info->keys4);
999		tp->md5sig_info->keys4 = NULL;
1000		tp->md5sig_info->alloced4  = 0;
1001	}
1002}
1003
1004static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1005				 int optlen)
1006{
1007	struct tcp_md5sig cmd;
1008	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1009	u8 *newkey;
1010
1011	if (optlen < sizeof(cmd))
1012		return -EINVAL;
1013
1014	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1015		return -EFAULT;
1016
1017	if (sin->sin_family != AF_INET)
1018		return -EINVAL;
1019
1020	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1021		if (!tcp_sk(sk)->md5sig_info)
1022			return -ENOENT;
1023		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1024	}
1025
1026	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1027		return -EINVAL;
1028
1029	if (!tcp_sk(sk)->md5sig_info) {
1030		struct tcp_sock *tp = tcp_sk(sk);
1031		struct tcp_md5sig_info *p;
1032
1033		p = kzalloc(sizeof(*p), sk->sk_allocation);
1034		if (!p)
1035			return -EINVAL;
1036
1037		tp->md5sig_info = p;
1038		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1039	}
1040
1041	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1042	if (!newkey)
1043		return -ENOMEM;
1044	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1045				 newkey, cmd.tcpm_keylen);
1046}
1047
1048static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1049					__be32 daddr, __be32 saddr, int nbytes)
1050{
1051	struct tcp4_pseudohdr *bp;
1052	struct scatterlist sg;
1053
1054	bp = &hp->md5_blk.ip4;
1055
1056	/*
1057	 * 1. the TCP pseudo-header (in the order: source IP address,
1058	 * destination IP address, zero-padded protocol number, and
1059	 * segment length)
1060	 */
1061	bp->saddr = saddr;
1062	bp->daddr = daddr;
1063	bp->pad = 0;
1064	bp->protocol = IPPROTO_TCP;
1065	bp->len = cpu_to_be16(nbytes);
1066
1067	sg_init_one(&sg, bp, sizeof(*bp));
1068	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1069}
1070
1071static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1072			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1073{
1074	struct tcp_md5sig_pool *hp;
1075	struct hash_desc *desc;
1076
1077	hp = tcp_get_md5sig_pool();
1078	if (!hp)
1079		goto clear_hash_noput;
1080	desc = &hp->md5_desc;
1081
1082	if (crypto_hash_init(desc))
1083		goto clear_hash;
1084	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1085		goto clear_hash;
1086	if (tcp_md5_hash_header(hp, th))
1087		goto clear_hash;
1088	if (tcp_md5_hash_key(hp, key))
1089		goto clear_hash;
1090	if (crypto_hash_final(desc, md5_hash))
1091		goto clear_hash;
1092
1093	tcp_put_md5sig_pool();
1094	return 0;
1095
1096clear_hash:
1097	tcp_put_md5sig_pool();
1098clear_hash_noput:
1099	memset(md5_hash, 0, 16);
1100	return 1;
1101}
1102
1103int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1104			struct sock *sk, struct request_sock *req,
1105			struct sk_buff *skb)
1106{
1107	struct tcp_md5sig_pool *hp;
1108	struct hash_desc *desc;
1109	struct tcphdr *th = tcp_hdr(skb);
1110	__be32 saddr, daddr;
1111
1112	if (sk) {
1113		saddr = inet_sk(sk)->inet_saddr;
1114		daddr = inet_sk(sk)->inet_daddr;
1115	} else if (req) {
1116		saddr = inet_rsk(req)->loc_addr;
1117		daddr = inet_rsk(req)->rmt_addr;
1118	} else {
1119		const struct iphdr *iph = ip_hdr(skb);
1120		saddr = iph->saddr;
1121		daddr = iph->daddr;
1122	}
1123
1124	hp = tcp_get_md5sig_pool();
1125	if (!hp)
1126		goto clear_hash_noput;
1127	desc = &hp->md5_desc;
1128
1129	if (crypto_hash_init(desc))
1130		goto clear_hash;
1131
1132	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1133		goto clear_hash;
1134	if (tcp_md5_hash_header(hp, th))
1135		goto clear_hash;
1136	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1137		goto clear_hash;
1138	if (tcp_md5_hash_key(hp, key))
1139		goto clear_hash;
1140	if (crypto_hash_final(desc, md5_hash))
1141		goto clear_hash;
1142
1143	tcp_put_md5sig_pool();
1144	return 0;
1145
1146clear_hash:
1147	tcp_put_md5sig_pool();
1148clear_hash_noput:
1149	memset(md5_hash, 0, 16);
1150	return 1;
1151}
1152EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1153
1154static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1155{
1156	/*
1157	 * This gets called for each TCP segment that arrives
1158	 * so we want to be efficient.
1159	 * We have 3 drop cases:
1160	 * o No MD5 hash and one expected.
1161	 * o MD5 hash and we're not expecting one.
1162	 * o MD5 hash and its wrong.
1163	 */
1164	__u8 *hash_location = NULL;
1165	struct tcp_md5sig_key *hash_expected;
1166	const struct iphdr *iph = ip_hdr(skb);
1167	struct tcphdr *th = tcp_hdr(skb);
1168	int genhash;
1169	unsigned char newhash[16];
1170
1171	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1172	hash_location = tcp_parse_md5sig_option(th);
1173
1174	/* We've parsed the options - do we have a hash? */
1175	if (!hash_expected && !hash_location)
1176		return 0;
1177
1178	if (hash_expected && !hash_location) {
1179		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1180		return 1;
1181	}
1182
1183	if (!hash_expected && hash_location) {
1184		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1185		return 1;
1186	}
1187
1188	/* Okay, so this is hash_expected and hash_location -
1189	 * so we need to calculate the checksum.
1190	 */
1191	genhash = tcp_v4_md5_hash_skb(newhash,
1192				      hash_expected,
1193				      NULL, NULL, skb);
1194
1195	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1196		if (net_ratelimit()) {
1197			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1198			       &iph->saddr, ntohs(th->source),
1199			       &iph->daddr, ntohs(th->dest),
1200			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1201		}
1202		return 1;
1203	}
1204	return 0;
1205}
1206
1207#endif
1208
1209struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1210	.family		=	PF_INET,
1211	.obj_size	=	sizeof(struct tcp_request_sock),
1212	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1213	.send_ack	=	tcp_v4_reqsk_send_ack,
1214	.destructor	=	tcp_v4_reqsk_destructor,
1215	.send_reset	=	tcp_v4_send_reset,
1216	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1217};
1218
1219#ifdef CONFIG_TCP_MD5SIG
1220static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1222	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1223};
1224#endif
1225
1226int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1227{
1228	struct tcp_extend_values tmp_ext;
1229	struct tcp_options_received tmp_opt;
1230	u8 *hash_location;
1231	struct request_sock *req;
1232	struct inet_request_sock *ireq;
1233	struct tcp_sock *tp = tcp_sk(sk);
1234	struct dst_entry *dst = NULL;
1235	__be32 saddr = ip_hdr(skb)->saddr;
1236	__be32 daddr = ip_hdr(skb)->daddr;
1237	__u32 isn = TCP_SKB_CB(skb)->when;
1238#ifdef CONFIG_SYN_COOKIES
1239	int want_cookie = 0;
1240#else
1241#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1242#endif
1243
1244	/* Never answer to SYNs send to broadcast or multicast */
1245	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1246		goto drop;
1247
1248	/* TW buckets are converted to open requests without
1249	 * limitations, they conserve resources and peer is
1250	 * evidently real one.
1251	 */
1252	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1253		if (net_ratelimit())
1254			syn_flood_warning(skb);
1255#ifdef CONFIG_SYN_COOKIES
1256		if (sysctl_tcp_syncookies) {
1257			want_cookie = 1;
1258		} else
1259#endif
1260		goto drop;
1261	}
1262
1263	/* Accept backlog is full. If we have already queued enough
1264	 * of warm entries in syn queue, drop request. It is better than
1265	 * clogging syn queue with openreqs with exponentially increasing
1266	 * timeout.
1267	 */
1268	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1269		goto drop;
1270
1271	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1272	if (!req)
1273		goto drop;
1274
1275#ifdef CONFIG_TCP_MD5SIG
1276	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1277#endif
1278
1279	tcp_clear_options(&tmp_opt);
1280	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1281	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1282	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1283
1284	if (tmp_opt.cookie_plus > 0 &&
1285	    tmp_opt.saw_tstamp &&
1286	    !tp->rx_opt.cookie_out_never &&
1287	    (sysctl_tcp_cookie_size > 0 ||
1288	     (tp->cookie_values != NULL &&
1289	      tp->cookie_values->cookie_desired > 0))) {
1290		u8 *c;
1291		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1292		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1293
1294		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1295			goto drop_and_release;
1296
1297		/* Secret recipe starts with IP addresses */
1298		*mess++ ^= (__force u32)daddr;
1299		*mess++ ^= (__force u32)saddr;
1300
1301		/* plus variable length Initiator Cookie */
1302		c = (u8 *)mess;
1303		while (l-- > 0)
1304			*c++ ^= *hash_location++;
1305
1306#ifdef CONFIG_SYN_COOKIES
1307		want_cookie = 0;	/* not our kind of cookie */
1308#endif
1309		tmp_ext.cookie_out_never = 0; /* false */
1310		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1311	} else if (!tp->rx_opt.cookie_in_always) {
1312		/* redundant indications, but ensure initialization. */
1313		tmp_ext.cookie_out_never = 1; /* true */
1314		tmp_ext.cookie_plus = 0;
1315	} else {
1316		goto drop_and_release;
1317	}
1318	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1319
1320	if (want_cookie && !tmp_opt.saw_tstamp)
1321		tcp_clear_options(&tmp_opt);
1322
1323	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1324	tcp_openreq_init(req, &tmp_opt, skb);
1325
1326	ireq = inet_rsk(req);
1327	ireq->loc_addr = daddr;
1328	ireq->rmt_addr = saddr;
1329	ireq->no_srccheck = inet_sk(sk)->transparent;
1330	ireq->opt = tcp_v4_save_options(sk, skb);
1331
1332	if (security_inet_conn_request(sk, skb, req))
1333		goto drop_and_free;
1334
1335	if (!want_cookie || tmp_opt.tstamp_ok)
1336		TCP_ECN_create_request(req, tcp_hdr(skb));
1337
1338	if (want_cookie) {
1339		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1340		req->cookie_ts = tmp_opt.tstamp_ok;
1341	} else if (!isn) {
1342		struct inet_peer *peer = NULL;
1343		struct flowi4 fl4;
1344
1345		/* VJ's idea. We save last timestamp seen
1346		 * from the destination in peer table, when entering
1347		 * state TIME-WAIT, and check against it before
1348		 * accepting new connection request.
1349		 *
1350		 * If "isn" is not zero, this request hit alive
1351		 * timewait bucket, so that all the necessary checks
1352		 * are made in the function processing timewait state.
1353		 */
1354		if (tmp_opt.saw_tstamp &&
1355		    tcp_death_row.sysctl_tw_recycle &&
1356		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1357		    fl4.daddr == saddr &&
1358		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1359			inet_peer_refcheck(peer);
1360			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1361			    (s32)(peer->tcp_ts - req->ts_recent) >
1362							TCP_PAWS_WINDOW) {
1363				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1364				goto drop_and_release;
1365			}
1366		}
1367		/* Kill the following clause, if you dislike this way. */
1368		else if (!sysctl_tcp_syncookies &&
1369			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1370			  (sysctl_max_syn_backlog >> 2)) &&
1371			 (!peer || !peer->tcp_ts_stamp) &&
1372			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1373			/* Without syncookies last quarter of
1374			 * backlog is filled with destinations,
1375			 * proven to be alive.
1376			 * It means that we continue to communicate
1377			 * to destinations, already remembered
1378			 * to the moment of synflood.
1379			 */
1380			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1381				       &saddr, ntohs(tcp_hdr(skb)->source));
1382			goto drop_and_release;
1383		}
1384
1385		isn = tcp_v4_init_sequence(skb);
1386	}
1387	tcp_rsk(req)->snt_isn = isn;
1388	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1389
1390	if (tcp_v4_send_synack(sk, dst, req,
1391			       (struct request_values *)&tmp_ext) ||
1392	    want_cookie)
1393		goto drop_and_free;
1394
1395	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1396	return 0;
1397
1398drop_and_release:
1399	dst_release(dst);
1400drop_and_free:
1401	reqsk_free(req);
1402drop:
1403	return 0;
1404}
1405EXPORT_SYMBOL(tcp_v4_conn_request);
1406
1407
1408/*
1409 * The three way handshake has completed - we got a valid synack -
1410 * now create the new socket.
1411 */
1412struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1413				  struct request_sock *req,
1414				  struct dst_entry *dst)
1415{
1416	struct inet_request_sock *ireq;
1417	struct inet_sock *newinet;
1418	struct tcp_sock *newtp;
1419	struct sock *newsk;
1420#ifdef CONFIG_TCP_MD5SIG
1421	struct tcp_md5sig_key *key;
1422#endif
1423	struct ip_options_rcu *inet_opt;
1424
1425	if (sk_acceptq_is_full(sk))
1426		goto exit_overflow;
1427
1428	newsk = tcp_create_openreq_child(sk, req, skb);
1429	if (!newsk)
1430		goto exit_nonewsk;
1431
1432	newsk->sk_gso_type = SKB_GSO_TCPV4;
1433
1434	newtp		      = tcp_sk(newsk);
1435	newinet		      = inet_sk(newsk);
1436	ireq		      = inet_rsk(req);
1437	newinet->inet_daddr   = ireq->rmt_addr;
1438	newinet->inet_rcv_saddr = ireq->loc_addr;
1439	newinet->inet_saddr	      = ireq->loc_addr;
1440	inet_opt	      = ireq->opt;
1441	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1442	ireq->opt	      = NULL;
1443	newinet->mc_index     = inet_iif(skb);
1444	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1445	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1446	if (inet_opt)
1447		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1448	newinet->inet_id = newtp->write_seq ^ jiffies;
1449
1450	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1451		goto put_and_exit;
1452
1453	sk_setup_caps(newsk, dst);
1454
1455	tcp_mtup_init(newsk);
1456	tcp_sync_mss(newsk, dst_mtu(dst));
1457	newtp->advmss = dst_metric_advmss(dst);
1458	if (tcp_sk(sk)->rx_opt.user_mss &&
1459	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1460		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1461
1462	tcp_initialize_rcv_mss(newsk);
1463	if (tcp_rsk(req)->snt_synack)
1464		tcp_valid_rtt_meas(newsk,
1465		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1466	newtp->total_retrans = req->retrans;
1467
1468#ifdef CONFIG_TCP_MD5SIG
1469	/* Copy over the MD5 key from the original socket */
1470	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1471	if (key != NULL) {
1472		/*
1473		 * We're using one, so create a matching key
1474		 * on the newsk structure. If we fail to get
1475		 * memory, then we end up not copying the key
1476		 * across. Shucks.
1477		 */
1478		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1479		if (newkey != NULL)
1480			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1481					  newkey, key->keylen);
1482		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483	}
1484#endif
1485
1486	if (__inet_inherit_port(sk, newsk) < 0)
1487		goto put_and_exit;
1488	__inet_hash_nolisten(newsk, NULL);
1489
1490	return newsk;
1491
1492exit_overflow:
1493	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1494exit_nonewsk:
1495	dst_release(dst);
1496exit:
1497	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1498	return NULL;
1499put_and_exit:
1500	sock_put(newsk);
1501	goto exit;
1502}
1503EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1504
1505static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1506{
1507	struct tcphdr *th = tcp_hdr(skb);
1508	const struct iphdr *iph = ip_hdr(skb);
1509	struct sock *nsk;
1510	struct request_sock **prev;
1511	/* Find possible connection requests. */
1512	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1513						       iph->saddr, iph->daddr);
1514	if (req)
1515		return tcp_check_req(sk, skb, req, prev);
1516
1517	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1518			th->source, iph->daddr, th->dest, inet_iif(skb));
1519
1520	if (nsk) {
1521		if (nsk->sk_state != TCP_TIME_WAIT) {
1522			bh_lock_sock(nsk);
1523			return nsk;
1524		}
1525		inet_twsk_put(inet_twsk(nsk));
1526		return NULL;
1527	}
1528
1529#ifdef CONFIG_SYN_COOKIES
1530	if (!th->syn)
1531		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1532#endif
1533	return sk;
1534}
1535
1536static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1537{
1538	const struct iphdr *iph = ip_hdr(skb);
1539
1540	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1541		if (!tcp_v4_check(skb->len, iph->saddr,
1542				  iph->daddr, skb->csum)) {
1543			skb->ip_summed = CHECKSUM_UNNECESSARY;
1544			return 0;
1545		}
1546	}
1547
1548	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1549				       skb->len, IPPROTO_TCP, 0);
1550
1551	if (skb->len <= 76) {
1552		return __skb_checksum_complete(skb);
1553	}
1554	return 0;
1555}
1556
1557
1558/* The socket must have it's spinlock held when we get
1559 * here.
1560 *
1561 * We have a potential double-lock case here, so even when
1562 * doing backlog processing we use the BH locking scheme.
1563 * This is because we cannot sleep with the original spinlock
1564 * held.
1565 */
1566int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1567{
1568	struct sock *rsk;
1569#ifdef CONFIG_TCP_MD5SIG
1570	/*
1571	 * We really want to reject the packet as early as possible
1572	 * if:
1573	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1574	 *  o There is an MD5 option and we're not expecting one
1575	 */
1576	if (tcp_v4_inbound_md5_hash(sk, skb))
1577		goto discard;
1578#endif
1579
1580	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1581		sock_rps_save_rxhash(sk, skb);
1582		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1583			rsk = sk;
1584			goto reset;
1585		}
1586		return 0;
1587	}
1588
1589	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1590		goto csum_err;
1591
1592	if (sk->sk_state == TCP_LISTEN) {
1593		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1594		if (!nsk)
1595			goto discard;
1596
1597		if (nsk != sk) {
1598			sock_rps_save_rxhash(nsk, skb);
1599			if (tcp_child_process(sk, nsk, skb)) {
1600				rsk = nsk;
1601				goto reset;
1602			}
1603			return 0;
1604		}
1605	} else
1606		sock_rps_save_rxhash(sk, skb);
1607
1608	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1609		rsk = sk;
1610		goto reset;
1611	}
1612	return 0;
1613
1614reset:
1615	tcp_v4_send_reset(rsk, skb);
1616discard:
1617	kfree_skb(skb);
1618	/* Be careful here. If this function gets more complicated and
1619	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1620	 * might be destroyed here. This current version compiles correctly,
1621	 * but you have been warned.
1622	 */
1623	return 0;
1624
1625csum_err:
1626	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1627	goto discard;
1628}
1629EXPORT_SYMBOL(tcp_v4_do_rcv);
1630
1631/*
1632 *	From tcp_input.c
1633 */
1634
1635int tcp_v4_rcv(struct sk_buff *skb)
1636{
1637	const struct iphdr *iph;
1638	struct tcphdr *th;
1639	struct sock *sk;
1640	int ret;
1641	struct net *net = dev_net(skb->dev);
1642
1643	if (skb->pkt_type != PACKET_HOST)
1644		goto discard_it;
1645
1646	/* Count it even if it's bad */
1647	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1648
1649	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1650		goto discard_it;
1651
1652	th = tcp_hdr(skb);
1653
1654	if (th->doff < sizeof(struct tcphdr) / 4)
1655		goto bad_packet;
1656	if (!pskb_may_pull(skb, th->doff * 4))
1657		goto discard_it;
1658
1659	/* An explanation is required here, I think.
1660	 * Packet length and doff are validated by header prediction,
1661	 * provided case of th->doff==0 is eliminated.
1662	 * So, we defer the checks. */
1663	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1664		goto bad_packet;
1665
1666	th = tcp_hdr(skb);
1667	iph = ip_hdr(skb);
1668	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1669	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1670				    skb->len - th->doff * 4);
1671	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1672	TCP_SKB_CB(skb)->when	 = 0;
1673	TCP_SKB_CB(skb)->flags	 = iph->tos;
1674	TCP_SKB_CB(skb)->sacked	 = 0;
1675
1676	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1677	if (!sk)
1678		goto no_tcp_socket;
1679
1680process:
1681	if (sk->sk_state == TCP_TIME_WAIT)
1682		goto do_time_wait;
1683
1684	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1685		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1686		goto discard_and_relse;
1687	}
1688
1689	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1690		goto discard_and_relse;
1691	nf_reset(skb);
1692
1693	if (sk_filter(sk, skb))
1694		goto discard_and_relse;
1695
1696	skb->dev = NULL;
1697
1698	bh_lock_sock_nested(sk);
1699	ret = 0;
1700	if (!sock_owned_by_user(sk)) {
1701#ifdef CONFIG_NET_DMA
1702		struct tcp_sock *tp = tcp_sk(sk);
1703		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1704			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1705		if (tp->ucopy.dma_chan)
1706			ret = tcp_v4_do_rcv(sk, skb);
1707		else
1708#endif
1709		{
1710			if (!tcp_prequeue(sk, skb))
1711				ret = tcp_v4_do_rcv(sk, skb);
1712		}
1713	} else if (unlikely(sk_add_backlog(sk, skb))) {
1714		bh_unlock_sock(sk);
1715		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1716		goto discard_and_relse;
1717	}
1718	bh_unlock_sock(sk);
1719
1720	sock_put(sk);
1721
1722	return ret;
1723
1724no_tcp_socket:
1725	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1726		goto discard_it;
1727
1728	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1729bad_packet:
1730		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1731	} else {
1732		tcp_v4_send_reset(NULL, skb);
1733	}
1734
1735discard_it:
1736	/* Discard frame. */
1737	kfree_skb(skb);
1738	return 0;
1739
1740discard_and_relse:
1741	sock_put(sk);
1742	goto discard_it;
1743
1744do_time_wait:
1745	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1746		inet_twsk_put(inet_twsk(sk));
1747		goto discard_it;
1748	}
1749
1750	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1751		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1752		inet_twsk_put(inet_twsk(sk));
1753		goto discard_it;
1754	}
1755	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1756	case TCP_TW_SYN: {
1757		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1758							&tcp_hashinfo,
1759							iph->daddr, th->dest,
1760							inet_iif(skb));
1761		if (sk2) {
1762			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1763			inet_twsk_put(inet_twsk(sk));
1764			sk = sk2;
1765			goto process;
1766		}
1767		/* Fall through to ACK */
1768	}
1769	case TCP_TW_ACK:
1770		tcp_v4_timewait_ack(sk, skb);
1771		break;
1772	case TCP_TW_RST:
1773		goto no_tcp_socket;
1774	case TCP_TW_SUCCESS:;
1775	}
1776	goto discard_it;
1777}
1778
1779struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1780{
1781	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1782	struct inet_sock *inet = inet_sk(sk);
1783	struct inet_peer *peer;
1784
1785	if (!rt ||
1786	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1787		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1788		*release_it = true;
1789	} else {
1790		if (!rt->peer)
1791			rt_bind_peer(rt, inet->inet_daddr, 1);
1792		peer = rt->peer;
1793		*release_it = false;
1794	}
1795
1796	return peer;
1797}
1798EXPORT_SYMBOL(tcp_v4_get_peer);
1799
1800void *tcp_v4_tw_get_peer(struct sock *sk)
1801{
1802	struct inet_timewait_sock *tw = inet_twsk(sk);
1803
1804	return inet_getpeer_v4(tw->tw_daddr, 1);
1805}
1806EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1807
1808static struct timewait_sock_ops tcp_timewait_sock_ops = {
1809	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1810	.twsk_unique	= tcp_twsk_unique,
1811	.twsk_destructor= tcp_twsk_destructor,
1812	.twsk_getpeer	= tcp_v4_tw_get_peer,
1813};
1814
1815const struct inet_connection_sock_af_ops ipv4_specific = {
1816	.queue_xmit	   = ip_queue_xmit,
1817	.send_check	   = tcp_v4_send_check,
1818	.rebuild_header	   = inet_sk_rebuild_header,
1819	.conn_request	   = tcp_v4_conn_request,
1820	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1821	.get_peer	   = tcp_v4_get_peer,
1822	.net_header_len	   = sizeof(struct iphdr),
1823	.setsockopt	   = ip_setsockopt,
1824	.getsockopt	   = ip_getsockopt,
1825	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1826	.sockaddr_len	   = sizeof(struct sockaddr_in),
1827	.bind_conflict	   = inet_csk_bind_conflict,
1828#ifdef CONFIG_COMPAT
1829	.compat_setsockopt = compat_ip_setsockopt,
1830	.compat_getsockopt = compat_ip_getsockopt,
1831#endif
1832};
1833EXPORT_SYMBOL(ipv4_specific);
1834
1835#ifdef CONFIG_TCP_MD5SIG
1836static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1837	.md5_lookup		= tcp_v4_md5_lookup,
1838	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1839	.md5_add		= tcp_v4_md5_add_func,
1840	.md5_parse		= tcp_v4_parse_md5_keys,
1841};
1842#endif
1843
1844/* NOTE: A lot of things set to zero explicitly by call to
1845 *       sk_alloc() so need not be done here.
1846 */
1847static int tcp_v4_init_sock(struct sock *sk)
1848{
1849	struct inet_connection_sock *icsk = inet_csk(sk);
1850	struct tcp_sock *tp = tcp_sk(sk);
1851
1852	skb_queue_head_init(&tp->out_of_order_queue);
1853	tcp_init_xmit_timers(sk);
1854	tcp_prequeue_init(tp);
1855
1856	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1857	tp->mdev = TCP_TIMEOUT_INIT;
1858
1859	/* So many TCP implementations out there (incorrectly) count the
1860	 * initial SYN frame in their delayed-ACK and congestion control
1861	 * algorithms that we must have the following bandaid to talk
1862	 * efficiently to them.  -DaveM
1863	 */
1864	tp->snd_cwnd = TCP_INIT_CWND;
1865
1866	/* See draft-stevens-tcpca-spec-01 for discussion of the
1867	 * initialization of these values.
1868	 */
1869	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1870	tp->snd_cwnd_clamp = ~0;
1871	tp->mss_cache = TCP_MSS_DEFAULT;
1872
1873	tp->reordering = sysctl_tcp_reordering;
1874	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1875
1876	sk->sk_state = TCP_CLOSE;
1877
1878	sk->sk_write_space = sk_stream_write_space;
1879	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1880
1881	icsk->icsk_af_ops = &ipv4_specific;
1882	icsk->icsk_sync_mss = tcp_sync_mss;
1883#ifdef CONFIG_TCP_MD5SIG
1884	tp->af_specific = &tcp_sock_ipv4_specific;
1885#endif
1886
1887	/* TCP Cookie Transactions */
1888	if (sysctl_tcp_cookie_size > 0) {
1889		/* Default, cookies without s_data_payload. */
1890		tp->cookie_values =
1891			kzalloc(sizeof(*tp->cookie_values),
1892				sk->sk_allocation);
1893		if (tp->cookie_values != NULL)
1894			kref_init(&tp->cookie_values->kref);
1895	}
1896	/* Presumed zeroed, in order of appearance:
1897	 *	cookie_in_always, cookie_out_never,
1898	 *	s_data_constant, s_data_in, s_data_out
1899	 */
1900	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1901	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1902
1903	local_bh_disable();
1904	percpu_counter_inc(&tcp_sockets_allocated);
1905	local_bh_enable();
1906
1907	return 0;
1908}
1909
1910void tcp_v4_destroy_sock(struct sock *sk)
1911{
1912	struct tcp_sock *tp = tcp_sk(sk);
1913
1914	tcp_clear_xmit_timers(sk);
1915
1916	tcp_cleanup_congestion_control(sk);
1917
1918	/* Cleanup up the write buffer. */
1919	tcp_write_queue_purge(sk);
1920
1921	/* Cleans up our, hopefully empty, out_of_order_queue. */
1922	__skb_queue_purge(&tp->out_of_order_queue);
1923
1924#ifdef CONFIG_TCP_MD5SIG
1925	/* Clean up the MD5 key list, if any */
1926	if (tp->md5sig_info) {
1927		tcp_v4_clear_md5_list(sk);
1928		kfree(tp->md5sig_info);
1929		tp->md5sig_info = NULL;
1930	}
1931#endif
1932
1933#ifdef CONFIG_NET_DMA
1934	/* Cleans up our sk_async_wait_queue */
1935	__skb_queue_purge(&sk->sk_async_wait_queue);
1936#endif
1937
1938	/* Clean prequeue, it must be empty really */
1939	__skb_queue_purge(&tp->ucopy.prequeue);
1940
1941	/* Clean up a referenced TCP bind bucket. */
1942	if (inet_csk(sk)->icsk_bind_hash)
1943		inet_put_port(sk);
1944
1945	/*
1946	 * If sendmsg cached page exists, toss it.
1947	 */
1948	if (sk->sk_sndmsg_page) {
1949		__free_page(sk->sk_sndmsg_page);
1950		sk->sk_sndmsg_page = NULL;
1951	}
1952
1953	/* TCP Cookie Transactions */
1954	if (tp->cookie_values != NULL) {
1955		kref_put(&tp->cookie_values->kref,
1956			 tcp_cookie_values_release);
1957		tp->cookie_values = NULL;
1958	}
1959
1960	percpu_counter_dec(&tcp_sockets_allocated);
1961}
1962EXPORT_SYMBOL(tcp_v4_destroy_sock);
1963
1964#ifdef CONFIG_PROC_FS
1965/* Proc filesystem TCP sock list dumping. */
1966
1967static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1968{
1969	return hlist_nulls_empty(head) ? NULL :
1970		list_entry(head->first, struct inet_timewait_sock, tw_node);
1971}
1972
1973static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1974{
1975	return !is_a_nulls(tw->tw_node.next) ?
1976		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1977}
1978
1979/*
1980 * Get next listener socket follow cur.  If cur is NULL, get first socket
1981 * starting from bucket given in st->bucket; when st->bucket is zero the
1982 * very first socket in the hash table is returned.
1983 */
1984static void *listening_get_next(struct seq_file *seq, void *cur)
1985{
1986	struct inet_connection_sock *icsk;
1987	struct hlist_nulls_node *node;
1988	struct sock *sk = cur;
1989	struct inet_listen_hashbucket *ilb;
1990	struct tcp_iter_state *st = seq->private;
1991	struct net *net = seq_file_net(seq);
1992
1993	if (!sk) {
1994		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1995		spin_lock_bh(&ilb->lock);
1996		sk = sk_nulls_head(&ilb->head);
1997		st->offset = 0;
1998		goto get_sk;
1999	}
2000	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2001	++st->num;
2002	++st->offset;
2003
2004	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2005		struct request_sock *req = cur;
2006
2007		icsk = inet_csk(st->syn_wait_sk);
2008		req = req->dl_next;
2009		while (1) {
2010			while (req) {
2011				if (req->rsk_ops->family == st->family) {
2012					cur = req;
2013					goto out;
2014				}
2015				req = req->dl_next;
2016			}
2017			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2018				break;
2019get_req:
2020			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2021		}
2022		sk	  = sk_nulls_next(st->syn_wait_sk);
2023		st->state = TCP_SEQ_STATE_LISTENING;
2024		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2025	} else {
2026		icsk = inet_csk(sk);
2027		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2028		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2029			goto start_req;
2030		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031		sk = sk_nulls_next(sk);
2032	}
2033get_sk:
2034	sk_nulls_for_each_from(sk, node) {
2035		if (!net_eq(sock_net(sk), net))
2036			continue;
2037		if (sk->sk_family == st->family) {
2038			cur = sk;
2039			goto out;
2040		}
2041		icsk = inet_csk(sk);
2042		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2043		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2044start_req:
2045			st->uid		= sock_i_uid(sk);
2046			st->syn_wait_sk = sk;
2047			st->state	= TCP_SEQ_STATE_OPENREQ;
2048			st->sbucket	= 0;
2049			goto get_req;
2050		}
2051		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2052	}
2053	spin_unlock_bh(&ilb->lock);
2054	st->offset = 0;
2055	if (++st->bucket < INET_LHTABLE_SIZE) {
2056		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2057		spin_lock_bh(&ilb->lock);
2058		sk = sk_nulls_head(&ilb->head);
2059		goto get_sk;
2060	}
2061	cur = NULL;
2062out:
2063	return cur;
2064}
2065
2066static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2067{
2068	struct tcp_iter_state *st = seq->private;
2069	void *rc;
2070
2071	st->bucket = 0;
2072	st->offset = 0;
2073	rc = listening_get_next(seq, NULL);
2074
2075	while (rc && *pos) {
2076		rc = listening_get_next(seq, rc);
2077		--*pos;
2078	}
2079	return rc;
2080}
2081
2082static inline int empty_bucket(struct tcp_iter_state *st)
2083{
2084	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2085		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2086}
2087
2088/*
2089 * Get first established socket starting from bucket given in st->bucket.
2090 * If st->bucket is zero, the very first socket in the hash is returned.
2091 */
2092static void *established_get_first(struct seq_file *seq)
2093{
2094	struct tcp_iter_state *st = seq->private;
2095	struct net *net = seq_file_net(seq);
2096	void *rc = NULL;
2097
2098	st->offset = 0;
2099	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2100		struct sock *sk;
2101		struct hlist_nulls_node *node;
2102		struct inet_timewait_sock *tw;
2103		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2104
2105		/* Lockless fast path for the common case of empty buckets */
2106		if (empty_bucket(st))
2107			continue;
2108
2109		spin_lock_bh(lock);
2110		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2111			if (sk->sk_family != st->family ||
2112			    !net_eq(sock_net(sk), net)) {
2113				continue;
2114			}
2115			rc = sk;
2116			goto out;
2117		}
2118		st->state = TCP_SEQ_STATE_TIME_WAIT;
2119		inet_twsk_for_each(tw, node,
2120				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2121			if (tw->tw_family != st->family ||
2122			    !net_eq(twsk_net(tw), net)) {
2123				continue;
2124			}
2125			rc = tw;
2126			goto out;
2127		}
2128		spin_unlock_bh(lock);
2129		st->state = TCP_SEQ_STATE_ESTABLISHED;
2130	}
2131out:
2132	return rc;
2133}
2134
2135static void *established_get_next(struct seq_file *seq, void *cur)
2136{
2137	struct sock *sk = cur;
2138	struct inet_timewait_sock *tw;
2139	struct hlist_nulls_node *node;
2140	struct tcp_iter_state *st = seq->private;
2141	struct net *net = seq_file_net(seq);
2142
2143	++st->num;
2144	++st->offset;
2145
2146	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2147		tw = cur;
2148		tw = tw_next(tw);
2149get_tw:
2150		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2151			tw = tw_next(tw);
2152		}
2153		if (tw) {
2154			cur = tw;
2155			goto out;
2156		}
2157		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2158		st->state = TCP_SEQ_STATE_ESTABLISHED;
2159
2160		/* Look for next non empty bucket */
2161		st->offset = 0;
2162		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2163				empty_bucket(st))
2164			;
2165		if (st->bucket > tcp_hashinfo.ehash_mask)
2166			return NULL;
2167
2168		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2169		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2170	} else
2171		sk = sk_nulls_next(sk);
2172
2173	sk_nulls_for_each_from(sk, node) {
2174		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2175			goto found;
2176	}
2177
2178	st->state = TCP_SEQ_STATE_TIME_WAIT;
2179	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2180	goto get_tw;
2181found:
2182	cur = sk;
2183out:
2184	return cur;
2185}
2186
2187static void *established_get_idx(struct seq_file *seq, loff_t pos)
2188{
2189	struct tcp_iter_state *st = seq->private;
2190	void *rc;
2191
2192	st->bucket = 0;
2193	rc = established_get_first(seq);
2194
2195	while (rc && pos) {
2196		rc = established_get_next(seq, rc);
2197		--pos;
2198	}
2199	return rc;
2200}
2201
2202static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2203{
2204	void *rc;
2205	struct tcp_iter_state *st = seq->private;
2206
2207	st->state = TCP_SEQ_STATE_LISTENING;
2208	rc	  = listening_get_idx(seq, &pos);
2209
2210	if (!rc) {
2211		st->state = TCP_SEQ_STATE_ESTABLISHED;
2212		rc	  = established_get_idx(seq, pos);
2213	}
2214
2215	return rc;
2216}
2217
2218static void *tcp_seek_last_pos(struct seq_file *seq)
2219{
2220	struct tcp_iter_state *st = seq->private;
2221	int offset = st->offset;
2222	int orig_num = st->num;
2223	void *rc = NULL;
2224
2225	switch (st->state) {
2226	case TCP_SEQ_STATE_OPENREQ:
2227	case TCP_SEQ_STATE_LISTENING:
2228		if (st->bucket >= INET_LHTABLE_SIZE)
2229			break;
2230		st->state = TCP_SEQ_STATE_LISTENING;
2231		rc = listening_get_next(seq, NULL);
2232		while (offset-- && rc)
2233			rc = listening_get_next(seq, rc);
2234		if (rc)
2235			break;
2236		st->bucket = 0;
2237		/* Fallthrough */
2238	case TCP_SEQ_STATE_ESTABLISHED:
2239	case TCP_SEQ_STATE_TIME_WAIT:
2240		st->state = TCP_SEQ_STATE_ESTABLISHED;
2241		if (st->bucket > tcp_hashinfo.ehash_mask)
2242			break;
2243		rc = established_get_first(seq);
2244		while (offset-- && rc)
2245			rc = established_get_next(seq, rc);
2246	}
2247
2248	st->num = orig_num;
2249
2250	return rc;
2251}
2252
2253static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2254{
2255	struct tcp_iter_state *st = seq->private;
2256	void *rc;
2257
2258	if (*pos && *pos == st->last_pos) {
2259		rc = tcp_seek_last_pos(seq);
2260		if (rc)
2261			goto out;
2262	}
2263
2264	st->state = TCP_SEQ_STATE_LISTENING;
2265	st->num = 0;
2266	st->bucket = 0;
2267	st->offset = 0;
2268	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2269
2270out:
2271	st->last_pos = *pos;
2272	return rc;
2273}
2274
2275static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2276{
2277	struct tcp_iter_state *st = seq->private;
2278	void *rc = NULL;
2279
2280	if (v == SEQ_START_TOKEN) {
2281		rc = tcp_get_idx(seq, 0);
2282		goto out;
2283	}
2284
2285	switch (st->state) {
2286	case TCP_SEQ_STATE_OPENREQ:
2287	case TCP_SEQ_STATE_LISTENING:
2288		rc = listening_get_next(seq, v);
2289		if (!rc) {
2290			st->state = TCP_SEQ_STATE_ESTABLISHED;
2291			st->bucket = 0;
2292			st->offset = 0;
2293			rc	  = established_get_first(seq);
2294		}
2295		break;
2296	case TCP_SEQ_STATE_ESTABLISHED:
2297	case TCP_SEQ_STATE_TIME_WAIT:
2298		rc = established_get_next(seq, v);
2299		break;
2300	}
2301out:
2302	++*pos;
2303	st->last_pos = *pos;
2304	return rc;
2305}
2306
2307static void tcp_seq_stop(struct seq_file *seq, void *v)
2308{
2309	struct tcp_iter_state *st = seq->private;
2310
2311	switch (st->state) {
2312	case TCP_SEQ_STATE_OPENREQ:
2313		if (v) {
2314			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2315			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2316		}
2317	case TCP_SEQ_STATE_LISTENING:
2318		if (v != SEQ_START_TOKEN)
2319			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2320		break;
2321	case TCP_SEQ_STATE_TIME_WAIT:
2322	case TCP_SEQ_STATE_ESTABLISHED:
2323		if (v)
2324			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2325		break;
2326	}
2327}
2328
2329static int tcp_seq_open(struct inode *inode, struct file *file)
2330{
2331	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2332	struct tcp_iter_state *s;
2333	int err;
2334
2335	err = seq_open_net(inode, file, &afinfo->seq_ops,
2336			  sizeof(struct tcp_iter_state));
2337	if (err < 0)
2338		return err;
2339
2340	s = ((struct seq_file *)file->private_data)->private;
2341	s->family		= afinfo->family;
2342	s->last_pos 		= 0;
2343	return 0;
2344}
2345
2346int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2347{
2348	int rc = 0;
2349	struct proc_dir_entry *p;
2350
2351	afinfo->seq_fops.open		= tcp_seq_open;
2352	afinfo->seq_fops.read		= seq_read;
2353	afinfo->seq_fops.llseek		= seq_lseek;
2354	afinfo->seq_fops.release	= seq_release_net;
2355
2356	afinfo->seq_ops.start		= tcp_seq_start;
2357	afinfo->seq_ops.next		= tcp_seq_next;
2358	afinfo->seq_ops.stop		= tcp_seq_stop;
2359
2360	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2361			     &afinfo->seq_fops, afinfo);
2362	if (!p)
2363		rc = -ENOMEM;
2364	return rc;
2365}
2366EXPORT_SYMBOL(tcp_proc_register);
2367
2368void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2369{
2370	proc_net_remove(net, afinfo->name);
2371}
2372EXPORT_SYMBOL(tcp_proc_unregister);
2373
2374static void get_openreq4(struct sock *sk, struct request_sock *req,
2375			 struct seq_file *f, int i, int uid, int *len)
2376{
2377	const struct inet_request_sock *ireq = inet_rsk(req);
2378	int ttd = req->expires - jiffies;
2379
2380	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2381		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2382		i,
2383		ireq->loc_addr,
2384		ntohs(inet_sk(sk)->inet_sport),
2385		ireq->rmt_addr,
2386		ntohs(ireq->rmt_port),
2387		TCP_SYN_RECV,
2388		0, 0, /* could print option size, but that is af dependent. */
2389		1,    /* timers active (only the expire timer) */
2390		jiffies_to_clock_t(ttd),
2391		req->retrans,
2392		uid,
2393		0,  /* non standard timer */
2394		0, /* open_requests have no inode */
2395		atomic_read(&sk->sk_refcnt),
2396		req,
2397		len);
2398}
2399
2400static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2401{
2402	int timer_active;
2403	unsigned long timer_expires;
2404	struct tcp_sock *tp = tcp_sk(sk);
2405	const struct inet_connection_sock *icsk = inet_csk(sk);
2406	struct inet_sock *inet = inet_sk(sk);
2407	__be32 dest = inet->inet_daddr;
2408	__be32 src = inet->inet_rcv_saddr;
2409	__u16 destp = ntohs(inet->inet_dport);
2410	__u16 srcp = ntohs(inet->inet_sport);
2411	int rx_queue;
2412
2413	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2414		timer_active	= 1;
2415		timer_expires	= icsk->icsk_timeout;
2416	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2417		timer_active	= 4;
2418		timer_expires	= icsk->icsk_timeout;
2419	} else if (timer_pending(&sk->sk_timer)) {
2420		timer_active	= 2;
2421		timer_expires	= sk->sk_timer.expires;
2422	} else {
2423		timer_active	= 0;
2424		timer_expires = jiffies;
2425	}
2426
2427	if (sk->sk_state == TCP_LISTEN)
2428		rx_queue = sk->sk_ack_backlog;
2429	else
2430		/*
2431		 * because we dont lock socket, we might find a transient negative value
2432		 */
2433		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2434
2435	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2437		i, src, srcp, dest, destp, sk->sk_state,
2438		tp->write_seq - tp->snd_una,
2439		rx_queue,
2440		timer_active,
2441		jiffies_to_clock_t(timer_expires - jiffies),
2442		icsk->icsk_retransmits,
2443		sock_i_uid(sk),
2444		icsk->icsk_probes_out,
2445		sock_i_ino(sk),
2446		atomic_read(&sk->sk_refcnt), sk,
2447		jiffies_to_clock_t(icsk->icsk_rto),
2448		jiffies_to_clock_t(icsk->icsk_ack.ato),
2449		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2450		tp->snd_cwnd,
2451		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2452		len);
2453}
2454
2455static void get_timewait4_sock(struct inet_timewait_sock *tw,
2456			       struct seq_file *f, int i, int *len)
2457{
2458	__be32 dest, src;
2459	__u16 destp, srcp;
2460	int ttd = tw->tw_ttd - jiffies;
2461
2462	if (ttd < 0)
2463		ttd = 0;
2464
2465	dest  = tw->tw_daddr;
2466	src   = tw->tw_rcv_saddr;
2467	destp = ntohs(tw->tw_dport);
2468	srcp  = ntohs(tw->tw_sport);
2469
2470	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2471		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2472		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2473		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2474		atomic_read(&tw->tw_refcnt), tw, len);
2475}
2476
2477#define TMPSZ 150
2478
2479static int tcp4_seq_show(struct seq_file *seq, void *v)
2480{
2481	struct tcp_iter_state *st;
2482	int len;
2483
2484	if (v == SEQ_START_TOKEN) {
2485		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2486			   "  sl  local_address rem_address   st tx_queue "
2487			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2488			   "inode");
2489		goto out;
2490	}
2491	st = seq->private;
2492
2493	switch (st->state) {
2494	case TCP_SEQ_STATE_LISTENING:
2495	case TCP_SEQ_STATE_ESTABLISHED:
2496		get_tcp4_sock(v, seq, st->num, &len);
2497		break;
2498	case TCP_SEQ_STATE_OPENREQ:
2499		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2500		break;
2501	case TCP_SEQ_STATE_TIME_WAIT:
2502		get_timewait4_sock(v, seq, st->num, &len);
2503		break;
2504	}
2505	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2506out:
2507	return 0;
2508}
2509
2510static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2511	.name		= "tcp",
2512	.family		= AF_INET,
2513	.seq_fops	= {
2514		.owner		= THIS_MODULE,
2515	},
2516	.seq_ops	= {
2517		.show		= tcp4_seq_show,
2518	},
2519};
2520
2521static int __net_init tcp4_proc_init_net(struct net *net)
2522{
2523	return tcp_proc_register(net, &tcp4_seq_afinfo);
2524}
2525
2526static void __net_exit tcp4_proc_exit_net(struct net *net)
2527{
2528	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2529}
2530
2531static struct pernet_operations tcp4_net_ops = {
2532	.init = tcp4_proc_init_net,
2533	.exit = tcp4_proc_exit_net,
2534};
2535
2536int __init tcp4_proc_init(void)
2537{
2538	return register_pernet_subsys(&tcp4_net_ops);
2539}
2540
2541void tcp4_proc_exit(void)
2542{
2543	unregister_pernet_subsys(&tcp4_net_ops);
2544}
2545#endif /* CONFIG_PROC_FS */
2546
2547struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2548{
2549	const struct iphdr *iph = skb_gro_network_header(skb);
2550
2551	switch (skb->ip_summed) {
2552	case CHECKSUM_COMPLETE:
2553		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2554				  skb->csum)) {
2555			skb->ip_summed = CHECKSUM_UNNECESSARY;
2556			break;
2557		}
2558
2559		/* fall through */
2560	case CHECKSUM_NONE:
2561		NAPI_GRO_CB(skb)->flush = 1;
2562		return NULL;
2563	}
2564
2565	return tcp_gro_receive(head, skb);
2566}
2567
2568int tcp4_gro_complete(struct sk_buff *skb)
2569{
2570	const struct iphdr *iph = ip_hdr(skb);
2571	struct tcphdr *th = tcp_hdr(skb);
2572
2573	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2574				  iph->saddr, iph->daddr, 0);
2575	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2576
2577	return tcp_gro_complete(skb);
2578}
2579
2580struct proto tcp_prot = {
2581	.name			= "TCP",
2582	.owner			= THIS_MODULE,
2583	.close			= tcp_close,
2584	.connect		= tcp_v4_connect,
2585	.disconnect		= tcp_disconnect,
2586	.accept			= inet_csk_accept,
2587	.ioctl			= tcp_ioctl,
2588	.init			= tcp_v4_init_sock,
2589	.destroy		= tcp_v4_destroy_sock,
2590	.shutdown		= tcp_shutdown,
2591	.setsockopt		= tcp_setsockopt,
2592	.getsockopt		= tcp_getsockopt,
2593	.recvmsg		= tcp_recvmsg,
2594	.sendmsg		= tcp_sendmsg,
2595	.sendpage		= tcp_sendpage,
2596	.backlog_rcv		= tcp_v4_do_rcv,
2597	.hash			= inet_hash,
2598	.unhash			= inet_unhash,
2599	.get_port		= inet_csk_get_port,
2600	.enter_memory_pressure	= tcp_enter_memory_pressure,
2601	.sockets_allocated	= &tcp_sockets_allocated,
2602	.orphan_count		= &tcp_orphan_count,
2603	.memory_allocated	= &tcp_memory_allocated,
2604	.memory_pressure	= &tcp_memory_pressure,
2605	.sysctl_mem		= sysctl_tcp_mem,
2606	.sysctl_wmem		= sysctl_tcp_wmem,
2607	.sysctl_rmem		= sysctl_tcp_rmem,
2608	.max_header		= MAX_TCP_HEADER,
2609	.obj_size		= sizeof(struct tcp_sock),
2610	.slab_flags		= SLAB_DESTROY_BY_RCU,
2611	.twsk_prot		= &tcp_timewait_sock_ops,
2612	.rsk_prot		= &tcp_request_sock_ops,
2613	.h.hashinfo		= &tcp_hashinfo,
2614	.no_autobind		= true,
2615#ifdef CONFIG_COMPAT
2616	.compat_setsockopt	= compat_tcp_setsockopt,
2617	.compat_getsockopt	= compat_tcp_getsockopt,
2618#endif
2619};
2620EXPORT_SYMBOL(tcp_prot);
2621
2622
2623static int __net_init tcp_sk_init(struct net *net)
2624{
2625	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2626				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2627}
2628
2629static void __net_exit tcp_sk_exit(struct net *net)
2630{
2631	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2632}
2633
2634static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2635{
2636	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2637}
2638
2639static struct pernet_operations __net_initdata tcp_sk_ops = {
2640       .init	   = tcp_sk_init,
2641       .exit	   = tcp_sk_exit,
2642       .exit_batch = tcp_sk_exit_batch,
2643};
2644
2645void __init tcp_v4_init(void)
2646{
2647	inet_hashinfo_init(&tcp_hashinfo);
2648	if (register_pernet_subsys(&tcp_sk_ops))
2649		panic("Failed to create the TCP control socket.\n");
2650}
2651