tcp_ipv4.c revision 2d7192d6cbab20e153c47fa1559ffd41ceef0e79
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 *		IPv4 specific functions
9 *
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 *
18 *	This program is free software; you can redistribute it and/or
19 *      modify it under the terms of the GNU General Public License
20 *      as published by the Free Software Foundation; either version
21 *      2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 *		David S. Miller	:	New socket lookup architecture.
27 *					This code is dedicated to John Dyson.
28 *		David S. Miller :	Change semantics of established hash,
29 *					half is devoted to TIME_WAIT sockets
30 *					and the rest go in the other half.
31 *		Andi Kleen :		Add support for syncookies and fixed
32 *					some bugs: ip options weren't passed to
33 *					the TCP layer, missed a check for an
34 *					ACK bit.
35 *		Andi Kleen :		Implemented fast path mtu discovery.
36 *	     				Fixed many serious bugs in the
37 *					request_sock handling and moved
38 *					most of it into the af independent code.
39 *					Added tail drop and some other bugfixes.
40 *					Added new listen semantics.
41 *		Mike McLagan	:	Routing by source
42 *	Juan Jose Ciarlante:		ip_dynaddr bits
43 *		Andi Kleen:		various fixes.
44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45 *					coma.
46 *	Andi Kleen		:	Fix new listen.
47 *	Andi Kleen		:	Fix accept error reporting.
48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50 *					a single port at the same time.
51 */
52
53
54#include <linux/bottom_half.h>
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
63#include <linux/slab.h>
64
65#include <net/net_namespace.h>
66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
68#include <net/tcp.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
73#include <net/xfrm.h>
74#include <net/netdma.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90#ifdef CONFIG_TCP_MD5SIG
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92						   __be32 addr);
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99	return NULL;
100}
101#endif
102
103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
105
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{
108	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109					  ip_hdr(skb)->saddr,
110					  tcp_hdr(skb)->dest,
111					  tcp_hdr(skb)->source);
112}
113
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117	struct tcp_sock *tp = tcp_sk(sk);
118
119	/* With PAWS, it is safe from the viewpoint
120	   of data integrity. Even without PAWS it is safe provided sequence
121	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123	   Actually, the idea is close to VJ's one, only timestamp cache is
124	   held not per host, but per port pair and TW bucket is used as state
125	   holder.
126
127	   If TW bucket has been already destroyed we fall back to VJ's scheme
128	   and use initial timestamp retrieved from peer table.
129	 */
130	if (tcptw->tw_ts_recent_stamp &&
131	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134		if (tp->write_seq == 0)
135			tp->write_seq = 1;
136		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138		sock_hold(sktw);
139		return 1;
140	}
141
142	return 0;
143}
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150	struct inet_sock *inet = inet_sk(sk);
151	struct tcp_sock *tp = tcp_sk(sk);
152	__be16 orig_sport, orig_dport;
153	__be32 daddr, nexthop;
154	struct flowi4 fl4;
155	struct rtable *rt;
156	int err;
157
158	if (addr_len < sizeof(struct sockaddr_in))
159		return -EINVAL;
160
161	if (usin->sin_family != AF_INET)
162		return -EAFNOSUPPORT;
163
164	nexthop = daddr = usin->sin_addr.s_addr;
165	if (inet->opt && inet->opt->srr) {
166		if (!daddr)
167			return -EINVAL;
168		nexthop = inet->opt->faddr;
169	}
170
171	orig_sport = inet->inet_sport;
172	orig_dport = usin->sin_port;
173	rt = ip_route_connect(&fl4, nexthop, inet->inet_saddr,
174			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175			      IPPROTO_TCP,
176			      orig_sport, orig_dport, sk, true);
177	if (IS_ERR(rt)) {
178		err = PTR_ERR(rt);
179		if (err == -ENETUNREACH)
180			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181		return err;
182	}
183
184	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185		ip_rt_put(rt);
186		return -ENETUNREACH;
187	}
188
189	if (!inet->opt || !inet->opt->srr)
190		daddr = rt->rt_dst;
191
192	if (!inet->inet_saddr)
193		inet->inet_saddr = rt->rt_src;
194	inet->inet_rcv_saddr = inet->inet_saddr;
195
196	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197		/* Reset inherited state */
198		tp->rx_opt.ts_recent	   = 0;
199		tp->rx_opt.ts_recent_stamp = 0;
200		tp->write_seq		   = 0;
201	}
202
203	if (tcp_death_row.sysctl_tw_recycle &&
204	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
205		struct inet_peer *peer = rt_get_peer(rt);
206		/*
207		 * VJ's idea. We save last timestamp seen from
208		 * the destination in peer table, when entering state
209		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
210		 * when trying new connection.
211		 */
212		if (peer) {
213			inet_peer_refcheck(peer);
214			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
215				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216				tp->rx_opt.ts_recent = peer->tcp_ts;
217			}
218		}
219	}
220
221	inet->inet_dport = usin->sin_port;
222	inet->inet_daddr = daddr;
223
224	inet_csk(sk)->icsk_ext_hdr_len = 0;
225	if (inet->opt)
226		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
229
230	/* Socket identity is still unknown (sport may be zero).
231	 * However we set state to SYN-SENT and not releasing socket
232	 * lock select source port, enter ourselves into the hash tables and
233	 * complete initialization after this.
234	 */
235	tcp_set_state(sk, TCP_SYN_SENT);
236	err = inet_hash_connect(&tcp_death_row, sk);
237	if (err)
238		goto failure;
239
240	rt = ip_route_newports(&fl4, rt, orig_sport, orig_dport,
241			       inet->inet_sport, inet->inet_dport, sk);
242	if (IS_ERR(rt)) {
243		err = PTR_ERR(rt);
244		rt = NULL;
245		goto failure;
246	}
247	/* OK, now commit destination to socket.  */
248	sk->sk_gso_type = SKB_GSO_TCPV4;
249	sk_setup_caps(sk, &rt->dst);
250
251	if (!tp->write_seq)
252		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
253							   inet->inet_daddr,
254							   inet->inet_sport,
255							   usin->sin_port);
256
257	inet->inet_id = tp->write_seq ^ jiffies;
258
259	err = tcp_connect(sk);
260	rt = NULL;
261	if (err)
262		goto failure;
263
264	return 0;
265
266failure:
267	/*
268	 * This unhashes the socket and releases the local port,
269	 * if necessary.
270	 */
271	tcp_set_state(sk, TCP_CLOSE);
272	ip_rt_put(rt);
273	sk->sk_route_caps = 0;
274	inet->inet_dport = 0;
275	return err;
276}
277EXPORT_SYMBOL(tcp_v4_connect);
278
279/*
280 * This routine does path mtu discovery as defined in RFC1191.
281 */
282static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
283{
284	struct dst_entry *dst;
285	struct inet_sock *inet = inet_sk(sk);
286
287	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
288	 * send out by Linux are always <576bytes so they should go through
289	 * unfragmented).
290	 */
291	if (sk->sk_state == TCP_LISTEN)
292		return;
293
294	/* We don't check in the destentry if pmtu discovery is forbidden
295	 * on this route. We just assume that no packet_to_big packets
296	 * are send back when pmtu discovery is not active.
297	 * There is a small race when the user changes this flag in the
298	 * route, but I think that's acceptable.
299	 */
300	if ((dst = __sk_dst_check(sk, 0)) == NULL)
301		return;
302
303	dst->ops->update_pmtu(dst, mtu);
304
305	/* Something is about to be wrong... Remember soft error
306	 * for the case, if this connection will not able to recover.
307	 */
308	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
309		sk->sk_err_soft = EMSGSIZE;
310
311	mtu = dst_mtu(dst);
312
313	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
314	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
315		tcp_sync_mss(sk, mtu);
316
317		/* Resend the TCP packet because it's
318		 * clear that the old packet has been
319		 * dropped. This is the new "fast" path mtu
320		 * discovery.
321		 */
322		tcp_simple_retransmit(sk);
323	} /* else let the usual retransmit timer handle it */
324}
325
326/*
327 * This routine is called by the ICMP module when it gets some
328 * sort of error condition.  If err < 0 then the socket should
329 * be closed and the error returned to the user.  If err > 0
330 * it's just the icmp type << 8 | icmp code.  After adjustment
331 * header points to the first 8 bytes of the tcp header.  We need
332 * to find the appropriate port.
333 *
334 * The locking strategy used here is very "optimistic". When
335 * someone else accesses the socket the ICMP is just dropped
336 * and for some paths there is no check at all.
337 * A more general error queue to queue errors for later handling
338 * is probably better.
339 *
340 */
341
342void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
343{
344	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
345	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
346	struct inet_connection_sock *icsk;
347	struct tcp_sock *tp;
348	struct inet_sock *inet;
349	const int type = icmp_hdr(icmp_skb)->type;
350	const int code = icmp_hdr(icmp_skb)->code;
351	struct sock *sk;
352	struct sk_buff *skb;
353	__u32 seq;
354	__u32 remaining;
355	int err;
356	struct net *net = dev_net(icmp_skb->dev);
357
358	if (icmp_skb->len < (iph->ihl << 2) + 8) {
359		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
360		return;
361	}
362
363	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
364			iph->saddr, th->source, inet_iif(icmp_skb));
365	if (!sk) {
366		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
367		return;
368	}
369	if (sk->sk_state == TCP_TIME_WAIT) {
370		inet_twsk_put(inet_twsk(sk));
371		return;
372	}
373
374	bh_lock_sock(sk);
375	/* If too many ICMPs get dropped on busy
376	 * servers this needs to be solved differently.
377	 */
378	if (sock_owned_by_user(sk))
379		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
380
381	if (sk->sk_state == TCP_CLOSE)
382		goto out;
383
384	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
385		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
386		goto out;
387	}
388
389	icsk = inet_csk(sk);
390	tp = tcp_sk(sk);
391	seq = ntohl(th->seq);
392	if (sk->sk_state != TCP_LISTEN &&
393	    !between(seq, tp->snd_una, tp->snd_nxt)) {
394		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
395		goto out;
396	}
397
398	switch (type) {
399	case ICMP_SOURCE_QUENCH:
400		/* Just silently ignore these. */
401		goto out;
402	case ICMP_PARAMETERPROB:
403		err = EPROTO;
404		break;
405	case ICMP_DEST_UNREACH:
406		if (code > NR_ICMP_UNREACH)
407			goto out;
408
409		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
410			if (!sock_owned_by_user(sk))
411				do_pmtu_discovery(sk, iph, info);
412			goto out;
413		}
414
415		err = icmp_err_convert[code].errno;
416		/* check if icmp_skb allows revert of backoff
417		 * (see draft-zimmermann-tcp-lcd) */
418		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
419			break;
420		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
421		    !icsk->icsk_backoff)
422			break;
423
424		if (sock_owned_by_user(sk))
425			break;
426
427		icsk->icsk_backoff--;
428		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
429					 icsk->icsk_backoff;
430		tcp_bound_rto(sk);
431
432		skb = tcp_write_queue_head(sk);
433		BUG_ON(!skb);
434
435		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
436				tcp_time_stamp - TCP_SKB_CB(skb)->when);
437
438		if (remaining) {
439			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
440						  remaining, TCP_RTO_MAX);
441		} else {
442			/* RTO revert clocked out retransmission.
443			 * Will retransmit now */
444			tcp_retransmit_timer(sk);
445		}
446
447		break;
448	case ICMP_TIME_EXCEEDED:
449		err = EHOSTUNREACH;
450		break;
451	default:
452		goto out;
453	}
454
455	switch (sk->sk_state) {
456		struct request_sock *req, **prev;
457	case TCP_LISTEN:
458		if (sock_owned_by_user(sk))
459			goto out;
460
461		req = inet_csk_search_req(sk, &prev, th->dest,
462					  iph->daddr, iph->saddr);
463		if (!req)
464			goto out;
465
466		/* ICMPs are not backlogged, hence we cannot get
467		   an established socket here.
468		 */
469		WARN_ON(req->sk);
470
471		if (seq != tcp_rsk(req)->snt_isn) {
472			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
473			goto out;
474		}
475
476		/*
477		 * Still in SYN_RECV, just remove it silently.
478		 * There is no good way to pass the error to the newly
479		 * created socket, and POSIX does not want network
480		 * errors returned from accept().
481		 */
482		inet_csk_reqsk_queue_drop(sk, req, prev);
483		goto out;
484
485	case TCP_SYN_SENT:
486	case TCP_SYN_RECV:  /* Cannot happen.
487			       It can f.e. if SYNs crossed.
488			     */
489		if (!sock_owned_by_user(sk)) {
490			sk->sk_err = err;
491
492			sk->sk_error_report(sk);
493
494			tcp_done(sk);
495		} else {
496			sk->sk_err_soft = err;
497		}
498		goto out;
499	}
500
501	/* If we've already connected we will keep trying
502	 * until we time out, or the user gives up.
503	 *
504	 * rfc1122 4.2.3.9 allows to consider as hard errors
505	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
506	 * but it is obsoleted by pmtu discovery).
507	 *
508	 * Note, that in modern internet, where routing is unreliable
509	 * and in each dark corner broken firewalls sit, sending random
510	 * errors ordered by their masters even this two messages finally lose
511	 * their original sense (even Linux sends invalid PORT_UNREACHs)
512	 *
513	 * Now we are in compliance with RFCs.
514	 *							--ANK (980905)
515	 */
516
517	inet = inet_sk(sk);
518	if (!sock_owned_by_user(sk) && inet->recverr) {
519		sk->sk_err = err;
520		sk->sk_error_report(sk);
521	} else	{ /* Only an error on timeout */
522		sk->sk_err_soft = err;
523	}
524
525out:
526	bh_unlock_sock(sk);
527	sock_put(sk);
528}
529
530static void __tcp_v4_send_check(struct sk_buff *skb,
531				__be32 saddr, __be32 daddr)
532{
533	struct tcphdr *th = tcp_hdr(skb);
534
535	if (skb->ip_summed == CHECKSUM_PARTIAL) {
536		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
537		skb->csum_start = skb_transport_header(skb) - skb->head;
538		skb->csum_offset = offsetof(struct tcphdr, check);
539	} else {
540		th->check = tcp_v4_check(skb->len, saddr, daddr,
541					 csum_partial(th,
542						      th->doff << 2,
543						      skb->csum));
544	}
545}
546
547/* This routine computes an IPv4 TCP checksum. */
548void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
549{
550	struct inet_sock *inet = inet_sk(sk);
551
552	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
553}
554EXPORT_SYMBOL(tcp_v4_send_check);
555
556int tcp_v4_gso_send_check(struct sk_buff *skb)
557{
558	const struct iphdr *iph;
559	struct tcphdr *th;
560
561	if (!pskb_may_pull(skb, sizeof(*th)))
562		return -EINVAL;
563
564	iph = ip_hdr(skb);
565	th = tcp_hdr(skb);
566
567	th->check = 0;
568	skb->ip_summed = CHECKSUM_PARTIAL;
569	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
570	return 0;
571}
572
573/*
574 *	This routine will send an RST to the other tcp.
575 *
576 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577 *		      for reset.
578 *	Answer: if a packet caused RST, it is not for a socket
579 *		existing in our system, if it is matched to a socket,
580 *		it is just duplicate segment or bug in other side's TCP.
581 *		So that we build reply only basing on parameters
582 *		arrived with segment.
583 *	Exception: precedence violation. We do not implement it in any case.
584 */
585
586static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
587{
588	struct tcphdr *th = tcp_hdr(skb);
589	struct {
590		struct tcphdr th;
591#ifdef CONFIG_TCP_MD5SIG
592		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
593#endif
594	} rep;
595	struct ip_reply_arg arg;
596#ifdef CONFIG_TCP_MD5SIG
597	struct tcp_md5sig_key *key;
598#endif
599	struct net *net;
600
601	/* Never send a reset in response to a reset. */
602	if (th->rst)
603		return;
604
605	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
606		return;
607
608	/* Swap the send and the receive. */
609	memset(&rep, 0, sizeof(rep));
610	rep.th.dest   = th->source;
611	rep.th.source = th->dest;
612	rep.th.doff   = sizeof(struct tcphdr) / 4;
613	rep.th.rst    = 1;
614
615	if (th->ack) {
616		rep.th.seq = th->ack_seq;
617	} else {
618		rep.th.ack = 1;
619		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620				       skb->len - (th->doff << 2));
621	}
622
623	memset(&arg, 0, sizeof(arg));
624	arg.iov[0].iov_base = (unsigned char *)&rep;
625	arg.iov[0].iov_len  = sizeof(rep.th);
626
627#ifdef CONFIG_TCP_MD5SIG
628	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
629	if (key) {
630		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
631				   (TCPOPT_NOP << 16) |
632				   (TCPOPT_MD5SIG << 8) |
633				   TCPOLEN_MD5SIG);
634		/* Update length and the length the header thinks exists */
635		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
636		rep.th.doff = arg.iov[0].iov_len / 4;
637
638		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
639				     key, ip_hdr(skb)->saddr,
640				     ip_hdr(skb)->daddr, &rep.th);
641	}
642#endif
643	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
644				      ip_hdr(skb)->saddr, /* XXX */
645				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
646	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
647	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
648
649	net = dev_net(skb_dst(skb)->dev);
650	ip_send_reply(net->ipv4.tcp_sock, skb,
651		      &arg, arg.iov[0].iov_len);
652
653	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
654	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
655}
656
657/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
658   outside socket context is ugly, certainly. What can I do?
659 */
660
661static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
662			    u32 win, u32 ts, int oif,
663			    struct tcp_md5sig_key *key,
664			    int reply_flags)
665{
666	struct tcphdr *th = tcp_hdr(skb);
667	struct {
668		struct tcphdr th;
669		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
670#ifdef CONFIG_TCP_MD5SIG
671			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
672#endif
673			];
674	} rep;
675	struct ip_reply_arg arg;
676	struct net *net = dev_net(skb_dst(skb)->dev);
677
678	memset(&rep.th, 0, sizeof(struct tcphdr));
679	memset(&arg, 0, sizeof(arg));
680
681	arg.iov[0].iov_base = (unsigned char *)&rep;
682	arg.iov[0].iov_len  = sizeof(rep.th);
683	if (ts) {
684		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
685				   (TCPOPT_TIMESTAMP << 8) |
686				   TCPOLEN_TIMESTAMP);
687		rep.opt[1] = htonl(tcp_time_stamp);
688		rep.opt[2] = htonl(ts);
689		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
690	}
691
692	/* Swap the send and the receive. */
693	rep.th.dest    = th->source;
694	rep.th.source  = th->dest;
695	rep.th.doff    = arg.iov[0].iov_len / 4;
696	rep.th.seq     = htonl(seq);
697	rep.th.ack_seq = htonl(ack);
698	rep.th.ack     = 1;
699	rep.th.window  = htons(win);
700
701#ifdef CONFIG_TCP_MD5SIG
702	if (key) {
703		int offset = (ts) ? 3 : 0;
704
705		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
706					  (TCPOPT_NOP << 16) |
707					  (TCPOPT_MD5SIG << 8) |
708					  TCPOLEN_MD5SIG);
709		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
710		rep.th.doff = arg.iov[0].iov_len/4;
711
712		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
713				    key, ip_hdr(skb)->saddr,
714				    ip_hdr(skb)->daddr, &rep.th);
715	}
716#endif
717	arg.flags = reply_flags;
718	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
719				      ip_hdr(skb)->saddr, /* XXX */
720				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
721	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
722	if (oif)
723		arg.bound_dev_if = oif;
724
725	ip_send_reply(net->ipv4.tcp_sock, skb,
726		      &arg, arg.iov[0].iov_len);
727
728	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
729}
730
731static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
732{
733	struct inet_timewait_sock *tw = inet_twsk(sk);
734	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
735
736	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
737			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
738			tcptw->tw_ts_recent,
739			tw->tw_bound_dev_if,
740			tcp_twsk_md5_key(tcptw),
741			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
742			);
743
744	inet_twsk_put(tw);
745}
746
747static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
748				  struct request_sock *req)
749{
750	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
751			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
752			req->ts_recent,
753			0,
754			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
755			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
756}
757
758/*
759 *	Send a SYN-ACK after having received a SYN.
760 *	This still operates on a request_sock only, not on a big
761 *	socket.
762 */
763static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
764			      struct request_sock *req,
765			      struct request_values *rvp)
766{
767	const struct inet_request_sock *ireq = inet_rsk(req);
768	int err = -1;
769	struct sk_buff * skb;
770
771	/* First, grab a route. */
772	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
773		return -1;
774
775	skb = tcp_make_synack(sk, dst, req, rvp);
776
777	if (skb) {
778		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
779
780		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
781					    ireq->rmt_addr,
782					    ireq->opt);
783		err = net_xmit_eval(err);
784	}
785
786	dst_release(dst);
787	return err;
788}
789
790static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
791			      struct request_values *rvp)
792{
793	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
794	return tcp_v4_send_synack(sk, NULL, req, rvp);
795}
796
797/*
798 *	IPv4 request_sock destructor.
799 */
800static void tcp_v4_reqsk_destructor(struct request_sock *req)
801{
802	kfree(inet_rsk(req)->opt);
803}
804
805static void syn_flood_warning(const struct sk_buff *skb)
806{
807	const char *msg;
808
809#ifdef CONFIG_SYN_COOKIES
810	if (sysctl_tcp_syncookies)
811		msg = "Sending cookies";
812	else
813#endif
814		msg = "Dropping request";
815
816	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
817				ntohs(tcp_hdr(skb)->dest), msg);
818}
819
820/*
821 * Save and compile IPv4 options into the request_sock if needed.
822 */
823static struct ip_options *tcp_v4_save_options(struct sock *sk,
824					      struct sk_buff *skb)
825{
826	struct ip_options *opt = &(IPCB(skb)->opt);
827	struct ip_options *dopt = NULL;
828
829	if (opt && opt->optlen) {
830		int opt_size = optlength(opt);
831		dopt = kmalloc(opt_size, GFP_ATOMIC);
832		if (dopt) {
833			if (ip_options_echo(dopt, skb)) {
834				kfree(dopt);
835				dopt = NULL;
836			}
837		}
838	}
839	return dopt;
840}
841
842#ifdef CONFIG_TCP_MD5SIG
843/*
844 * RFC2385 MD5 checksumming requires a mapping of
845 * IP address->MD5 Key.
846 * We need to maintain these in the sk structure.
847 */
848
849/* Find the Key structure for an address.  */
850static struct tcp_md5sig_key *
851			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
852{
853	struct tcp_sock *tp = tcp_sk(sk);
854	int i;
855
856	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
857		return NULL;
858	for (i = 0; i < tp->md5sig_info->entries4; i++) {
859		if (tp->md5sig_info->keys4[i].addr == addr)
860			return &tp->md5sig_info->keys4[i].base;
861	}
862	return NULL;
863}
864
865struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
866					 struct sock *addr_sk)
867{
868	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
869}
870EXPORT_SYMBOL(tcp_v4_md5_lookup);
871
872static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
873						      struct request_sock *req)
874{
875	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
876}
877
878/* This can be called on a newly created socket, from other files */
879int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
880		      u8 *newkey, u8 newkeylen)
881{
882	/* Add Key to the list */
883	struct tcp_md5sig_key *key;
884	struct tcp_sock *tp = tcp_sk(sk);
885	struct tcp4_md5sig_key *keys;
886
887	key = tcp_v4_md5_do_lookup(sk, addr);
888	if (key) {
889		/* Pre-existing entry - just update that one. */
890		kfree(key->key);
891		key->key = newkey;
892		key->keylen = newkeylen;
893	} else {
894		struct tcp_md5sig_info *md5sig;
895
896		if (!tp->md5sig_info) {
897			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
898						  GFP_ATOMIC);
899			if (!tp->md5sig_info) {
900				kfree(newkey);
901				return -ENOMEM;
902			}
903			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
904		}
905		if (tcp_alloc_md5sig_pool(sk) == NULL) {
906			kfree(newkey);
907			return -ENOMEM;
908		}
909		md5sig = tp->md5sig_info;
910
911		if (md5sig->alloced4 == md5sig->entries4) {
912			keys = kmalloc((sizeof(*keys) *
913					(md5sig->entries4 + 1)), GFP_ATOMIC);
914			if (!keys) {
915				kfree(newkey);
916				tcp_free_md5sig_pool();
917				return -ENOMEM;
918			}
919
920			if (md5sig->entries4)
921				memcpy(keys, md5sig->keys4,
922				       sizeof(*keys) * md5sig->entries4);
923
924			/* Free old key list, and reference new one */
925			kfree(md5sig->keys4);
926			md5sig->keys4 = keys;
927			md5sig->alloced4++;
928		}
929		md5sig->entries4++;
930		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
931		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
932		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
933	}
934	return 0;
935}
936EXPORT_SYMBOL(tcp_v4_md5_do_add);
937
938static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
939			       u8 *newkey, u8 newkeylen)
940{
941	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
942				 newkey, newkeylen);
943}
944
945int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
946{
947	struct tcp_sock *tp = tcp_sk(sk);
948	int i;
949
950	for (i = 0; i < tp->md5sig_info->entries4; i++) {
951		if (tp->md5sig_info->keys4[i].addr == addr) {
952			/* Free the key */
953			kfree(tp->md5sig_info->keys4[i].base.key);
954			tp->md5sig_info->entries4--;
955
956			if (tp->md5sig_info->entries4 == 0) {
957				kfree(tp->md5sig_info->keys4);
958				tp->md5sig_info->keys4 = NULL;
959				tp->md5sig_info->alloced4 = 0;
960			} else if (tp->md5sig_info->entries4 != i) {
961				/* Need to do some manipulation */
962				memmove(&tp->md5sig_info->keys4[i],
963					&tp->md5sig_info->keys4[i+1],
964					(tp->md5sig_info->entries4 - i) *
965					 sizeof(struct tcp4_md5sig_key));
966			}
967			tcp_free_md5sig_pool();
968			return 0;
969		}
970	}
971	return -ENOENT;
972}
973EXPORT_SYMBOL(tcp_v4_md5_do_del);
974
975static void tcp_v4_clear_md5_list(struct sock *sk)
976{
977	struct tcp_sock *tp = tcp_sk(sk);
978
979	/* Free each key, then the set of key keys,
980	 * the crypto element, and then decrement our
981	 * hold on the last resort crypto.
982	 */
983	if (tp->md5sig_info->entries4) {
984		int i;
985		for (i = 0; i < tp->md5sig_info->entries4; i++)
986			kfree(tp->md5sig_info->keys4[i].base.key);
987		tp->md5sig_info->entries4 = 0;
988		tcp_free_md5sig_pool();
989	}
990	if (tp->md5sig_info->keys4) {
991		kfree(tp->md5sig_info->keys4);
992		tp->md5sig_info->keys4 = NULL;
993		tp->md5sig_info->alloced4  = 0;
994	}
995}
996
997static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
998				 int optlen)
999{
1000	struct tcp_md5sig cmd;
1001	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1002	u8 *newkey;
1003
1004	if (optlen < sizeof(cmd))
1005		return -EINVAL;
1006
1007	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1008		return -EFAULT;
1009
1010	if (sin->sin_family != AF_INET)
1011		return -EINVAL;
1012
1013	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1014		if (!tcp_sk(sk)->md5sig_info)
1015			return -ENOENT;
1016		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1017	}
1018
1019	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020		return -EINVAL;
1021
1022	if (!tcp_sk(sk)->md5sig_info) {
1023		struct tcp_sock *tp = tcp_sk(sk);
1024		struct tcp_md5sig_info *p;
1025
1026		p = kzalloc(sizeof(*p), sk->sk_allocation);
1027		if (!p)
1028			return -EINVAL;
1029
1030		tp->md5sig_info = p;
1031		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1032	}
1033
1034	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1035	if (!newkey)
1036		return -ENOMEM;
1037	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1038				 newkey, cmd.tcpm_keylen);
1039}
1040
1041static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1042					__be32 daddr, __be32 saddr, int nbytes)
1043{
1044	struct tcp4_pseudohdr *bp;
1045	struct scatterlist sg;
1046
1047	bp = &hp->md5_blk.ip4;
1048
1049	/*
1050	 * 1. the TCP pseudo-header (in the order: source IP address,
1051	 * destination IP address, zero-padded protocol number, and
1052	 * segment length)
1053	 */
1054	bp->saddr = saddr;
1055	bp->daddr = daddr;
1056	bp->pad = 0;
1057	bp->protocol = IPPROTO_TCP;
1058	bp->len = cpu_to_be16(nbytes);
1059
1060	sg_init_one(&sg, bp, sizeof(*bp));
1061	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1062}
1063
1064static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1065			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1066{
1067	struct tcp_md5sig_pool *hp;
1068	struct hash_desc *desc;
1069
1070	hp = tcp_get_md5sig_pool();
1071	if (!hp)
1072		goto clear_hash_noput;
1073	desc = &hp->md5_desc;
1074
1075	if (crypto_hash_init(desc))
1076		goto clear_hash;
1077	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1078		goto clear_hash;
1079	if (tcp_md5_hash_header(hp, th))
1080		goto clear_hash;
1081	if (tcp_md5_hash_key(hp, key))
1082		goto clear_hash;
1083	if (crypto_hash_final(desc, md5_hash))
1084		goto clear_hash;
1085
1086	tcp_put_md5sig_pool();
1087	return 0;
1088
1089clear_hash:
1090	tcp_put_md5sig_pool();
1091clear_hash_noput:
1092	memset(md5_hash, 0, 16);
1093	return 1;
1094}
1095
1096int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1097			struct sock *sk, struct request_sock *req,
1098			struct sk_buff *skb)
1099{
1100	struct tcp_md5sig_pool *hp;
1101	struct hash_desc *desc;
1102	struct tcphdr *th = tcp_hdr(skb);
1103	__be32 saddr, daddr;
1104
1105	if (sk) {
1106		saddr = inet_sk(sk)->inet_saddr;
1107		daddr = inet_sk(sk)->inet_daddr;
1108	} else if (req) {
1109		saddr = inet_rsk(req)->loc_addr;
1110		daddr = inet_rsk(req)->rmt_addr;
1111	} else {
1112		const struct iphdr *iph = ip_hdr(skb);
1113		saddr = iph->saddr;
1114		daddr = iph->daddr;
1115	}
1116
1117	hp = tcp_get_md5sig_pool();
1118	if (!hp)
1119		goto clear_hash_noput;
1120	desc = &hp->md5_desc;
1121
1122	if (crypto_hash_init(desc))
1123		goto clear_hash;
1124
1125	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1126		goto clear_hash;
1127	if (tcp_md5_hash_header(hp, th))
1128		goto clear_hash;
1129	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1130		goto clear_hash;
1131	if (tcp_md5_hash_key(hp, key))
1132		goto clear_hash;
1133	if (crypto_hash_final(desc, md5_hash))
1134		goto clear_hash;
1135
1136	tcp_put_md5sig_pool();
1137	return 0;
1138
1139clear_hash:
1140	tcp_put_md5sig_pool();
1141clear_hash_noput:
1142	memset(md5_hash, 0, 16);
1143	return 1;
1144}
1145EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1146
1147static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1148{
1149	/*
1150	 * This gets called for each TCP segment that arrives
1151	 * so we want to be efficient.
1152	 * We have 3 drop cases:
1153	 * o No MD5 hash and one expected.
1154	 * o MD5 hash and we're not expecting one.
1155	 * o MD5 hash and its wrong.
1156	 */
1157	__u8 *hash_location = NULL;
1158	struct tcp_md5sig_key *hash_expected;
1159	const struct iphdr *iph = ip_hdr(skb);
1160	struct tcphdr *th = tcp_hdr(skb);
1161	int genhash;
1162	unsigned char newhash[16];
1163
1164	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1165	hash_location = tcp_parse_md5sig_option(th);
1166
1167	/* We've parsed the options - do we have a hash? */
1168	if (!hash_expected && !hash_location)
1169		return 0;
1170
1171	if (hash_expected && !hash_location) {
1172		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1173		return 1;
1174	}
1175
1176	if (!hash_expected && hash_location) {
1177		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1178		return 1;
1179	}
1180
1181	/* Okay, so this is hash_expected and hash_location -
1182	 * so we need to calculate the checksum.
1183	 */
1184	genhash = tcp_v4_md5_hash_skb(newhash,
1185				      hash_expected,
1186				      NULL, NULL, skb);
1187
1188	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1189		if (net_ratelimit()) {
1190			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1191			       &iph->saddr, ntohs(th->source),
1192			       &iph->daddr, ntohs(th->dest),
1193			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1194		}
1195		return 1;
1196	}
1197	return 0;
1198}
1199
1200#endif
1201
1202struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1203	.family		=	PF_INET,
1204	.obj_size	=	sizeof(struct tcp_request_sock),
1205	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1206	.send_ack	=	tcp_v4_reqsk_send_ack,
1207	.destructor	=	tcp_v4_reqsk_destructor,
1208	.send_reset	=	tcp_v4_send_reset,
1209	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1210};
1211
1212#ifdef CONFIG_TCP_MD5SIG
1213static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1214	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1215	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1216};
1217#endif
1218
1219int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220{
1221	struct tcp_extend_values tmp_ext;
1222	struct tcp_options_received tmp_opt;
1223	u8 *hash_location;
1224	struct request_sock *req;
1225	struct inet_request_sock *ireq;
1226	struct tcp_sock *tp = tcp_sk(sk);
1227	struct dst_entry *dst = NULL;
1228	__be32 saddr = ip_hdr(skb)->saddr;
1229	__be32 daddr = ip_hdr(skb)->daddr;
1230	__u32 isn = TCP_SKB_CB(skb)->when;
1231#ifdef CONFIG_SYN_COOKIES
1232	int want_cookie = 0;
1233#else
1234#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1235#endif
1236
1237	/* Never answer to SYNs send to broadcast or multicast */
1238	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239		goto drop;
1240
1241	/* TW buckets are converted to open requests without
1242	 * limitations, they conserve resources and peer is
1243	 * evidently real one.
1244	 */
1245	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1246		if (net_ratelimit())
1247			syn_flood_warning(skb);
1248#ifdef CONFIG_SYN_COOKIES
1249		if (sysctl_tcp_syncookies) {
1250			want_cookie = 1;
1251		} else
1252#endif
1253		goto drop;
1254	}
1255
1256	/* Accept backlog is full. If we have already queued enough
1257	 * of warm entries in syn queue, drop request. It is better than
1258	 * clogging syn queue with openreqs with exponentially increasing
1259	 * timeout.
1260	 */
1261	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1262		goto drop;
1263
1264	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1265	if (!req)
1266		goto drop;
1267
1268#ifdef CONFIG_TCP_MD5SIG
1269	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1270#endif
1271
1272	tcp_clear_options(&tmp_opt);
1273	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1274	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1275	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1276
1277	if (tmp_opt.cookie_plus > 0 &&
1278	    tmp_opt.saw_tstamp &&
1279	    !tp->rx_opt.cookie_out_never &&
1280	    (sysctl_tcp_cookie_size > 0 ||
1281	     (tp->cookie_values != NULL &&
1282	      tp->cookie_values->cookie_desired > 0))) {
1283		u8 *c;
1284		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1285		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1286
1287		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1288			goto drop_and_release;
1289
1290		/* Secret recipe starts with IP addresses */
1291		*mess++ ^= (__force u32)daddr;
1292		*mess++ ^= (__force u32)saddr;
1293
1294		/* plus variable length Initiator Cookie */
1295		c = (u8 *)mess;
1296		while (l-- > 0)
1297			*c++ ^= *hash_location++;
1298
1299#ifdef CONFIG_SYN_COOKIES
1300		want_cookie = 0;	/* not our kind of cookie */
1301#endif
1302		tmp_ext.cookie_out_never = 0; /* false */
1303		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1304	} else if (!tp->rx_opt.cookie_in_always) {
1305		/* redundant indications, but ensure initialization. */
1306		tmp_ext.cookie_out_never = 1; /* true */
1307		tmp_ext.cookie_plus = 0;
1308	} else {
1309		goto drop_and_release;
1310	}
1311	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1312
1313	if (want_cookie && !tmp_opt.saw_tstamp)
1314		tcp_clear_options(&tmp_opt);
1315
1316	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1317	tcp_openreq_init(req, &tmp_opt, skb);
1318
1319	ireq = inet_rsk(req);
1320	ireq->loc_addr = daddr;
1321	ireq->rmt_addr = saddr;
1322	ireq->no_srccheck = inet_sk(sk)->transparent;
1323	ireq->opt = tcp_v4_save_options(sk, skb);
1324
1325	if (security_inet_conn_request(sk, skb, req))
1326		goto drop_and_free;
1327
1328	if (!want_cookie || tmp_opt.tstamp_ok)
1329		TCP_ECN_create_request(req, tcp_hdr(skb));
1330
1331	if (want_cookie) {
1332		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333		req->cookie_ts = tmp_opt.tstamp_ok;
1334	} else if (!isn) {
1335		struct inet_peer *peer = NULL;
1336
1337		/* VJ's idea. We save last timestamp seen
1338		 * from the destination in peer table, when entering
1339		 * state TIME-WAIT, and check against it before
1340		 * accepting new connection request.
1341		 *
1342		 * If "isn" is not zero, this request hit alive
1343		 * timewait bucket, so that all the necessary checks
1344		 * are made in the function processing timewait state.
1345		 */
1346		if (tmp_opt.saw_tstamp &&
1347		    tcp_death_row.sysctl_tw_recycle &&
1348		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1349		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350		    peer->daddr.addr.a4 == saddr) {
1351			inet_peer_refcheck(peer);
1352			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353			    (s32)(peer->tcp_ts - req->ts_recent) >
1354							TCP_PAWS_WINDOW) {
1355				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1356				goto drop_and_release;
1357			}
1358		}
1359		/* Kill the following clause, if you dislike this way. */
1360		else if (!sysctl_tcp_syncookies &&
1361			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1362			  (sysctl_max_syn_backlog >> 2)) &&
1363			 (!peer || !peer->tcp_ts_stamp) &&
1364			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1365			/* Without syncookies last quarter of
1366			 * backlog is filled with destinations,
1367			 * proven to be alive.
1368			 * It means that we continue to communicate
1369			 * to destinations, already remembered
1370			 * to the moment of synflood.
1371			 */
1372			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1373				       &saddr, ntohs(tcp_hdr(skb)->source));
1374			goto drop_and_release;
1375		}
1376
1377		isn = tcp_v4_init_sequence(skb);
1378	}
1379	tcp_rsk(req)->snt_isn = isn;
1380
1381	if (tcp_v4_send_synack(sk, dst, req,
1382			       (struct request_values *)&tmp_ext) ||
1383	    want_cookie)
1384		goto drop_and_free;
1385
1386	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387	return 0;
1388
1389drop_and_release:
1390	dst_release(dst);
1391drop_and_free:
1392	reqsk_free(req);
1393drop:
1394	return 0;
1395}
1396EXPORT_SYMBOL(tcp_v4_conn_request);
1397
1398
1399/*
1400 * The three way handshake has completed - we got a valid synack -
1401 * now create the new socket.
1402 */
1403struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1404				  struct request_sock *req,
1405				  struct dst_entry *dst)
1406{
1407	struct inet_request_sock *ireq;
1408	struct inet_sock *newinet;
1409	struct tcp_sock *newtp;
1410	struct sock *newsk;
1411#ifdef CONFIG_TCP_MD5SIG
1412	struct tcp_md5sig_key *key;
1413#endif
1414
1415	if (sk_acceptq_is_full(sk))
1416		goto exit_overflow;
1417
1418	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1419		goto exit;
1420
1421	newsk = tcp_create_openreq_child(sk, req, skb);
1422	if (!newsk)
1423		goto exit_nonewsk;
1424
1425	newsk->sk_gso_type = SKB_GSO_TCPV4;
1426	sk_setup_caps(newsk, dst);
1427
1428	newtp		      = tcp_sk(newsk);
1429	newinet		      = inet_sk(newsk);
1430	ireq		      = inet_rsk(req);
1431	newinet->inet_daddr   = ireq->rmt_addr;
1432	newinet->inet_rcv_saddr = ireq->loc_addr;
1433	newinet->inet_saddr	      = ireq->loc_addr;
1434	newinet->opt	      = ireq->opt;
1435	ireq->opt	      = NULL;
1436	newinet->mc_index     = inet_iif(skb);
1437	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1438	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1439	if (newinet->opt)
1440		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1441	newinet->inet_id = newtp->write_seq ^ jiffies;
1442
1443	tcp_mtup_init(newsk);
1444	tcp_sync_mss(newsk, dst_mtu(dst));
1445	newtp->advmss = dst_metric_advmss(dst);
1446	if (tcp_sk(sk)->rx_opt.user_mss &&
1447	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1448		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1449
1450	tcp_initialize_rcv_mss(newsk);
1451
1452#ifdef CONFIG_TCP_MD5SIG
1453	/* Copy over the MD5 key from the original socket */
1454	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1455	if (key != NULL) {
1456		/*
1457		 * We're using one, so create a matching key
1458		 * on the newsk structure. If we fail to get
1459		 * memory, then we end up not copying the key
1460		 * across. Shucks.
1461		 */
1462		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463		if (newkey != NULL)
1464			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1465					  newkey, key->keylen);
1466		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467	}
1468#endif
1469
1470	if (__inet_inherit_port(sk, newsk) < 0) {
1471		sock_put(newsk);
1472		goto exit;
1473	}
1474	__inet_hash_nolisten(newsk, NULL);
1475
1476	return newsk;
1477
1478exit_overflow:
1479	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1480exit_nonewsk:
1481	dst_release(dst);
1482exit:
1483	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1484	return NULL;
1485}
1486EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1487
1488static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1489{
1490	struct tcphdr *th = tcp_hdr(skb);
1491	const struct iphdr *iph = ip_hdr(skb);
1492	struct sock *nsk;
1493	struct request_sock **prev;
1494	/* Find possible connection requests. */
1495	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1496						       iph->saddr, iph->daddr);
1497	if (req)
1498		return tcp_check_req(sk, skb, req, prev);
1499
1500	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1501			th->source, iph->daddr, th->dest, inet_iif(skb));
1502
1503	if (nsk) {
1504		if (nsk->sk_state != TCP_TIME_WAIT) {
1505			bh_lock_sock(nsk);
1506			return nsk;
1507		}
1508		inet_twsk_put(inet_twsk(nsk));
1509		return NULL;
1510	}
1511
1512#ifdef CONFIG_SYN_COOKIES
1513	if (!th->syn)
1514		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1515#endif
1516	return sk;
1517}
1518
1519static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1520{
1521	const struct iphdr *iph = ip_hdr(skb);
1522
1523	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1524		if (!tcp_v4_check(skb->len, iph->saddr,
1525				  iph->daddr, skb->csum)) {
1526			skb->ip_summed = CHECKSUM_UNNECESSARY;
1527			return 0;
1528		}
1529	}
1530
1531	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1532				       skb->len, IPPROTO_TCP, 0);
1533
1534	if (skb->len <= 76) {
1535		return __skb_checksum_complete(skb);
1536	}
1537	return 0;
1538}
1539
1540
1541/* The socket must have it's spinlock held when we get
1542 * here.
1543 *
1544 * We have a potential double-lock case here, so even when
1545 * doing backlog processing we use the BH locking scheme.
1546 * This is because we cannot sleep with the original spinlock
1547 * held.
1548 */
1549int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1550{
1551	struct sock *rsk;
1552#ifdef CONFIG_TCP_MD5SIG
1553	/*
1554	 * We really want to reject the packet as early as possible
1555	 * if:
1556	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1557	 *  o There is an MD5 option and we're not expecting one
1558	 */
1559	if (tcp_v4_inbound_md5_hash(sk, skb))
1560		goto discard;
1561#endif
1562
1563	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1564		sock_rps_save_rxhash(sk, skb->rxhash);
1565		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1566			rsk = sk;
1567			goto reset;
1568		}
1569		return 0;
1570	}
1571
1572	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1573		goto csum_err;
1574
1575	if (sk->sk_state == TCP_LISTEN) {
1576		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577		if (!nsk)
1578			goto discard;
1579
1580		if (nsk != sk) {
1581			if (tcp_child_process(sk, nsk, skb)) {
1582				rsk = nsk;
1583				goto reset;
1584			}
1585			return 0;
1586		}
1587	} else
1588		sock_rps_save_rxhash(sk, skb->rxhash);
1589
1590	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1591		rsk = sk;
1592		goto reset;
1593	}
1594	return 0;
1595
1596reset:
1597	tcp_v4_send_reset(rsk, skb);
1598discard:
1599	kfree_skb(skb);
1600	/* Be careful here. If this function gets more complicated and
1601	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1602	 * might be destroyed here. This current version compiles correctly,
1603	 * but you have been warned.
1604	 */
1605	return 0;
1606
1607csum_err:
1608	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1609	goto discard;
1610}
1611EXPORT_SYMBOL(tcp_v4_do_rcv);
1612
1613/*
1614 *	From tcp_input.c
1615 */
1616
1617int tcp_v4_rcv(struct sk_buff *skb)
1618{
1619	const struct iphdr *iph;
1620	struct tcphdr *th;
1621	struct sock *sk;
1622	int ret;
1623	struct net *net = dev_net(skb->dev);
1624
1625	if (skb->pkt_type != PACKET_HOST)
1626		goto discard_it;
1627
1628	/* Count it even if it's bad */
1629	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1630
1631	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1632		goto discard_it;
1633
1634	th = tcp_hdr(skb);
1635
1636	if (th->doff < sizeof(struct tcphdr) / 4)
1637		goto bad_packet;
1638	if (!pskb_may_pull(skb, th->doff * 4))
1639		goto discard_it;
1640
1641	/* An explanation is required here, I think.
1642	 * Packet length and doff are validated by header prediction,
1643	 * provided case of th->doff==0 is eliminated.
1644	 * So, we defer the checks. */
1645	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1646		goto bad_packet;
1647
1648	th = tcp_hdr(skb);
1649	iph = ip_hdr(skb);
1650	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1651	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1652				    skb->len - th->doff * 4);
1653	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1654	TCP_SKB_CB(skb)->when	 = 0;
1655	TCP_SKB_CB(skb)->flags	 = iph->tos;
1656	TCP_SKB_CB(skb)->sacked	 = 0;
1657
1658	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1659	if (!sk)
1660		goto no_tcp_socket;
1661
1662process:
1663	if (sk->sk_state == TCP_TIME_WAIT)
1664		goto do_time_wait;
1665
1666	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1668		goto discard_and_relse;
1669	}
1670
1671	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672		goto discard_and_relse;
1673	nf_reset(skb);
1674
1675	if (sk_filter(sk, skb))
1676		goto discard_and_relse;
1677
1678	skb->dev = NULL;
1679
1680	bh_lock_sock_nested(sk);
1681	ret = 0;
1682	if (!sock_owned_by_user(sk)) {
1683#ifdef CONFIG_NET_DMA
1684		struct tcp_sock *tp = tcp_sk(sk);
1685		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1686			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1687		if (tp->ucopy.dma_chan)
1688			ret = tcp_v4_do_rcv(sk, skb);
1689		else
1690#endif
1691		{
1692			if (!tcp_prequeue(sk, skb))
1693				ret = tcp_v4_do_rcv(sk, skb);
1694		}
1695	} else if (unlikely(sk_add_backlog(sk, skb))) {
1696		bh_unlock_sock(sk);
1697		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1698		goto discard_and_relse;
1699	}
1700	bh_unlock_sock(sk);
1701
1702	sock_put(sk);
1703
1704	return ret;
1705
1706no_tcp_socket:
1707	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1708		goto discard_it;
1709
1710	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711bad_packet:
1712		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1713	} else {
1714		tcp_v4_send_reset(NULL, skb);
1715	}
1716
1717discard_it:
1718	/* Discard frame. */
1719	kfree_skb(skb);
1720	return 0;
1721
1722discard_and_relse:
1723	sock_put(sk);
1724	goto discard_it;
1725
1726do_time_wait:
1727	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1728		inet_twsk_put(inet_twsk(sk));
1729		goto discard_it;
1730	}
1731
1732	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1733		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1734		inet_twsk_put(inet_twsk(sk));
1735		goto discard_it;
1736	}
1737	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1738	case TCP_TW_SYN: {
1739		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1740							&tcp_hashinfo,
1741							iph->daddr, th->dest,
1742							inet_iif(skb));
1743		if (sk2) {
1744			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1745			inet_twsk_put(inet_twsk(sk));
1746			sk = sk2;
1747			goto process;
1748		}
1749		/* Fall through to ACK */
1750	}
1751	case TCP_TW_ACK:
1752		tcp_v4_timewait_ack(sk, skb);
1753		break;
1754	case TCP_TW_RST:
1755		goto no_tcp_socket;
1756	case TCP_TW_SUCCESS:;
1757	}
1758	goto discard_it;
1759}
1760
1761struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1762{
1763	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1764	struct inet_sock *inet = inet_sk(sk);
1765	struct inet_peer *peer;
1766
1767	if (!rt || rt->rt_dst != inet->inet_daddr) {
1768		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1769		*release_it = true;
1770	} else {
1771		if (!rt->peer)
1772			rt_bind_peer(rt, 1);
1773		peer = rt->peer;
1774		*release_it = false;
1775	}
1776
1777	return peer;
1778}
1779EXPORT_SYMBOL(tcp_v4_get_peer);
1780
1781void *tcp_v4_tw_get_peer(struct sock *sk)
1782{
1783	struct inet_timewait_sock *tw = inet_twsk(sk);
1784
1785	return inet_getpeer_v4(tw->tw_daddr, 1);
1786}
1787EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1788
1789static struct timewait_sock_ops tcp_timewait_sock_ops = {
1790	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1791	.twsk_unique	= tcp_twsk_unique,
1792	.twsk_destructor= tcp_twsk_destructor,
1793	.twsk_getpeer	= tcp_v4_tw_get_peer,
1794};
1795
1796const struct inet_connection_sock_af_ops ipv4_specific = {
1797	.queue_xmit	   = ip_queue_xmit,
1798	.send_check	   = tcp_v4_send_check,
1799	.rebuild_header	   = inet_sk_rebuild_header,
1800	.conn_request	   = tcp_v4_conn_request,
1801	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1802	.get_peer	   = tcp_v4_get_peer,
1803	.net_header_len	   = sizeof(struct iphdr),
1804	.setsockopt	   = ip_setsockopt,
1805	.getsockopt	   = ip_getsockopt,
1806	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1807	.sockaddr_len	   = sizeof(struct sockaddr_in),
1808	.bind_conflict	   = inet_csk_bind_conflict,
1809#ifdef CONFIG_COMPAT
1810	.compat_setsockopt = compat_ip_setsockopt,
1811	.compat_getsockopt = compat_ip_getsockopt,
1812#endif
1813};
1814EXPORT_SYMBOL(ipv4_specific);
1815
1816#ifdef CONFIG_TCP_MD5SIG
1817static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1818	.md5_lookup		= tcp_v4_md5_lookup,
1819	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1820	.md5_add		= tcp_v4_md5_add_func,
1821	.md5_parse		= tcp_v4_parse_md5_keys,
1822};
1823#endif
1824
1825/* NOTE: A lot of things set to zero explicitly by call to
1826 *       sk_alloc() so need not be done here.
1827 */
1828static int tcp_v4_init_sock(struct sock *sk)
1829{
1830	struct inet_connection_sock *icsk = inet_csk(sk);
1831	struct tcp_sock *tp = tcp_sk(sk);
1832
1833	skb_queue_head_init(&tp->out_of_order_queue);
1834	tcp_init_xmit_timers(sk);
1835	tcp_prequeue_init(tp);
1836
1837	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1838	tp->mdev = TCP_TIMEOUT_INIT;
1839
1840	/* So many TCP implementations out there (incorrectly) count the
1841	 * initial SYN frame in their delayed-ACK and congestion control
1842	 * algorithms that we must have the following bandaid to talk
1843	 * efficiently to them.  -DaveM
1844	 */
1845	tp->snd_cwnd = 2;
1846
1847	/* See draft-stevens-tcpca-spec-01 for discussion of the
1848	 * initialization of these values.
1849	 */
1850	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1851	tp->snd_cwnd_clamp = ~0;
1852	tp->mss_cache = TCP_MSS_DEFAULT;
1853
1854	tp->reordering = sysctl_tcp_reordering;
1855	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1856
1857	sk->sk_state = TCP_CLOSE;
1858
1859	sk->sk_write_space = sk_stream_write_space;
1860	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1861
1862	icsk->icsk_af_ops = &ipv4_specific;
1863	icsk->icsk_sync_mss = tcp_sync_mss;
1864#ifdef CONFIG_TCP_MD5SIG
1865	tp->af_specific = &tcp_sock_ipv4_specific;
1866#endif
1867
1868	/* TCP Cookie Transactions */
1869	if (sysctl_tcp_cookie_size > 0) {
1870		/* Default, cookies without s_data_payload. */
1871		tp->cookie_values =
1872			kzalloc(sizeof(*tp->cookie_values),
1873				sk->sk_allocation);
1874		if (tp->cookie_values != NULL)
1875			kref_init(&tp->cookie_values->kref);
1876	}
1877	/* Presumed zeroed, in order of appearance:
1878	 *	cookie_in_always, cookie_out_never,
1879	 *	s_data_constant, s_data_in, s_data_out
1880	 */
1881	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1882	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1883
1884	local_bh_disable();
1885	percpu_counter_inc(&tcp_sockets_allocated);
1886	local_bh_enable();
1887
1888	return 0;
1889}
1890
1891void tcp_v4_destroy_sock(struct sock *sk)
1892{
1893	struct tcp_sock *tp = tcp_sk(sk);
1894
1895	tcp_clear_xmit_timers(sk);
1896
1897	tcp_cleanup_congestion_control(sk);
1898
1899	/* Cleanup up the write buffer. */
1900	tcp_write_queue_purge(sk);
1901
1902	/* Cleans up our, hopefully empty, out_of_order_queue. */
1903	__skb_queue_purge(&tp->out_of_order_queue);
1904
1905#ifdef CONFIG_TCP_MD5SIG
1906	/* Clean up the MD5 key list, if any */
1907	if (tp->md5sig_info) {
1908		tcp_v4_clear_md5_list(sk);
1909		kfree(tp->md5sig_info);
1910		tp->md5sig_info = NULL;
1911	}
1912#endif
1913
1914#ifdef CONFIG_NET_DMA
1915	/* Cleans up our sk_async_wait_queue */
1916	__skb_queue_purge(&sk->sk_async_wait_queue);
1917#endif
1918
1919	/* Clean prequeue, it must be empty really */
1920	__skb_queue_purge(&tp->ucopy.prequeue);
1921
1922	/* Clean up a referenced TCP bind bucket. */
1923	if (inet_csk(sk)->icsk_bind_hash)
1924		inet_put_port(sk);
1925
1926	/*
1927	 * If sendmsg cached page exists, toss it.
1928	 */
1929	if (sk->sk_sndmsg_page) {
1930		__free_page(sk->sk_sndmsg_page);
1931		sk->sk_sndmsg_page = NULL;
1932	}
1933
1934	/* TCP Cookie Transactions */
1935	if (tp->cookie_values != NULL) {
1936		kref_put(&tp->cookie_values->kref,
1937			 tcp_cookie_values_release);
1938		tp->cookie_values = NULL;
1939	}
1940
1941	percpu_counter_dec(&tcp_sockets_allocated);
1942}
1943EXPORT_SYMBOL(tcp_v4_destroy_sock);
1944
1945#ifdef CONFIG_PROC_FS
1946/* Proc filesystem TCP sock list dumping. */
1947
1948static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1949{
1950	return hlist_nulls_empty(head) ? NULL :
1951		list_entry(head->first, struct inet_timewait_sock, tw_node);
1952}
1953
1954static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1955{
1956	return !is_a_nulls(tw->tw_node.next) ?
1957		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1958}
1959
1960/*
1961 * Get next listener socket follow cur.  If cur is NULL, get first socket
1962 * starting from bucket given in st->bucket; when st->bucket is zero the
1963 * very first socket in the hash table is returned.
1964 */
1965static void *listening_get_next(struct seq_file *seq, void *cur)
1966{
1967	struct inet_connection_sock *icsk;
1968	struct hlist_nulls_node *node;
1969	struct sock *sk = cur;
1970	struct inet_listen_hashbucket *ilb;
1971	struct tcp_iter_state *st = seq->private;
1972	struct net *net = seq_file_net(seq);
1973
1974	if (!sk) {
1975		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976		spin_lock_bh(&ilb->lock);
1977		sk = sk_nulls_head(&ilb->head);
1978		st->offset = 0;
1979		goto get_sk;
1980	}
1981	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982	++st->num;
1983	++st->offset;
1984
1985	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1986		struct request_sock *req = cur;
1987
1988		icsk = inet_csk(st->syn_wait_sk);
1989		req = req->dl_next;
1990		while (1) {
1991			while (req) {
1992				if (req->rsk_ops->family == st->family) {
1993					cur = req;
1994					goto out;
1995				}
1996				req = req->dl_next;
1997			}
1998			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1999				break;
2000get_req:
2001			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2002		}
2003		sk	  = sk_nulls_next(st->syn_wait_sk);
2004		st->state = TCP_SEQ_STATE_LISTENING;
2005		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006	} else {
2007		icsk = inet_csk(sk);
2008		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2010			goto start_req;
2011		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012		sk = sk_nulls_next(sk);
2013	}
2014get_sk:
2015	sk_nulls_for_each_from(sk, node) {
2016		if (!net_eq(sock_net(sk), net))
2017			continue;
2018		if (sk->sk_family == st->family) {
2019			cur = sk;
2020			goto out;
2021		}
2022		icsk = inet_csk(sk);
2023		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2025start_req:
2026			st->uid		= sock_i_uid(sk);
2027			st->syn_wait_sk = sk;
2028			st->state	= TCP_SEQ_STATE_OPENREQ;
2029			st->sbucket	= 0;
2030			goto get_req;
2031		}
2032		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033	}
2034	spin_unlock_bh(&ilb->lock);
2035	st->offset = 0;
2036	if (++st->bucket < INET_LHTABLE_SIZE) {
2037		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038		spin_lock_bh(&ilb->lock);
2039		sk = sk_nulls_head(&ilb->head);
2040		goto get_sk;
2041	}
2042	cur = NULL;
2043out:
2044	return cur;
2045}
2046
2047static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048{
2049	struct tcp_iter_state *st = seq->private;
2050	void *rc;
2051
2052	st->bucket = 0;
2053	st->offset = 0;
2054	rc = listening_get_next(seq, NULL);
2055
2056	while (rc && *pos) {
2057		rc = listening_get_next(seq, rc);
2058		--*pos;
2059	}
2060	return rc;
2061}
2062
2063static inline int empty_bucket(struct tcp_iter_state *st)
2064{
2065	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2066		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2067}
2068
2069/*
2070 * Get first established socket starting from bucket given in st->bucket.
2071 * If st->bucket is zero, the very first socket in the hash is returned.
2072 */
2073static void *established_get_first(struct seq_file *seq)
2074{
2075	struct tcp_iter_state *st = seq->private;
2076	struct net *net = seq_file_net(seq);
2077	void *rc = NULL;
2078
2079	st->offset = 0;
2080	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2081		struct sock *sk;
2082		struct hlist_nulls_node *node;
2083		struct inet_timewait_sock *tw;
2084		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2085
2086		/* Lockless fast path for the common case of empty buckets */
2087		if (empty_bucket(st))
2088			continue;
2089
2090		spin_lock_bh(lock);
2091		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2092			if (sk->sk_family != st->family ||
2093			    !net_eq(sock_net(sk), net)) {
2094				continue;
2095			}
2096			rc = sk;
2097			goto out;
2098		}
2099		st->state = TCP_SEQ_STATE_TIME_WAIT;
2100		inet_twsk_for_each(tw, node,
2101				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2102			if (tw->tw_family != st->family ||
2103			    !net_eq(twsk_net(tw), net)) {
2104				continue;
2105			}
2106			rc = tw;
2107			goto out;
2108		}
2109		spin_unlock_bh(lock);
2110		st->state = TCP_SEQ_STATE_ESTABLISHED;
2111	}
2112out:
2113	return rc;
2114}
2115
2116static void *established_get_next(struct seq_file *seq, void *cur)
2117{
2118	struct sock *sk = cur;
2119	struct inet_timewait_sock *tw;
2120	struct hlist_nulls_node *node;
2121	struct tcp_iter_state *st = seq->private;
2122	struct net *net = seq_file_net(seq);
2123
2124	++st->num;
2125	++st->offset;
2126
2127	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128		tw = cur;
2129		tw = tw_next(tw);
2130get_tw:
2131		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2132			tw = tw_next(tw);
2133		}
2134		if (tw) {
2135			cur = tw;
2136			goto out;
2137		}
2138		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139		st->state = TCP_SEQ_STATE_ESTABLISHED;
2140
2141		/* Look for next non empty bucket */
2142		st->offset = 0;
2143		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2144				empty_bucket(st))
2145			;
2146		if (st->bucket > tcp_hashinfo.ehash_mask)
2147			return NULL;
2148
2149		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2151	} else
2152		sk = sk_nulls_next(sk);
2153
2154	sk_nulls_for_each_from(sk, node) {
2155		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2156			goto found;
2157	}
2158
2159	st->state = TCP_SEQ_STATE_TIME_WAIT;
2160	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2161	goto get_tw;
2162found:
2163	cur = sk;
2164out:
2165	return cur;
2166}
2167
2168static void *established_get_idx(struct seq_file *seq, loff_t pos)
2169{
2170	struct tcp_iter_state *st = seq->private;
2171	void *rc;
2172
2173	st->bucket = 0;
2174	rc = established_get_first(seq);
2175
2176	while (rc && pos) {
2177		rc = established_get_next(seq, rc);
2178		--pos;
2179	}
2180	return rc;
2181}
2182
2183static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2184{
2185	void *rc;
2186	struct tcp_iter_state *st = seq->private;
2187
2188	st->state = TCP_SEQ_STATE_LISTENING;
2189	rc	  = listening_get_idx(seq, &pos);
2190
2191	if (!rc) {
2192		st->state = TCP_SEQ_STATE_ESTABLISHED;
2193		rc	  = established_get_idx(seq, pos);
2194	}
2195
2196	return rc;
2197}
2198
2199static void *tcp_seek_last_pos(struct seq_file *seq)
2200{
2201	struct tcp_iter_state *st = seq->private;
2202	int offset = st->offset;
2203	int orig_num = st->num;
2204	void *rc = NULL;
2205
2206	switch (st->state) {
2207	case TCP_SEQ_STATE_OPENREQ:
2208	case TCP_SEQ_STATE_LISTENING:
2209		if (st->bucket >= INET_LHTABLE_SIZE)
2210			break;
2211		st->state = TCP_SEQ_STATE_LISTENING;
2212		rc = listening_get_next(seq, NULL);
2213		while (offset-- && rc)
2214			rc = listening_get_next(seq, rc);
2215		if (rc)
2216			break;
2217		st->bucket = 0;
2218		/* Fallthrough */
2219	case TCP_SEQ_STATE_ESTABLISHED:
2220	case TCP_SEQ_STATE_TIME_WAIT:
2221		st->state = TCP_SEQ_STATE_ESTABLISHED;
2222		if (st->bucket > tcp_hashinfo.ehash_mask)
2223			break;
2224		rc = established_get_first(seq);
2225		while (offset-- && rc)
2226			rc = established_get_next(seq, rc);
2227	}
2228
2229	st->num = orig_num;
2230
2231	return rc;
2232}
2233
2234static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2235{
2236	struct tcp_iter_state *st = seq->private;
2237	void *rc;
2238
2239	if (*pos && *pos == st->last_pos) {
2240		rc = tcp_seek_last_pos(seq);
2241		if (rc)
2242			goto out;
2243	}
2244
2245	st->state = TCP_SEQ_STATE_LISTENING;
2246	st->num = 0;
2247	st->bucket = 0;
2248	st->offset = 0;
2249	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2250
2251out:
2252	st->last_pos = *pos;
2253	return rc;
2254}
2255
2256static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257{
2258	struct tcp_iter_state *st = seq->private;
2259	void *rc = NULL;
2260
2261	if (v == SEQ_START_TOKEN) {
2262		rc = tcp_get_idx(seq, 0);
2263		goto out;
2264	}
2265
2266	switch (st->state) {
2267	case TCP_SEQ_STATE_OPENREQ:
2268	case TCP_SEQ_STATE_LISTENING:
2269		rc = listening_get_next(seq, v);
2270		if (!rc) {
2271			st->state = TCP_SEQ_STATE_ESTABLISHED;
2272			st->bucket = 0;
2273			st->offset = 0;
2274			rc	  = established_get_first(seq);
2275		}
2276		break;
2277	case TCP_SEQ_STATE_ESTABLISHED:
2278	case TCP_SEQ_STATE_TIME_WAIT:
2279		rc = established_get_next(seq, v);
2280		break;
2281	}
2282out:
2283	++*pos;
2284	st->last_pos = *pos;
2285	return rc;
2286}
2287
2288static void tcp_seq_stop(struct seq_file *seq, void *v)
2289{
2290	struct tcp_iter_state *st = seq->private;
2291
2292	switch (st->state) {
2293	case TCP_SEQ_STATE_OPENREQ:
2294		if (v) {
2295			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2296			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2297		}
2298	case TCP_SEQ_STATE_LISTENING:
2299		if (v != SEQ_START_TOKEN)
2300			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2301		break;
2302	case TCP_SEQ_STATE_TIME_WAIT:
2303	case TCP_SEQ_STATE_ESTABLISHED:
2304		if (v)
2305			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2306		break;
2307	}
2308}
2309
2310static int tcp_seq_open(struct inode *inode, struct file *file)
2311{
2312	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2313	struct tcp_iter_state *s;
2314	int err;
2315
2316	err = seq_open_net(inode, file, &afinfo->seq_ops,
2317			  sizeof(struct tcp_iter_state));
2318	if (err < 0)
2319		return err;
2320
2321	s = ((struct seq_file *)file->private_data)->private;
2322	s->family		= afinfo->family;
2323	s->last_pos 		= 0;
2324	return 0;
2325}
2326
2327int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2328{
2329	int rc = 0;
2330	struct proc_dir_entry *p;
2331
2332	afinfo->seq_fops.open		= tcp_seq_open;
2333	afinfo->seq_fops.read		= seq_read;
2334	afinfo->seq_fops.llseek		= seq_lseek;
2335	afinfo->seq_fops.release	= seq_release_net;
2336
2337	afinfo->seq_ops.start		= tcp_seq_start;
2338	afinfo->seq_ops.next		= tcp_seq_next;
2339	afinfo->seq_ops.stop		= tcp_seq_stop;
2340
2341	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2342			     &afinfo->seq_fops, afinfo);
2343	if (!p)
2344		rc = -ENOMEM;
2345	return rc;
2346}
2347EXPORT_SYMBOL(tcp_proc_register);
2348
2349void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2350{
2351	proc_net_remove(net, afinfo->name);
2352}
2353EXPORT_SYMBOL(tcp_proc_unregister);
2354
2355static void get_openreq4(struct sock *sk, struct request_sock *req,
2356			 struct seq_file *f, int i, int uid, int *len)
2357{
2358	const struct inet_request_sock *ireq = inet_rsk(req);
2359	int ttd = req->expires - jiffies;
2360
2361	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2362		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2363		i,
2364		ireq->loc_addr,
2365		ntohs(inet_sk(sk)->inet_sport),
2366		ireq->rmt_addr,
2367		ntohs(ireq->rmt_port),
2368		TCP_SYN_RECV,
2369		0, 0, /* could print option size, but that is af dependent. */
2370		1,    /* timers active (only the expire timer) */
2371		jiffies_to_clock_t(ttd),
2372		req->retrans,
2373		uid,
2374		0,  /* non standard timer */
2375		0, /* open_requests have no inode */
2376		atomic_read(&sk->sk_refcnt),
2377		req,
2378		len);
2379}
2380
2381static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2382{
2383	int timer_active;
2384	unsigned long timer_expires;
2385	struct tcp_sock *tp = tcp_sk(sk);
2386	const struct inet_connection_sock *icsk = inet_csk(sk);
2387	struct inet_sock *inet = inet_sk(sk);
2388	__be32 dest = inet->inet_daddr;
2389	__be32 src = inet->inet_rcv_saddr;
2390	__u16 destp = ntohs(inet->inet_dport);
2391	__u16 srcp = ntohs(inet->inet_sport);
2392	int rx_queue;
2393
2394	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2395		timer_active	= 1;
2396		timer_expires	= icsk->icsk_timeout;
2397	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2398		timer_active	= 4;
2399		timer_expires	= icsk->icsk_timeout;
2400	} else if (timer_pending(&sk->sk_timer)) {
2401		timer_active	= 2;
2402		timer_expires	= sk->sk_timer.expires;
2403	} else {
2404		timer_active	= 0;
2405		timer_expires = jiffies;
2406	}
2407
2408	if (sk->sk_state == TCP_LISTEN)
2409		rx_queue = sk->sk_ack_backlog;
2410	else
2411		/*
2412		 * because we dont lock socket, we might find a transient negative value
2413		 */
2414		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2415
2416	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2417			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2418		i, src, srcp, dest, destp, sk->sk_state,
2419		tp->write_seq - tp->snd_una,
2420		rx_queue,
2421		timer_active,
2422		jiffies_to_clock_t(timer_expires - jiffies),
2423		icsk->icsk_retransmits,
2424		sock_i_uid(sk),
2425		icsk->icsk_probes_out,
2426		sock_i_ino(sk),
2427		atomic_read(&sk->sk_refcnt), sk,
2428		jiffies_to_clock_t(icsk->icsk_rto),
2429		jiffies_to_clock_t(icsk->icsk_ack.ato),
2430		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2431		tp->snd_cwnd,
2432		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2433		len);
2434}
2435
2436static void get_timewait4_sock(struct inet_timewait_sock *tw,
2437			       struct seq_file *f, int i, int *len)
2438{
2439	__be32 dest, src;
2440	__u16 destp, srcp;
2441	int ttd = tw->tw_ttd - jiffies;
2442
2443	if (ttd < 0)
2444		ttd = 0;
2445
2446	dest  = tw->tw_daddr;
2447	src   = tw->tw_rcv_saddr;
2448	destp = ntohs(tw->tw_dport);
2449	srcp  = ntohs(tw->tw_sport);
2450
2451	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2452		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2453		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2454		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2455		atomic_read(&tw->tw_refcnt), tw, len);
2456}
2457
2458#define TMPSZ 150
2459
2460static int tcp4_seq_show(struct seq_file *seq, void *v)
2461{
2462	struct tcp_iter_state *st;
2463	int len;
2464
2465	if (v == SEQ_START_TOKEN) {
2466		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2467			   "  sl  local_address rem_address   st tx_queue "
2468			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2469			   "inode");
2470		goto out;
2471	}
2472	st = seq->private;
2473
2474	switch (st->state) {
2475	case TCP_SEQ_STATE_LISTENING:
2476	case TCP_SEQ_STATE_ESTABLISHED:
2477		get_tcp4_sock(v, seq, st->num, &len);
2478		break;
2479	case TCP_SEQ_STATE_OPENREQ:
2480		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2481		break;
2482	case TCP_SEQ_STATE_TIME_WAIT:
2483		get_timewait4_sock(v, seq, st->num, &len);
2484		break;
2485	}
2486	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2487out:
2488	return 0;
2489}
2490
2491static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2492	.name		= "tcp",
2493	.family		= AF_INET,
2494	.seq_fops	= {
2495		.owner		= THIS_MODULE,
2496	},
2497	.seq_ops	= {
2498		.show		= tcp4_seq_show,
2499	},
2500};
2501
2502static int __net_init tcp4_proc_init_net(struct net *net)
2503{
2504	return tcp_proc_register(net, &tcp4_seq_afinfo);
2505}
2506
2507static void __net_exit tcp4_proc_exit_net(struct net *net)
2508{
2509	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2510}
2511
2512static struct pernet_operations tcp4_net_ops = {
2513	.init = tcp4_proc_init_net,
2514	.exit = tcp4_proc_exit_net,
2515};
2516
2517int __init tcp4_proc_init(void)
2518{
2519	return register_pernet_subsys(&tcp4_net_ops);
2520}
2521
2522void tcp4_proc_exit(void)
2523{
2524	unregister_pernet_subsys(&tcp4_net_ops);
2525}
2526#endif /* CONFIG_PROC_FS */
2527
2528struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2529{
2530	const struct iphdr *iph = skb_gro_network_header(skb);
2531
2532	switch (skb->ip_summed) {
2533	case CHECKSUM_COMPLETE:
2534		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2535				  skb->csum)) {
2536			skb->ip_summed = CHECKSUM_UNNECESSARY;
2537			break;
2538		}
2539
2540		/* fall through */
2541	case CHECKSUM_NONE:
2542		NAPI_GRO_CB(skb)->flush = 1;
2543		return NULL;
2544	}
2545
2546	return tcp_gro_receive(head, skb);
2547}
2548
2549int tcp4_gro_complete(struct sk_buff *skb)
2550{
2551	const struct iphdr *iph = ip_hdr(skb);
2552	struct tcphdr *th = tcp_hdr(skb);
2553
2554	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2555				  iph->saddr, iph->daddr, 0);
2556	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2557
2558	return tcp_gro_complete(skb);
2559}
2560
2561struct proto tcp_prot = {
2562	.name			= "TCP",
2563	.owner			= THIS_MODULE,
2564	.close			= tcp_close,
2565	.connect		= tcp_v4_connect,
2566	.disconnect		= tcp_disconnect,
2567	.accept			= inet_csk_accept,
2568	.ioctl			= tcp_ioctl,
2569	.init			= tcp_v4_init_sock,
2570	.destroy		= tcp_v4_destroy_sock,
2571	.shutdown		= tcp_shutdown,
2572	.setsockopt		= tcp_setsockopt,
2573	.getsockopt		= tcp_getsockopt,
2574	.recvmsg		= tcp_recvmsg,
2575	.sendmsg		= tcp_sendmsg,
2576	.sendpage		= tcp_sendpage,
2577	.backlog_rcv		= tcp_v4_do_rcv,
2578	.hash			= inet_hash,
2579	.unhash			= inet_unhash,
2580	.get_port		= inet_csk_get_port,
2581	.enter_memory_pressure	= tcp_enter_memory_pressure,
2582	.sockets_allocated	= &tcp_sockets_allocated,
2583	.orphan_count		= &tcp_orphan_count,
2584	.memory_allocated	= &tcp_memory_allocated,
2585	.memory_pressure	= &tcp_memory_pressure,
2586	.sysctl_mem		= sysctl_tcp_mem,
2587	.sysctl_wmem		= sysctl_tcp_wmem,
2588	.sysctl_rmem		= sysctl_tcp_rmem,
2589	.max_header		= MAX_TCP_HEADER,
2590	.obj_size		= sizeof(struct tcp_sock),
2591	.slab_flags		= SLAB_DESTROY_BY_RCU,
2592	.twsk_prot		= &tcp_timewait_sock_ops,
2593	.rsk_prot		= &tcp_request_sock_ops,
2594	.h.hashinfo		= &tcp_hashinfo,
2595	.no_autobind		= true,
2596#ifdef CONFIG_COMPAT
2597	.compat_setsockopt	= compat_tcp_setsockopt,
2598	.compat_getsockopt	= compat_tcp_getsockopt,
2599#endif
2600};
2601EXPORT_SYMBOL(tcp_prot);
2602
2603
2604static int __net_init tcp_sk_init(struct net *net)
2605{
2606	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2607				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2608}
2609
2610static void __net_exit tcp_sk_exit(struct net *net)
2611{
2612	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2613}
2614
2615static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616{
2617	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2618}
2619
2620static struct pernet_operations __net_initdata tcp_sk_ops = {
2621       .init	   = tcp_sk_init,
2622       .exit	   = tcp_sk_exit,
2623       .exit_batch = tcp_sk_exit_batch,
2624};
2625
2626void __init tcp_v4_init(void)
2627{
2628	inet_hashinfo_init(&tcp_hashinfo);
2629	if (register_pernet_subsys(&tcp_sk_ops))
2630		panic("Failed to create the TCP control socket.\n");
2631}
2632