tcp.c revision c3f1dbaf6e281642848b78fe101764170c15f168
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16 *		Matthew Dillon, <dillon@apollo.west.oic.com>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Jorge Cwik, <jorge@laser.satlink.net>
19 *
20 * Fixes:
21 *		Alan Cox	:	Numerous verify_area() calls
22 *		Alan Cox	:	Set the ACK bit on a reset
23 *		Alan Cox	:	Stopped it crashing if it closed while
24 *					sk->inuse=1 and was trying to connect
25 *					(tcp_err()).
26 *		Alan Cox	:	All icmp error handling was broken
27 *					pointers passed where wrong and the
28 *					socket was looked up backwards. Nobody
29 *					tested any icmp error code obviously.
30 *		Alan Cox	:	tcp_err() now handled properly. It
31 *					wakes people on errors. poll
32 *					behaves and the icmp error race
33 *					has gone by moving it into sock.c
34 *		Alan Cox	:	tcp_send_reset() fixed to work for
35 *					everything not just packets for
36 *					unknown sockets.
37 *		Alan Cox	:	tcp option processing.
38 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
39 *					syn rule wrong]
40 *		Herp Rosmanith  :	More reset fixes
41 *		Alan Cox	:	No longer acks invalid rst frames.
42 *					Acking any kind of RST is right out.
43 *		Alan Cox	:	Sets an ignore me flag on an rst
44 *					receive otherwise odd bits of prattle
45 *					escape still
46 *		Alan Cox	:	Fixed another acking RST frame bug.
47 *					Should stop LAN workplace lockups.
48 *		Alan Cox	: 	Some tidyups using the new skb list
49 *					facilities
50 *		Alan Cox	:	sk->keepopen now seems to work
51 *		Alan Cox	:	Pulls options out correctly on accepts
52 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
53 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
54 *					bit to skb ops.
55 *		Alan Cox	:	Tidied tcp_data to avoid a potential
56 *					nasty.
57 *		Alan Cox	:	Added some better commenting, as the
58 *					tcp is hard to follow
59 *		Alan Cox	:	Removed incorrect check for 20 * psh
60 *	Michael O'Reilly	:	ack < copied bug fix.
61 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
62 *		Alan Cox	:	FIN with no memory -> CRASH
63 *		Alan Cox	:	Added socket option proto entries.
64 *					Also added awareness of them to accept.
65 *		Alan Cox	:	Added TCP options (SOL_TCP)
66 *		Alan Cox	:	Switched wakeup calls to callbacks,
67 *					so the kernel can layer network
68 *					sockets.
69 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
70 *		Alan Cox	:	Handle FIN (more) properly (we hope).
71 *		Alan Cox	:	RST frames sent on unsynchronised
72 *					state ack error.
73 *		Alan Cox	:	Put in missing check for SYN bit.
74 *		Alan Cox	:	Added tcp_select_window() aka NET2E
75 *					window non shrink trick.
76 *		Alan Cox	:	Added a couple of small NET2E timer
77 *					fixes
78 *		Charles Hedrick :	TCP fixes
79 *		Toomas Tamm	:	TCP window fixes
80 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
81 *		Charles Hedrick	:	Rewrote most of it to actually work
82 *		Linus		:	Rewrote tcp_read() and URG handling
83 *					completely
84 *		Gerhard Koerting:	Fixed some missing timer handling
85 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
86 *		Gerhard Koerting:	PC/TCP workarounds
87 *		Adam Caldwell	:	Assorted timer/timing errors
88 *		Matthew Dillon	:	Fixed another RST bug
89 *		Alan Cox	:	Move to kernel side addressing changes.
90 *		Alan Cox	:	Beginning work on TCP fastpathing
91 *					(not yet usable)
92 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
93 *		Alan Cox	:	TCP fast path debugging
94 *		Alan Cox	:	Window clamping
95 *		Michael Riepe	:	Bug in tcp_check()
96 *		Matt Dillon	:	More TCP improvements and RST bug fixes
97 *		Matt Dillon	:	Yet more small nasties remove from the
98 *					TCP code (Be very nice to this man if
99 *					tcp finally works 100%) 8)
100 *		Alan Cox	:	BSD accept semantics.
101 *		Alan Cox	:	Reset on closedown bug.
102 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
103 *		Michael Pall	:	Handle poll() after URG properly in
104 *					all cases.
105 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
106 *					(multi URG PUSH broke rlogin).
107 *		Michael Pall	:	Fix the multi URG PUSH problem in
108 *					tcp_readable(), poll() after URG
109 *					works now.
110 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
111 *					BSD api.
112 *		Alan Cox	:	Changed the semantics of sk->socket to
113 *					fix a race and a signal problem with
114 *					accept() and async I/O.
115 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
116 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
117 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
118 *					clients/servers which listen in on
119 *					fixed ports.
120 *		Alan Cox	:	Cleaned the above up and shrank it to
121 *					a sensible code size.
122 *		Alan Cox	:	Self connect lockup fix.
123 *		Alan Cox	:	No connect to multicast.
124 *		Ross Biro	:	Close unaccepted children on master
125 *					socket close.
126 *		Alan Cox	:	Reset tracing code.
127 *		Alan Cox	:	Spurious resets on shutdown.
128 *		Alan Cox	:	Giant 15 minute/60 second timer error
129 *		Alan Cox	:	Small whoops in polling before an
130 *					accept.
131 *		Alan Cox	:	Kept the state trace facility since
132 *					it's handy for debugging.
133 *		Alan Cox	:	More reset handler fixes.
134 *		Alan Cox	:	Started rewriting the code based on
135 *					the RFC's for other useful protocol
136 *					references see: Comer, KA9Q NOS, and
137 *					for a reference on the difference
138 *					between specifications and how BSD
139 *					works see the 4.4lite source.
140 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
141 *					close.
142 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
143 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
144 *		Alan Cox	:	Reimplemented timers as per the RFC
145 *					and using multiple timers for sanity.
146 *		Alan Cox	:	Small bug fixes, and a lot of new
147 *					comments.
148 *		Alan Cox	:	Fixed dual reader crash by locking
149 *					the buffers (much like datagram.c)
150 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
151 *					now gets fed up of retrying without
152 *					(even a no space) answer.
153 *		Alan Cox	:	Extracted closing code better
154 *		Alan Cox	:	Fixed the closing state machine to
155 *					resemble the RFC.
156 *		Alan Cox	:	More 'per spec' fixes.
157 *		Jorge Cwik	:	Even faster checksumming.
158 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
159 *					only frames. At least one pc tcp stack
160 *					generates them.
161 *		Alan Cox	:	Cache last socket.
162 *		Alan Cox	:	Per route irtt.
163 *		Matt Day	:	poll()->select() match BSD precisely on error
164 *		Alan Cox	:	New buffers
165 *		Marc Tamsky	:	Various sk->prot->retransmits and
166 *					sk->retransmits misupdating fixed.
167 *					Fixed tcp_write_timeout: stuck close,
168 *					and TCP syn retries gets used now.
169 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
170 *					ack if state is TCP_CLOSED.
171 *		Alan Cox	:	Look up device on a retransmit - routes may
172 *					change. Doesn't yet cope with MSS shrink right
173 *					but it's a start!
174 *		Marc Tamsky	:	Closing in closing fixes.
175 *		Mike Shaver	:	RFC1122 verifications.
176 *		Alan Cox	:	rcv_saddr errors.
177 *		Alan Cox	:	Block double connect().
178 *		Alan Cox	:	Small hooks for enSKIP.
179 *		Alexey Kuznetsov:	Path MTU discovery.
180 *		Alan Cox	:	Support soft errors.
181 *		Alan Cox	:	Fix MTU discovery pathological case
182 *					when the remote claims no mtu!
183 *		Marc Tamsky	:	TCP_CLOSE fix.
184 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
185 *					window but wrong (fixes NT lpd problems)
186 *		Pedro Roque	:	Better TCP window handling, delayed ack.
187 *		Joerg Reuter	:	No modification of locked buffers in
188 *					tcp_do_retransmit()
189 *		Eric Schenk	:	Changed receiver side silly window
190 *					avoidance algorithm to BSD style
191 *					algorithm. This doubles throughput
192 *					against machines running Solaris,
193 *					and seems to result in general
194 *					improvement.
195 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
196 *	Willy Konynenberg	:	Transparent proxying support.
197 *	Mike McLagan		:	Routing by source
198 *		Keith Owens	:	Do proper merging with partial SKB's in
199 *					tcp_do_sendmsg to avoid burstiness.
200 *		Eric Schenk	:	Fix fast close down bug with
201 *					shutdown() followed by close().
202 *		Andi Kleen 	:	Make poll agree with SIGIO
203 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
204 *					lingertime == 0 (RFC 793 ABORT Call)
205 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
206 *					csum_and_copy_from_user() if possible.
207 *
208 *		This program is free software; you can redistribute it and/or
209 *		modify it under the terms of the GNU General Public License
210 *		as published by the Free Software Foundation; either version
211 *		2 of the License, or(at your option) any later version.
212 *
213 * Description of States:
214 *
215 *	TCP_SYN_SENT		sent a connection request, waiting for ack
216 *
217 *	TCP_SYN_RECV		received a connection request, sent ack,
218 *				waiting for final ack in three-way handshake.
219 *
220 *	TCP_ESTABLISHED		connection established
221 *
222 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
223 *				transmission of remaining buffered data
224 *
225 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
226 *				to shutdown
227 *
228 *	TCP_CLOSING		both sides have shutdown but we still have
229 *				data we have to finish sending
230 *
231 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
232 *				closed, can only be entered from FIN_WAIT2
233 *				or CLOSING.  Required because the other end
234 *				may not have gotten our last ACK causing it
235 *				to retransmit the data packet (which we ignore)
236 *
237 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
238 *				us to finish writing our data and to shutdown
239 *				(we have to close() to move on to LAST_ACK)
240 *
241 *	TCP_LAST_ACK		out side has shutdown after remote has
242 *				shutdown.  There may still be data in our
243 *				buffer that we have to finish sending
244 *
245 *	TCP_CLOSE		socket is finished
246 */
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297/*
298 * Current number of TCP sockets.
299 */
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303/*
304 * TCP splice context
305 */
306struct tcp_splice_state {
307	struct pipe_inode_info *pipe;
308	size_t len;
309	unsigned int flags;
310};
311
312/*
313 * Pressure flag: try to collapse.
314 * Technical note: it is used by multiple contexts non atomically.
315 * All the __sk_mem_schedule() is of this nature: accounting
316 * is strict, actions are advisory and have some latency.
317 */
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323	if (!tcp_memory_pressure) {
324		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325		tcp_memory_pressure = 1;
326	}
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330/* Convert seconds to retransmits based on initial and max timeout */
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333	u8 res = 0;
334
335	if (seconds > 0) {
336		int period = timeout;
337
338		res = 1;
339		while (seconds > period && res < 255) {
340			res++;
341			timeout <<= 1;
342			if (timeout > rto_max)
343				timeout = rto_max;
344			period += timeout;
345		}
346	}
347	return res;
348}
349
350/* Convert retransmits to seconds based on initial and max timeout */
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353	int period = 0;
354
355	if (retrans > 0) {
356		period = timeout;
357		while (--retrans) {
358			timeout <<= 1;
359			if (timeout > rto_max)
360				timeout = rto_max;
361			period += timeout;
362		}
363	}
364	return period;
365}
366
367/* Address-family independent initialization for a tcp_sock.
368 *
369 * NOTE: A lot of things set to zero explicitly by call to
370 *       sk_alloc() so need not be done here.
371 */
372void tcp_init_sock(struct sock *sk)
373{
374	struct inet_connection_sock *icsk = inet_csk(sk);
375	struct tcp_sock *tp = tcp_sk(sk);
376
377	skb_queue_head_init(&tp->out_of_order_queue);
378	tcp_init_xmit_timers(sk);
379	tcp_prequeue_init(tp);
380	INIT_LIST_HEAD(&tp->tsq_node);
381
382	icsk->icsk_rto = TCP_TIMEOUT_INIT;
383	tp->mdev = TCP_TIMEOUT_INIT;
384
385	/* So many TCP implementations out there (incorrectly) count the
386	 * initial SYN frame in their delayed-ACK and congestion control
387	 * algorithms that we must have the following bandaid to talk
388	 * efficiently to them.  -DaveM
389	 */
390	tp->snd_cwnd = TCP_INIT_CWND;
391
392	/* See draft-stevens-tcpca-spec-01 for discussion of the
393	 * initialization of these values.
394	 */
395	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396	tp->snd_cwnd_clamp = ~0;
397	tp->mss_cache = TCP_MSS_DEFAULT;
398
399	tp->reordering = sysctl_tcp_reordering;
400	tcp_enable_early_retrans(tp);
401	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403	tp->tsoffset = 0;
404
405	sk->sk_state = TCP_CLOSE;
406
407	sk->sk_write_space = sk_stream_write_space;
408	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
409
410	icsk->icsk_sync_mss = tcp_sync_mss;
411
412	/* Presumed zeroed, in order of appearance:
413	 *	cookie_in_always, cookie_out_never,
414	 *	s_data_constant, s_data_in, s_data_out
415	 */
416	sk->sk_sndbuf = sysctl_tcp_wmem[1];
417	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
418
419	local_bh_disable();
420	sock_update_memcg(sk);
421	sk_sockets_allocated_inc(sk);
422	local_bh_enable();
423}
424EXPORT_SYMBOL(tcp_init_sock);
425
426/*
427 *	Wait for a TCP event.
428 *
429 *	Note that we don't need to lock the socket, as the upper poll layers
430 *	take care of normal races (between the test and the event) and we don't
431 *	go look at any of the socket buffers directly.
432 */
433unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
434{
435	unsigned int mask;
436	struct sock *sk = sock->sk;
437	const struct tcp_sock *tp = tcp_sk(sk);
438
439	sock_rps_record_flow(sk);
440
441	sock_poll_wait(file, sk_sleep(sk), wait);
442	if (sk->sk_state == TCP_LISTEN)
443		return inet_csk_listen_poll(sk);
444
445	/* Socket is not locked. We are protected from async events
446	 * by poll logic and correct handling of state changes
447	 * made by other threads is impossible in any case.
448	 */
449
450	mask = 0;
451
452	/*
453	 * POLLHUP is certainly not done right. But poll() doesn't
454	 * have a notion of HUP in just one direction, and for a
455	 * socket the read side is more interesting.
456	 *
457	 * Some poll() documentation says that POLLHUP is incompatible
458	 * with the POLLOUT/POLLWR flags, so somebody should check this
459	 * all. But careful, it tends to be safer to return too many
460	 * bits than too few, and you can easily break real applications
461	 * if you don't tell them that something has hung up!
462	 *
463	 * Check-me.
464	 *
465	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
466	 * our fs/select.c). It means that after we received EOF,
467	 * poll always returns immediately, making impossible poll() on write()
468	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
469	 * if and only if shutdown has been made in both directions.
470	 * Actually, it is interesting to look how Solaris and DUX
471	 * solve this dilemma. I would prefer, if POLLHUP were maskable,
472	 * then we could set it on SND_SHUTDOWN. BTW examples given
473	 * in Stevens' books assume exactly this behaviour, it explains
474	 * why POLLHUP is incompatible with POLLOUT.	--ANK
475	 *
476	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
477	 * blocking on fresh not-connected or disconnected socket. --ANK
478	 */
479	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
480		mask |= POLLHUP;
481	if (sk->sk_shutdown & RCV_SHUTDOWN)
482		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
483
484	/* Connected or passive Fast Open socket? */
485	if (sk->sk_state != TCP_SYN_SENT &&
486	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
487		int target = sock_rcvlowat(sk, 0, INT_MAX);
488
489		if (tp->urg_seq == tp->copied_seq &&
490		    !sock_flag(sk, SOCK_URGINLINE) &&
491		    tp->urg_data)
492			target++;
493
494		/* Potential race condition. If read of tp below will
495		 * escape above sk->sk_state, we can be illegally awaken
496		 * in SYN_* states. */
497		if (tp->rcv_nxt - tp->copied_seq >= target)
498			mask |= POLLIN | POLLRDNORM;
499
500		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
501			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
502				mask |= POLLOUT | POLLWRNORM;
503			} else {  /* send SIGIO later */
504				set_bit(SOCK_ASYNC_NOSPACE,
505					&sk->sk_socket->flags);
506				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
507
508				/* Race breaker. If space is freed after
509				 * wspace test but before the flags are set,
510				 * IO signal will be lost.
511				 */
512				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
513					mask |= POLLOUT | POLLWRNORM;
514			}
515		} else
516			mask |= POLLOUT | POLLWRNORM;
517
518		if (tp->urg_data & TCP_URG_VALID)
519			mask |= POLLPRI;
520	}
521	/* This barrier is coupled with smp_wmb() in tcp_reset() */
522	smp_rmb();
523	if (sk->sk_err)
524		mask |= POLLERR;
525
526	return mask;
527}
528EXPORT_SYMBOL(tcp_poll);
529
530int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
531{
532	struct tcp_sock *tp = tcp_sk(sk);
533	int answ;
534	bool slow;
535
536	switch (cmd) {
537	case SIOCINQ:
538		if (sk->sk_state == TCP_LISTEN)
539			return -EINVAL;
540
541		slow = lock_sock_fast(sk);
542		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
543			answ = 0;
544		else if (sock_flag(sk, SOCK_URGINLINE) ||
545			 !tp->urg_data ||
546			 before(tp->urg_seq, tp->copied_seq) ||
547			 !before(tp->urg_seq, tp->rcv_nxt)) {
548
549			answ = tp->rcv_nxt - tp->copied_seq;
550
551			/* Subtract 1, if FIN was received */
552			if (answ && sock_flag(sk, SOCK_DONE))
553				answ--;
554		} else
555			answ = tp->urg_seq - tp->copied_seq;
556		unlock_sock_fast(sk, slow);
557		break;
558	case SIOCATMARK:
559		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
560		break;
561	case SIOCOUTQ:
562		if (sk->sk_state == TCP_LISTEN)
563			return -EINVAL;
564
565		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
566			answ = 0;
567		else
568			answ = tp->write_seq - tp->snd_una;
569		break;
570	case SIOCOUTQNSD:
571		if (sk->sk_state == TCP_LISTEN)
572			return -EINVAL;
573
574		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
575			answ = 0;
576		else
577			answ = tp->write_seq - tp->snd_nxt;
578		break;
579	default:
580		return -ENOIOCTLCMD;
581	}
582
583	return put_user(answ, (int __user *)arg);
584}
585EXPORT_SYMBOL(tcp_ioctl);
586
587static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
588{
589	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
590	tp->pushed_seq = tp->write_seq;
591}
592
593static inline bool forced_push(const struct tcp_sock *tp)
594{
595	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
596}
597
598static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
599{
600	struct tcp_sock *tp = tcp_sk(sk);
601	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
602
603	skb->csum    = 0;
604	tcb->seq     = tcb->end_seq = tp->write_seq;
605	tcb->tcp_flags = TCPHDR_ACK;
606	tcb->sacked  = 0;
607	skb_header_release(skb);
608	tcp_add_write_queue_tail(sk, skb);
609	sk->sk_wmem_queued += skb->truesize;
610	sk_mem_charge(sk, skb->truesize);
611	if (tp->nonagle & TCP_NAGLE_PUSH)
612		tp->nonagle &= ~TCP_NAGLE_PUSH;
613}
614
615static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
616{
617	if (flags & MSG_OOB)
618		tp->snd_up = tp->write_seq;
619}
620
621static inline void tcp_push(struct sock *sk, int flags, int mss_now,
622			    int nonagle)
623{
624	if (tcp_send_head(sk)) {
625		struct tcp_sock *tp = tcp_sk(sk);
626
627		if (!(flags & MSG_MORE) || forced_push(tp))
628			tcp_mark_push(tp, tcp_write_queue_tail(sk));
629
630		tcp_mark_urg(tp, flags);
631		__tcp_push_pending_frames(sk, mss_now,
632					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
633	}
634}
635
636static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
637				unsigned int offset, size_t len)
638{
639	struct tcp_splice_state *tss = rd_desc->arg.data;
640	int ret;
641
642	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
643			      tss->flags);
644	if (ret > 0)
645		rd_desc->count -= ret;
646	return ret;
647}
648
649static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
650{
651	/* Store TCP splice context information in read_descriptor_t. */
652	read_descriptor_t rd_desc = {
653		.arg.data = tss,
654		.count	  = tss->len,
655	};
656
657	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
658}
659
660/**
661 *  tcp_splice_read - splice data from TCP socket to a pipe
662 * @sock:	socket to splice from
663 * @ppos:	position (not valid)
664 * @pipe:	pipe to splice to
665 * @len:	number of bytes to splice
666 * @flags:	splice modifier flags
667 *
668 * Description:
669 *    Will read pages from given socket and fill them into a pipe.
670 *
671 **/
672ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
673			struct pipe_inode_info *pipe, size_t len,
674			unsigned int flags)
675{
676	struct sock *sk = sock->sk;
677	struct tcp_splice_state tss = {
678		.pipe = pipe,
679		.len = len,
680		.flags = flags,
681	};
682	long timeo;
683	ssize_t spliced;
684	int ret;
685
686	sock_rps_record_flow(sk);
687	/*
688	 * We can't seek on a socket input
689	 */
690	if (unlikely(*ppos))
691		return -ESPIPE;
692
693	ret = spliced = 0;
694
695	lock_sock(sk);
696
697	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
698	while (tss.len) {
699		ret = __tcp_splice_read(sk, &tss);
700		if (ret < 0)
701			break;
702		else if (!ret) {
703			if (spliced)
704				break;
705			if (sock_flag(sk, SOCK_DONE))
706				break;
707			if (sk->sk_err) {
708				ret = sock_error(sk);
709				break;
710			}
711			if (sk->sk_shutdown & RCV_SHUTDOWN)
712				break;
713			if (sk->sk_state == TCP_CLOSE) {
714				/*
715				 * This occurs when user tries to read
716				 * from never connected socket.
717				 */
718				if (!sock_flag(sk, SOCK_DONE))
719					ret = -ENOTCONN;
720				break;
721			}
722			if (!timeo) {
723				ret = -EAGAIN;
724				break;
725			}
726			sk_wait_data(sk, &timeo);
727			if (signal_pending(current)) {
728				ret = sock_intr_errno(timeo);
729				break;
730			}
731			continue;
732		}
733		tss.len -= ret;
734		spliced += ret;
735
736		if (!timeo)
737			break;
738		release_sock(sk);
739		lock_sock(sk);
740
741		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
742		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
743		    signal_pending(current))
744			break;
745	}
746
747	release_sock(sk);
748
749	if (spliced)
750		return spliced;
751
752	return ret;
753}
754EXPORT_SYMBOL(tcp_splice_read);
755
756struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
757{
758	struct sk_buff *skb;
759
760	/* The TCP header must be at least 32-bit aligned.  */
761	size = ALIGN(size, 4);
762
763	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
764	if (skb) {
765		if (sk_wmem_schedule(sk, skb->truesize)) {
766			skb_reserve(skb, sk->sk_prot->max_header);
767			/*
768			 * Make sure that we have exactly size bytes
769			 * available to the caller, no more, no less.
770			 */
771			skb->reserved_tailroom = skb->end - skb->tail - size;
772			return skb;
773		}
774		__kfree_skb(skb);
775	} else {
776		sk->sk_prot->enter_memory_pressure(sk);
777		sk_stream_moderate_sndbuf(sk);
778	}
779	return NULL;
780}
781
782static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
783				       int large_allowed)
784{
785	struct tcp_sock *tp = tcp_sk(sk);
786	u32 xmit_size_goal, old_size_goal;
787
788	xmit_size_goal = mss_now;
789
790	if (large_allowed && sk_can_gso(sk)) {
791		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
792				  inet_csk(sk)->icsk_af_ops->net_header_len -
793				  inet_csk(sk)->icsk_ext_hdr_len -
794				  tp->tcp_header_len);
795
796		/* TSQ : try to have two TSO segments in flight */
797		xmit_size_goal = min_t(u32, xmit_size_goal,
798				       sysctl_tcp_limit_output_bytes >> 1);
799
800		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
801
802		/* We try hard to avoid divides here */
803		old_size_goal = tp->xmit_size_goal_segs * mss_now;
804
805		if (likely(old_size_goal <= xmit_size_goal &&
806			   old_size_goal + mss_now > xmit_size_goal)) {
807			xmit_size_goal = old_size_goal;
808		} else {
809			tp->xmit_size_goal_segs =
810				min_t(u16, xmit_size_goal / mss_now,
811				      sk->sk_gso_max_segs);
812			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
813		}
814	}
815
816	return max(xmit_size_goal, mss_now);
817}
818
819static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
820{
821	int mss_now;
822
823	mss_now = tcp_current_mss(sk);
824	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
825
826	return mss_now;
827}
828
829static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
830				size_t size, int flags)
831{
832	struct tcp_sock *tp = tcp_sk(sk);
833	int mss_now, size_goal;
834	int err;
835	ssize_t copied;
836	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
837
838	/* Wait for a connection to finish. One exception is TCP Fast Open
839	 * (passive side) where data is allowed to be sent before a connection
840	 * is fully established.
841	 */
842	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
843	    !tcp_passive_fastopen(sk)) {
844		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
845			goto out_err;
846	}
847
848	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849
850	mss_now = tcp_send_mss(sk, &size_goal, flags);
851	copied = 0;
852
853	err = -EPIPE;
854	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
855		goto out_err;
856
857	while (size > 0) {
858		struct sk_buff *skb = tcp_write_queue_tail(sk);
859		int copy, i;
860		bool can_coalesce;
861
862		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
863new_segment:
864			if (!sk_stream_memory_free(sk))
865				goto wait_for_sndbuf;
866
867			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
868			if (!skb)
869				goto wait_for_memory;
870
871			skb_entail(sk, skb);
872			copy = size_goal;
873		}
874
875		if (copy > size)
876			copy = size;
877
878		i = skb_shinfo(skb)->nr_frags;
879		can_coalesce = skb_can_coalesce(skb, i, page, offset);
880		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
881			tcp_mark_push(tp, skb);
882			goto new_segment;
883		}
884		if (!sk_wmem_schedule(sk, copy))
885			goto wait_for_memory;
886
887		if (can_coalesce) {
888			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
889		} else {
890			get_page(page);
891			skb_fill_page_desc(skb, i, page, offset, copy);
892		}
893		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
894
895		skb->len += copy;
896		skb->data_len += copy;
897		skb->truesize += copy;
898		sk->sk_wmem_queued += copy;
899		sk_mem_charge(sk, copy);
900		skb->ip_summed = CHECKSUM_PARTIAL;
901		tp->write_seq += copy;
902		TCP_SKB_CB(skb)->end_seq += copy;
903		skb_shinfo(skb)->gso_segs = 0;
904
905		if (!copied)
906			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
907
908		copied += copy;
909		offset += copy;
910		if (!(size -= copy))
911			goto out;
912
913		if (skb->len < size_goal || (flags & MSG_OOB))
914			continue;
915
916		if (forced_push(tp)) {
917			tcp_mark_push(tp, skb);
918			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
919		} else if (skb == tcp_send_head(sk))
920			tcp_push_one(sk, mss_now);
921		continue;
922
923wait_for_sndbuf:
924		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
925wait_for_memory:
926		tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
927
928		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
929			goto do_error;
930
931		mss_now = tcp_send_mss(sk, &size_goal, flags);
932	}
933
934out:
935	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
936		tcp_push(sk, flags, mss_now, tp->nonagle);
937	return copied;
938
939do_error:
940	if (copied)
941		goto out;
942out_err:
943	return sk_stream_error(sk, flags, err);
944}
945
946int tcp_sendpage(struct sock *sk, struct page *page, int offset,
947		 size_t size, int flags)
948{
949	ssize_t res;
950
951	if (!(sk->sk_route_caps & NETIF_F_SG) ||
952	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
953		return sock_no_sendpage(sk->sk_socket, page, offset, size,
954					flags);
955
956	lock_sock(sk);
957	res = do_tcp_sendpages(sk, page, offset, size, flags);
958	release_sock(sk);
959	return res;
960}
961EXPORT_SYMBOL(tcp_sendpage);
962
963static inline int select_size(const struct sock *sk, bool sg)
964{
965	const struct tcp_sock *tp = tcp_sk(sk);
966	int tmp = tp->mss_cache;
967
968	if (sg) {
969		if (sk_can_gso(sk)) {
970			/* Small frames wont use a full page:
971			 * Payload will immediately follow tcp header.
972			 */
973			tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
974		} else {
975			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
976
977			if (tmp >= pgbreak &&
978			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
979				tmp = pgbreak;
980		}
981	}
982
983	return tmp;
984}
985
986void tcp_free_fastopen_req(struct tcp_sock *tp)
987{
988	if (tp->fastopen_req != NULL) {
989		kfree(tp->fastopen_req);
990		tp->fastopen_req = NULL;
991	}
992}
993
994static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
995{
996	struct tcp_sock *tp = tcp_sk(sk);
997	int err, flags;
998
999	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1000		return -EOPNOTSUPP;
1001	if (tp->fastopen_req != NULL)
1002		return -EALREADY; /* Another Fast Open is in progress */
1003
1004	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1005				   sk->sk_allocation);
1006	if (unlikely(tp->fastopen_req == NULL))
1007		return -ENOBUFS;
1008	tp->fastopen_req->data = msg;
1009
1010	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1011	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1012				    msg->msg_namelen, flags);
1013	*size = tp->fastopen_req->copied;
1014	tcp_free_fastopen_req(tp);
1015	return err;
1016}
1017
1018int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1019		size_t size)
1020{
1021	struct iovec *iov;
1022	struct tcp_sock *tp = tcp_sk(sk);
1023	struct sk_buff *skb;
1024	int iovlen, flags, err, copied = 0;
1025	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1026	bool sg;
1027	long timeo;
1028
1029	lock_sock(sk);
1030
1031	flags = msg->msg_flags;
1032	if (flags & MSG_FASTOPEN) {
1033		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1034		if (err == -EINPROGRESS && copied_syn > 0)
1035			goto out;
1036		else if (err)
1037			goto out_err;
1038		offset = copied_syn;
1039	}
1040
1041	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1042
1043	/* Wait for a connection to finish. One exception is TCP Fast Open
1044	 * (passive side) where data is allowed to be sent before a connection
1045	 * is fully established.
1046	 */
1047	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1048	    !tcp_passive_fastopen(sk)) {
1049		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1050			goto do_error;
1051	}
1052
1053	if (unlikely(tp->repair)) {
1054		if (tp->repair_queue == TCP_RECV_QUEUE) {
1055			copied = tcp_send_rcvq(sk, msg, size);
1056			goto out;
1057		}
1058
1059		err = -EINVAL;
1060		if (tp->repair_queue == TCP_NO_QUEUE)
1061			goto out_err;
1062
1063		/* 'common' sending to sendq */
1064	}
1065
1066	/* This should be in poll */
1067	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1068
1069	mss_now = tcp_send_mss(sk, &size_goal, flags);
1070
1071	/* Ok commence sending. */
1072	iovlen = msg->msg_iovlen;
1073	iov = msg->msg_iov;
1074	copied = 0;
1075
1076	err = -EPIPE;
1077	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1078		goto out_err;
1079
1080	sg = !!(sk->sk_route_caps & NETIF_F_SG);
1081
1082	while (--iovlen >= 0) {
1083		size_t seglen = iov->iov_len;
1084		unsigned char __user *from = iov->iov_base;
1085
1086		iov++;
1087		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
1088			if (offset >= seglen) {
1089				offset -= seglen;
1090				continue;
1091			}
1092			seglen -= offset;
1093			from += offset;
1094			offset = 0;
1095		}
1096
1097		while (seglen > 0) {
1098			int copy = 0;
1099			int max = size_goal;
1100
1101			skb = tcp_write_queue_tail(sk);
1102			if (tcp_send_head(sk)) {
1103				if (skb->ip_summed == CHECKSUM_NONE)
1104					max = mss_now;
1105				copy = max - skb->len;
1106			}
1107
1108			if (copy <= 0) {
1109new_segment:
1110				/* Allocate new segment. If the interface is SG,
1111				 * allocate skb fitting to single page.
1112				 */
1113				if (!sk_stream_memory_free(sk))
1114					goto wait_for_sndbuf;
1115
1116				skb = sk_stream_alloc_skb(sk,
1117							  select_size(sk, sg),
1118							  sk->sk_allocation);
1119				if (!skb)
1120					goto wait_for_memory;
1121
1122				/*
1123				 * Check whether we can use HW checksum.
1124				 */
1125				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1126					skb->ip_summed = CHECKSUM_PARTIAL;
1127
1128				skb_entail(sk, skb);
1129				copy = size_goal;
1130				max = size_goal;
1131			}
1132
1133			/* Try to append data to the end of skb. */
1134			if (copy > seglen)
1135				copy = seglen;
1136
1137			/* Where to copy to? */
1138			if (skb_availroom(skb) > 0) {
1139				/* We have some space in skb head. Superb! */
1140				copy = min_t(int, copy, skb_availroom(skb));
1141				err = skb_add_data_nocache(sk, skb, from, copy);
1142				if (err)
1143					goto do_fault;
1144			} else {
1145				bool merge = true;
1146				int i = skb_shinfo(skb)->nr_frags;
1147				struct page_frag *pfrag = sk_page_frag(sk);
1148
1149				if (!sk_page_frag_refill(sk, pfrag))
1150					goto wait_for_memory;
1151
1152				if (!skb_can_coalesce(skb, i, pfrag->page,
1153						      pfrag->offset)) {
1154					if (i == MAX_SKB_FRAGS || !sg) {
1155						tcp_mark_push(tp, skb);
1156						goto new_segment;
1157					}
1158					merge = false;
1159				}
1160
1161				copy = min_t(int, copy, pfrag->size - pfrag->offset);
1162
1163				if (!sk_wmem_schedule(sk, copy))
1164					goto wait_for_memory;
1165
1166				err = skb_copy_to_page_nocache(sk, from, skb,
1167							       pfrag->page,
1168							       pfrag->offset,
1169							       copy);
1170				if (err)
1171					goto do_error;
1172
1173				/* Update the skb. */
1174				if (merge) {
1175					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1176				} else {
1177					skb_fill_page_desc(skb, i, pfrag->page,
1178							   pfrag->offset, copy);
1179					get_page(pfrag->page);
1180				}
1181				pfrag->offset += copy;
1182			}
1183
1184			if (!copied)
1185				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1186
1187			tp->write_seq += copy;
1188			TCP_SKB_CB(skb)->end_seq += copy;
1189			skb_shinfo(skb)->gso_segs = 0;
1190
1191			from += copy;
1192			copied += copy;
1193			if ((seglen -= copy) == 0 && iovlen == 0)
1194				goto out;
1195
1196			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1197				continue;
1198
1199			if (forced_push(tp)) {
1200				tcp_mark_push(tp, skb);
1201				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1202			} else if (skb == tcp_send_head(sk))
1203				tcp_push_one(sk, mss_now);
1204			continue;
1205
1206wait_for_sndbuf:
1207			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1208wait_for_memory:
1209			if (copied)
1210				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1211
1212			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1213				goto do_error;
1214
1215			mss_now = tcp_send_mss(sk, &size_goal, flags);
1216		}
1217	}
1218
1219out:
1220	if (copied)
1221		tcp_push(sk, flags, mss_now, tp->nonagle);
1222	release_sock(sk);
1223	return copied + copied_syn;
1224
1225do_fault:
1226	if (!skb->len) {
1227		tcp_unlink_write_queue(skb, sk);
1228		/* It is the one place in all of TCP, except connection
1229		 * reset, where we can be unlinking the send_head.
1230		 */
1231		tcp_check_send_head(sk, skb);
1232		sk_wmem_free_skb(sk, skb);
1233	}
1234
1235do_error:
1236	if (copied + copied_syn)
1237		goto out;
1238out_err:
1239	err = sk_stream_error(sk, flags, err);
1240	release_sock(sk);
1241	return err;
1242}
1243EXPORT_SYMBOL(tcp_sendmsg);
1244
1245/*
1246 *	Handle reading urgent data. BSD has very simple semantics for
1247 *	this, no blocking and very strange errors 8)
1248 */
1249
1250static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1251{
1252	struct tcp_sock *tp = tcp_sk(sk);
1253
1254	/* No URG data to read. */
1255	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1256	    tp->urg_data == TCP_URG_READ)
1257		return -EINVAL;	/* Yes this is right ! */
1258
1259	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1260		return -ENOTCONN;
1261
1262	if (tp->urg_data & TCP_URG_VALID) {
1263		int err = 0;
1264		char c = tp->urg_data;
1265
1266		if (!(flags & MSG_PEEK))
1267			tp->urg_data = TCP_URG_READ;
1268
1269		/* Read urgent data. */
1270		msg->msg_flags |= MSG_OOB;
1271
1272		if (len > 0) {
1273			if (!(flags & MSG_TRUNC))
1274				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1275			len = 1;
1276		} else
1277			msg->msg_flags |= MSG_TRUNC;
1278
1279		return err ? -EFAULT : len;
1280	}
1281
1282	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1283		return 0;
1284
1285	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1286	 * the available implementations agree in this case:
1287	 * this call should never block, independent of the
1288	 * blocking state of the socket.
1289	 * Mike <pall@rz.uni-karlsruhe.de>
1290	 */
1291	return -EAGAIN;
1292}
1293
1294static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1295{
1296	struct sk_buff *skb;
1297	int copied = 0, err = 0;
1298
1299	/* XXX -- need to support SO_PEEK_OFF */
1300
1301	skb_queue_walk(&sk->sk_write_queue, skb) {
1302		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1303		if (err)
1304			break;
1305
1306		copied += skb->len;
1307	}
1308
1309	return err ?: copied;
1310}
1311
1312/* Clean up the receive buffer for full frames taken by the user,
1313 * then send an ACK if necessary.  COPIED is the number of bytes
1314 * tcp_recvmsg has given to the user so far, it speeds up the
1315 * calculation of whether or not we must ACK for the sake of
1316 * a window update.
1317 */
1318void tcp_cleanup_rbuf(struct sock *sk, int copied)
1319{
1320	struct tcp_sock *tp = tcp_sk(sk);
1321	bool time_to_ack = false;
1322
1323	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1324
1325	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1326	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1327	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1328
1329	if (inet_csk_ack_scheduled(sk)) {
1330		const struct inet_connection_sock *icsk = inet_csk(sk);
1331		   /* Delayed ACKs frequently hit locked sockets during bulk
1332		    * receive. */
1333		if (icsk->icsk_ack.blocked ||
1334		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1335		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1336		    /*
1337		     * If this read emptied read buffer, we send ACK, if
1338		     * connection is not bidirectional, user drained
1339		     * receive buffer and there was a small segment
1340		     * in queue.
1341		     */
1342		    (copied > 0 &&
1343		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1344		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1345		       !icsk->icsk_ack.pingpong)) &&
1346		      !atomic_read(&sk->sk_rmem_alloc)))
1347			time_to_ack = true;
1348	}
1349
1350	/* We send an ACK if we can now advertise a non-zero window
1351	 * which has been raised "significantly".
1352	 *
1353	 * Even if window raised up to infinity, do not send window open ACK
1354	 * in states, where we will not receive more. It is useless.
1355	 */
1356	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1357		__u32 rcv_window_now = tcp_receive_window(tp);
1358
1359		/* Optimize, __tcp_select_window() is not cheap. */
1360		if (2*rcv_window_now <= tp->window_clamp) {
1361			__u32 new_window = __tcp_select_window(sk);
1362
1363			/* Send ACK now, if this read freed lots of space
1364			 * in our buffer. Certainly, new_window is new window.
1365			 * We can advertise it now, if it is not less than current one.
1366			 * "Lots" means "at least twice" here.
1367			 */
1368			if (new_window && new_window >= 2 * rcv_window_now)
1369				time_to_ack = true;
1370		}
1371	}
1372	if (time_to_ack)
1373		tcp_send_ack(sk);
1374}
1375
1376static void tcp_prequeue_process(struct sock *sk)
1377{
1378	struct sk_buff *skb;
1379	struct tcp_sock *tp = tcp_sk(sk);
1380
1381	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1382
1383	/* RX process wants to run with disabled BHs, though it is not
1384	 * necessary */
1385	local_bh_disable();
1386	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1387		sk_backlog_rcv(sk, skb);
1388	local_bh_enable();
1389
1390	/* Clear memory counter. */
1391	tp->ucopy.memory = 0;
1392}
1393
1394#ifdef CONFIG_NET_DMA
1395static void tcp_service_net_dma(struct sock *sk, bool wait)
1396{
1397	dma_cookie_t done, used;
1398	dma_cookie_t last_issued;
1399	struct tcp_sock *tp = tcp_sk(sk);
1400
1401	if (!tp->ucopy.dma_chan)
1402		return;
1403
1404	last_issued = tp->ucopy.dma_cookie;
1405	dma_async_issue_pending(tp->ucopy.dma_chan);
1406
1407	do {
1408		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1409					      last_issued, &done,
1410					      &used) == DMA_SUCCESS) {
1411			/* Safe to free early-copied skbs now */
1412			__skb_queue_purge(&sk->sk_async_wait_queue);
1413			break;
1414		} else {
1415			struct sk_buff *skb;
1416			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1417			       (dma_async_is_complete(skb->dma_cookie, done,
1418						      used) == DMA_SUCCESS)) {
1419				__skb_dequeue(&sk->sk_async_wait_queue);
1420				kfree_skb(skb);
1421			}
1422		}
1423	} while (wait);
1424}
1425#endif
1426
1427static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1428{
1429	struct sk_buff *skb;
1430	u32 offset;
1431
1432	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1433		offset = seq - TCP_SKB_CB(skb)->seq;
1434		if (tcp_hdr(skb)->syn)
1435			offset--;
1436		if (offset < skb->len || tcp_hdr(skb)->fin) {
1437			*off = offset;
1438			return skb;
1439		}
1440		/* This looks weird, but this can happen if TCP collapsing
1441		 * splitted a fat GRO packet, while we released socket lock
1442		 * in skb_splice_bits()
1443		 */
1444		sk_eat_skb(sk, skb, false);
1445	}
1446	return NULL;
1447}
1448
1449/*
1450 * This routine provides an alternative to tcp_recvmsg() for routines
1451 * that would like to handle copying from skbuffs directly in 'sendfile'
1452 * fashion.
1453 * Note:
1454 *	- It is assumed that the socket was locked by the caller.
1455 *	- The routine does not block.
1456 *	- At present, there is no support for reading OOB data
1457 *	  or for 'peeking' the socket using this routine
1458 *	  (although both would be easy to implement).
1459 */
1460int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1461		  sk_read_actor_t recv_actor)
1462{
1463	struct sk_buff *skb;
1464	struct tcp_sock *tp = tcp_sk(sk);
1465	u32 seq = tp->copied_seq;
1466	u32 offset;
1467	int copied = 0;
1468
1469	if (sk->sk_state == TCP_LISTEN)
1470		return -ENOTCONN;
1471	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1472		if (offset < skb->len) {
1473			int used;
1474			size_t len;
1475
1476			len = skb->len - offset;
1477			/* Stop reading if we hit a patch of urgent data */
1478			if (tp->urg_data) {
1479				u32 urg_offset = tp->urg_seq - seq;
1480				if (urg_offset < len)
1481					len = urg_offset;
1482				if (!len)
1483					break;
1484			}
1485			used = recv_actor(desc, skb, offset, len);
1486			if (used <= 0) {
1487				if (!copied)
1488					copied = used;
1489				break;
1490			} else if (used <= len) {
1491				seq += used;
1492				copied += used;
1493				offset += used;
1494			}
1495			/* If recv_actor drops the lock (e.g. TCP splice
1496			 * receive) the skb pointer might be invalid when
1497			 * getting here: tcp_collapse might have deleted it
1498			 * while aggregating skbs from the socket queue.
1499			 */
1500			skb = tcp_recv_skb(sk, seq - 1, &offset);
1501			if (!skb)
1502				break;
1503			/* TCP coalescing might have appended data to the skb.
1504			 * Try to splice more frags
1505			 */
1506			if (offset + 1 != skb->len)
1507				continue;
1508		}
1509		if (tcp_hdr(skb)->fin) {
1510			sk_eat_skb(sk, skb, false);
1511			++seq;
1512			break;
1513		}
1514		sk_eat_skb(sk, skb, false);
1515		if (!desc->count)
1516			break;
1517		tp->copied_seq = seq;
1518	}
1519	tp->copied_seq = seq;
1520
1521	tcp_rcv_space_adjust(sk);
1522
1523	/* Clean up data we have read: This will do ACK frames. */
1524	if (copied > 0) {
1525		tcp_recv_skb(sk, seq, &offset);
1526		tcp_cleanup_rbuf(sk, copied);
1527	}
1528	return copied;
1529}
1530EXPORT_SYMBOL(tcp_read_sock);
1531
1532/*
1533 *	This routine copies from a sock struct into the user buffer.
1534 *
1535 *	Technical note: in 2.3 we work on _locked_ socket, so that
1536 *	tricks with *seq access order and skb->users are not required.
1537 *	Probably, code can be easily improved even more.
1538 */
1539
1540int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1541		size_t len, int nonblock, int flags, int *addr_len)
1542{
1543	struct tcp_sock *tp = tcp_sk(sk);
1544	int copied = 0;
1545	u32 peek_seq;
1546	u32 *seq;
1547	unsigned long used;
1548	int err;
1549	int target;		/* Read at least this many bytes */
1550	long timeo;
1551	struct task_struct *user_recv = NULL;
1552	bool copied_early = false;
1553	struct sk_buff *skb;
1554	u32 urg_hole = 0;
1555
1556	lock_sock(sk);
1557
1558	err = -ENOTCONN;
1559	if (sk->sk_state == TCP_LISTEN)
1560		goto out;
1561
1562	timeo = sock_rcvtimeo(sk, nonblock);
1563
1564	/* Urgent data needs to be handled specially. */
1565	if (flags & MSG_OOB)
1566		goto recv_urg;
1567
1568	if (unlikely(tp->repair)) {
1569		err = -EPERM;
1570		if (!(flags & MSG_PEEK))
1571			goto out;
1572
1573		if (tp->repair_queue == TCP_SEND_QUEUE)
1574			goto recv_sndq;
1575
1576		err = -EINVAL;
1577		if (tp->repair_queue == TCP_NO_QUEUE)
1578			goto out;
1579
1580		/* 'common' recv queue MSG_PEEK-ing */
1581	}
1582
1583	seq = &tp->copied_seq;
1584	if (flags & MSG_PEEK) {
1585		peek_seq = tp->copied_seq;
1586		seq = &peek_seq;
1587	}
1588
1589	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1590
1591#ifdef CONFIG_NET_DMA
1592	tp->ucopy.dma_chan = NULL;
1593	preempt_disable();
1594	skb = skb_peek_tail(&sk->sk_receive_queue);
1595	{
1596		int available = 0;
1597
1598		if (skb)
1599			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1600		if ((available < target) &&
1601		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1602		    !sysctl_tcp_low_latency &&
1603		    net_dma_find_channel()) {
1604			preempt_enable_no_resched();
1605			tp->ucopy.pinned_list =
1606					dma_pin_iovec_pages(msg->msg_iov, len);
1607		} else {
1608			preempt_enable_no_resched();
1609		}
1610	}
1611#endif
1612
1613	do {
1614		u32 offset;
1615
1616		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1617		if (tp->urg_data && tp->urg_seq == *seq) {
1618			if (copied)
1619				break;
1620			if (signal_pending(current)) {
1621				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1622				break;
1623			}
1624		}
1625
1626		/* Next get a buffer. */
1627
1628		skb_queue_walk(&sk->sk_receive_queue, skb) {
1629			/* Now that we have two receive queues this
1630			 * shouldn't happen.
1631			 */
1632			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1633				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1634				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1635				 flags))
1636				break;
1637
1638			offset = *seq - TCP_SKB_CB(skb)->seq;
1639			if (tcp_hdr(skb)->syn)
1640				offset--;
1641			if (offset < skb->len)
1642				goto found_ok_skb;
1643			if (tcp_hdr(skb)->fin)
1644				goto found_fin_ok;
1645			WARN(!(flags & MSG_PEEK),
1646			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1647			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1648		}
1649
1650		/* Well, if we have backlog, try to process it now yet. */
1651
1652		if (copied >= target && !sk->sk_backlog.tail)
1653			break;
1654
1655		if (copied) {
1656			if (sk->sk_err ||
1657			    sk->sk_state == TCP_CLOSE ||
1658			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1659			    !timeo ||
1660			    signal_pending(current))
1661				break;
1662		} else {
1663			if (sock_flag(sk, SOCK_DONE))
1664				break;
1665
1666			if (sk->sk_err) {
1667				copied = sock_error(sk);
1668				break;
1669			}
1670
1671			if (sk->sk_shutdown & RCV_SHUTDOWN)
1672				break;
1673
1674			if (sk->sk_state == TCP_CLOSE) {
1675				if (!sock_flag(sk, SOCK_DONE)) {
1676					/* This occurs when user tries to read
1677					 * from never connected socket.
1678					 */
1679					copied = -ENOTCONN;
1680					break;
1681				}
1682				break;
1683			}
1684
1685			if (!timeo) {
1686				copied = -EAGAIN;
1687				break;
1688			}
1689
1690			if (signal_pending(current)) {
1691				copied = sock_intr_errno(timeo);
1692				break;
1693			}
1694		}
1695
1696		tcp_cleanup_rbuf(sk, copied);
1697
1698		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1699			/* Install new reader */
1700			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1701				user_recv = current;
1702				tp->ucopy.task = user_recv;
1703				tp->ucopy.iov = msg->msg_iov;
1704			}
1705
1706			tp->ucopy.len = len;
1707
1708			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1709				!(flags & (MSG_PEEK | MSG_TRUNC)));
1710
1711			/* Ugly... If prequeue is not empty, we have to
1712			 * process it before releasing socket, otherwise
1713			 * order will be broken at second iteration.
1714			 * More elegant solution is required!!!
1715			 *
1716			 * Look: we have the following (pseudo)queues:
1717			 *
1718			 * 1. packets in flight
1719			 * 2. backlog
1720			 * 3. prequeue
1721			 * 4. receive_queue
1722			 *
1723			 * Each queue can be processed only if the next ones
1724			 * are empty. At this point we have empty receive_queue.
1725			 * But prequeue _can_ be not empty after 2nd iteration,
1726			 * when we jumped to start of loop because backlog
1727			 * processing added something to receive_queue.
1728			 * We cannot release_sock(), because backlog contains
1729			 * packets arrived _after_ prequeued ones.
1730			 *
1731			 * Shortly, algorithm is clear --- to process all
1732			 * the queues in order. We could make it more directly,
1733			 * requeueing packets from backlog to prequeue, if
1734			 * is not empty. It is more elegant, but eats cycles,
1735			 * unfortunately.
1736			 */
1737			if (!skb_queue_empty(&tp->ucopy.prequeue))
1738				goto do_prequeue;
1739
1740			/* __ Set realtime policy in scheduler __ */
1741		}
1742
1743#ifdef CONFIG_NET_DMA
1744		if (tp->ucopy.dma_chan) {
1745			if (tp->rcv_wnd == 0 &&
1746			    !skb_queue_empty(&sk->sk_async_wait_queue)) {
1747				tcp_service_net_dma(sk, true);
1748				tcp_cleanup_rbuf(sk, copied);
1749			} else
1750				dma_async_issue_pending(tp->ucopy.dma_chan);
1751		}
1752#endif
1753		if (copied >= target) {
1754			/* Do not sleep, just process backlog. */
1755			release_sock(sk);
1756			lock_sock(sk);
1757		} else
1758			sk_wait_data(sk, &timeo);
1759
1760#ifdef CONFIG_NET_DMA
1761		tcp_service_net_dma(sk, false);  /* Don't block */
1762		tp->ucopy.wakeup = 0;
1763#endif
1764
1765		if (user_recv) {
1766			int chunk;
1767
1768			/* __ Restore normal policy in scheduler __ */
1769
1770			if ((chunk = len - tp->ucopy.len) != 0) {
1771				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1772				len -= chunk;
1773				copied += chunk;
1774			}
1775
1776			if (tp->rcv_nxt == tp->copied_seq &&
1777			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1778do_prequeue:
1779				tcp_prequeue_process(sk);
1780
1781				if ((chunk = len - tp->ucopy.len) != 0) {
1782					NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1783					len -= chunk;
1784					copied += chunk;
1785				}
1786			}
1787		}
1788		if ((flags & MSG_PEEK) &&
1789		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
1790			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1791					    current->comm,
1792					    task_pid_nr(current));
1793			peek_seq = tp->copied_seq;
1794		}
1795		continue;
1796
1797	found_ok_skb:
1798		/* Ok so how much can we use? */
1799		used = skb->len - offset;
1800		if (len < used)
1801			used = len;
1802
1803		/* Do we have urgent data here? */
1804		if (tp->urg_data) {
1805			u32 urg_offset = tp->urg_seq - *seq;
1806			if (urg_offset < used) {
1807				if (!urg_offset) {
1808					if (!sock_flag(sk, SOCK_URGINLINE)) {
1809						++*seq;
1810						urg_hole++;
1811						offset++;
1812						used--;
1813						if (!used)
1814							goto skip_copy;
1815					}
1816				} else
1817					used = urg_offset;
1818			}
1819		}
1820
1821		if (!(flags & MSG_TRUNC)) {
1822#ifdef CONFIG_NET_DMA
1823			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1824				tp->ucopy.dma_chan = net_dma_find_channel();
1825
1826			if (tp->ucopy.dma_chan) {
1827				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1828					tp->ucopy.dma_chan, skb, offset,
1829					msg->msg_iov, used,
1830					tp->ucopy.pinned_list);
1831
1832				if (tp->ucopy.dma_cookie < 0) {
1833
1834					pr_alert("%s: dma_cookie < 0\n",
1835						 __func__);
1836
1837					/* Exception. Bailout! */
1838					if (!copied)
1839						copied = -EFAULT;
1840					break;
1841				}
1842
1843				dma_async_issue_pending(tp->ucopy.dma_chan);
1844
1845				if ((offset + used) == skb->len)
1846					copied_early = true;
1847
1848			} else
1849#endif
1850			{
1851				err = skb_copy_datagram_iovec(skb, offset,
1852						msg->msg_iov, used);
1853				if (err) {
1854					/* Exception. Bailout! */
1855					if (!copied)
1856						copied = -EFAULT;
1857					break;
1858				}
1859			}
1860		}
1861
1862		*seq += used;
1863		copied += used;
1864		len -= used;
1865
1866		tcp_rcv_space_adjust(sk);
1867
1868skip_copy:
1869		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1870			tp->urg_data = 0;
1871			tcp_fast_path_check(sk);
1872		}
1873		if (used + offset < skb->len)
1874			continue;
1875
1876		if (tcp_hdr(skb)->fin)
1877			goto found_fin_ok;
1878		if (!(flags & MSG_PEEK)) {
1879			sk_eat_skb(sk, skb, copied_early);
1880			copied_early = false;
1881		}
1882		continue;
1883
1884	found_fin_ok:
1885		/* Process the FIN. */
1886		++*seq;
1887		if (!(flags & MSG_PEEK)) {
1888			sk_eat_skb(sk, skb, copied_early);
1889			copied_early = false;
1890		}
1891		break;
1892	} while (len > 0);
1893
1894	if (user_recv) {
1895		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1896			int chunk;
1897
1898			tp->ucopy.len = copied > 0 ? len : 0;
1899
1900			tcp_prequeue_process(sk);
1901
1902			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1903				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1904				len -= chunk;
1905				copied += chunk;
1906			}
1907		}
1908
1909		tp->ucopy.task = NULL;
1910		tp->ucopy.len = 0;
1911	}
1912
1913#ifdef CONFIG_NET_DMA
1914	tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
1915	tp->ucopy.dma_chan = NULL;
1916
1917	if (tp->ucopy.pinned_list) {
1918		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1919		tp->ucopy.pinned_list = NULL;
1920	}
1921#endif
1922
1923	/* According to UNIX98, msg_name/msg_namelen are ignored
1924	 * on connected socket. I was just happy when found this 8) --ANK
1925	 */
1926
1927	/* Clean up data we have read: This will do ACK frames. */
1928	tcp_cleanup_rbuf(sk, copied);
1929
1930	release_sock(sk);
1931	return copied;
1932
1933out:
1934	release_sock(sk);
1935	return err;
1936
1937recv_urg:
1938	err = tcp_recv_urg(sk, msg, len, flags);
1939	goto out;
1940
1941recv_sndq:
1942	err = tcp_peek_sndq(sk, msg, len);
1943	goto out;
1944}
1945EXPORT_SYMBOL(tcp_recvmsg);
1946
1947void tcp_set_state(struct sock *sk, int state)
1948{
1949	int oldstate = sk->sk_state;
1950
1951	switch (state) {
1952	case TCP_ESTABLISHED:
1953		if (oldstate != TCP_ESTABLISHED)
1954			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1955		break;
1956
1957	case TCP_CLOSE:
1958		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1959			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1960
1961		sk->sk_prot->unhash(sk);
1962		if (inet_csk(sk)->icsk_bind_hash &&
1963		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1964			inet_put_port(sk);
1965		/* fall through */
1966	default:
1967		if (oldstate == TCP_ESTABLISHED)
1968			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1969	}
1970
1971	/* Change state AFTER socket is unhashed to avoid closed
1972	 * socket sitting in hash tables.
1973	 */
1974	sk->sk_state = state;
1975
1976#ifdef STATE_TRACE
1977	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1978#endif
1979}
1980EXPORT_SYMBOL_GPL(tcp_set_state);
1981
1982/*
1983 *	State processing on a close. This implements the state shift for
1984 *	sending our FIN frame. Note that we only send a FIN for some
1985 *	states. A shutdown() may have already sent the FIN, or we may be
1986 *	closed.
1987 */
1988
1989static const unsigned char new_state[16] = {
1990  /* current state:        new state:      action:	*/
1991  /* (Invalid)		*/ TCP_CLOSE,
1992  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1993  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1994  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1995  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1996  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1997  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1998  /* TCP_CLOSE		*/ TCP_CLOSE,
1999  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
2000  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
2001  /* TCP_LISTEN		*/ TCP_CLOSE,
2002  /* TCP_CLOSING	*/ TCP_CLOSING,
2003};
2004
2005static int tcp_close_state(struct sock *sk)
2006{
2007	int next = (int)new_state[sk->sk_state];
2008	int ns = next & TCP_STATE_MASK;
2009
2010	tcp_set_state(sk, ns);
2011
2012	return next & TCP_ACTION_FIN;
2013}
2014
2015/*
2016 *	Shutdown the sending side of a connection. Much like close except
2017 *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2018 */
2019
2020void tcp_shutdown(struct sock *sk, int how)
2021{
2022	/*	We need to grab some memory, and put together a FIN,
2023	 *	and then put it into the queue to be sent.
2024	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2025	 */
2026	if (!(how & SEND_SHUTDOWN))
2027		return;
2028
2029	/* If we've already sent a FIN, or it's a closed state, skip this. */
2030	if ((1 << sk->sk_state) &
2031	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2032	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2033		/* Clear out any half completed packets.  FIN if needed. */
2034		if (tcp_close_state(sk))
2035			tcp_send_fin(sk);
2036	}
2037}
2038EXPORT_SYMBOL(tcp_shutdown);
2039
2040bool tcp_check_oom(struct sock *sk, int shift)
2041{
2042	bool too_many_orphans, out_of_socket_memory;
2043
2044	too_many_orphans = tcp_too_many_orphans(sk, shift);
2045	out_of_socket_memory = tcp_out_of_memory(sk);
2046
2047	if (too_many_orphans)
2048		net_info_ratelimited("too many orphaned sockets\n");
2049	if (out_of_socket_memory)
2050		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2051	return too_many_orphans || out_of_socket_memory;
2052}
2053
2054void tcp_close(struct sock *sk, long timeout)
2055{
2056	struct sk_buff *skb;
2057	int data_was_unread = 0;
2058	int state;
2059
2060	lock_sock(sk);
2061	sk->sk_shutdown = SHUTDOWN_MASK;
2062
2063	if (sk->sk_state == TCP_LISTEN) {
2064		tcp_set_state(sk, TCP_CLOSE);
2065
2066		/* Special case. */
2067		inet_csk_listen_stop(sk);
2068
2069		goto adjudge_to_death;
2070	}
2071
2072	/*  We need to flush the recv. buffs.  We do this only on the
2073	 *  descriptor close, not protocol-sourced closes, because the
2074	 *  reader process may not have drained the data yet!
2075	 */
2076	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2077		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2078			  tcp_hdr(skb)->fin;
2079		data_was_unread += len;
2080		__kfree_skb(skb);
2081	}
2082
2083	sk_mem_reclaim(sk);
2084
2085	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2086	if (sk->sk_state == TCP_CLOSE)
2087		goto adjudge_to_death;
2088
2089	/* As outlined in RFC 2525, section 2.17, we send a RST here because
2090	 * data was lost. To witness the awful effects of the old behavior of
2091	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2092	 * GET in an FTP client, suspend the process, wait for the client to
2093	 * advertise a zero window, then kill -9 the FTP client, wheee...
2094	 * Note: timeout is always zero in such a case.
2095	 */
2096	if (unlikely(tcp_sk(sk)->repair)) {
2097		sk->sk_prot->disconnect(sk, 0);
2098	} else if (data_was_unread) {
2099		/* Unread data was tossed, zap the connection. */
2100		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2101		tcp_set_state(sk, TCP_CLOSE);
2102		tcp_send_active_reset(sk, sk->sk_allocation);
2103	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2104		/* Check zero linger _after_ checking for unread data. */
2105		sk->sk_prot->disconnect(sk, 0);
2106		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2107	} else if (tcp_close_state(sk)) {
2108		/* We FIN if the application ate all the data before
2109		 * zapping the connection.
2110		 */
2111
2112		/* RED-PEN. Formally speaking, we have broken TCP state
2113		 * machine. State transitions:
2114		 *
2115		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2116		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
2117		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2118		 *
2119		 * are legal only when FIN has been sent (i.e. in window),
2120		 * rather than queued out of window. Purists blame.
2121		 *
2122		 * F.e. "RFC state" is ESTABLISHED,
2123		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2124		 *
2125		 * The visible declinations are that sometimes
2126		 * we enter time-wait state, when it is not required really
2127		 * (harmless), do not send active resets, when they are
2128		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2129		 * they look as CLOSING or LAST_ACK for Linux)
2130		 * Probably, I missed some more holelets.
2131		 * 						--ANK
2132		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2133		 * in a single packet! (May consider it later but will
2134		 * probably need API support or TCP_CORK SYN-ACK until
2135		 * data is written and socket is closed.)
2136		 */
2137		tcp_send_fin(sk);
2138	}
2139
2140	sk_stream_wait_close(sk, timeout);
2141
2142adjudge_to_death:
2143	state = sk->sk_state;
2144	sock_hold(sk);
2145	sock_orphan(sk);
2146
2147	/* It is the last release_sock in its life. It will remove backlog. */
2148	release_sock(sk);
2149
2150
2151	/* Now socket is owned by kernel and we acquire BH lock
2152	   to finish close. No need to check for user refs.
2153	 */
2154	local_bh_disable();
2155	bh_lock_sock(sk);
2156	WARN_ON(sock_owned_by_user(sk));
2157
2158	percpu_counter_inc(sk->sk_prot->orphan_count);
2159
2160	/* Have we already been destroyed by a softirq or backlog? */
2161	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2162		goto out;
2163
2164	/*	This is a (useful) BSD violating of the RFC. There is a
2165	 *	problem with TCP as specified in that the other end could
2166	 *	keep a socket open forever with no application left this end.
2167	 *	We use a 3 minute timeout (about the same as BSD) then kill
2168	 *	our end. If they send after that then tough - BUT: long enough
2169	 *	that we won't make the old 4*rto = almost no time - whoops
2170	 *	reset mistake.
2171	 *
2172	 *	Nope, it was not mistake. It is really desired behaviour
2173	 *	f.e. on http servers, when such sockets are useless, but
2174	 *	consume significant resources. Let's do it with special
2175	 *	linger2	option.					--ANK
2176	 */
2177
2178	if (sk->sk_state == TCP_FIN_WAIT2) {
2179		struct tcp_sock *tp = tcp_sk(sk);
2180		if (tp->linger2 < 0) {
2181			tcp_set_state(sk, TCP_CLOSE);
2182			tcp_send_active_reset(sk, GFP_ATOMIC);
2183			NET_INC_STATS_BH(sock_net(sk),
2184					LINUX_MIB_TCPABORTONLINGER);
2185		} else {
2186			const int tmo = tcp_fin_time(sk);
2187
2188			if (tmo > TCP_TIMEWAIT_LEN) {
2189				inet_csk_reset_keepalive_timer(sk,
2190						tmo - TCP_TIMEWAIT_LEN);
2191			} else {
2192				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2193				goto out;
2194			}
2195		}
2196	}
2197	if (sk->sk_state != TCP_CLOSE) {
2198		sk_mem_reclaim(sk);
2199		if (tcp_check_oom(sk, 0)) {
2200			tcp_set_state(sk, TCP_CLOSE);
2201			tcp_send_active_reset(sk, GFP_ATOMIC);
2202			NET_INC_STATS_BH(sock_net(sk),
2203					LINUX_MIB_TCPABORTONMEMORY);
2204		}
2205	}
2206
2207	if (sk->sk_state == TCP_CLOSE) {
2208		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2209		/* We could get here with a non-NULL req if the socket is
2210		 * aborted (e.g., closed with unread data) before 3WHS
2211		 * finishes.
2212		 */
2213		if (req != NULL)
2214			reqsk_fastopen_remove(sk, req, false);
2215		inet_csk_destroy_sock(sk);
2216	}
2217	/* Otherwise, socket is reprieved until protocol close. */
2218
2219out:
2220	bh_unlock_sock(sk);
2221	local_bh_enable();
2222	sock_put(sk);
2223}
2224EXPORT_SYMBOL(tcp_close);
2225
2226/* These states need RST on ABORT according to RFC793 */
2227
2228static inline bool tcp_need_reset(int state)
2229{
2230	return (1 << state) &
2231	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2232		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2233}
2234
2235int tcp_disconnect(struct sock *sk, int flags)
2236{
2237	struct inet_sock *inet = inet_sk(sk);
2238	struct inet_connection_sock *icsk = inet_csk(sk);
2239	struct tcp_sock *tp = tcp_sk(sk);
2240	int err = 0;
2241	int old_state = sk->sk_state;
2242
2243	if (old_state != TCP_CLOSE)
2244		tcp_set_state(sk, TCP_CLOSE);
2245
2246	/* ABORT function of RFC793 */
2247	if (old_state == TCP_LISTEN) {
2248		inet_csk_listen_stop(sk);
2249	} else if (unlikely(tp->repair)) {
2250		sk->sk_err = ECONNABORTED;
2251	} else if (tcp_need_reset(old_state) ||
2252		   (tp->snd_nxt != tp->write_seq &&
2253		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2254		/* The last check adjusts for discrepancy of Linux wrt. RFC
2255		 * states
2256		 */
2257		tcp_send_active_reset(sk, gfp_any());
2258		sk->sk_err = ECONNRESET;
2259	} else if (old_state == TCP_SYN_SENT)
2260		sk->sk_err = ECONNRESET;
2261
2262	tcp_clear_xmit_timers(sk);
2263	__skb_queue_purge(&sk->sk_receive_queue);
2264	tcp_write_queue_purge(sk);
2265	__skb_queue_purge(&tp->out_of_order_queue);
2266#ifdef CONFIG_NET_DMA
2267	__skb_queue_purge(&sk->sk_async_wait_queue);
2268#endif
2269
2270	inet->inet_dport = 0;
2271
2272	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2273		inet_reset_saddr(sk);
2274
2275	sk->sk_shutdown = 0;
2276	sock_reset_flag(sk, SOCK_DONE);
2277	tp->srtt = 0;
2278	if ((tp->write_seq += tp->max_window + 2) == 0)
2279		tp->write_seq = 1;
2280	icsk->icsk_backoff = 0;
2281	tp->snd_cwnd = 2;
2282	icsk->icsk_probes_out = 0;
2283	tp->packets_out = 0;
2284	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2285	tp->snd_cwnd_cnt = 0;
2286	tp->window_clamp = 0;
2287	tcp_set_ca_state(sk, TCP_CA_Open);
2288	tcp_clear_retrans(tp);
2289	inet_csk_delack_init(sk);
2290	tcp_init_send_head(sk);
2291	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2292	__sk_dst_reset(sk);
2293
2294	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2295
2296	sk->sk_error_report(sk);
2297	return err;
2298}
2299EXPORT_SYMBOL(tcp_disconnect);
2300
2301void tcp_sock_destruct(struct sock *sk)
2302{
2303	inet_sock_destruct(sk);
2304
2305	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2306}
2307
2308static inline bool tcp_can_repair_sock(const struct sock *sk)
2309{
2310	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2311		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2312}
2313
2314static int tcp_repair_options_est(struct tcp_sock *tp,
2315		struct tcp_repair_opt __user *optbuf, unsigned int len)
2316{
2317	struct tcp_repair_opt opt;
2318
2319	while (len >= sizeof(opt)) {
2320		if (copy_from_user(&opt, optbuf, sizeof(opt)))
2321			return -EFAULT;
2322
2323		optbuf++;
2324		len -= sizeof(opt);
2325
2326		switch (opt.opt_code) {
2327		case TCPOPT_MSS:
2328			tp->rx_opt.mss_clamp = opt.opt_val;
2329			break;
2330		case TCPOPT_WINDOW:
2331			{
2332				u16 snd_wscale = opt.opt_val & 0xFFFF;
2333				u16 rcv_wscale = opt.opt_val >> 16;
2334
2335				if (snd_wscale > 14 || rcv_wscale > 14)
2336					return -EFBIG;
2337
2338				tp->rx_opt.snd_wscale = snd_wscale;
2339				tp->rx_opt.rcv_wscale = rcv_wscale;
2340				tp->rx_opt.wscale_ok = 1;
2341			}
2342			break;
2343		case TCPOPT_SACK_PERM:
2344			if (opt.opt_val != 0)
2345				return -EINVAL;
2346
2347			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2348			if (sysctl_tcp_fack)
2349				tcp_enable_fack(tp);
2350			break;
2351		case TCPOPT_TIMESTAMP:
2352			if (opt.opt_val != 0)
2353				return -EINVAL;
2354
2355			tp->rx_opt.tstamp_ok = 1;
2356			break;
2357		}
2358	}
2359
2360	return 0;
2361}
2362
2363/*
2364 *	Socket option code for TCP.
2365 */
2366static int do_tcp_setsockopt(struct sock *sk, int level,
2367		int optname, char __user *optval, unsigned int optlen)
2368{
2369	struct tcp_sock *tp = tcp_sk(sk);
2370	struct inet_connection_sock *icsk = inet_csk(sk);
2371	int val;
2372	int err = 0;
2373
2374	/* These are data/string values, all the others are ints */
2375	switch (optname) {
2376	case TCP_CONGESTION: {
2377		char name[TCP_CA_NAME_MAX];
2378
2379		if (optlen < 1)
2380			return -EINVAL;
2381
2382		val = strncpy_from_user(name, optval,
2383					min_t(long, TCP_CA_NAME_MAX-1, optlen));
2384		if (val < 0)
2385			return -EFAULT;
2386		name[val] = 0;
2387
2388		lock_sock(sk);
2389		err = tcp_set_congestion_control(sk, name);
2390		release_sock(sk);
2391		return err;
2392	}
2393	default:
2394		/* fallthru */
2395		break;
2396	}
2397
2398	if (optlen < sizeof(int))
2399		return -EINVAL;
2400
2401	if (get_user(val, (int __user *)optval))
2402		return -EFAULT;
2403
2404	lock_sock(sk);
2405
2406	switch (optname) {
2407	case TCP_MAXSEG:
2408		/* Values greater than interface MTU won't take effect. However
2409		 * at the point when this call is done we typically don't yet
2410		 * know which interface is going to be used */
2411		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2412			err = -EINVAL;
2413			break;
2414		}
2415		tp->rx_opt.user_mss = val;
2416		break;
2417
2418	case TCP_NODELAY:
2419		if (val) {
2420			/* TCP_NODELAY is weaker than TCP_CORK, so that
2421			 * this option on corked socket is remembered, but
2422			 * it is not activated until cork is cleared.
2423			 *
2424			 * However, when TCP_NODELAY is set we make
2425			 * an explicit push, which overrides even TCP_CORK
2426			 * for currently queued segments.
2427			 */
2428			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2429			tcp_push_pending_frames(sk);
2430		} else {
2431			tp->nonagle &= ~TCP_NAGLE_OFF;
2432		}
2433		break;
2434
2435	case TCP_THIN_LINEAR_TIMEOUTS:
2436		if (val < 0 || val > 1)
2437			err = -EINVAL;
2438		else
2439			tp->thin_lto = val;
2440		break;
2441
2442	case TCP_THIN_DUPACK:
2443		if (val < 0 || val > 1)
2444			err = -EINVAL;
2445		else
2446			tp->thin_dupack = val;
2447			if (tp->thin_dupack)
2448				tcp_disable_early_retrans(tp);
2449		break;
2450
2451	case TCP_REPAIR:
2452		if (!tcp_can_repair_sock(sk))
2453			err = -EPERM;
2454		else if (val == 1) {
2455			tp->repair = 1;
2456			sk->sk_reuse = SK_FORCE_REUSE;
2457			tp->repair_queue = TCP_NO_QUEUE;
2458		} else if (val == 0) {
2459			tp->repair = 0;
2460			sk->sk_reuse = SK_NO_REUSE;
2461			tcp_send_window_probe(sk);
2462		} else
2463			err = -EINVAL;
2464
2465		break;
2466
2467	case TCP_REPAIR_QUEUE:
2468		if (!tp->repair)
2469			err = -EPERM;
2470		else if (val < TCP_QUEUES_NR)
2471			tp->repair_queue = val;
2472		else
2473			err = -EINVAL;
2474		break;
2475
2476	case TCP_QUEUE_SEQ:
2477		if (sk->sk_state != TCP_CLOSE)
2478			err = -EPERM;
2479		else if (tp->repair_queue == TCP_SEND_QUEUE)
2480			tp->write_seq = val;
2481		else if (tp->repair_queue == TCP_RECV_QUEUE)
2482			tp->rcv_nxt = val;
2483		else
2484			err = -EINVAL;
2485		break;
2486
2487	case TCP_REPAIR_OPTIONS:
2488		if (!tp->repair)
2489			err = -EINVAL;
2490		else if (sk->sk_state == TCP_ESTABLISHED)
2491			err = tcp_repair_options_est(tp,
2492					(struct tcp_repair_opt __user *)optval,
2493					optlen);
2494		else
2495			err = -EPERM;
2496		break;
2497
2498	case TCP_CORK:
2499		/* When set indicates to always queue non-full frames.
2500		 * Later the user clears this option and we transmit
2501		 * any pending partial frames in the queue.  This is
2502		 * meant to be used alongside sendfile() to get properly
2503		 * filled frames when the user (for example) must write
2504		 * out headers with a write() call first and then use
2505		 * sendfile to send out the data parts.
2506		 *
2507		 * TCP_CORK can be set together with TCP_NODELAY and it is
2508		 * stronger than TCP_NODELAY.
2509		 */
2510		if (val) {
2511			tp->nonagle |= TCP_NAGLE_CORK;
2512		} else {
2513			tp->nonagle &= ~TCP_NAGLE_CORK;
2514			if (tp->nonagle&TCP_NAGLE_OFF)
2515				tp->nonagle |= TCP_NAGLE_PUSH;
2516			tcp_push_pending_frames(sk);
2517		}
2518		break;
2519
2520	case TCP_KEEPIDLE:
2521		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2522			err = -EINVAL;
2523		else {
2524			tp->keepalive_time = val * HZ;
2525			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2526			    !((1 << sk->sk_state) &
2527			      (TCPF_CLOSE | TCPF_LISTEN))) {
2528				u32 elapsed = keepalive_time_elapsed(tp);
2529				if (tp->keepalive_time > elapsed)
2530					elapsed = tp->keepalive_time - elapsed;
2531				else
2532					elapsed = 0;
2533				inet_csk_reset_keepalive_timer(sk, elapsed);
2534			}
2535		}
2536		break;
2537	case TCP_KEEPINTVL:
2538		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2539			err = -EINVAL;
2540		else
2541			tp->keepalive_intvl = val * HZ;
2542		break;
2543	case TCP_KEEPCNT:
2544		if (val < 1 || val > MAX_TCP_KEEPCNT)
2545			err = -EINVAL;
2546		else
2547			tp->keepalive_probes = val;
2548		break;
2549	case TCP_SYNCNT:
2550		if (val < 1 || val > MAX_TCP_SYNCNT)
2551			err = -EINVAL;
2552		else
2553			icsk->icsk_syn_retries = val;
2554		break;
2555
2556	case TCP_LINGER2:
2557		if (val < 0)
2558			tp->linger2 = -1;
2559		else if (val > sysctl_tcp_fin_timeout / HZ)
2560			tp->linger2 = 0;
2561		else
2562			tp->linger2 = val * HZ;
2563		break;
2564
2565	case TCP_DEFER_ACCEPT:
2566		/* Translate value in seconds to number of retransmits */
2567		icsk->icsk_accept_queue.rskq_defer_accept =
2568			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2569					TCP_RTO_MAX / HZ);
2570		break;
2571
2572	case TCP_WINDOW_CLAMP:
2573		if (!val) {
2574			if (sk->sk_state != TCP_CLOSE) {
2575				err = -EINVAL;
2576				break;
2577			}
2578			tp->window_clamp = 0;
2579		} else
2580			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2581						SOCK_MIN_RCVBUF / 2 : val;
2582		break;
2583
2584	case TCP_QUICKACK:
2585		if (!val) {
2586			icsk->icsk_ack.pingpong = 1;
2587		} else {
2588			icsk->icsk_ack.pingpong = 0;
2589			if ((1 << sk->sk_state) &
2590			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2591			    inet_csk_ack_scheduled(sk)) {
2592				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2593				tcp_cleanup_rbuf(sk, 1);
2594				if (!(val & 1))
2595					icsk->icsk_ack.pingpong = 1;
2596			}
2597		}
2598		break;
2599
2600#ifdef CONFIG_TCP_MD5SIG
2601	case TCP_MD5SIG:
2602		/* Read the IP->Key mappings from userspace */
2603		err = tp->af_specific->md5_parse(sk, optval, optlen);
2604		break;
2605#endif
2606	case TCP_USER_TIMEOUT:
2607		/* Cap the max timeout in ms TCP will retry/retrans
2608		 * before giving up and aborting (ETIMEDOUT) a connection.
2609		 */
2610		if (val < 0)
2611			err = -EINVAL;
2612		else
2613			icsk->icsk_user_timeout = msecs_to_jiffies(val);
2614		break;
2615
2616	case TCP_FASTOPEN:
2617		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2618		    TCPF_LISTEN)))
2619			err = fastopen_init_queue(sk, val);
2620		else
2621			err = -EINVAL;
2622		break;
2623	case TCP_TIMESTAMP:
2624		if (!tp->repair)
2625			err = -EPERM;
2626		else
2627			tp->tsoffset = val - tcp_time_stamp;
2628		break;
2629	default:
2630		err = -ENOPROTOOPT;
2631		break;
2632	}
2633
2634	release_sock(sk);
2635	return err;
2636}
2637
2638int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2639		   unsigned int optlen)
2640{
2641	const struct inet_connection_sock *icsk = inet_csk(sk);
2642
2643	if (level != SOL_TCP)
2644		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2645						     optval, optlen);
2646	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2647}
2648EXPORT_SYMBOL(tcp_setsockopt);
2649
2650#ifdef CONFIG_COMPAT
2651int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2652			  char __user *optval, unsigned int optlen)
2653{
2654	if (level != SOL_TCP)
2655		return inet_csk_compat_setsockopt(sk, level, optname,
2656						  optval, optlen);
2657	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2658}
2659EXPORT_SYMBOL(compat_tcp_setsockopt);
2660#endif
2661
2662/* Return information about state of tcp endpoint in API format. */
2663void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2664{
2665	const struct tcp_sock *tp = tcp_sk(sk);
2666	const struct inet_connection_sock *icsk = inet_csk(sk);
2667	u32 now = tcp_time_stamp;
2668
2669	memset(info, 0, sizeof(*info));
2670
2671	info->tcpi_state = sk->sk_state;
2672	info->tcpi_ca_state = icsk->icsk_ca_state;
2673	info->tcpi_retransmits = icsk->icsk_retransmits;
2674	info->tcpi_probes = icsk->icsk_probes_out;
2675	info->tcpi_backoff = icsk->icsk_backoff;
2676
2677	if (tp->rx_opt.tstamp_ok)
2678		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2679	if (tcp_is_sack(tp))
2680		info->tcpi_options |= TCPI_OPT_SACK;
2681	if (tp->rx_opt.wscale_ok) {
2682		info->tcpi_options |= TCPI_OPT_WSCALE;
2683		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2684		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2685	}
2686
2687	if (tp->ecn_flags & TCP_ECN_OK)
2688		info->tcpi_options |= TCPI_OPT_ECN;
2689	if (tp->ecn_flags & TCP_ECN_SEEN)
2690		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2691	if (tp->syn_data_acked)
2692		info->tcpi_options |= TCPI_OPT_SYN_DATA;
2693
2694	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2695	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2696	info->tcpi_snd_mss = tp->mss_cache;
2697	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2698
2699	if (sk->sk_state == TCP_LISTEN) {
2700		info->tcpi_unacked = sk->sk_ack_backlog;
2701		info->tcpi_sacked = sk->sk_max_ack_backlog;
2702	} else {
2703		info->tcpi_unacked = tp->packets_out;
2704		info->tcpi_sacked = tp->sacked_out;
2705	}
2706	info->tcpi_lost = tp->lost_out;
2707	info->tcpi_retrans = tp->retrans_out;
2708	info->tcpi_fackets = tp->fackets_out;
2709
2710	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2711	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2712	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2713
2714	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2715	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2716	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2717	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2718	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2719	info->tcpi_snd_cwnd = tp->snd_cwnd;
2720	info->tcpi_advmss = tp->advmss;
2721	info->tcpi_reordering = tp->reordering;
2722
2723	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2724	info->tcpi_rcv_space = tp->rcvq_space.space;
2725
2726	info->tcpi_total_retrans = tp->total_retrans;
2727}
2728EXPORT_SYMBOL_GPL(tcp_get_info);
2729
2730static int do_tcp_getsockopt(struct sock *sk, int level,
2731		int optname, char __user *optval, int __user *optlen)
2732{
2733	struct inet_connection_sock *icsk = inet_csk(sk);
2734	struct tcp_sock *tp = tcp_sk(sk);
2735	int val, len;
2736
2737	if (get_user(len, optlen))
2738		return -EFAULT;
2739
2740	len = min_t(unsigned int, len, sizeof(int));
2741
2742	if (len < 0)
2743		return -EINVAL;
2744
2745	switch (optname) {
2746	case TCP_MAXSEG:
2747		val = tp->mss_cache;
2748		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2749			val = tp->rx_opt.user_mss;
2750		if (tp->repair)
2751			val = tp->rx_opt.mss_clamp;
2752		break;
2753	case TCP_NODELAY:
2754		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2755		break;
2756	case TCP_CORK:
2757		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2758		break;
2759	case TCP_KEEPIDLE:
2760		val = keepalive_time_when(tp) / HZ;
2761		break;
2762	case TCP_KEEPINTVL:
2763		val = keepalive_intvl_when(tp) / HZ;
2764		break;
2765	case TCP_KEEPCNT:
2766		val = keepalive_probes(tp);
2767		break;
2768	case TCP_SYNCNT:
2769		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2770		break;
2771	case TCP_LINGER2:
2772		val = tp->linger2;
2773		if (val >= 0)
2774			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2775		break;
2776	case TCP_DEFER_ACCEPT:
2777		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2778				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2779		break;
2780	case TCP_WINDOW_CLAMP:
2781		val = tp->window_clamp;
2782		break;
2783	case TCP_INFO: {
2784		struct tcp_info info;
2785
2786		if (get_user(len, optlen))
2787			return -EFAULT;
2788
2789		tcp_get_info(sk, &info);
2790
2791		len = min_t(unsigned int, len, sizeof(info));
2792		if (put_user(len, optlen))
2793			return -EFAULT;
2794		if (copy_to_user(optval, &info, len))
2795			return -EFAULT;
2796		return 0;
2797	}
2798	case TCP_QUICKACK:
2799		val = !icsk->icsk_ack.pingpong;
2800		break;
2801
2802	case TCP_CONGESTION:
2803		if (get_user(len, optlen))
2804			return -EFAULT;
2805		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2806		if (put_user(len, optlen))
2807			return -EFAULT;
2808		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2809			return -EFAULT;
2810		return 0;
2811
2812	case TCP_THIN_LINEAR_TIMEOUTS:
2813		val = tp->thin_lto;
2814		break;
2815	case TCP_THIN_DUPACK:
2816		val = tp->thin_dupack;
2817		break;
2818
2819	case TCP_REPAIR:
2820		val = tp->repair;
2821		break;
2822
2823	case TCP_REPAIR_QUEUE:
2824		if (tp->repair)
2825			val = tp->repair_queue;
2826		else
2827			return -EINVAL;
2828		break;
2829
2830	case TCP_QUEUE_SEQ:
2831		if (tp->repair_queue == TCP_SEND_QUEUE)
2832			val = tp->write_seq;
2833		else if (tp->repair_queue == TCP_RECV_QUEUE)
2834			val = tp->rcv_nxt;
2835		else
2836			return -EINVAL;
2837		break;
2838
2839	case TCP_USER_TIMEOUT:
2840		val = jiffies_to_msecs(icsk->icsk_user_timeout);
2841		break;
2842	case TCP_TIMESTAMP:
2843		val = tcp_time_stamp + tp->tsoffset;
2844		break;
2845	default:
2846		return -ENOPROTOOPT;
2847	}
2848
2849	if (put_user(len, optlen))
2850		return -EFAULT;
2851	if (copy_to_user(optval, &val, len))
2852		return -EFAULT;
2853	return 0;
2854}
2855
2856int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2857		   int __user *optlen)
2858{
2859	struct inet_connection_sock *icsk = inet_csk(sk);
2860
2861	if (level != SOL_TCP)
2862		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2863						     optval, optlen);
2864	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2865}
2866EXPORT_SYMBOL(tcp_getsockopt);
2867
2868#ifdef CONFIG_COMPAT
2869int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2870			  char __user *optval, int __user *optlen)
2871{
2872	if (level != SOL_TCP)
2873		return inet_csk_compat_getsockopt(sk, level, optname,
2874						  optval, optlen);
2875	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2876}
2877EXPORT_SYMBOL(compat_tcp_getsockopt);
2878#endif
2879
2880struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2881	netdev_features_t features)
2882{
2883	struct sk_buff *segs = ERR_PTR(-EINVAL);
2884	struct tcphdr *th;
2885	unsigned int thlen;
2886	unsigned int seq;
2887	__be32 delta;
2888	unsigned int oldlen;
2889	unsigned int mss;
2890	struct sk_buff *gso_skb = skb;
2891	__sum16 newcheck;
2892	bool ooo_okay, copy_destructor;
2893
2894	if (!pskb_may_pull(skb, sizeof(*th)))
2895		goto out;
2896
2897	th = tcp_hdr(skb);
2898	thlen = th->doff * 4;
2899	if (thlen < sizeof(*th))
2900		goto out;
2901
2902	if (!pskb_may_pull(skb, thlen))
2903		goto out;
2904
2905	oldlen = (u16)~skb->len;
2906	__skb_pull(skb, thlen);
2907
2908	mss = skb_shinfo(skb)->gso_size;
2909	if (unlikely(skb->len <= mss))
2910		goto out;
2911
2912	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2913		/* Packet is from an untrusted source, reset gso_segs. */
2914		int type = skb_shinfo(skb)->gso_type;
2915
2916		if (unlikely(type &
2917			     ~(SKB_GSO_TCPV4 |
2918			       SKB_GSO_DODGY |
2919			       SKB_GSO_TCP_ECN |
2920			       SKB_GSO_TCPV6 |
2921			       SKB_GSO_GRE |
2922			       SKB_GSO_MPLS |
2923			       SKB_GSO_UDP_TUNNEL |
2924			       0) ||
2925			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2926			goto out;
2927
2928		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2929
2930		segs = NULL;
2931		goto out;
2932	}
2933
2934	copy_destructor = gso_skb->destructor == tcp_wfree;
2935	ooo_okay = gso_skb->ooo_okay;
2936	/* All segments but the first should have ooo_okay cleared */
2937	skb->ooo_okay = 0;
2938
2939	segs = skb_segment(skb, features);
2940	if (IS_ERR(segs))
2941		goto out;
2942
2943	/* Only first segment might have ooo_okay set */
2944	segs->ooo_okay = ooo_okay;
2945
2946	delta = htonl(oldlen + (thlen + mss));
2947
2948	skb = segs;
2949	th = tcp_hdr(skb);
2950	seq = ntohl(th->seq);
2951
2952	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
2953					       (__force u32)delta));
2954
2955	do {
2956		th->fin = th->psh = 0;
2957		th->check = newcheck;
2958
2959		if (skb->ip_summed != CHECKSUM_PARTIAL)
2960			th->check =
2961			     csum_fold(csum_partial(skb_transport_header(skb),
2962						    thlen, skb->csum));
2963
2964		seq += mss;
2965		if (copy_destructor) {
2966			skb->destructor = gso_skb->destructor;
2967			skb->sk = gso_skb->sk;
2968			/* {tcp|sock}_wfree() use exact truesize accounting :
2969			 * sum(skb->truesize) MUST be exactly be gso_skb->truesize
2970			 * So we account mss bytes of 'true size' for each segment.
2971			 * The last segment will contain the remaining.
2972			 */
2973			skb->truesize = mss;
2974			gso_skb->truesize -= mss;
2975		}
2976		skb = skb->next;
2977		th = tcp_hdr(skb);
2978
2979		th->seq = htonl(seq);
2980		th->cwr = 0;
2981	} while (skb->next);
2982
2983	/* Following permits TCP Small Queues to work well with GSO :
2984	 * The callback to TCP stack will be called at the time last frag
2985	 * is freed at TX completion, and not right now when gso_skb
2986	 * is freed by GSO engine
2987	 */
2988	if (copy_destructor) {
2989		swap(gso_skb->sk, skb->sk);
2990		swap(gso_skb->destructor, skb->destructor);
2991		swap(gso_skb->truesize, skb->truesize);
2992	}
2993
2994	delta = htonl(oldlen + (skb_tail_pointer(skb) -
2995				skb_transport_header(skb)) +
2996		      skb->data_len);
2997	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2998				(__force u32)delta));
2999	if (skb->ip_summed != CHECKSUM_PARTIAL)
3000		th->check = csum_fold(csum_partial(skb_transport_header(skb),
3001						   thlen, skb->csum));
3002
3003out:
3004	return segs;
3005}
3006EXPORT_SYMBOL(tcp_tso_segment);
3007
3008struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3009{
3010	struct sk_buff **pp = NULL;
3011	struct sk_buff *p;
3012	struct tcphdr *th;
3013	struct tcphdr *th2;
3014	unsigned int len;
3015	unsigned int thlen;
3016	__be32 flags;
3017	unsigned int mss = 1;
3018	unsigned int hlen;
3019	unsigned int off;
3020	int flush = 1;
3021	int i;
3022
3023	off = skb_gro_offset(skb);
3024	hlen = off + sizeof(*th);
3025	th = skb_gro_header_fast(skb, off);
3026	if (skb_gro_header_hard(skb, hlen)) {
3027		th = skb_gro_header_slow(skb, hlen, off);
3028		if (unlikely(!th))
3029			goto out;
3030	}
3031
3032	thlen = th->doff * 4;
3033	if (thlen < sizeof(*th))
3034		goto out;
3035
3036	hlen = off + thlen;
3037	if (skb_gro_header_hard(skb, hlen)) {
3038		th = skb_gro_header_slow(skb, hlen, off);
3039		if (unlikely(!th))
3040			goto out;
3041	}
3042
3043	skb_gro_pull(skb, thlen);
3044
3045	len = skb_gro_len(skb);
3046	flags = tcp_flag_word(th);
3047
3048	for (; (p = *head); head = &p->next) {
3049		if (!NAPI_GRO_CB(p)->same_flow)
3050			continue;
3051
3052		th2 = tcp_hdr(p);
3053
3054		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3055			NAPI_GRO_CB(p)->same_flow = 0;
3056			continue;
3057		}
3058
3059		goto found;
3060	}
3061
3062	goto out_check_final;
3063
3064found:
3065	flush = NAPI_GRO_CB(p)->flush;
3066	flush |= (__force int)(flags & TCP_FLAG_CWR);
3067	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3068		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3069	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3070	for (i = sizeof(*th); i < thlen; i += 4)
3071		flush |= *(u32 *)((u8 *)th + i) ^
3072			 *(u32 *)((u8 *)th2 + i);
3073
3074	mss = skb_shinfo(p)->gso_size;
3075
3076	flush |= (len - 1) >= mss;
3077	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3078
3079	if (flush || skb_gro_receive(head, skb)) {
3080		mss = 1;
3081		goto out_check_final;
3082	}
3083
3084	p = *head;
3085	th2 = tcp_hdr(p);
3086	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3087
3088out_check_final:
3089	flush = len < mss;
3090	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3091					TCP_FLAG_RST | TCP_FLAG_SYN |
3092					TCP_FLAG_FIN));
3093
3094	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3095		pp = head;
3096
3097out:
3098	NAPI_GRO_CB(skb)->flush |= flush;
3099
3100	return pp;
3101}
3102EXPORT_SYMBOL(tcp_gro_receive);
3103
3104int tcp_gro_complete(struct sk_buff *skb)
3105{
3106	struct tcphdr *th = tcp_hdr(skb);
3107
3108	skb->csum_start = skb_transport_header(skb) - skb->head;
3109	skb->csum_offset = offsetof(struct tcphdr, check);
3110	skb->ip_summed = CHECKSUM_PARTIAL;
3111
3112	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3113
3114	if (th->cwr)
3115		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3116
3117	return 0;
3118}
3119EXPORT_SYMBOL(tcp_gro_complete);
3120
3121#ifdef CONFIG_TCP_MD5SIG
3122static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
3123static DEFINE_MUTEX(tcp_md5sig_mutex);
3124
3125static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3126{
3127	int cpu;
3128
3129	for_each_possible_cpu(cpu) {
3130		struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3131
3132		if (p->md5_desc.tfm)
3133			crypto_free_hash(p->md5_desc.tfm);
3134	}
3135	free_percpu(pool);
3136}
3137
3138static void __tcp_alloc_md5sig_pool(void)
3139{
3140	int cpu;
3141	struct tcp_md5sig_pool __percpu *pool;
3142
3143	pool = alloc_percpu(struct tcp_md5sig_pool);
3144	if (!pool)
3145		return;
3146
3147	for_each_possible_cpu(cpu) {
3148		struct crypto_hash *hash;
3149
3150		hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3151		if (IS_ERR_OR_NULL(hash))
3152			goto out_free;
3153
3154		per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3155	}
3156	/* before setting tcp_md5sig_pool, we must commit all writes
3157	 * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
3158	 */
3159	smp_wmb();
3160	tcp_md5sig_pool = pool;
3161	return;
3162out_free:
3163	__tcp_free_md5sig_pool(pool);
3164}
3165
3166bool tcp_alloc_md5sig_pool(void)
3167{
3168	if (unlikely(!tcp_md5sig_pool)) {
3169		mutex_lock(&tcp_md5sig_mutex);
3170
3171		if (!tcp_md5sig_pool)
3172			__tcp_alloc_md5sig_pool();
3173
3174		mutex_unlock(&tcp_md5sig_mutex);
3175	}
3176	return tcp_md5sig_pool != NULL;
3177}
3178EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3179
3180
3181/**
3182 *	tcp_get_md5sig_pool - get md5sig_pool for this user
3183 *
3184 *	We use percpu structure, so if we succeed, we exit with preemption
3185 *	and BH disabled, to make sure another thread or softirq handling
3186 *	wont try to get same context.
3187 */
3188struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3189{
3190	struct tcp_md5sig_pool __percpu *p;
3191
3192	local_bh_disable();
3193	p = ACCESS_ONCE(tcp_md5sig_pool);
3194	if (p)
3195		return __this_cpu_ptr(p);
3196
3197	local_bh_enable();
3198	return NULL;
3199}
3200EXPORT_SYMBOL(tcp_get_md5sig_pool);
3201
3202int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3203			const struct tcphdr *th)
3204{
3205	struct scatterlist sg;
3206	struct tcphdr hdr;
3207	int err;
3208
3209	/* We are not allowed to change tcphdr, make a local copy */
3210	memcpy(&hdr, th, sizeof(hdr));
3211	hdr.check = 0;
3212
3213	/* options aren't included in the hash */
3214	sg_init_one(&sg, &hdr, sizeof(hdr));
3215	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3216	return err;
3217}
3218EXPORT_SYMBOL(tcp_md5_hash_header);
3219
3220int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3221			  const struct sk_buff *skb, unsigned int header_len)
3222{
3223	struct scatterlist sg;
3224	const struct tcphdr *tp = tcp_hdr(skb);
3225	struct hash_desc *desc = &hp->md5_desc;
3226	unsigned int i;
3227	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3228					   skb_headlen(skb) - header_len : 0;
3229	const struct skb_shared_info *shi = skb_shinfo(skb);
3230	struct sk_buff *frag_iter;
3231
3232	sg_init_table(&sg, 1);
3233
3234	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3235	if (crypto_hash_update(desc, &sg, head_data_len))
3236		return 1;
3237
3238	for (i = 0; i < shi->nr_frags; ++i) {
3239		const struct skb_frag_struct *f = &shi->frags[i];
3240		unsigned int offset = f->page_offset;
3241		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3242
3243		sg_set_page(&sg, page, skb_frag_size(f),
3244			    offset_in_page(offset));
3245		if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3246			return 1;
3247	}
3248
3249	skb_walk_frags(skb, frag_iter)
3250		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3251			return 1;
3252
3253	return 0;
3254}
3255EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3256
3257int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3258{
3259	struct scatterlist sg;
3260
3261	sg_init_one(&sg, key->key, key->keylen);
3262	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3263}
3264EXPORT_SYMBOL(tcp_md5_hash_key);
3265
3266#endif
3267
3268void tcp_done(struct sock *sk)
3269{
3270	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3271
3272	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3273		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3274
3275	tcp_set_state(sk, TCP_CLOSE);
3276	tcp_clear_xmit_timers(sk);
3277	if (req != NULL)
3278		reqsk_fastopen_remove(sk, req, false);
3279
3280	sk->sk_shutdown = SHUTDOWN_MASK;
3281
3282	if (!sock_flag(sk, SOCK_DEAD))
3283		sk->sk_state_change(sk);
3284	else
3285		inet_csk_destroy_sock(sk);
3286}
3287EXPORT_SYMBOL_GPL(tcp_done);
3288
3289extern struct tcp_congestion_ops tcp_reno;
3290
3291static __initdata unsigned long thash_entries;
3292static int __init set_thash_entries(char *str)
3293{
3294	ssize_t ret;
3295
3296	if (!str)
3297		return 0;
3298
3299	ret = kstrtoul(str, 0, &thash_entries);
3300	if (ret)
3301		return 0;
3302
3303	return 1;
3304}
3305__setup("thash_entries=", set_thash_entries);
3306
3307void tcp_init_mem(struct net *net)
3308{
3309	unsigned long limit = nr_free_buffer_pages() / 8;
3310	limit = max(limit, 128UL);
3311	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3312	net->ipv4.sysctl_tcp_mem[1] = limit;
3313	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3314}
3315
3316void __init tcp_init(void)
3317{
3318	struct sk_buff *skb = NULL;
3319	unsigned long limit;
3320	int max_rshare, max_wshare, cnt;
3321	unsigned int i;
3322
3323	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3324
3325	percpu_counter_init(&tcp_sockets_allocated, 0);
3326	percpu_counter_init(&tcp_orphan_count, 0);
3327	tcp_hashinfo.bind_bucket_cachep =
3328		kmem_cache_create("tcp_bind_bucket",
3329				  sizeof(struct inet_bind_bucket), 0,
3330				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3331
3332	/* Size and allocate the main established and bind bucket
3333	 * hash tables.
3334	 *
3335	 * The methodology is similar to that of the buffer cache.
3336	 */
3337	tcp_hashinfo.ehash =
3338		alloc_large_system_hash("TCP established",
3339					sizeof(struct inet_ehash_bucket),
3340					thash_entries,
3341					17, /* one slot per 128 KB of memory */
3342					0,
3343					NULL,
3344					&tcp_hashinfo.ehash_mask,
3345					0,
3346					thash_entries ? 0 : 512 * 1024);
3347	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3348		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3349		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3350	}
3351	if (inet_ehash_locks_alloc(&tcp_hashinfo))
3352		panic("TCP: failed to alloc ehash_locks");
3353	tcp_hashinfo.bhash =
3354		alloc_large_system_hash("TCP bind",
3355					sizeof(struct inet_bind_hashbucket),
3356					tcp_hashinfo.ehash_mask + 1,
3357					17, /* one slot per 128 KB of memory */
3358					0,
3359					&tcp_hashinfo.bhash_size,
3360					NULL,
3361					0,
3362					64 * 1024);
3363	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3364	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3365		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3366		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3367	}
3368
3369
3370	cnt = tcp_hashinfo.ehash_mask + 1;
3371
3372	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3373	sysctl_tcp_max_orphans = cnt / 2;
3374	sysctl_max_syn_backlog = max(128, cnt / 256);
3375
3376	tcp_init_mem(&init_net);
3377	/* Set per-socket limits to no more than 1/128 the pressure threshold */
3378	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3379	max_wshare = min(4UL*1024*1024, limit);
3380	max_rshare = min(6UL*1024*1024, limit);
3381
3382	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3383	sysctl_tcp_wmem[1] = 16*1024;
3384	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3385
3386	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3387	sysctl_tcp_rmem[1] = 87380;
3388	sysctl_tcp_rmem[2] = max(87380, max_rshare);
3389
3390	pr_info("Hash tables configured (established %u bind %u)\n",
3391		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3392
3393	tcp_metrics_init();
3394
3395	tcp_register_congestion_control(&tcp_reno);
3396
3397	tcp_tasklet_init();
3398}
3399