tcp.c revision 295ff7edb8f72b77d524759266f7524deae379b3
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274atomic_t tcp_orphan_count = ATOMIC_INIT(0);
275
276EXPORT_SYMBOL_GPL(tcp_orphan_count);
277
278int sysctl_tcp_mem[3];
279int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
281
282EXPORT_SYMBOL(sysctl_tcp_mem);
283EXPORT_SYMBOL(sysctl_tcp_rmem);
284EXPORT_SYMBOL(sysctl_tcp_wmem);
285
286atomic_t tcp_memory_allocated;	/* Current allocated memory. */
287atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
288
289EXPORT_SYMBOL(tcp_memory_allocated);
290EXPORT_SYMBOL(tcp_sockets_allocated);
291
292/*
293 * Pressure flag: try to collapse.
294 * Technical note: it is used by multiple contexts non atomically.
295 * All the sk_stream_mem_schedule() is of this nature: accounting
296 * is strict, actions are advisory and have some latency.
297 */
298int tcp_memory_pressure;
299
300EXPORT_SYMBOL(tcp_memory_pressure);
301
302void tcp_enter_memory_pressure(void)
303{
304	if (!tcp_memory_pressure) {
305		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
306		tcp_memory_pressure = 1;
307	}
308}
309
310EXPORT_SYMBOL(tcp_enter_memory_pressure);
311
312/*
313 * LISTEN is a special case for poll..
314 */
315static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
316					       poll_table *wait)
317{
318	return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0;
319}
320
321/*
322 *	Wait for a TCP event.
323 *
324 *	Note that we don't need to lock the socket, as the upper poll layers
325 *	take care of normal races (between the test and the event) and we don't
326 *	go look at any of the socket buffers directly.
327 */
328unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
329{
330	unsigned int mask;
331	struct sock *sk = sock->sk;
332	struct tcp_sock *tp = tcp_sk(sk);
333
334	poll_wait(file, sk->sk_sleep, wait);
335	if (sk->sk_state == TCP_LISTEN)
336		return tcp_listen_poll(sk, wait);
337
338	/* Socket is not locked. We are protected from async events
339	   by poll logic and correct handling of state changes
340	   made by another threads is impossible in any case.
341	 */
342
343	mask = 0;
344	if (sk->sk_err)
345		mask = POLLERR;
346
347	/*
348	 * POLLHUP is certainly not done right. But poll() doesn't
349	 * have a notion of HUP in just one direction, and for a
350	 * socket the read side is more interesting.
351	 *
352	 * Some poll() documentation says that POLLHUP is incompatible
353	 * with the POLLOUT/POLLWR flags, so somebody should check this
354	 * all. But careful, it tends to be safer to return too many
355	 * bits than too few, and you can easily break real applications
356	 * if you don't tell them that something has hung up!
357	 *
358	 * Check-me.
359	 *
360	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
361	 * our fs/select.c). It means that after we received EOF,
362	 * poll always returns immediately, making impossible poll() on write()
363	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
364	 * if and only if shutdown has been made in both directions.
365	 * Actually, it is interesting to look how Solaris and DUX
366	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
367	 * then we could set it on SND_SHUTDOWN. BTW examples given
368	 * in Stevens' books assume exactly this behaviour, it explains
369	 * why PULLHUP is incompatible with POLLOUT.	--ANK
370	 *
371	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
372	 * blocking on fresh not-connected or disconnected socket. --ANK
373	 */
374	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
375		mask |= POLLHUP;
376	if (sk->sk_shutdown & RCV_SHUTDOWN)
377		mask |= POLLIN | POLLRDNORM;
378
379	/* Connected? */
380	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
381		/* Potential race condition. If read of tp below will
382		 * escape above sk->sk_state, we can be illegally awaken
383		 * in SYN_* states. */
384		if ((tp->rcv_nxt != tp->copied_seq) &&
385		    (tp->urg_seq != tp->copied_seq ||
386		     tp->rcv_nxt != tp->copied_seq + 1 ||
387		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
388			mask |= POLLIN | POLLRDNORM;
389
390		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
391			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
392				mask |= POLLOUT | POLLWRNORM;
393			} else {  /* send SIGIO later */
394				set_bit(SOCK_ASYNC_NOSPACE,
395					&sk->sk_socket->flags);
396				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
397
398				/* Race breaker. If space is freed after
399				 * wspace test but before the flags are set,
400				 * IO signal will be lost.
401				 */
402				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
403					mask |= POLLOUT | POLLWRNORM;
404			}
405		}
406
407		if (tp->urg_data & TCP_URG_VALID)
408			mask |= POLLPRI;
409	}
410	return mask;
411}
412
413int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
414{
415	struct tcp_sock *tp = tcp_sk(sk);
416	int answ;
417
418	switch (cmd) {
419	case SIOCINQ:
420		if (sk->sk_state == TCP_LISTEN)
421			return -EINVAL;
422
423		lock_sock(sk);
424		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
425			answ = 0;
426		else if (sock_flag(sk, SOCK_URGINLINE) ||
427			 !tp->urg_data ||
428			 before(tp->urg_seq, tp->copied_seq) ||
429			 !before(tp->urg_seq, tp->rcv_nxt)) {
430			answ = tp->rcv_nxt - tp->copied_seq;
431
432			/* Subtract 1, if FIN is in queue. */
433			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
434				answ -=
435		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
436		} else
437			answ = tp->urg_seq - tp->copied_seq;
438		release_sock(sk);
439		break;
440	case SIOCATMARK:
441		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
442		break;
443	case SIOCOUTQ:
444		if (sk->sk_state == TCP_LISTEN)
445			return -EINVAL;
446
447		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
448			answ = 0;
449		else
450			answ = tp->write_seq - tp->snd_una;
451		break;
452	default:
453		return -ENOIOCTLCMD;
454	};
455
456	return put_user(answ, (int __user *)arg);
457}
458
459static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
460{
461	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
462	tp->pushed_seq = tp->write_seq;
463}
464
465static inline int forced_push(struct tcp_sock *tp)
466{
467	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
468}
469
470static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
471			      struct sk_buff *skb)
472{
473	skb->csum = 0;
474	TCP_SKB_CB(skb)->seq = tp->write_seq;
475	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
476	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
477	TCP_SKB_CB(skb)->sacked = 0;
478	skb_header_release(skb);
479	__skb_queue_tail(&sk->sk_write_queue, skb);
480	sk_charge_skb(sk, skb);
481	if (!sk->sk_send_head)
482		sk->sk_send_head = skb;
483	if (tp->nonagle & TCP_NAGLE_PUSH)
484		tp->nonagle &= ~TCP_NAGLE_PUSH;
485}
486
487static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
488				struct sk_buff *skb)
489{
490	if (flags & MSG_OOB) {
491		tp->urg_mode = 1;
492		tp->snd_up = tp->write_seq;
493		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
494	}
495}
496
497static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
498			    int mss_now, int nonagle)
499{
500	if (sk->sk_send_head) {
501		struct sk_buff *skb = sk->sk_write_queue.prev;
502		if (!(flags & MSG_MORE) || forced_push(tp))
503			tcp_mark_push(tp, skb);
504		tcp_mark_urg(tp, flags, skb);
505		__tcp_push_pending_frames(sk, tp, mss_now,
506					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
507	}
508}
509
510static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
511			 size_t psize, int flags)
512{
513	struct tcp_sock *tp = tcp_sk(sk);
514	int mss_now, size_goal;
515	int err;
516	ssize_t copied;
517	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
518
519	/* Wait for a connection to finish. */
520	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
521		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
522			goto out_err;
523
524	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
525
526	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
527	size_goal = tp->xmit_size_goal;
528	copied = 0;
529
530	err = -EPIPE;
531	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
532		goto do_error;
533
534	while (psize > 0) {
535		struct sk_buff *skb = sk->sk_write_queue.prev;
536		struct page *page = pages[poffset / PAGE_SIZE];
537		int copy, i, can_coalesce;
538		int offset = poffset % PAGE_SIZE;
539		int size = min_t(size_t, psize, PAGE_SIZE - offset);
540
541		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
542new_segment:
543			if (!sk_stream_memory_free(sk))
544				goto wait_for_sndbuf;
545
546			skb = sk_stream_alloc_pskb(sk, 0, 0,
547						   sk->sk_allocation);
548			if (!skb)
549				goto wait_for_memory;
550
551			skb_entail(sk, tp, skb);
552			copy = size_goal;
553		}
554
555		if (copy > size)
556			copy = size;
557
558		i = skb_shinfo(skb)->nr_frags;
559		can_coalesce = skb_can_coalesce(skb, i, page, offset);
560		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
561			tcp_mark_push(tp, skb);
562			goto new_segment;
563		}
564		if (sk->sk_forward_alloc < copy &&
565		    !sk_stream_mem_schedule(sk, copy, 0))
566			goto wait_for_memory;
567
568		if (can_coalesce) {
569			skb_shinfo(skb)->frags[i - 1].size += copy;
570		} else {
571			get_page(page);
572			skb_fill_page_desc(skb, i, page, offset, copy);
573		}
574
575		skb->len += copy;
576		skb->data_len += copy;
577		skb->truesize += copy;
578		sk->sk_wmem_queued += copy;
579		sk->sk_forward_alloc -= copy;
580		skb->ip_summed = CHECKSUM_HW;
581		tp->write_seq += copy;
582		TCP_SKB_CB(skb)->end_seq += copy;
583		skb_shinfo(skb)->tso_segs = 0;
584
585		if (!copied)
586			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
587
588		copied += copy;
589		poffset += copy;
590		if (!(psize -= copy))
591			goto out;
592
593		if (skb->len < mss_now || (flags & MSG_OOB))
594			continue;
595
596		if (forced_push(tp)) {
597			tcp_mark_push(tp, skb);
598			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
599		} else if (skb == sk->sk_send_head)
600			tcp_push_one(sk, mss_now);
601		continue;
602
603wait_for_sndbuf:
604		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
605wait_for_memory:
606		if (copied)
607			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
608
609		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
610			goto do_error;
611
612		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
613		size_goal = tp->xmit_size_goal;
614	}
615
616out:
617	if (copied)
618		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
619	return copied;
620
621do_error:
622	if (copied)
623		goto out;
624out_err:
625	return sk_stream_error(sk, flags, err);
626}
627
628ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
629		     size_t size, int flags)
630{
631	ssize_t res;
632	struct sock *sk = sock->sk;
633
634#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
635
636	if (!(sk->sk_route_caps & NETIF_F_SG) ||
637	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
638		return sock_no_sendpage(sock, page, offset, size, flags);
639
640#undef TCP_ZC_CSUM_FLAGS
641
642	lock_sock(sk);
643	TCP_CHECK_TIMER(sk);
644	res = do_tcp_sendpages(sk, &page, offset, size, flags);
645	TCP_CHECK_TIMER(sk);
646	release_sock(sk);
647	return res;
648}
649
650#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
651#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
652
653static inline int select_size(struct sock *sk, struct tcp_sock *tp)
654{
655	int tmp = tp->mss_cache;
656
657	if (sk->sk_route_caps & NETIF_F_SG) {
658		if (sk->sk_route_caps & NETIF_F_TSO)
659			tmp = 0;
660		else {
661			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
662
663			if (tmp >= pgbreak &&
664			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
665				tmp = pgbreak;
666		}
667	}
668
669	return tmp;
670}
671
672int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
673		size_t size)
674{
675	struct iovec *iov;
676	struct tcp_sock *tp = tcp_sk(sk);
677	struct sk_buff *skb;
678	int iovlen, flags;
679	int mss_now, size_goal;
680	int err, copied;
681	long timeo;
682
683	lock_sock(sk);
684	TCP_CHECK_TIMER(sk);
685
686	flags = msg->msg_flags;
687	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
688
689	/* Wait for a connection to finish. */
690	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
691		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
692			goto out_err;
693
694	/* This should be in poll */
695	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
696
697	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
698	size_goal = tp->xmit_size_goal;
699
700	/* Ok commence sending. */
701	iovlen = msg->msg_iovlen;
702	iov = msg->msg_iov;
703	copied = 0;
704
705	err = -EPIPE;
706	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
707		goto do_error;
708
709	while (--iovlen >= 0) {
710		int seglen = iov->iov_len;
711		unsigned char __user *from = iov->iov_base;
712
713		iov++;
714
715		while (seglen > 0) {
716			int copy;
717
718			skb = sk->sk_write_queue.prev;
719
720			if (!sk->sk_send_head ||
721			    (copy = size_goal - skb->len) <= 0) {
722
723new_segment:
724				/* Allocate new segment. If the interface is SG,
725				 * allocate skb fitting to single page.
726				 */
727				if (!sk_stream_memory_free(sk))
728					goto wait_for_sndbuf;
729
730				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
731							   0, sk->sk_allocation);
732				if (!skb)
733					goto wait_for_memory;
734
735				/*
736				 * Check whether we can use HW checksum.
737				 */
738				if (sk->sk_route_caps &
739				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
740				     NETIF_F_HW_CSUM))
741					skb->ip_summed = CHECKSUM_HW;
742
743				skb_entail(sk, tp, skb);
744				copy = size_goal;
745			}
746
747			/* Try to append data to the end of skb. */
748			if (copy > seglen)
749				copy = seglen;
750
751			/* Where to copy to? */
752			if (skb_tailroom(skb) > 0) {
753				/* We have some space in skb head. Superb! */
754				if (copy > skb_tailroom(skb))
755					copy = skb_tailroom(skb);
756				if ((err = skb_add_data(skb, from, copy)) != 0)
757					goto do_fault;
758			} else {
759				int merge = 0;
760				int i = skb_shinfo(skb)->nr_frags;
761				struct page *page = TCP_PAGE(sk);
762				int off = TCP_OFF(sk);
763
764				if (skb_can_coalesce(skb, i, page, off) &&
765				    off != PAGE_SIZE) {
766					/* We can extend the last page
767					 * fragment. */
768					merge = 1;
769				} else if (i == MAX_SKB_FRAGS ||
770					   (!i &&
771					   !(sk->sk_route_caps & NETIF_F_SG))) {
772					/* Need to add new fragment and cannot
773					 * do this because interface is non-SG,
774					 * or because all the page slots are
775					 * busy. */
776					tcp_mark_push(tp, skb);
777					goto new_segment;
778				} else if (page) {
779					if (off == PAGE_SIZE) {
780						put_page(page);
781						TCP_PAGE(sk) = page = NULL;
782					}
783				}
784
785				if (!page) {
786					/* Allocate new cache page. */
787					if (!(page = sk_stream_alloc_page(sk)))
788						goto wait_for_memory;
789					off = 0;
790				}
791
792				if (copy > PAGE_SIZE - off)
793					copy = PAGE_SIZE - off;
794
795				/* Time to copy data. We are close to
796				 * the end! */
797				err = skb_copy_to_page(sk, from, skb, page,
798						       off, copy);
799				if (err) {
800					/* If this page was new, give it to the
801					 * socket so it does not get leaked.
802					 */
803					if (!TCP_PAGE(sk)) {
804						TCP_PAGE(sk) = page;
805						TCP_OFF(sk) = 0;
806					}
807					goto do_error;
808				}
809
810				/* Update the skb. */
811				if (merge) {
812					skb_shinfo(skb)->frags[i - 1].size +=
813									copy;
814				} else {
815					skb_fill_page_desc(skb, i, page, off, copy);
816					if (TCP_PAGE(sk)) {
817						get_page(page);
818					} else if (off + copy < PAGE_SIZE) {
819						get_page(page);
820						TCP_PAGE(sk) = page;
821					}
822				}
823
824				TCP_OFF(sk) = off + copy;
825			}
826
827			if (!copied)
828				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
829
830			tp->write_seq += copy;
831			TCP_SKB_CB(skb)->end_seq += copy;
832			skb_shinfo(skb)->tso_segs = 0;
833
834			from += copy;
835			copied += copy;
836			if ((seglen -= copy) == 0 && iovlen == 0)
837				goto out;
838
839			if (skb->len < mss_now || (flags & MSG_OOB))
840				continue;
841
842			if (forced_push(tp)) {
843				tcp_mark_push(tp, skb);
844				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
845			} else if (skb == sk->sk_send_head)
846				tcp_push_one(sk, mss_now);
847			continue;
848
849wait_for_sndbuf:
850			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
851wait_for_memory:
852			if (copied)
853				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
854
855			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
856				goto do_error;
857
858			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
859			size_goal = tp->xmit_size_goal;
860		}
861	}
862
863out:
864	if (copied)
865		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
866	TCP_CHECK_TIMER(sk);
867	release_sock(sk);
868	return copied;
869
870do_fault:
871	if (!skb->len) {
872		if (sk->sk_send_head == skb)
873			sk->sk_send_head = NULL;
874		__skb_unlink(skb, &sk->sk_write_queue);
875		sk_stream_free_skb(sk, skb);
876	}
877
878do_error:
879	if (copied)
880		goto out;
881out_err:
882	err = sk_stream_error(sk, flags, err);
883	TCP_CHECK_TIMER(sk);
884	release_sock(sk);
885	return err;
886}
887
888/*
889 *	Handle reading urgent data. BSD has very simple semantics for
890 *	this, no blocking and very strange errors 8)
891 */
892
893static int tcp_recv_urg(struct sock *sk, long timeo,
894			struct msghdr *msg, int len, int flags,
895			int *addr_len)
896{
897	struct tcp_sock *tp = tcp_sk(sk);
898
899	/* No URG data to read. */
900	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
901	    tp->urg_data == TCP_URG_READ)
902		return -EINVAL;	/* Yes this is right ! */
903
904	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
905		return -ENOTCONN;
906
907	if (tp->urg_data & TCP_URG_VALID) {
908		int err = 0;
909		char c = tp->urg_data;
910
911		if (!(flags & MSG_PEEK))
912			tp->urg_data = TCP_URG_READ;
913
914		/* Read urgent data. */
915		msg->msg_flags |= MSG_OOB;
916
917		if (len > 0) {
918			if (!(flags & MSG_TRUNC))
919				err = memcpy_toiovec(msg->msg_iov, &c, 1);
920			len = 1;
921		} else
922			msg->msg_flags |= MSG_TRUNC;
923
924		return err ? -EFAULT : len;
925	}
926
927	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
928		return 0;
929
930	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
931	 * the available implementations agree in this case:
932	 * this call should never block, independent of the
933	 * blocking state of the socket.
934	 * Mike <pall@rz.uni-karlsruhe.de>
935	 */
936	return -EAGAIN;
937}
938
939/* Clean up the receive buffer for full frames taken by the user,
940 * then send an ACK if necessary.  COPIED is the number of bytes
941 * tcp_recvmsg has given to the user so far, it speeds up the
942 * calculation of whether or not we must ACK for the sake of
943 * a window update.
944 */
945static void cleanup_rbuf(struct sock *sk, int copied)
946{
947	struct tcp_sock *tp = tcp_sk(sk);
948	int time_to_ack = 0;
949
950#if TCP_DEBUG
951	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
952
953	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
954#endif
955
956	if (inet_csk_ack_scheduled(sk)) {
957		const struct inet_connection_sock *icsk = inet_csk(sk);
958		   /* Delayed ACKs frequently hit locked sockets during bulk
959		    * receive. */
960		if (icsk->icsk_ack.blocked ||
961		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
962		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
963		    /*
964		     * If this read emptied read buffer, we send ACK, if
965		     * connection is not bidirectional, user drained
966		     * receive buffer and there was a small segment
967		     * in queue.
968		     */
969		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
970		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
971			time_to_ack = 1;
972	}
973
974	/* We send an ACK if we can now advertise a non-zero window
975	 * which has been raised "significantly".
976	 *
977	 * Even if window raised up to infinity, do not send window open ACK
978	 * in states, where we will not receive more. It is useless.
979	 */
980	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
981		__u32 rcv_window_now = tcp_receive_window(tp);
982
983		/* Optimize, __tcp_select_window() is not cheap. */
984		if (2*rcv_window_now <= tp->window_clamp) {
985			__u32 new_window = __tcp_select_window(sk);
986
987			/* Send ACK now, if this read freed lots of space
988			 * in our buffer. Certainly, new_window is new window.
989			 * We can advertise it now, if it is not less than current one.
990			 * "Lots" means "at least twice" here.
991			 */
992			if (new_window && new_window >= 2 * rcv_window_now)
993				time_to_ack = 1;
994		}
995	}
996	if (time_to_ack)
997		tcp_send_ack(sk);
998}
999
1000static void tcp_prequeue_process(struct sock *sk)
1001{
1002	struct sk_buff *skb;
1003	struct tcp_sock *tp = tcp_sk(sk);
1004
1005	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1006
1007	/* RX process wants to run with disabled BHs, though it is not
1008	 * necessary */
1009	local_bh_disable();
1010	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1011		sk->sk_backlog_rcv(sk, skb);
1012	local_bh_enable();
1013
1014	/* Clear memory counter. */
1015	tp->ucopy.memory = 0;
1016}
1017
1018static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1019{
1020	struct sk_buff *skb;
1021	u32 offset;
1022
1023	skb_queue_walk(&sk->sk_receive_queue, skb) {
1024		offset = seq - TCP_SKB_CB(skb)->seq;
1025		if (skb->h.th->syn)
1026			offset--;
1027		if (offset < skb->len || skb->h.th->fin) {
1028			*off = offset;
1029			return skb;
1030		}
1031	}
1032	return NULL;
1033}
1034
1035/*
1036 * This routine provides an alternative to tcp_recvmsg() for routines
1037 * that would like to handle copying from skbuffs directly in 'sendfile'
1038 * fashion.
1039 * Note:
1040 *	- It is assumed that the socket was locked by the caller.
1041 *	- The routine does not block.
1042 *	- At present, there is no support for reading OOB data
1043 *	  or for 'peeking' the socket using this routine
1044 *	  (although both would be easy to implement).
1045 */
1046int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1047		  sk_read_actor_t recv_actor)
1048{
1049	struct sk_buff *skb;
1050	struct tcp_sock *tp = tcp_sk(sk);
1051	u32 seq = tp->copied_seq;
1052	u32 offset;
1053	int copied = 0;
1054
1055	if (sk->sk_state == TCP_LISTEN)
1056		return -ENOTCONN;
1057	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1058		if (offset < skb->len) {
1059			size_t used, len;
1060
1061			len = skb->len - offset;
1062			/* Stop reading if we hit a patch of urgent data */
1063			if (tp->urg_data) {
1064				u32 urg_offset = tp->urg_seq - seq;
1065				if (urg_offset < len)
1066					len = urg_offset;
1067				if (!len)
1068					break;
1069			}
1070			used = recv_actor(desc, skb, offset, len);
1071			if (used <= len) {
1072				seq += used;
1073				copied += used;
1074				offset += used;
1075			}
1076			if (offset != skb->len)
1077				break;
1078		}
1079		if (skb->h.th->fin) {
1080			sk_eat_skb(sk, skb);
1081			++seq;
1082			break;
1083		}
1084		sk_eat_skb(sk, skb);
1085		if (!desc->count)
1086			break;
1087	}
1088	tp->copied_seq = seq;
1089
1090	tcp_rcv_space_adjust(sk);
1091
1092	/* Clean up data we have read: This will do ACK frames. */
1093	if (copied)
1094		cleanup_rbuf(sk, copied);
1095	return copied;
1096}
1097
1098/*
1099 *	This routine copies from a sock struct into the user buffer.
1100 *
1101 *	Technical note: in 2.3 we work on _locked_ socket, so that
1102 *	tricks with *seq access order and skb->users are not required.
1103 *	Probably, code can be easily improved even more.
1104 */
1105
1106int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1107		size_t len, int nonblock, int flags, int *addr_len)
1108{
1109	struct tcp_sock *tp = tcp_sk(sk);
1110	int copied = 0;
1111	u32 peek_seq;
1112	u32 *seq;
1113	unsigned long used;
1114	int err;
1115	int target;		/* Read at least this many bytes */
1116	long timeo;
1117	struct task_struct *user_recv = NULL;
1118
1119	lock_sock(sk);
1120
1121	TCP_CHECK_TIMER(sk);
1122
1123	err = -ENOTCONN;
1124	if (sk->sk_state == TCP_LISTEN)
1125		goto out;
1126
1127	timeo = sock_rcvtimeo(sk, nonblock);
1128
1129	/* Urgent data needs to be handled specially. */
1130	if (flags & MSG_OOB)
1131		goto recv_urg;
1132
1133	seq = &tp->copied_seq;
1134	if (flags & MSG_PEEK) {
1135		peek_seq = tp->copied_seq;
1136		seq = &peek_seq;
1137	}
1138
1139	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1140
1141	do {
1142		struct sk_buff *skb;
1143		u32 offset;
1144
1145		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1146		if (tp->urg_data && tp->urg_seq == *seq) {
1147			if (copied)
1148				break;
1149			if (signal_pending(current)) {
1150				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1151				break;
1152			}
1153		}
1154
1155		/* Next get a buffer. */
1156
1157		skb = skb_peek(&sk->sk_receive_queue);
1158		do {
1159			if (!skb)
1160				break;
1161
1162			/* Now that we have two receive queues this
1163			 * shouldn't happen.
1164			 */
1165			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1166				printk(KERN_INFO "recvmsg bug: copied %X "
1167				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1168				break;
1169			}
1170			offset = *seq - TCP_SKB_CB(skb)->seq;
1171			if (skb->h.th->syn)
1172				offset--;
1173			if (offset < skb->len)
1174				goto found_ok_skb;
1175			if (skb->h.th->fin)
1176				goto found_fin_ok;
1177			BUG_TRAP(flags & MSG_PEEK);
1178			skb = skb->next;
1179		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1180
1181		/* Well, if we have backlog, try to process it now yet. */
1182
1183		if (copied >= target && !sk->sk_backlog.tail)
1184			break;
1185
1186		if (copied) {
1187			if (sk->sk_err ||
1188			    sk->sk_state == TCP_CLOSE ||
1189			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1190			    !timeo ||
1191			    signal_pending(current) ||
1192			    (flags & MSG_PEEK))
1193				break;
1194		} else {
1195			if (sock_flag(sk, SOCK_DONE))
1196				break;
1197
1198			if (sk->sk_err) {
1199				copied = sock_error(sk);
1200				break;
1201			}
1202
1203			if (sk->sk_shutdown & RCV_SHUTDOWN)
1204				break;
1205
1206			if (sk->sk_state == TCP_CLOSE) {
1207				if (!sock_flag(sk, SOCK_DONE)) {
1208					/* This occurs when user tries to read
1209					 * from never connected socket.
1210					 */
1211					copied = -ENOTCONN;
1212					break;
1213				}
1214				break;
1215			}
1216
1217			if (!timeo) {
1218				copied = -EAGAIN;
1219				break;
1220			}
1221
1222			if (signal_pending(current)) {
1223				copied = sock_intr_errno(timeo);
1224				break;
1225			}
1226		}
1227
1228		cleanup_rbuf(sk, copied);
1229
1230		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1231			/* Install new reader */
1232			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1233				user_recv = current;
1234				tp->ucopy.task = user_recv;
1235				tp->ucopy.iov = msg->msg_iov;
1236			}
1237
1238			tp->ucopy.len = len;
1239
1240			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1241				 (flags & (MSG_PEEK | MSG_TRUNC)));
1242
1243			/* Ugly... If prequeue is not empty, we have to
1244			 * process it before releasing socket, otherwise
1245			 * order will be broken at second iteration.
1246			 * More elegant solution is required!!!
1247			 *
1248			 * Look: we have the following (pseudo)queues:
1249			 *
1250			 * 1. packets in flight
1251			 * 2. backlog
1252			 * 3. prequeue
1253			 * 4. receive_queue
1254			 *
1255			 * Each queue can be processed only if the next ones
1256			 * are empty. At this point we have empty receive_queue.
1257			 * But prequeue _can_ be not empty after 2nd iteration,
1258			 * when we jumped to start of loop because backlog
1259			 * processing added something to receive_queue.
1260			 * We cannot release_sock(), because backlog contains
1261			 * packets arrived _after_ prequeued ones.
1262			 *
1263			 * Shortly, algorithm is clear --- to process all
1264			 * the queues in order. We could make it more directly,
1265			 * requeueing packets from backlog to prequeue, if
1266			 * is not empty. It is more elegant, but eats cycles,
1267			 * unfortunately.
1268			 */
1269			if (!skb_queue_empty(&tp->ucopy.prequeue))
1270				goto do_prequeue;
1271
1272			/* __ Set realtime policy in scheduler __ */
1273		}
1274
1275		if (copied >= target) {
1276			/* Do not sleep, just process backlog. */
1277			release_sock(sk);
1278			lock_sock(sk);
1279		} else
1280			sk_wait_data(sk, &timeo);
1281
1282		if (user_recv) {
1283			int chunk;
1284
1285			/* __ Restore normal policy in scheduler __ */
1286
1287			if ((chunk = len - tp->ucopy.len) != 0) {
1288				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1289				len -= chunk;
1290				copied += chunk;
1291			}
1292
1293			if (tp->rcv_nxt == tp->copied_seq &&
1294			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1295do_prequeue:
1296				tcp_prequeue_process(sk);
1297
1298				if ((chunk = len - tp->ucopy.len) != 0) {
1299					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1300					len -= chunk;
1301					copied += chunk;
1302				}
1303			}
1304		}
1305		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1306			if (net_ratelimit())
1307				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1308				       current->comm, current->pid);
1309			peek_seq = tp->copied_seq;
1310		}
1311		continue;
1312
1313	found_ok_skb:
1314		/* Ok so how much can we use? */
1315		used = skb->len - offset;
1316		if (len < used)
1317			used = len;
1318
1319		/* Do we have urgent data here? */
1320		if (tp->urg_data) {
1321			u32 urg_offset = tp->urg_seq - *seq;
1322			if (urg_offset < used) {
1323				if (!urg_offset) {
1324					if (!sock_flag(sk, SOCK_URGINLINE)) {
1325						++*seq;
1326						offset++;
1327						used--;
1328						if (!used)
1329							goto skip_copy;
1330					}
1331				} else
1332					used = urg_offset;
1333			}
1334		}
1335
1336		if (!(flags & MSG_TRUNC)) {
1337			err = skb_copy_datagram_iovec(skb, offset,
1338						      msg->msg_iov, used);
1339			if (err) {
1340				/* Exception. Bailout! */
1341				if (!copied)
1342					copied = -EFAULT;
1343				break;
1344			}
1345		}
1346
1347		*seq += used;
1348		copied += used;
1349		len -= used;
1350
1351		tcp_rcv_space_adjust(sk);
1352
1353skip_copy:
1354		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1355			tp->urg_data = 0;
1356			tcp_fast_path_check(sk, tp);
1357		}
1358		if (used + offset < skb->len)
1359			continue;
1360
1361		if (skb->h.th->fin)
1362			goto found_fin_ok;
1363		if (!(flags & MSG_PEEK))
1364			sk_eat_skb(sk, skb);
1365		continue;
1366
1367	found_fin_ok:
1368		/* Process the FIN. */
1369		++*seq;
1370		if (!(flags & MSG_PEEK))
1371			sk_eat_skb(sk, skb);
1372		break;
1373	} while (len > 0);
1374
1375	if (user_recv) {
1376		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1377			int chunk;
1378
1379			tp->ucopy.len = copied > 0 ? len : 0;
1380
1381			tcp_prequeue_process(sk);
1382
1383			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1384				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1385				len -= chunk;
1386				copied += chunk;
1387			}
1388		}
1389
1390		tp->ucopy.task = NULL;
1391		tp->ucopy.len = 0;
1392	}
1393
1394	/* According to UNIX98, msg_name/msg_namelen are ignored
1395	 * on connected socket. I was just happy when found this 8) --ANK
1396	 */
1397
1398	/* Clean up data we have read: This will do ACK frames. */
1399	cleanup_rbuf(sk, copied);
1400
1401	TCP_CHECK_TIMER(sk);
1402	release_sock(sk);
1403	return copied;
1404
1405out:
1406	TCP_CHECK_TIMER(sk);
1407	release_sock(sk);
1408	return err;
1409
1410recv_urg:
1411	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1412	goto out;
1413}
1414
1415/*
1416 *	State processing on a close. This implements the state shift for
1417 *	sending our FIN frame. Note that we only send a FIN for some
1418 *	states. A shutdown() may have already sent the FIN, or we may be
1419 *	closed.
1420 */
1421
1422static unsigned char new_state[16] = {
1423  /* current state:        new state:      action:	*/
1424  /* (Invalid)		*/ TCP_CLOSE,
1425  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1426  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1427  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1428  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1429  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1430  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1431  /* TCP_CLOSE		*/ TCP_CLOSE,
1432  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1433  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1434  /* TCP_LISTEN		*/ TCP_CLOSE,
1435  /* TCP_CLOSING	*/ TCP_CLOSING,
1436};
1437
1438static int tcp_close_state(struct sock *sk)
1439{
1440	int next = (int)new_state[sk->sk_state];
1441	int ns = next & TCP_STATE_MASK;
1442
1443	tcp_set_state(sk, ns);
1444
1445	return next & TCP_ACTION_FIN;
1446}
1447
1448/*
1449 *	Shutdown the sending side of a connection. Much like close except
1450 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1451 */
1452
1453void tcp_shutdown(struct sock *sk, int how)
1454{
1455	/*	We need to grab some memory, and put together a FIN,
1456	 *	and then put it into the queue to be sent.
1457	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1458	 */
1459	if (!(how & SEND_SHUTDOWN))
1460		return;
1461
1462	/* If we've already sent a FIN, or it's a closed state, skip this. */
1463	if ((1 << sk->sk_state) &
1464	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1465	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1466		/* Clear out any half completed packets.  FIN if needed. */
1467		if (tcp_close_state(sk))
1468			tcp_send_fin(sk);
1469	}
1470}
1471
1472void tcp_close(struct sock *sk, long timeout)
1473{
1474	struct sk_buff *skb;
1475	int data_was_unread = 0;
1476
1477	lock_sock(sk);
1478	sk->sk_shutdown = SHUTDOWN_MASK;
1479
1480	if (sk->sk_state == TCP_LISTEN) {
1481		tcp_set_state(sk, TCP_CLOSE);
1482
1483		/* Special case. */
1484		inet_csk_listen_stop(sk);
1485
1486		goto adjudge_to_death;
1487	}
1488
1489	/*  We need to flush the recv. buffs.  We do this only on the
1490	 *  descriptor close, not protocol-sourced closes, because the
1491	 *  reader process may not have drained the data yet!
1492	 */
1493	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1494		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1495			  skb->h.th->fin;
1496		data_was_unread += len;
1497		__kfree_skb(skb);
1498	}
1499
1500	sk_stream_mem_reclaim(sk);
1501
1502	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1503	 * 3.10, we send a RST here because data was lost.  To
1504	 * witness the awful effects of the old behavior of always
1505	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1506	 * a bulk GET in an FTP client, suspend the process, wait
1507	 * for the client to advertise a zero window, then kill -9
1508	 * the FTP client, wheee...  Note: timeout is always zero
1509	 * in such a case.
1510	 */
1511	if (data_was_unread) {
1512		/* Unread data was tossed, zap the connection. */
1513		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1514		tcp_set_state(sk, TCP_CLOSE);
1515		tcp_send_active_reset(sk, GFP_KERNEL);
1516	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1517		/* Check zero linger _after_ checking for unread data. */
1518		sk->sk_prot->disconnect(sk, 0);
1519		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1520	} else if (tcp_close_state(sk)) {
1521		/* We FIN if the application ate all the data before
1522		 * zapping the connection.
1523		 */
1524
1525		/* RED-PEN. Formally speaking, we have broken TCP state
1526		 * machine. State transitions:
1527		 *
1528		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1529		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1530		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1531		 *
1532		 * are legal only when FIN has been sent (i.e. in window),
1533		 * rather than queued out of window. Purists blame.
1534		 *
1535		 * F.e. "RFC state" is ESTABLISHED,
1536		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1537		 *
1538		 * The visible declinations are that sometimes
1539		 * we enter time-wait state, when it is not required really
1540		 * (harmless), do not send active resets, when they are
1541		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1542		 * they look as CLOSING or LAST_ACK for Linux)
1543		 * Probably, I missed some more holelets.
1544		 * 						--ANK
1545		 */
1546		tcp_send_fin(sk);
1547	}
1548
1549	sk_stream_wait_close(sk, timeout);
1550
1551adjudge_to_death:
1552	/* It is the last release_sock in its life. It will remove backlog. */
1553	release_sock(sk);
1554
1555
1556	/* Now socket is owned by kernel and we acquire BH lock
1557	   to finish close. No need to check for user refs.
1558	 */
1559	local_bh_disable();
1560	bh_lock_sock(sk);
1561	BUG_TRAP(!sock_owned_by_user(sk));
1562
1563	sock_hold(sk);
1564	sock_orphan(sk);
1565
1566	/*	This is a (useful) BSD violating of the RFC. There is a
1567	 *	problem with TCP as specified in that the other end could
1568	 *	keep a socket open forever with no application left this end.
1569	 *	We use a 3 minute timeout (about the same as BSD) then kill
1570	 *	our end. If they send after that then tough - BUT: long enough
1571	 *	that we won't make the old 4*rto = almost no time - whoops
1572	 *	reset mistake.
1573	 *
1574	 *	Nope, it was not mistake. It is really desired behaviour
1575	 *	f.e. on http servers, when such sockets are useless, but
1576	 *	consume significant resources. Let's do it with special
1577	 *	linger2	option.					--ANK
1578	 */
1579
1580	if (sk->sk_state == TCP_FIN_WAIT2) {
1581		struct tcp_sock *tp = tcp_sk(sk);
1582		if (tp->linger2 < 0) {
1583			tcp_set_state(sk, TCP_CLOSE);
1584			tcp_send_active_reset(sk, GFP_ATOMIC);
1585			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1586		} else {
1587			const int tmo = tcp_fin_time(sk);
1588
1589			if (tmo > TCP_TIMEWAIT_LEN) {
1590				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1591			} else {
1592				atomic_inc(sk->sk_prot->orphan_count);
1593				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1594				goto out;
1595			}
1596		}
1597	}
1598	if (sk->sk_state != TCP_CLOSE) {
1599		sk_stream_mem_reclaim(sk);
1600		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1601		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1602		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1603			if (net_ratelimit())
1604				printk(KERN_INFO "TCP: too many of orphaned "
1605				       "sockets\n");
1606			tcp_set_state(sk, TCP_CLOSE);
1607			tcp_send_active_reset(sk, GFP_ATOMIC);
1608			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1609		}
1610	}
1611	atomic_inc(sk->sk_prot->orphan_count);
1612
1613	if (sk->sk_state == TCP_CLOSE)
1614		inet_csk_destroy_sock(sk);
1615	/* Otherwise, socket is reprieved until protocol close. */
1616
1617out:
1618	bh_unlock_sock(sk);
1619	local_bh_enable();
1620	sock_put(sk);
1621}
1622
1623/* These states need RST on ABORT according to RFC793 */
1624
1625static inline int tcp_need_reset(int state)
1626{
1627	return (1 << state) &
1628	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1629		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1630}
1631
1632int tcp_disconnect(struct sock *sk, int flags)
1633{
1634	struct inet_sock *inet = inet_sk(sk);
1635	struct inet_connection_sock *icsk = inet_csk(sk);
1636	struct tcp_sock *tp = tcp_sk(sk);
1637	int err = 0;
1638	int old_state = sk->sk_state;
1639
1640	if (old_state != TCP_CLOSE)
1641		tcp_set_state(sk, TCP_CLOSE);
1642
1643	/* ABORT function of RFC793 */
1644	if (old_state == TCP_LISTEN) {
1645		inet_csk_listen_stop(sk);
1646	} else if (tcp_need_reset(old_state) ||
1647		   (tp->snd_nxt != tp->write_seq &&
1648		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1649		/* The last check adjusts for discrepance of Linux wrt. RFC
1650		 * states
1651		 */
1652		tcp_send_active_reset(sk, gfp_any());
1653		sk->sk_err = ECONNRESET;
1654	} else if (old_state == TCP_SYN_SENT)
1655		sk->sk_err = ECONNRESET;
1656
1657	tcp_clear_xmit_timers(sk);
1658	__skb_queue_purge(&sk->sk_receive_queue);
1659	sk_stream_writequeue_purge(sk);
1660	__skb_queue_purge(&tp->out_of_order_queue);
1661
1662	inet->dport = 0;
1663
1664	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1665		inet_reset_saddr(sk);
1666
1667	sk->sk_shutdown = 0;
1668	sock_reset_flag(sk, SOCK_DONE);
1669	tp->srtt = 0;
1670	if ((tp->write_seq += tp->max_window + 2) == 0)
1671		tp->write_seq = 1;
1672	icsk->icsk_backoff = 0;
1673	tp->snd_cwnd = 2;
1674	tp->probes_out = 0;
1675	tp->packets_out = 0;
1676	tp->snd_ssthresh = 0x7fffffff;
1677	tp->snd_cwnd_cnt = 0;
1678	tcp_set_ca_state(tp, TCP_CA_Open);
1679	tcp_clear_retrans(tp);
1680	inet_csk_delack_init(sk);
1681	sk->sk_send_head = NULL;
1682	tp->rx_opt.saw_tstamp = 0;
1683	tcp_sack_reset(&tp->rx_opt);
1684	__sk_dst_reset(sk);
1685
1686	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1687
1688	sk->sk_error_report(sk);
1689	return err;
1690}
1691
1692/*
1693 *	Socket option code for TCP.
1694 */
1695int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1696		   int optlen)
1697{
1698	struct tcp_sock *tp = tcp_sk(sk);
1699	struct inet_connection_sock *icsk = inet_csk(sk);
1700	int val;
1701	int err = 0;
1702
1703	if (level != SOL_TCP)
1704		return tp->af_specific->setsockopt(sk, level, optname,
1705						   optval, optlen);
1706
1707	/* This is a string value all the others are int's */
1708	if (optname == TCP_CONGESTION) {
1709		char name[TCP_CA_NAME_MAX];
1710
1711		if (optlen < 1)
1712			return -EINVAL;
1713
1714		val = strncpy_from_user(name, optval,
1715					min(TCP_CA_NAME_MAX-1, optlen));
1716		if (val < 0)
1717			return -EFAULT;
1718		name[val] = 0;
1719
1720		lock_sock(sk);
1721		err = tcp_set_congestion_control(tp, name);
1722		release_sock(sk);
1723		return err;
1724	}
1725
1726	if (optlen < sizeof(int))
1727		return -EINVAL;
1728
1729	if (get_user(val, (int __user *)optval))
1730		return -EFAULT;
1731
1732	lock_sock(sk);
1733
1734	switch (optname) {
1735	case TCP_MAXSEG:
1736		/* Values greater than interface MTU won't take effect. However
1737		 * at the point when this call is done we typically don't yet
1738		 * know which interface is going to be used */
1739		if (val < 8 || val > MAX_TCP_WINDOW) {
1740			err = -EINVAL;
1741			break;
1742		}
1743		tp->rx_opt.user_mss = val;
1744		break;
1745
1746	case TCP_NODELAY:
1747		if (val) {
1748			/* TCP_NODELAY is weaker than TCP_CORK, so that
1749			 * this option on corked socket is remembered, but
1750			 * it is not activated until cork is cleared.
1751			 *
1752			 * However, when TCP_NODELAY is set we make
1753			 * an explicit push, which overrides even TCP_CORK
1754			 * for currently queued segments.
1755			 */
1756			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1757			tcp_push_pending_frames(sk, tp);
1758		} else {
1759			tp->nonagle &= ~TCP_NAGLE_OFF;
1760		}
1761		break;
1762
1763	case TCP_CORK:
1764		/* When set indicates to always queue non-full frames.
1765		 * Later the user clears this option and we transmit
1766		 * any pending partial frames in the queue.  This is
1767		 * meant to be used alongside sendfile() to get properly
1768		 * filled frames when the user (for example) must write
1769		 * out headers with a write() call first and then use
1770		 * sendfile to send out the data parts.
1771		 *
1772		 * TCP_CORK can be set together with TCP_NODELAY and it is
1773		 * stronger than TCP_NODELAY.
1774		 */
1775		if (val) {
1776			tp->nonagle |= TCP_NAGLE_CORK;
1777		} else {
1778			tp->nonagle &= ~TCP_NAGLE_CORK;
1779			if (tp->nonagle&TCP_NAGLE_OFF)
1780				tp->nonagle |= TCP_NAGLE_PUSH;
1781			tcp_push_pending_frames(sk, tp);
1782		}
1783		break;
1784
1785	case TCP_KEEPIDLE:
1786		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1787			err = -EINVAL;
1788		else {
1789			tp->keepalive_time = val * HZ;
1790			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1791			    !((1 << sk->sk_state) &
1792			      (TCPF_CLOSE | TCPF_LISTEN))) {
1793				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1794				if (tp->keepalive_time > elapsed)
1795					elapsed = tp->keepalive_time - elapsed;
1796				else
1797					elapsed = 0;
1798				inet_csk_reset_keepalive_timer(sk, elapsed);
1799			}
1800		}
1801		break;
1802	case TCP_KEEPINTVL:
1803		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1804			err = -EINVAL;
1805		else
1806			tp->keepalive_intvl = val * HZ;
1807		break;
1808	case TCP_KEEPCNT:
1809		if (val < 1 || val > MAX_TCP_KEEPCNT)
1810			err = -EINVAL;
1811		else
1812			tp->keepalive_probes = val;
1813		break;
1814	case TCP_SYNCNT:
1815		if (val < 1 || val > MAX_TCP_SYNCNT)
1816			err = -EINVAL;
1817		else
1818			icsk->icsk_syn_retries = val;
1819		break;
1820
1821	case TCP_LINGER2:
1822		if (val < 0)
1823			tp->linger2 = -1;
1824		else if (val > sysctl_tcp_fin_timeout / HZ)
1825			tp->linger2 = 0;
1826		else
1827			tp->linger2 = val * HZ;
1828		break;
1829
1830	case TCP_DEFER_ACCEPT:
1831		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1832		if (val > 0) {
1833			/* Translate value in seconds to number of
1834			 * retransmits */
1835			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1836			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1837				       icsk->icsk_accept_queue.rskq_defer_accept))
1838				icsk->icsk_accept_queue.rskq_defer_accept++;
1839			icsk->icsk_accept_queue.rskq_defer_accept++;
1840		}
1841		break;
1842
1843	case TCP_WINDOW_CLAMP:
1844		if (!val) {
1845			if (sk->sk_state != TCP_CLOSE) {
1846				err = -EINVAL;
1847				break;
1848			}
1849			tp->window_clamp = 0;
1850		} else
1851			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1852						SOCK_MIN_RCVBUF / 2 : val;
1853		break;
1854
1855	case TCP_QUICKACK:
1856		if (!val) {
1857			icsk->icsk_ack.pingpong = 1;
1858		} else {
1859			icsk->icsk_ack.pingpong = 0;
1860			if ((1 << sk->sk_state) &
1861			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1862			    inet_csk_ack_scheduled(sk)) {
1863				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1864				cleanup_rbuf(sk, 1);
1865				if (!(val & 1))
1866					icsk->icsk_ack.pingpong = 1;
1867			}
1868		}
1869		break;
1870
1871	default:
1872		err = -ENOPROTOOPT;
1873		break;
1874	};
1875	release_sock(sk);
1876	return err;
1877}
1878
1879/* Return information about state of tcp endpoint in API format. */
1880void tcp_get_info(struct sock *sk, struct tcp_info *info)
1881{
1882	struct tcp_sock *tp = tcp_sk(sk);
1883	const struct inet_connection_sock *icsk = inet_csk(sk);
1884	u32 now = tcp_time_stamp;
1885
1886	memset(info, 0, sizeof(*info));
1887
1888	info->tcpi_state = sk->sk_state;
1889	info->tcpi_ca_state = tp->ca_state;
1890	info->tcpi_retransmits = icsk->icsk_retransmits;
1891	info->tcpi_probes = tp->probes_out;
1892	info->tcpi_backoff = icsk->icsk_backoff;
1893
1894	if (tp->rx_opt.tstamp_ok)
1895		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1896	if (tp->rx_opt.sack_ok)
1897		info->tcpi_options |= TCPI_OPT_SACK;
1898	if (tp->rx_opt.wscale_ok) {
1899		info->tcpi_options |= TCPI_OPT_WSCALE;
1900		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1901		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1902	}
1903
1904	if (tp->ecn_flags&TCP_ECN_OK)
1905		info->tcpi_options |= TCPI_OPT_ECN;
1906
1907	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1908	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1909	info->tcpi_snd_mss = tp->mss_cache;
1910	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1911
1912	info->tcpi_unacked = tp->packets_out;
1913	info->tcpi_sacked = tp->sacked_out;
1914	info->tcpi_lost = tp->lost_out;
1915	info->tcpi_retrans = tp->retrans_out;
1916	info->tcpi_fackets = tp->fackets_out;
1917
1918	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1919	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1920	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1921
1922	info->tcpi_pmtu = tp->pmtu_cookie;
1923	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1924	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1925	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1926	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1927	info->tcpi_snd_cwnd = tp->snd_cwnd;
1928	info->tcpi_advmss = tp->advmss;
1929	info->tcpi_reordering = tp->reordering;
1930
1931	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1932	info->tcpi_rcv_space = tp->rcvq_space.space;
1933
1934	info->tcpi_total_retrans = tp->total_retrans;
1935}
1936
1937EXPORT_SYMBOL_GPL(tcp_get_info);
1938
1939int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1940		   int __user *optlen)
1941{
1942	struct inet_connection_sock *icsk = inet_csk(sk);
1943	struct tcp_sock *tp = tcp_sk(sk);
1944	int val, len;
1945
1946	if (level != SOL_TCP)
1947		return tp->af_specific->getsockopt(sk, level, optname,
1948						   optval, optlen);
1949
1950	if (get_user(len, optlen))
1951		return -EFAULT;
1952
1953	len = min_t(unsigned int, len, sizeof(int));
1954
1955	if (len < 0)
1956		return -EINVAL;
1957
1958	switch (optname) {
1959	case TCP_MAXSEG:
1960		val = tp->mss_cache;
1961		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1962			val = tp->rx_opt.user_mss;
1963		break;
1964	case TCP_NODELAY:
1965		val = !!(tp->nonagle&TCP_NAGLE_OFF);
1966		break;
1967	case TCP_CORK:
1968		val = !!(tp->nonagle&TCP_NAGLE_CORK);
1969		break;
1970	case TCP_KEEPIDLE:
1971		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1972		break;
1973	case TCP_KEEPINTVL:
1974		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1975		break;
1976	case TCP_KEEPCNT:
1977		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1978		break;
1979	case TCP_SYNCNT:
1980		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1981		break;
1982	case TCP_LINGER2:
1983		val = tp->linger2;
1984		if (val >= 0)
1985			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1986		break;
1987	case TCP_DEFER_ACCEPT:
1988		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1989			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1990		break;
1991	case TCP_WINDOW_CLAMP:
1992		val = tp->window_clamp;
1993		break;
1994	case TCP_INFO: {
1995		struct tcp_info info;
1996
1997		if (get_user(len, optlen))
1998			return -EFAULT;
1999
2000		tcp_get_info(sk, &info);
2001
2002		len = min_t(unsigned int, len, sizeof(info));
2003		if (put_user(len, optlen))
2004			return -EFAULT;
2005		if (copy_to_user(optval, &info, len))
2006			return -EFAULT;
2007		return 0;
2008	}
2009	case TCP_QUICKACK:
2010		val = !icsk->icsk_ack.pingpong;
2011		break;
2012
2013	case TCP_CONGESTION:
2014		if (get_user(len, optlen))
2015			return -EFAULT;
2016		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2017		if (put_user(len, optlen))
2018			return -EFAULT;
2019		if (copy_to_user(optval, tp->ca_ops->name, len))
2020			return -EFAULT;
2021		return 0;
2022	default:
2023		return -ENOPROTOOPT;
2024	};
2025
2026	if (put_user(len, optlen))
2027		return -EFAULT;
2028	if (copy_to_user(optval, &val, len))
2029		return -EFAULT;
2030	return 0;
2031}
2032
2033
2034extern void __skb_cb_too_small_for_tcp(int, int);
2035extern struct tcp_congestion_ops tcp_reno;
2036
2037static __initdata unsigned long thash_entries;
2038static int __init set_thash_entries(char *str)
2039{
2040	if (!str)
2041		return 0;
2042	thash_entries = simple_strtoul(str, &str, 0);
2043	return 1;
2044}
2045__setup("thash_entries=", set_thash_entries);
2046
2047void __init tcp_init(void)
2048{
2049	struct sk_buff *skb = NULL;
2050	int order, i;
2051
2052	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2053		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2054					   sizeof(skb->cb));
2055
2056	tcp_hashinfo.bind_bucket_cachep =
2057		kmem_cache_create("tcp_bind_bucket",
2058				  sizeof(struct inet_bind_bucket), 0,
2059				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2060	if (!tcp_hashinfo.bind_bucket_cachep)
2061		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2062
2063	/* Size and allocate the main established and bind bucket
2064	 * hash tables.
2065	 *
2066	 * The methodology is similar to that of the buffer cache.
2067	 */
2068	tcp_hashinfo.ehash =
2069		alloc_large_system_hash("TCP established",
2070					sizeof(struct inet_ehash_bucket),
2071					thash_entries,
2072					(num_physpages >= 128 * 1024) ?
2073						(25 - PAGE_SHIFT) :
2074						(27 - PAGE_SHIFT),
2075					HASH_HIGHMEM,
2076					&tcp_hashinfo.ehash_size,
2077					NULL,
2078					0);
2079	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2080	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2081		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2082		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2083	}
2084
2085	tcp_hashinfo.bhash =
2086		alloc_large_system_hash("TCP bind",
2087					sizeof(struct inet_bind_hashbucket),
2088					tcp_hashinfo.ehash_size,
2089					(num_physpages >= 128 * 1024) ?
2090						(25 - PAGE_SHIFT) :
2091						(27 - PAGE_SHIFT),
2092					HASH_HIGHMEM,
2093					&tcp_hashinfo.bhash_size,
2094					NULL,
2095					64 * 1024);
2096	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2097	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2098		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2099		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2100	}
2101
2102	/* Try to be a bit smarter and adjust defaults depending
2103	 * on available memory.
2104	 */
2105	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2106			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2107			order++)
2108		;
2109	if (order >= 4) {
2110		sysctl_local_port_range[0] = 32768;
2111		sysctl_local_port_range[1] = 61000;
2112		tcp_death_row.sysctl_max_tw_buckets = 180000;
2113		sysctl_tcp_max_orphans = 4096 << (order - 4);
2114		sysctl_max_syn_backlog = 1024;
2115	} else if (order < 3) {
2116		sysctl_local_port_range[0] = 1024 * (3 - order);
2117		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2118		sysctl_tcp_max_orphans >>= (3 - order);
2119		sysctl_max_syn_backlog = 128;
2120	}
2121	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2122
2123	sysctl_tcp_mem[0] =  768 << order;
2124	sysctl_tcp_mem[1] = 1024 << order;
2125	sysctl_tcp_mem[2] = 1536 << order;
2126
2127	if (order < 3) {
2128		sysctl_tcp_wmem[2] = 64 * 1024;
2129		sysctl_tcp_rmem[0] = PAGE_SIZE;
2130		sysctl_tcp_rmem[1] = 43689;
2131		sysctl_tcp_rmem[2] = 2 * 43689;
2132	}
2133
2134	printk(KERN_INFO "TCP: Hash tables configured "
2135	       "(established %d bind %d)\n",
2136	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2137
2138	tcp_register_congestion_control(&tcp_reno);
2139}
2140
2141EXPORT_SYMBOL(tcp_close);
2142EXPORT_SYMBOL(tcp_disconnect);
2143EXPORT_SYMBOL(tcp_getsockopt);
2144EXPORT_SYMBOL(tcp_ioctl);
2145EXPORT_SYMBOL(tcp_poll);
2146EXPORT_SYMBOL(tcp_read_sock);
2147EXPORT_SYMBOL(tcp_recvmsg);
2148EXPORT_SYMBOL(tcp_sendmsg);
2149EXPORT_SYMBOL(tcp_sendpage);
2150EXPORT_SYMBOL(tcp_setsockopt);
2151EXPORT_SYMBOL(tcp_shutdown);
2152EXPORT_SYMBOL(tcp_statistics);
2153