tcp.c revision 18955cfcb2a5d75a08e0cb297f13ccfb6904de48
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
273
274atomic_t tcp_orphan_count = ATOMIC_INIT(0);
275
276EXPORT_SYMBOL_GPL(tcp_orphan_count);
277
278int sysctl_tcp_mem[3];
279int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
281
282EXPORT_SYMBOL(sysctl_tcp_mem);
283EXPORT_SYMBOL(sysctl_tcp_rmem);
284EXPORT_SYMBOL(sysctl_tcp_wmem);
285
286atomic_t tcp_memory_allocated;	/* Current allocated memory. */
287atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
288
289EXPORT_SYMBOL(tcp_memory_allocated);
290EXPORT_SYMBOL(tcp_sockets_allocated);
291
292/*
293 * Pressure flag: try to collapse.
294 * Technical note: it is used by multiple contexts non atomically.
295 * All the sk_stream_mem_schedule() is of this nature: accounting
296 * is strict, actions are advisory and have some latency.
297 */
298int tcp_memory_pressure;
299
300EXPORT_SYMBOL(tcp_memory_pressure);
301
302void tcp_enter_memory_pressure(void)
303{
304	if (!tcp_memory_pressure) {
305		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
306		tcp_memory_pressure = 1;
307	}
308}
309
310EXPORT_SYMBOL(tcp_enter_memory_pressure);
311
312/*
313 *	Wait for a TCP event.
314 *
315 *	Note that we don't need to lock the socket, as the upper poll layers
316 *	take care of normal races (between the test and the event) and we don't
317 *	go look at any of the socket buffers directly.
318 */
319unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
320{
321	unsigned int mask;
322	struct sock *sk = sock->sk;
323	struct tcp_sock *tp = tcp_sk(sk);
324
325	poll_wait(file, sk->sk_sleep, wait);
326	if (sk->sk_state == TCP_LISTEN)
327		return inet_csk_listen_poll(sk);
328
329	/* Socket is not locked. We are protected from async events
330	   by poll logic and correct handling of state changes
331	   made by another threads is impossible in any case.
332	 */
333
334	mask = 0;
335	if (sk->sk_err)
336		mask = POLLERR;
337
338	/*
339	 * POLLHUP is certainly not done right. But poll() doesn't
340	 * have a notion of HUP in just one direction, and for a
341	 * socket the read side is more interesting.
342	 *
343	 * Some poll() documentation says that POLLHUP is incompatible
344	 * with the POLLOUT/POLLWR flags, so somebody should check this
345	 * all. But careful, it tends to be safer to return too many
346	 * bits than too few, and you can easily break real applications
347	 * if you don't tell them that something has hung up!
348	 *
349	 * Check-me.
350	 *
351	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
352	 * our fs/select.c). It means that after we received EOF,
353	 * poll always returns immediately, making impossible poll() on write()
354	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
355	 * if and only if shutdown has been made in both directions.
356	 * Actually, it is interesting to look how Solaris and DUX
357	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
358	 * then we could set it on SND_SHUTDOWN. BTW examples given
359	 * in Stevens' books assume exactly this behaviour, it explains
360	 * why PULLHUP is incompatible with POLLOUT.	--ANK
361	 *
362	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
363	 * blocking on fresh not-connected or disconnected socket. --ANK
364	 */
365	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
366		mask |= POLLHUP;
367	if (sk->sk_shutdown & RCV_SHUTDOWN)
368		mask |= POLLIN | POLLRDNORM;
369
370	/* Connected? */
371	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
372		/* Potential race condition. If read of tp below will
373		 * escape above sk->sk_state, we can be illegally awaken
374		 * in SYN_* states. */
375		if ((tp->rcv_nxt != tp->copied_seq) &&
376		    (tp->urg_seq != tp->copied_seq ||
377		     tp->rcv_nxt != tp->copied_seq + 1 ||
378		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
379			mask |= POLLIN | POLLRDNORM;
380
381		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
382			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
383				mask |= POLLOUT | POLLWRNORM;
384			} else {  /* send SIGIO later */
385				set_bit(SOCK_ASYNC_NOSPACE,
386					&sk->sk_socket->flags);
387				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
388
389				/* Race breaker. If space is freed after
390				 * wspace test but before the flags are set,
391				 * IO signal will be lost.
392				 */
393				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
394					mask |= POLLOUT | POLLWRNORM;
395			}
396		}
397
398		if (tp->urg_data & TCP_URG_VALID)
399			mask |= POLLPRI;
400	}
401	return mask;
402}
403
404int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
405{
406	struct tcp_sock *tp = tcp_sk(sk);
407	int answ;
408
409	switch (cmd) {
410	case SIOCINQ:
411		if (sk->sk_state == TCP_LISTEN)
412			return -EINVAL;
413
414		lock_sock(sk);
415		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
416			answ = 0;
417		else if (sock_flag(sk, SOCK_URGINLINE) ||
418			 !tp->urg_data ||
419			 before(tp->urg_seq, tp->copied_seq) ||
420			 !before(tp->urg_seq, tp->rcv_nxt)) {
421			answ = tp->rcv_nxt - tp->copied_seq;
422
423			/* Subtract 1, if FIN is in queue. */
424			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
425				answ -=
426		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
427		} else
428			answ = tp->urg_seq - tp->copied_seq;
429		release_sock(sk);
430		break;
431	case SIOCATMARK:
432		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
433		break;
434	case SIOCOUTQ:
435		if (sk->sk_state == TCP_LISTEN)
436			return -EINVAL;
437
438		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
439			answ = 0;
440		else
441			answ = tp->write_seq - tp->snd_una;
442		break;
443	default:
444		return -ENOIOCTLCMD;
445	};
446
447	return put_user(answ, (int __user *)arg);
448}
449
450static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
451{
452	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
453	tp->pushed_seq = tp->write_seq;
454}
455
456static inline int forced_push(struct tcp_sock *tp)
457{
458	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
459}
460
461static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
462			      struct sk_buff *skb)
463{
464	skb->csum = 0;
465	TCP_SKB_CB(skb)->seq = tp->write_seq;
466	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
467	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
468	TCP_SKB_CB(skb)->sacked = 0;
469	skb_header_release(skb);
470	__skb_queue_tail(&sk->sk_write_queue, skb);
471	sk_charge_skb(sk, skb);
472	if (!sk->sk_send_head)
473		sk->sk_send_head = skb;
474	if (tp->nonagle & TCP_NAGLE_PUSH)
475		tp->nonagle &= ~TCP_NAGLE_PUSH;
476}
477
478static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
479				struct sk_buff *skb)
480{
481	if (flags & MSG_OOB) {
482		tp->urg_mode = 1;
483		tp->snd_up = tp->write_seq;
484		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
485	}
486}
487
488static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
489			    int mss_now, int nonagle)
490{
491	if (sk->sk_send_head) {
492		struct sk_buff *skb = sk->sk_write_queue.prev;
493		if (!(flags & MSG_MORE) || forced_push(tp))
494			tcp_mark_push(tp, skb);
495		tcp_mark_urg(tp, flags, skb);
496		__tcp_push_pending_frames(sk, tp, mss_now,
497					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
498	}
499}
500
501static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
502			 size_t psize, int flags)
503{
504	struct tcp_sock *tp = tcp_sk(sk);
505	int mss_now, size_goal;
506	int err;
507	ssize_t copied;
508	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
509
510	/* Wait for a connection to finish. */
511	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
512		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
513			goto out_err;
514
515	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
516
517	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
518	size_goal = tp->xmit_size_goal;
519	copied = 0;
520
521	err = -EPIPE;
522	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
523		goto do_error;
524
525	while (psize > 0) {
526		struct sk_buff *skb = sk->sk_write_queue.prev;
527		struct page *page = pages[poffset / PAGE_SIZE];
528		int copy, i, can_coalesce;
529		int offset = poffset % PAGE_SIZE;
530		int size = min_t(size_t, psize, PAGE_SIZE - offset);
531
532		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
533new_segment:
534			if (!sk_stream_memory_free(sk))
535				goto wait_for_sndbuf;
536
537			skb = sk_stream_alloc_pskb(sk, 0, 0,
538						   sk->sk_allocation);
539			if (!skb)
540				goto wait_for_memory;
541
542			skb_entail(sk, tp, skb);
543			copy = size_goal;
544		}
545
546		if (copy > size)
547			copy = size;
548
549		i = skb_shinfo(skb)->nr_frags;
550		can_coalesce = skb_can_coalesce(skb, i, page, offset);
551		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
552			tcp_mark_push(tp, skb);
553			goto new_segment;
554		}
555		if (!sk_stream_wmem_schedule(sk, copy))
556			goto wait_for_memory;
557
558		if (can_coalesce) {
559			skb_shinfo(skb)->frags[i - 1].size += copy;
560		} else {
561			get_page(page);
562			skb_fill_page_desc(skb, i, page, offset, copy);
563		}
564
565		skb->len += copy;
566		skb->data_len += copy;
567		skb->truesize += copy;
568		sk->sk_wmem_queued += copy;
569		sk->sk_forward_alloc -= copy;
570		skb->ip_summed = CHECKSUM_HW;
571		tp->write_seq += copy;
572		TCP_SKB_CB(skb)->end_seq += copy;
573		skb_shinfo(skb)->tso_segs = 0;
574
575		if (!copied)
576			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
577
578		copied += copy;
579		poffset += copy;
580		if (!(psize -= copy))
581			goto out;
582
583		if (skb->len < mss_now || (flags & MSG_OOB))
584			continue;
585
586		if (forced_push(tp)) {
587			tcp_mark_push(tp, skb);
588			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
589		} else if (skb == sk->sk_send_head)
590			tcp_push_one(sk, mss_now);
591		continue;
592
593wait_for_sndbuf:
594		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
595wait_for_memory:
596		if (copied)
597			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
598
599		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
600			goto do_error;
601
602		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
603		size_goal = tp->xmit_size_goal;
604	}
605
606out:
607	if (copied)
608		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
609	return copied;
610
611do_error:
612	if (copied)
613		goto out;
614out_err:
615	return sk_stream_error(sk, flags, err);
616}
617
618ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
619		     size_t size, int flags)
620{
621	ssize_t res;
622	struct sock *sk = sock->sk;
623
624#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
625
626	if (!(sk->sk_route_caps & NETIF_F_SG) ||
627	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
628		return sock_no_sendpage(sock, page, offset, size, flags);
629
630#undef TCP_ZC_CSUM_FLAGS
631
632	lock_sock(sk);
633	TCP_CHECK_TIMER(sk);
634	res = do_tcp_sendpages(sk, &page, offset, size, flags);
635	TCP_CHECK_TIMER(sk);
636	release_sock(sk);
637	return res;
638}
639
640#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
641#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
642
643static inline int select_size(struct sock *sk, struct tcp_sock *tp)
644{
645	int tmp = tp->mss_cache;
646
647	if (sk->sk_route_caps & NETIF_F_SG) {
648		if (sk->sk_route_caps & NETIF_F_TSO)
649			tmp = 0;
650		else {
651			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
652
653			if (tmp >= pgbreak &&
654			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
655				tmp = pgbreak;
656		}
657	}
658
659	return tmp;
660}
661
662int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
663		size_t size)
664{
665	struct iovec *iov;
666	struct tcp_sock *tp = tcp_sk(sk);
667	struct sk_buff *skb;
668	int iovlen, flags;
669	int mss_now, size_goal;
670	int err, copied;
671	long timeo;
672
673	lock_sock(sk);
674	TCP_CHECK_TIMER(sk);
675
676	flags = msg->msg_flags;
677	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
678
679	/* Wait for a connection to finish. */
680	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
681		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
682			goto out_err;
683
684	/* This should be in poll */
685	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
686
687	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
688	size_goal = tp->xmit_size_goal;
689
690	/* Ok commence sending. */
691	iovlen = msg->msg_iovlen;
692	iov = msg->msg_iov;
693	copied = 0;
694
695	err = -EPIPE;
696	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
697		goto do_error;
698
699	while (--iovlen >= 0) {
700		int seglen = iov->iov_len;
701		unsigned char __user *from = iov->iov_base;
702
703		iov++;
704
705		while (seglen > 0) {
706			int copy;
707
708			skb = sk->sk_write_queue.prev;
709
710			if (!sk->sk_send_head ||
711			    (copy = size_goal - skb->len) <= 0) {
712
713new_segment:
714				/* Allocate new segment. If the interface is SG,
715				 * allocate skb fitting to single page.
716				 */
717				if (!sk_stream_memory_free(sk))
718					goto wait_for_sndbuf;
719
720				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
721							   0, sk->sk_allocation);
722				if (!skb)
723					goto wait_for_memory;
724
725				/*
726				 * Check whether we can use HW checksum.
727				 */
728				if (sk->sk_route_caps &
729				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
730				     NETIF_F_HW_CSUM))
731					skb->ip_summed = CHECKSUM_HW;
732
733				skb_entail(sk, tp, skb);
734				copy = size_goal;
735			}
736
737			/* Try to append data to the end of skb. */
738			if (copy > seglen)
739				copy = seglen;
740
741			/* Where to copy to? */
742			if (skb_tailroom(skb) > 0) {
743				/* We have some space in skb head. Superb! */
744				if (copy > skb_tailroom(skb))
745					copy = skb_tailroom(skb);
746				if ((err = skb_add_data(skb, from, copy)) != 0)
747					goto do_fault;
748			} else {
749				int merge = 0;
750				int i = skb_shinfo(skb)->nr_frags;
751				struct page *page = TCP_PAGE(sk);
752				int off = TCP_OFF(sk);
753
754				if (skb_can_coalesce(skb, i, page, off) &&
755				    off != PAGE_SIZE) {
756					/* We can extend the last page
757					 * fragment. */
758					merge = 1;
759				} else if (i == MAX_SKB_FRAGS ||
760					   (!i &&
761					   !(sk->sk_route_caps & NETIF_F_SG))) {
762					/* Need to add new fragment and cannot
763					 * do this because interface is non-SG,
764					 * or because all the page slots are
765					 * busy. */
766					tcp_mark_push(tp, skb);
767					goto new_segment;
768				} else if (page) {
769					if (off == PAGE_SIZE) {
770						put_page(page);
771						TCP_PAGE(sk) = page = NULL;
772						off = 0;
773					}
774				} else
775					off = 0;
776
777				if (copy > PAGE_SIZE - off)
778					copy = PAGE_SIZE - off;
779
780				if (!sk_stream_wmem_schedule(sk, copy))
781					goto wait_for_memory;
782
783				if (!page) {
784					/* Allocate new cache page. */
785					if (!(page = sk_stream_alloc_page(sk)))
786						goto wait_for_memory;
787				}
788
789				/* Time to copy data. We are close to
790				 * the end! */
791				err = skb_copy_to_page(sk, from, skb, page,
792						       off, copy);
793				if (err) {
794					/* If this page was new, give it to the
795					 * socket so it does not get leaked.
796					 */
797					if (!TCP_PAGE(sk)) {
798						TCP_PAGE(sk) = page;
799						TCP_OFF(sk) = 0;
800					}
801					goto do_error;
802				}
803
804				/* Update the skb. */
805				if (merge) {
806					skb_shinfo(skb)->frags[i - 1].size +=
807									copy;
808				} else {
809					skb_fill_page_desc(skb, i, page, off, copy);
810					if (TCP_PAGE(sk)) {
811						get_page(page);
812					} else if (off + copy < PAGE_SIZE) {
813						get_page(page);
814						TCP_PAGE(sk) = page;
815					}
816				}
817
818				TCP_OFF(sk) = off + copy;
819			}
820
821			if (!copied)
822				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
823
824			tp->write_seq += copy;
825			TCP_SKB_CB(skb)->end_seq += copy;
826			skb_shinfo(skb)->tso_segs = 0;
827
828			from += copy;
829			copied += copy;
830			if ((seglen -= copy) == 0 && iovlen == 0)
831				goto out;
832
833			if (skb->len < mss_now || (flags & MSG_OOB))
834				continue;
835
836			if (forced_push(tp)) {
837				tcp_mark_push(tp, skb);
838				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
839			} else if (skb == sk->sk_send_head)
840				tcp_push_one(sk, mss_now);
841			continue;
842
843wait_for_sndbuf:
844			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
845wait_for_memory:
846			if (copied)
847				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
848
849			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
850				goto do_error;
851
852			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
853			size_goal = tp->xmit_size_goal;
854		}
855	}
856
857out:
858	if (copied)
859		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
860	TCP_CHECK_TIMER(sk);
861	release_sock(sk);
862	return copied;
863
864do_fault:
865	if (!skb->len) {
866		if (sk->sk_send_head == skb)
867			sk->sk_send_head = NULL;
868		__skb_unlink(skb, &sk->sk_write_queue);
869		sk_stream_free_skb(sk, skb);
870	}
871
872do_error:
873	if (copied)
874		goto out;
875out_err:
876	err = sk_stream_error(sk, flags, err);
877	TCP_CHECK_TIMER(sk);
878	release_sock(sk);
879	return err;
880}
881
882/*
883 *	Handle reading urgent data. BSD has very simple semantics for
884 *	this, no blocking and very strange errors 8)
885 */
886
887static int tcp_recv_urg(struct sock *sk, long timeo,
888			struct msghdr *msg, int len, int flags,
889			int *addr_len)
890{
891	struct tcp_sock *tp = tcp_sk(sk);
892
893	/* No URG data to read. */
894	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
895	    tp->urg_data == TCP_URG_READ)
896		return -EINVAL;	/* Yes this is right ! */
897
898	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
899		return -ENOTCONN;
900
901	if (tp->urg_data & TCP_URG_VALID) {
902		int err = 0;
903		char c = tp->urg_data;
904
905		if (!(flags & MSG_PEEK))
906			tp->urg_data = TCP_URG_READ;
907
908		/* Read urgent data. */
909		msg->msg_flags |= MSG_OOB;
910
911		if (len > 0) {
912			if (!(flags & MSG_TRUNC))
913				err = memcpy_toiovec(msg->msg_iov, &c, 1);
914			len = 1;
915		} else
916			msg->msg_flags |= MSG_TRUNC;
917
918		return err ? -EFAULT : len;
919	}
920
921	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
922		return 0;
923
924	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
925	 * the available implementations agree in this case:
926	 * this call should never block, independent of the
927	 * blocking state of the socket.
928	 * Mike <pall@rz.uni-karlsruhe.de>
929	 */
930	return -EAGAIN;
931}
932
933/* Clean up the receive buffer for full frames taken by the user,
934 * then send an ACK if necessary.  COPIED is the number of bytes
935 * tcp_recvmsg has given to the user so far, it speeds up the
936 * calculation of whether or not we must ACK for the sake of
937 * a window update.
938 */
939static void cleanup_rbuf(struct sock *sk, int copied)
940{
941	struct tcp_sock *tp = tcp_sk(sk);
942	int time_to_ack = 0;
943
944#if TCP_DEBUG
945	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
946
947	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
948#endif
949
950	if (inet_csk_ack_scheduled(sk)) {
951		const struct inet_connection_sock *icsk = inet_csk(sk);
952		   /* Delayed ACKs frequently hit locked sockets during bulk
953		    * receive. */
954		if (icsk->icsk_ack.blocked ||
955		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
956		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
957		    /*
958		     * If this read emptied read buffer, we send ACK, if
959		     * connection is not bidirectional, user drained
960		     * receive buffer and there was a small segment
961		     * in queue.
962		     */
963		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
964		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
965			time_to_ack = 1;
966	}
967
968	/* We send an ACK if we can now advertise a non-zero window
969	 * which has been raised "significantly".
970	 *
971	 * Even if window raised up to infinity, do not send window open ACK
972	 * in states, where we will not receive more. It is useless.
973	 */
974	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
975		__u32 rcv_window_now = tcp_receive_window(tp);
976
977		/* Optimize, __tcp_select_window() is not cheap. */
978		if (2*rcv_window_now <= tp->window_clamp) {
979			__u32 new_window = __tcp_select_window(sk);
980
981			/* Send ACK now, if this read freed lots of space
982			 * in our buffer. Certainly, new_window is new window.
983			 * We can advertise it now, if it is not less than current one.
984			 * "Lots" means "at least twice" here.
985			 */
986			if (new_window && new_window >= 2 * rcv_window_now)
987				time_to_ack = 1;
988		}
989	}
990	if (time_to_ack)
991		tcp_send_ack(sk);
992}
993
994static void tcp_prequeue_process(struct sock *sk)
995{
996	struct sk_buff *skb;
997	struct tcp_sock *tp = tcp_sk(sk);
998
999	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1000
1001	/* RX process wants to run with disabled BHs, though it is not
1002	 * necessary */
1003	local_bh_disable();
1004	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1005		sk->sk_backlog_rcv(sk, skb);
1006	local_bh_enable();
1007
1008	/* Clear memory counter. */
1009	tp->ucopy.memory = 0;
1010}
1011
1012static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1013{
1014	struct sk_buff *skb;
1015	u32 offset;
1016
1017	skb_queue_walk(&sk->sk_receive_queue, skb) {
1018		offset = seq - TCP_SKB_CB(skb)->seq;
1019		if (skb->h.th->syn)
1020			offset--;
1021		if (offset < skb->len || skb->h.th->fin) {
1022			*off = offset;
1023			return skb;
1024		}
1025	}
1026	return NULL;
1027}
1028
1029/*
1030 * This routine provides an alternative to tcp_recvmsg() for routines
1031 * that would like to handle copying from skbuffs directly in 'sendfile'
1032 * fashion.
1033 * Note:
1034 *	- It is assumed that the socket was locked by the caller.
1035 *	- The routine does not block.
1036 *	- At present, there is no support for reading OOB data
1037 *	  or for 'peeking' the socket using this routine
1038 *	  (although both would be easy to implement).
1039 */
1040int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1041		  sk_read_actor_t recv_actor)
1042{
1043	struct sk_buff *skb;
1044	struct tcp_sock *tp = tcp_sk(sk);
1045	u32 seq = tp->copied_seq;
1046	u32 offset;
1047	int copied = 0;
1048
1049	if (sk->sk_state == TCP_LISTEN)
1050		return -ENOTCONN;
1051	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1052		if (offset < skb->len) {
1053			size_t used, len;
1054
1055			len = skb->len - offset;
1056			/* Stop reading if we hit a patch of urgent data */
1057			if (tp->urg_data) {
1058				u32 urg_offset = tp->urg_seq - seq;
1059				if (urg_offset < len)
1060					len = urg_offset;
1061				if (!len)
1062					break;
1063			}
1064			used = recv_actor(desc, skb, offset, len);
1065			if (used <= len) {
1066				seq += used;
1067				copied += used;
1068				offset += used;
1069			}
1070			if (offset != skb->len)
1071				break;
1072		}
1073		if (skb->h.th->fin) {
1074			sk_eat_skb(sk, skb);
1075			++seq;
1076			break;
1077		}
1078		sk_eat_skb(sk, skb);
1079		if (!desc->count)
1080			break;
1081	}
1082	tp->copied_seq = seq;
1083
1084	tcp_rcv_space_adjust(sk);
1085
1086	/* Clean up data we have read: This will do ACK frames. */
1087	if (copied)
1088		cleanup_rbuf(sk, copied);
1089	return copied;
1090}
1091
1092/*
1093 *	This routine copies from a sock struct into the user buffer.
1094 *
1095 *	Technical note: in 2.3 we work on _locked_ socket, so that
1096 *	tricks with *seq access order and skb->users are not required.
1097 *	Probably, code can be easily improved even more.
1098 */
1099
1100int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1101		size_t len, int nonblock, int flags, int *addr_len)
1102{
1103	struct tcp_sock *tp = tcp_sk(sk);
1104	int copied = 0;
1105	u32 peek_seq;
1106	u32 *seq;
1107	unsigned long used;
1108	int err;
1109	int target;		/* Read at least this many bytes */
1110	long timeo;
1111	struct task_struct *user_recv = NULL;
1112
1113	lock_sock(sk);
1114
1115	TCP_CHECK_TIMER(sk);
1116
1117	err = -ENOTCONN;
1118	if (sk->sk_state == TCP_LISTEN)
1119		goto out;
1120
1121	timeo = sock_rcvtimeo(sk, nonblock);
1122
1123	/* Urgent data needs to be handled specially. */
1124	if (flags & MSG_OOB)
1125		goto recv_urg;
1126
1127	seq = &tp->copied_seq;
1128	if (flags & MSG_PEEK) {
1129		peek_seq = tp->copied_seq;
1130		seq = &peek_seq;
1131	}
1132
1133	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1134
1135	do {
1136		struct sk_buff *skb;
1137		u32 offset;
1138
1139		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1140		if (tp->urg_data && tp->urg_seq == *seq) {
1141			if (copied)
1142				break;
1143			if (signal_pending(current)) {
1144				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1145				break;
1146			}
1147		}
1148
1149		/* Next get a buffer. */
1150
1151		skb = skb_peek(&sk->sk_receive_queue);
1152		do {
1153			if (!skb)
1154				break;
1155
1156			/* Now that we have two receive queues this
1157			 * shouldn't happen.
1158			 */
1159			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1160				printk(KERN_INFO "recvmsg bug: copied %X "
1161				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1162				break;
1163			}
1164			offset = *seq - TCP_SKB_CB(skb)->seq;
1165			if (skb->h.th->syn)
1166				offset--;
1167			if (offset < skb->len)
1168				goto found_ok_skb;
1169			if (skb->h.th->fin)
1170				goto found_fin_ok;
1171			BUG_TRAP(flags & MSG_PEEK);
1172			skb = skb->next;
1173		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1174
1175		/* Well, if we have backlog, try to process it now yet. */
1176
1177		if (copied >= target && !sk->sk_backlog.tail)
1178			break;
1179
1180		if (copied) {
1181			if (sk->sk_err ||
1182			    sk->sk_state == TCP_CLOSE ||
1183			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1184			    !timeo ||
1185			    signal_pending(current) ||
1186			    (flags & MSG_PEEK))
1187				break;
1188		} else {
1189			if (sock_flag(sk, SOCK_DONE))
1190				break;
1191
1192			if (sk->sk_err) {
1193				copied = sock_error(sk);
1194				break;
1195			}
1196
1197			if (sk->sk_shutdown & RCV_SHUTDOWN)
1198				break;
1199
1200			if (sk->sk_state == TCP_CLOSE) {
1201				if (!sock_flag(sk, SOCK_DONE)) {
1202					/* This occurs when user tries to read
1203					 * from never connected socket.
1204					 */
1205					copied = -ENOTCONN;
1206					break;
1207				}
1208				break;
1209			}
1210
1211			if (!timeo) {
1212				copied = -EAGAIN;
1213				break;
1214			}
1215
1216			if (signal_pending(current)) {
1217				copied = sock_intr_errno(timeo);
1218				break;
1219			}
1220		}
1221
1222		cleanup_rbuf(sk, copied);
1223
1224		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1225			/* Install new reader */
1226			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1227				user_recv = current;
1228				tp->ucopy.task = user_recv;
1229				tp->ucopy.iov = msg->msg_iov;
1230			}
1231
1232			tp->ucopy.len = len;
1233
1234			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1235				 (flags & (MSG_PEEK | MSG_TRUNC)));
1236
1237			/* Ugly... If prequeue is not empty, we have to
1238			 * process it before releasing socket, otherwise
1239			 * order will be broken at second iteration.
1240			 * More elegant solution is required!!!
1241			 *
1242			 * Look: we have the following (pseudo)queues:
1243			 *
1244			 * 1. packets in flight
1245			 * 2. backlog
1246			 * 3. prequeue
1247			 * 4. receive_queue
1248			 *
1249			 * Each queue can be processed only if the next ones
1250			 * are empty. At this point we have empty receive_queue.
1251			 * But prequeue _can_ be not empty after 2nd iteration,
1252			 * when we jumped to start of loop because backlog
1253			 * processing added something to receive_queue.
1254			 * We cannot release_sock(), because backlog contains
1255			 * packets arrived _after_ prequeued ones.
1256			 *
1257			 * Shortly, algorithm is clear --- to process all
1258			 * the queues in order. We could make it more directly,
1259			 * requeueing packets from backlog to prequeue, if
1260			 * is not empty. It is more elegant, but eats cycles,
1261			 * unfortunately.
1262			 */
1263			if (!skb_queue_empty(&tp->ucopy.prequeue))
1264				goto do_prequeue;
1265
1266			/* __ Set realtime policy in scheduler __ */
1267		}
1268
1269		if (copied >= target) {
1270			/* Do not sleep, just process backlog. */
1271			release_sock(sk);
1272			lock_sock(sk);
1273		} else
1274			sk_wait_data(sk, &timeo);
1275
1276		if (user_recv) {
1277			int chunk;
1278
1279			/* __ Restore normal policy in scheduler __ */
1280
1281			if ((chunk = len - tp->ucopy.len) != 0) {
1282				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1283				len -= chunk;
1284				copied += chunk;
1285			}
1286
1287			if (tp->rcv_nxt == tp->copied_seq &&
1288			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1289do_prequeue:
1290				tcp_prequeue_process(sk);
1291
1292				if ((chunk = len - tp->ucopy.len) != 0) {
1293					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1294					len -= chunk;
1295					copied += chunk;
1296				}
1297			}
1298		}
1299		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1300			if (net_ratelimit())
1301				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1302				       current->comm, current->pid);
1303			peek_seq = tp->copied_seq;
1304		}
1305		continue;
1306
1307	found_ok_skb:
1308		/* Ok so how much can we use? */
1309		used = skb->len - offset;
1310		if (len < used)
1311			used = len;
1312
1313		/* Do we have urgent data here? */
1314		if (tp->urg_data) {
1315			u32 urg_offset = tp->urg_seq - *seq;
1316			if (urg_offset < used) {
1317				if (!urg_offset) {
1318					if (!sock_flag(sk, SOCK_URGINLINE)) {
1319						++*seq;
1320						offset++;
1321						used--;
1322						if (!used)
1323							goto skip_copy;
1324					}
1325				} else
1326					used = urg_offset;
1327			}
1328		}
1329
1330		if (!(flags & MSG_TRUNC)) {
1331			err = skb_copy_datagram_iovec(skb, offset,
1332						      msg->msg_iov, used);
1333			if (err) {
1334				/* Exception. Bailout! */
1335				if (!copied)
1336					copied = -EFAULT;
1337				break;
1338			}
1339		}
1340
1341		*seq += used;
1342		copied += used;
1343		len -= used;
1344
1345		tcp_rcv_space_adjust(sk);
1346
1347skip_copy:
1348		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1349			tp->urg_data = 0;
1350			tcp_fast_path_check(sk, tp);
1351		}
1352		if (used + offset < skb->len)
1353			continue;
1354
1355		if (skb->h.th->fin)
1356			goto found_fin_ok;
1357		if (!(flags & MSG_PEEK))
1358			sk_eat_skb(sk, skb);
1359		continue;
1360
1361	found_fin_ok:
1362		/* Process the FIN. */
1363		++*seq;
1364		if (!(flags & MSG_PEEK))
1365			sk_eat_skb(sk, skb);
1366		break;
1367	} while (len > 0);
1368
1369	if (user_recv) {
1370		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1371			int chunk;
1372
1373			tp->ucopy.len = copied > 0 ? len : 0;
1374
1375			tcp_prequeue_process(sk);
1376
1377			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1378				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1379				len -= chunk;
1380				copied += chunk;
1381			}
1382		}
1383
1384		tp->ucopy.task = NULL;
1385		tp->ucopy.len = 0;
1386	}
1387
1388	/* According to UNIX98, msg_name/msg_namelen are ignored
1389	 * on connected socket. I was just happy when found this 8) --ANK
1390	 */
1391
1392	/* Clean up data we have read: This will do ACK frames. */
1393	cleanup_rbuf(sk, copied);
1394
1395	TCP_CHECK_TIMER(sk);
1396	release_sock(sk);
1397	return copied;
1398
1399out:
1400	TCP_CHECK_TIMER(sk);
1401	release_sock(sk);
1402	return err;
1403
1404recv_urg:
1405	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1406	goto out;
1407}
1408
1409/*
1410 *	State processing on a close. This implements the state shift for
1411 *	sending our FIN frame. Note that we only send a FIN for some
1412 *	states. A shutdown() may have already sent the FIN, or we may be
1413 *	closed.
1414 */
1415
1416static unsigned char new_state[16] = {
1417  /* current state:        new state:      action:	*/
1418  /* (Invalid)		*/ TCP_CLOSE,
1419  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1420  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1421  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1422  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1423  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1424  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1425  /* TCP_CLOSE		*/ TCP_CLOSE,
1426  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1427  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1428  /* TCP_LISTEN		*/ TCP_CLOSE,
1429  /* TCP_CLOSING	*/ TCP_CLOSING,
1430};
1431
1432static int tcp_close_state(struct sock *sk)
1433{
1434	int next = (int)new_state[sk->sk_state];
1435	int ns = next & TCP_STATE_MASK;
1436
1437	tcp_set_state(sk, ns);
1438
1439	return next & TCP_ACTION_FIN;
1440}
1441
1442/*
1443 *	Shutdown the sending side of a connection. Much like close except
1444 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1445 */
1446
1447void tcp_shutdown(struct sock *sk, int how)
1448{
1449	/*	We need to grab some memory, and put together a FIN,
1450	 *	and then put it into the queue to be sent.
1451	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1452	 */
1453	if (!(how & SEND_SHUTDOWN))
1454		return;
1455
1456	/* If we've already sent a FIN, or it's a closed state, skip this. */
1457	if ((1 << sk->sk_state) &
1458	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1459	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1460		/* Clear out any half completed packets.  FIN if needed. */
1461		if (tcp_close_state(sk))
1462			tcp_send_fin(sk);
1463	}
1464}
1465
1466void tcp_close(struct sock *sk, long timeout)
1467{
1468	struct sk_buff *skb;
1469	int data_was_unread = 0;
1470
1471	lock_sock(sk);
1472	sk->sk_shutdown = SHUTDOWN_MASK;
1473
1474	if (sk->sk_state == TCP_LISTEN) {
1475		tcp_set_state(sk, TCP_CLOSE);
1476
1477		/* Special case. */
1478		inet_csk_listen_stop(sk);
1479
1480		goto adjudge_to_death;
1481	}
1482
1483	/*  We need to flush the recv. buffs.  We do this only on the
1484	 *  descriptor close, not protocol-sourced closes, because the
1485	 *  reader process may not have drained the data yet!
1486	 */
1487	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1488		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1489			  skb->h.th->fin;
1490		data_was_unread += len;
1491		__kfree_skb(skb);
1492	}
1493
1494	sk_stream_mem_reclaim(sk);
1495
1496	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1497	 * 3.10, we send a RST here because data was lost.  To
1498	 * witness the awful effects of the old behavior of always
1499	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1500	 * a bulk GET in an FTP client, suspend the process, wait
1501	 * for the client to advertise a zero window, then kill -9
1502	 * the FTP client, wheee...  Note: timeout is always zero
1503	 * in such a case.
1504	 */
1505	if (data_was_unread) {
1506		/* Unread data was tossed, zap the connection. */
1507		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1508		tcp_set_state(sk, TCP_CLOSE);
1509		tcp_send_active_reset(sk, GFP_KERNEL);
1510	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1511		/* Check zero linger _after_ checking for unread data. */
1512		sk->sk_prot->disconnect(sk, 0);
1513		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1514	} else if (tcp_close_state(sk)) {
1515		/* We FIN if the application ate all the data before
1516		 * zapping the connection.
1517		 */
1518
1519		/* RED-PEN. Formally speaking, we have broken TCP state
1520		 * machine. State transitions:
1521		 *
1522		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1523		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1524		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1525		 *
1526		 * are legal only when FIN has been sent (i.e. in window),
1527		 * rather than queued out of window. Purists blame.
1528		 *
1529		 * F.e. "RFC state" is ESTABLISHED,
1530		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1531		 *
1532		 * The visible declinations are that sometimes
1533		 * we enter time-wait state, when it is not required really
1534		 * (harmless), do not send active resets, when they are
1535		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1536		 * they look as CLOSING or LAST_ACK for Linux)
1537		 * Probably, I missed some more holelets.
1538		 * 						--ANK
1539		 */
1540		tcp_send_fin(sk);
1541	}
1542
1543	sk_stream_wait_close(sk, timeout);
1544
1545adjudge_to_death:
1546	/* It is the last release_sock in its life. It will remove backlog. */
1547	release_sock(sk);
1548
1549
1550	/* Now socket is owned by kernel and we acquire BH lock
1551	   to finish close. No need to check for user refs.
1552	 */
1553	local_bh_disable();
1554	bh_lock_sock(sk);
1555	BUG_TRAP(!sock_owned_by_user(sk));
1556
1557	sock_hold(sk);
1558	sock_orphan(sk);
1559
1560	/*	This is a (useful) BSD violating of the RFC. There is a
1561	 *	problem with TCP as specified in that the other end could
1562	 *	keep a socket open forever with no application left this end.
1563	 *	We use a 3 minute timeout (about the same as BSD) then kill
1564	 *	our end. If they send after that then tough - BUT: long enough
1565	 *	that we won't make the old 4*rto = almost no time - whoops
1566	 *	reset mistake.
1567	 *
1568	 *	Nope, it was not mistake. It is really desired behaviour
1569	 *	f.e. on http servers, when such sockets are useless, but
1570	 *	consume significant resources. Let's do it with special
1571	 *	linger2	option.					--ANK
1572	 */
1573
1574	if (sk->sk_state == TCP_FIN_WAIT2) {
1575		struct tcp_sock *tp = tcp_sk(sk);
1576		if (tp->linger2 < 0) {
1577			tcp_set_state(sk, TCP_CLOSE);
1578			tcp_send_active_reset(sk, GFP_ATOMIC);
1579			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1580		} else {
1581			const int tmo = tcp_fin_time(sk);
1582
1583			if (tmo > TCP_TIMEWAIT_LEN) {
1584				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1585			} else {
1586				atomic_inc(sk->sk_prot->orphan_count);
1587				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1588				goto out;
1589			}
1590		}
1591	}
1592	if (sk->sk_state != TCP_CLOSE) {
1593		sk_stream_mem_reclaim(sk);
1594		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1595		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1596		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1597			if (net_ratelimit())
1598				printk(KERN_INFO "TCP: too many of orphaned "
1599				       "sockets\n");
1600			tcp_set_state(sk, TCP_CLOSE);
1601			tcp_send_active_reset(sk, GFP_ATOMIC);
1602			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1603		}
1604	}
1605	atomic_inc(sk->sk_prot->orphan_count);
1606
1607	if (sk->sk_state == TCP_CLOSE)
1608		inet_csk_destroy_sock(sk);
1609	/* Otherwise, socket is reprieved until protocol close. */
1610
1611out:
1612	bh_unlock_sock(sk);
1613	local_bh_enable();
1614	sock_put(sk);
1615}
1616
1617/* These states need RST on ABORT according to RFC793 */
1618
1619static inline int tcp_need_reset(int state)
1620{
1621	return (1 << state) &
1622	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1623		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1624}
1625
1626int tcp_disconnect(struct sock *sk, int flags)
1627{
1628	struct inet_sock *inet = inet_sk(sk);
1629	struct inet_connection_sock *icsk = inet_csk(sk);
1630	struct tcp_sock *tp = tcp_sk(sk);
1631	int err = 0;
1632	int old_state = sk->sk_state;
1633
1634	if (old_state != TCP_CLOSE)
1635		tcp_set_state(sk, TCP_CLOSE);
1636
1637	/* ABORT function of RFC793 */
1638	if (old_state == TCP_LISTEN) {
1639		inet_csk_listen_stop(sk);
1640	} else if (tcp_need_reset(old_state) ||
1641		   (tp->snd_nxt != tp->write_seq &&
1642		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1643		/* The last check adjusts for discrepancy of Linux wrt. RFC
1644		 * states
1645		 */
1646		tcp_send_active_reset(sk, gfp_any());
1647		sk->sk_err = ECONNRESET;
1648	} else if (old_state == TCP_SYN_SENT)
1649		sk->sk_err = ECONNRESET;
1650
1651	tcp_clear_xmit_timers(sk);
1652	__skb_queue_purge(&sk->sk_receive_queue);
1653	sk_stream_writequeue_purge(sk);
1654	__skb_queue_purge(&tp->out_of_order_queue);
1655
1656	inet->dport = 0;
1657
1658	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1659		inet_reset_saddr(sk);
1660
1661	sk->sk_shutdown = 0;
1662	sock_reset_flag(sk, SOCK_DONE);
1663	tp->srtt = 0;
1664	if ((tp->write_seq += tp->max_window + 2) == 0)
1665		tp->write_seq = 1;
1666	icsk->icsk_backoff = 0;
1667	tp->snd_cwnd = 2;
1668	icsk->icsk_probes_out = 0;
1669	tp->packets_out = 0;
1670	tp->snd_ssthresh = 0x7fffffff;
1671	tp->snd_cwnd_cnt = 0;
1672	tp->bytes_acked = 0;
1673	tcp_set_ca_state(sk, TCP_CA_Open);
1674	tcp_clear_retrans(tp);
1675	inet_csk_delack_init(sk);
1676	sk->sk_send_head = NULL;
1677	tp->rx_opt.saw_tstamp = 0;
1678	tcp_sack_reset(&tp->rx_opt);
1679	__sk_dst_reset(sk);
1680
1681	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1682
1683	sk->sk_error_report(sk);
1684	return err;
1685}
1686
1687/*
1688 *	Socket option code for TCP.
1689 */
1690int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1691		   int optlen)
1692{
1693	struct tcp_sock *tp = tcp_sk(sk);
1694	struct inet_connection_sock *icsk = inet_csk(sk);
1695	int val;
1696	int err = 0;
1697
1698	if (level != SOL_TCP)
1699		return tp->af_specific->setsockopt(sk, level, optname,
1700						   optval, optlen);
1701
1702	/* This is a string value all the others are int's */
1703	if (optname == TCP_CONGESTION) {
1704		char name[TCP_CA_NAME_MAX];
1705
1706		if (optlen < 1)
1707			return -EINVAL;
1708
1709		val = strncpy_from_user(name, optval,
1710					min(TCP_CA_NAME_MAX-1, optlen));
1711		if (val < 0)
1712			return -EFAULT;
1713		name[val] = 0;
1714
1715		lock_sock(sk);
1716		err = tcp_set_congestion_control(sk, name);
1717		release_sock(sk);
1718		return err;
1719	}
1720
1721	if (optlen < sizeof(int))
1722		return -EINVAL;
1723
1724	if (get_user(val, (int __user *)optval))
1725		return -EFAULT;
1726
1727	lock_sock(sk);
1728
1729	switch (optname) {
1730	case TCP_MAXSEG:
1731		/* Values greater than interface MTU won't take effect. However
1732		 * at the point when this call is done we typically don't yet
1733		 * know which interface is going to be used */
1734		if (val < 8 || val > MAX_TCP_WINDOW) {
1735			err = -EINVAL;
1736			break;
1737		}
1738		tp->rx_opt.user_mss = val;
1739		break;
1740
1741	case TCP_NODELAY:
1742		if (val) {
1743			/* TCP_NODELAY is weaker than TCP_CORK, so that
1744			 * this option on corked socket is remembered, but
1745			 * it is not activated until cork is cleared.
1746			 *
1747			 * However, when TCP_NODELAY is set we make
1748			 * an explicit push, which overrides even TCP_CORK
1749			 * for currently queued segments.
1750			 */
1751			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1752			tcp_push_pending_frames(sk, tp);
1753		} else {
1754			tp->nonagle &= ~TCP_NAGLE_OFF;
1755		}
1756		break;
1757
1758	case TCP_CORK:
1759		/* When set indicates to always queue non-full frames.
1760		 * Later the user clears this option and we transmit
1761		 * any pending partial frames in the queue.  This is
1762		 * meant to be used alongside sendfile() to get properly
1763		 * filled frames when the user (for example) must write
1764		 * out headers with a write() call first and then use
1765		 * sendfile to send out the data parts.
1766		 *
1767		 * TCP_CORK can be set together with TCP_NODELAY and it is
1768		 * stronger than TCP_NODELAY.
1769		 */
1770		if (val) {
1771			tp->nonagle |= TCP_NAGLE_CORK;
1772		} else {
1773			tp->nonagle &= ~TCP_NAGLE_CORK;
1774			if (tp->nonagle&TCP_NAGLE_OFF)
1775				tp->nonagle |= TCP_NAGLE_PUSH;
1776			tcp_push_pending_frames(sk, tp);
1777		}
1778		break;
1779
1780	case TCP_KEEPIDLE:
1781		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1782			err = -EINVAL;
1783		else {
1784			tp->keepalive_time = val * HZ;
1785			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1786			    !((1 << sk->sk_state) &
1787			      (TCPF_CLOSE | TCPF_LISTEN))) {
1788				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1789				if (tp->keepalive_time > elapsed)
1790					elapsed = tp->keepalive_time - elapsed;
1791				else
1792					elapsed = 0;
1793				inet_csk_reset_keepalive_timer(sk, elapsed);
1794			}
1795		}
1796		break;
1797	case TCP_KEEPINTVL:
1798		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1799			err = -EINVAL;
1800		else
1801			tp->keepalive_intvl = val * HZ;
1802		break;
1803	case TCP_KEEPCNT:
1804		if (val < 1 || val > MAX_TCP_KEEPCNT)
1805			err = -EINVAL;
1806		else
1807			tp->keepalive_probes = val;
1808		break;
1809	case TCP_SYNCNT:
1810		if (val < 1 || val > MAX_TCP_SYNCNT)
1811			err = -EINVAL;
1812		else
1813			icsk->icsk_syn_retries = val;
1814		break;
1815
1816	case TCP_LINGER2:
1817		if (val < 0)
1818			tp->linger2 = -1;
1819		else if (val > sysctl_tcp_fin_timeout / HZ)
1820			tp->linger2 = 0;
1821		else
1822			tp->linger2 = val * HZ;
1823		break;
1824
1825	case TCP_DEFER_ACCEPT:
1826		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1827		if (val > 0) {
1828			/* Translate value in seconds to number of
1829			 * retransmits */
1830			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1831			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1832				       icsk->icsk_accept_queue.rskq_defer_accept))
1833				icsk->icsk_accept_queue.rskq_defer_accept++;
1834			icsk->icsk_accept_queue.rskq_defer_accept++;
1835		}
1836		break;
1837
1838	case TCP_WINDOW_CLAMP:
1839		if (!val) {
1840			if (sk->sk_state != TCP_CLOSE) {
1841				err = -EINVAL;
1842				break;
1843			}
1844			tp->window_clamp = 0;
1845		} else
1846			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1847						SOCK_MIN_RCVBUF / 2 : val;
1848		break;
1849
1850	case TCP_QUICKACK:
1851		if (!val) {
1852			icsk->icsk_ack.pingpong = 1;
1853		} else {
1854			icsk->icsk_ack.pingpong = 0;
1855			if ((1 << sk->sk_state) &
1856			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1857			    inet_csk_ack_scheduled(sk)) {
1858				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1859				cleanup_rbuf(sk, 1);
1860				if (!(val & 1))
1861					icsk->icsk_ack.pingpong = 1;
1862			}
1863		}
1864		break;
1865
1866	default:
1867		err = -ENOPROTOOPT;
1868		break;
1869	};
1870	release_sock(sk);
1871	return err;
1872}
1873
1874/* Return information about state of tcp endpoint in API format. */
1875void tcp_get_info(struct sock *sk, struct tcp_info *info)
1876{
1877	struct tcp_sock *tp = tcp_sk(sk);
1878	const struct inet_connection_sock *icsk = inet_csk(sk);
1879	u32 now = tcp_time_stamp;
1880
1881	memset(info, 0, sizeof(*info));
1882
1883	info->tcpi_state = sk->sk_state;
1884	info->tcpi_ca_state = icsk->icsk_ca_state;
1885	info->tcpi_retransmits = icsk->icsk_retransmits;
1886	info->tcpi_probes = icsk->icsk_probes_out;
1887	info->tcpi_backoff = icsk->icsk_backoff;
1888
1889	if (tp->rx_opt.tstamp_ok)
1890		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1891	if (tp->rx_opt.sack_ok)
1892		info->tcpi_options |= TCPI_OPT_SACK;
1893	if (tp->rx_opt.wscale_ok) {
1894		info->tcpi_options |= TCPI_OPT_WSCALE;
1895		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1896		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1897	}
1898
1899	if (tp->ecn_flags&TCP_ECN_OK)
1900		info->tcpi_options |= TCPI_OPT_ECN;
1901
1902	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1903	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1904	info->tcpi_snd_mss = tp->mss_cache;
1905	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1906
1907	info->tcpi_unacked = tp->packets_out;
1908	info->tcpi_sacked = tp->sacked_out;
1909	info->tcpi_lost = tp->lost_out;
1910	info->tcpi_retrans = tp->retrans_out;
1911	info->tcpi_fackets = tp->fackets_out;
1912
1913	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1914	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1915	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1916
1917	info->tcpi_pmtu = tp->pmtu_cookie;
1918	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1919	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1920	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1921	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1922	info->tcpi_snd_cwnd = tp->snd_cwnd;
1923	info->tcpi_advmss = tp->advmss;
1924	info->tcpi_reordering = tp->reordering;
1925
1926	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1927	info->tcpi_rcv_space = tp->rcvq_space.space;
1928
1929	info->tcpi_total_retrans = tp->total_retrans;
1930}
1931
1932EXPORT_SYMBOL_GPL(tcp_get_info);
1933
1934int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1935		   int __user *optlen)
1936{
1937	struct inet_connection_sock *icsk = inet_csk(sk);
1938	struct tcp_sock *tp = tcp_sk(sk);
1939	int val, len;
1940
1941	if (level != SOL_TCP)
1942		return tp->af_specific->getsockopt(sk, level, optname,
1943						   optval, optlen);
1944
1945	if (get_user(len, optlen))
1946		return -EFAULT;
1947
1948	len = min_t(unsigned int, len, sizeof(int));
1949
1950	if (len < 0)
1951		return -EINVAL;
1952
1953	switch (optname) {
1954	case TCP_MAXSEG:
1955		val = tp->mss_cache;
1956		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1957			val = tp->rx_opt.user_mss;
1958		break;
1959	case TCP_NODELAY:
1960		val = !!(tp->nonagle&TCP_NAGLE_OFF);
1961		break;
1962	case TCP_CORK:
1963		val = !!(tp->nonagle&TCP_NAGLE_CORK);
1964		break;
1965	case TCP_KEEPIDLE:
1966		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1967		break;
1968	case TCP_KEEPINTVL:
1969		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1970		break;
1971	case TCP_KEEPCNT:
1972		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1973		break;
1974	case TCP_SYNCNT:
1975		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1976		break;
1977	case TCP_LINGER2:
1978		val = tp->linger2;
1979		if (val >= 0)
1980			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1981		break;
1982	case TCP_DEFER_ACCEPT:
1983		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1984			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1985		break;
1986	case TCP_WINDOW_CLAMP:
1987		val = tp->window_clamp;
1988		break;
1989	case TCP_INFO: {
1990		struct tcp_info info;
1991
1992		if (get_user(len, optlen))
1993			return -EFAULT;
1994
1995		tcp_get_info(sk, &info);
1996
1997		len = min_t(unsigned int, len, sizeof(info));
1998		if (put_user(len, optlen))
1999			return -EFAULT;
2000		if (copy_to_user(optval, &info, len))
2001			return -EFAULT;
2002		return 0;
2003	}
2004	case TCP_QUICKACK:
2005		val = !icsk->icsk_ack.pingpong;
2006		break;
2007
2008	case TCP_CONGESTION:
2009		if (get_user(len, optlen))
2010			return -EFAULT;
2011		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2012		if (put_user(len, optlen))
2013			return -EFAULT;
2014		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2015			return -EFAULT;
2016		return 0;
2017	default:
2018		return -ENOPROTOOPT;
2019	};
2020
2021	if (put_user(len, optlen))
2022		return -EFAULT;
2023	if (copy_to_user(optval, &val, len))
2024		return -EFAULT;
2025	return 0;
2026}
2027
2028
2029extern void __skb_cb_too_small_for_tcp(int, int);
2030extern struct tcp_congestion_ops tcp_reno;
2031
2032static __initdata unsigned long thash_entries;
2033static int __init set_thash_entries(char *str)
2034{
2035	if (!str)
2036		return 0;
2037	thash_entries = simple_strtoul(str, &str, 0);
2038	return 1;
2039}
2040__setup("thash_entries=", set_thash_entries);
2041
2042void __init tcp_init(void)
2043{
2044	struct sk_buff *skb = NULL;
2045	int order, i;
2046
2047	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2048		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2049					   sizeof(skb->cb));
2050
2051	tcp_hashinfo.bind_bucket_cachep =
2052		kmem_cache_create("tcp_bind_bucket",
2053				  sizeof(struct inet_bind_bucket), 0,
2054				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2055	if (!tcp_hashinfo.bind_bucket_cachep)
2056		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2057
2058	/* Size and allocate the main established and bind bucket
2059	 * hash tables.
2060	 *
2061	 * The methodology is similar to that of the buffer cache.
2062	 */
2063	tcp_hashinfo.ehash =
2064		alloc_large_system_hash("TCP established",
2065					sizeof(struct inet_ehash_bucket),
2066					thash_entries,
2067					(num_physpages >= 128 * 1024) ?
2068					13 : 15,
2069					HASH_HIGHMEM,
2070					&tcp_hashinfo.ehash_size,
2071					NULL,
2072					0);
2073	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2074	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2075		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2076		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2077	}
2078
2079	tcp_hashinfo.bhash =
2080		alloc_large_system_hash("TCP bind",
2081					sizeof(struct inet_bind_hashbucket),
2082					tcp_hashinfo.ehash_size,
2083					(num_physpages >= 128 * 1024) ?
2084					13 : 15,
2085					HASH_HIGHMEM,
2086					&tcp_hashinfo.bhash_size,
2087					NULL,
2088					64 * 1024);
2089	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2090	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2091		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2092		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2093	}
2094
2095	/* Try to be a bit smarter and adjust defaults depending
2096	 * on available memory.
2097	 */
2098	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2099			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2100			order++)
2101		;
2102	if (order >= 4) {
2103		sysctl_local_port_range[0] = 32768;
2104		sysctl_local_port_range[1] = 61000;
2105		tcp_death_row.sysctl_max_tw_buckets = 180000;
2106		sysctl_tcp_max_orphans = 4096 << (order - 4);
2107		sysctl_max_syn_backlog = 1024;
2108	} else if (order < 3) {
2109		sysctl_local_port_range[0] = 1024 * (3 - order);
2110		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2111		sysctl_tcp_max_orphans >>= (3 - order);
2112		sysctl_max_syn_backlog = 128;
2113	}
2114
2115	sysctl_tcp_mem[0] =  768 << order;
2116	sysctl_tcp_mem[1] = 1024 << order;
2117	sysctl_tcp_mem[2] = 1536 << order;
2118
2119	if (order < 3) {
2120		sysctl_tcp_wmem[2] = 64 * 1024;
2121		sysctl_tcp_rmem[0] = PAGE_SIZE;
2122		sysctl_tcp_rmem[1] = 43689;
2123		sysctl_tcp_rmem[2] = 2 * 43689;
2124	}
2125
2126	printk(KERN_INFO "TCP: Hash tables configured "
2127	       "(established %d bind %d)\n",
2128	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2129
2130	tcp_register_congestion_control(&tcp_reno);
2131}
2132
2133EXPORT_SYMBOL(tcp_close);
2134EXPORT_SYMBOL(tcp_disconnect);
2135EXPORT_SYMBOL(tcp_getsockopt);
2136EXPORT_SYMBOL(tcp_ioctl);
2137EXPORT_SYMBOL(tcp_poll);
2138EXPORT_SYMBOL(tcp_read_sock);
2139EXPORT_SYMBOL(tcp_recvmsg);
2140EXPORT_SYMBOL(tcp_sendmsg);
2141EXPORT_SYMBOL(tcp_sendpage);
2142EXPORT_SYMBOL(tcp_setsockopt);
2143EXPORT_SYMBOL(tcp_shutdown);
2144EXPORT_SYMBOL(tcp_statistics);
2145