tcp.c revision c65f7f00c587828e3d50737805a78f74804972de
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305	if (!tcp_memory_pressure) {
306		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307		tcp_memory_pressure = 1;
308	}
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317					       poll_table *wait)
318{
319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 *	Wait for a TCP event.
324 *
325 *	Note that we don't need to lock the socket, as the upper poll layers
326 *	take care of normal races (between the test and the event) and we don't
327 *	go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331	unsigned int mask;
332	struct sock *sk = sock->sk;
333	struct tcp_sock *tp = tcp_sk(sk);
334
335	poll_wait(file, sk->sk_sleep, wait);
336	if (sk->sk_state == TCP_LISTEN)
337		return tcp_listen_poll(sk, wait);
338
339	/* Socket is not locked. We are protected from async events
340	   by poll logic and correct handling of state changes
341	   made by another threads is impossible in any case.
342	 */
343
344	mask = 0;
345	if (sk->sk_err)
346		mask = POLLERR;
347
348	/*
349	 * POLLHUP is certainly not done right. But poll() doesn't
350	 * have a notion of HUP in just one direction, and for a
351	 * socket the read side is more interesting.
352	 *
353	 * Some poll() documentation says that POLLHUP is incompatible
354	 * with the POLLOUT/POLLWR flags, so somebody should check this
355	 * all. But careful, it tends to be safer to return too many
356	 * bits than too few, and you can easily break real applications
357	 * if you don't tell them that something has hung up!
358	 *
359	 * Check-me.
360	 *
361	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362	 * our fs/select.c). It means that after we received EOF,
363	 * poll always returns immediately, making impossible poll() on write()
364	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365	 * if and only if shutdown has been made in both directions.
366	 * Actually, it is interesting to look how Solaris and DUX
367	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368	 * then we could set it on SND_SHUTDOWN. BTW examples given
369	 * in Stevens' books assume exactly this behaviour, it explains
370	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371	 *
372	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373	 * blocking on fresh not-connected or disconnected socket. --ANK
374	 */
375	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376		mask |= POLLHUP;
377	if (sk->sk_shutdown & RCV_SHUTDOWN)
378		mask |= POLLIN | POLLRDNORM;
379
380	/* Connected? */
381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382		/* Potential race condition. If read of tp below will
383		 * escape above sk->sk_state, we can be illegally awaken
384		 * in SYN_* states. */
385		if ((tp->rcv_nxt != tp->copied_seq) &&
386		    (tp->urg_seq != tp->copied_seq ||
387		     tp->rcv_nxt != tp->copied_seq + 1 ||
388		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389			mask |= POLLIN | POLLRDNORM;
390
391		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393				mask |= POLLOUT | POLLWRNORM;
394			} else {  /* send SIGIO later */
395				set_bit(SOCK_ASYNC_NOSPACE,
396					&sk->sk_socket->flags);
397				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399				/* Race breaker. If space is freed after
400				 * wspace test but before the flags are set,
401				 * IO signal will be lost.
402				 */
403				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404					mask |= POLLOUT | POLLWRNORM;
405			}
406		}
407
408		if (tp->urg_data & TCP_URG_VALID)
409			mask |= POLLPRI;
410	}
411	return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416	struct tcp_sock *tp = tcp_sk(sk);
417	int answ;
418
419	switch (cmd) {
420	case SIOCINQ:
421		if (sk->sk_state == TCP_LISTEN)
422			return -EINVAL;
423
424		lock_sock(sk);
425		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426			answ = 0;
427		else if (sock_flag(sk, SOCK_URGINLINE) ||
428			 !tp->urg_data ||
429			 before(tp->urg_seq, tp->copied_seq) ||
430			 !before(tp->urg_seq, tp->rcv_nxt)) {
431			answ = tp->rcv_nxt - tp->copied_seq;
432
433			/* Subtract 1, if FIN is in queue. */
434			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435				answ -=
436		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437		} else
438			answ = tp->urg_seq - tp->copied_seq;
439		release_sock(sk);
440		break;
441	case SIOCATMARK:
442		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443		break;
444	case SIOCOUTQ:
445		if (sk->sk_state == TCP_LISTEN)
446			return -EINVAL;
447
448		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449			answ = 0;
450		else
451			answ = tp->write_seq - tp->snd_una;
452		break;
453	default:
454		return -ENOIOCTLCMD;
455	};
456
457	return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463	struct inet_sock *inet = inet_sk(sk);
464	struct tcp_sock *tp = tcp_sk(sk);
465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467	if (rc != 0)
468		return rc;
469
470	sk->sk_max_ack_backlog = 0;
471	sk->sk_ack_backlog = 0;
472	tcp_delack_init(tp);
473
474	/* There is race window here: we announce ourselves listening,
475	 * but this transition is still not validated by get_port().
476	 * It is OK, because this socket enters to hash table only
477	 * after validation is complete.
478	 */
479	sk->sk_state = TCP_LISTEN;
480	if (!sk->sk_prot->get_port(sk, inet->num)) {
481		inet->sport = htons(inet->num);
482
483		sk_dst_reset(sk);
484		sk->sk_prot->hash(sk);
485
486		return 0;
487	}
488
489	sk->sk_state = TCP_CLOSE;
490	reqsk_queue_destroy(&tp->accept_queue);
491	return -EADDRINUSE;
492}
493
494/*
495 *	This routine closes sockets which have been at least partially
496 *	opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501	struct tcp_sock *tp = tcp_sk(sk);
502	struct listen_sock *lopt;
503	struct request_sock *acc_req;
504	struct request_sock *req;
505	int i;
506
507	tcp_delete_keepalive_timer(sk);
508
509	/* make all the listen_opt local to us */
510	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513	if (lopt->qlen) {
514		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515			while ((req = lopt->syn_table[i]) != NULL) {
516				lopt->syn_table[i] = req->dl_next;
517				lopt->qlen--;
518				reqsk_free(req);
519
520		/* Following specs, it would be better either to send FIN
521		 * (and enter FIN-WAIT-1, it is normal close)
522		 * or to send active reset (abort).
523		 * Certainly, it is pretty dangerous while synflood, but it is
524		 * bad justification for our negligence 8)
525		 * To be honest, we are not able to make either
526		 * of the variants now.			--ANK
527		 */
528			}
529		}
530	}
531	BUG_TRAP(!lopt->qlen);
532
533	kfree(lopt);
534
535	while ((req = acc_req) != NULL) {
536		struct sock *child = req->sk;
537
538		acc_req = req->dl_next;
539
540		local_bh_disable();
541		bh_lock_sock(child);
542		BUG_TRAP(!sock_owned_by_user(child));
543		sock_hold(child);
544
545		tcp_disconnect(child, O_NONBLOCK);
546
547		sock_orphan(child);
548
549		atomic_inc(&tcp_orphan_count);
550
551		tcp_destroy_sock(child);
552
553		bh_unlock_sock(child);
554		local_bh_enable();
555		sock_put(child);
556
557		sk_acceptq_removed(sk);
558		__reqsk_free(req);
559	}
560	BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{
565	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566	tp->pushed_seq = tp->write_seq;
567}
568
569static inline int forced_push(struct tcp_sock *tp)
570{
571	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572}
573
574static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575			      struct sk_buff *skb)
576{
577	skb->csum = 0;
578	TCP_SKB_CB(skb)->seq = tp->write_seq;
579	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581	TCP_SKB_CB(skb)->sacked = 0;
582	skb_header_release(skb);
583	__skb_queue_tail(&sk->sk_write_queue, skb);
584	sk_charge_skb(sk, skb);
585	if (!sk->sk_send_head)
586		sk->sk_send_head = skb;
587	else if (tp->nonagle&TCP_NAGLE_PUSH)
588		tp->nonagle &= ~TCP_NAGLE_PUSH;
589}
590
591static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592				struct sk_buff *skb)
593{
594	if (flags & MSG_OOB) {
595		tp->urg_mode = 1;
596		tp->snd_up = tp->write_seq;
597		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598	}
599}
600
601static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602			    int mss_now, int nonagle)
603{
604	if (sk->sk_send_head) {
605		struct sk_buff *skb = sk->sk_write_queue.prev;
606		if (!(flags & MSG_MORE) || forced_push(tp))
607			tcp_mark_push(tp, skb);
608		tcp_mark_urg(tp, flags, skb);
609		__tcp_push_pending_frames(sk, tp, mss_now,
610					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611	}
612}
613
614static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615			 size_t psize, int flags)
616{
617	struct tcp_sock *tp = tcp_sk(sk);
618	int mss_now;
619	int err;
620	ssize_t copied;
621	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623	/* Wait for a connection to finish. */
624	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626			goto out_err;
627
628	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631	copied = 0;
632
633	err = -EPIPE;
634	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635		goto do_error;
636
637	while (psize > 0) {
638		struct sk_buff *skb = sk->sk_write_queue.prev;
639		struct page *page = pages[poffset / PAGE_SIZE];
640		int copy, i, can_coalesce;
641		int offset = poffset % PAGE_SIZE;
642		int size = min_t(size_t, psize, PAGE_SIZE - offset);
643
644		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645new_segment:
646			if (!sk_stream_memory_free(sk))
647				goto wait_for_sndbuf;
648
649			skb = sk_stream_alloc_pskb(sk, 0, 0,
650						   sk->sk_allocation);
651			if (!skb)
652				goto wait_for_memory;
653
654			skb_entail(sk, tp, skb);
655			copy = mss_now;
656		}
657
658		if (copy > size)
659			copy = size;
660
661		i = skb_shinfo(skb)->nr_frags;
662		can_coalesce = skb_can_coalesce(skb, i, page, offset);
663		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664			tcp_mark_push(tp, skb);
665			goto new_segment;
666		}
667		if (sk->sk_forward_alloc < copy &&
668		    !sk_stream_mem_schedule(sk, copy, 0))
669			goto wait_for_memory;
670
671		if (can_coalesce) {
672			skb_shinfo(skb)->frags[i - 1].size += copy;
673		} else {
674			get_page(page);
675			skb_fill_page_desc(skb, i, page, offset, copy);
676		}
677
678		skb->len += copy;
679		skb->data_len += copy;
680		skb->truesize += copy;
681		sk->sk_wmem_queued += copy;
682		sk->sk_forward_alloc -= copy;
683		skb->ip_summed = CHECKSUM_HW;
684		tp->write_seq += copy;
685		TCP_SKB_CB(skb)->end_seq += copy;
686		skb_shinfo(skb)->tso_segs = 0;
687
688		if (!copied)
689			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690
691		copied += copy;
692		poffset += copy;
693		if (!(psize -= copy))
694			goto out;
695
696		if (skb->len != mss_now || (flags & MSG_OOB))
697			continue;
698
699		if (forced_push(tp)) {
700			tcp_mark_push(tp, skb);
701			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702		} else if (skb == sk->sk_send_head)
703			tcp_push_one(sk, mss_now);
704		continue;
705
706wait_for_sndbuf:
707		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708wait_for_memory:
709		if (copied)
710			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711
712		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713			goto do_error;
714
715		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716	}
717
718out:
719	if (copied)
720		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721	return copied;
722
723do_error:
724	if (copied)
725		goto out;
726out_err:
727	return sk_stream_error(sk, flags, err);
728}
729
730ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731		     size_t size, int flags)
732{
733	ssize_t res;
734	struct sock *sk = sock->sk;
735
736#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737
738	if (!(sk->sk_route_caps & NETIF_F_SG) ||
739	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740		return sock_no_sendpage(sock, page, offset, size, flags);
741
742#undef TCP_ZC_CSUM_FLAGS
743
744	lock_sock(sk);
745	TCP_CHECK_TIMER(sk);
746	res = do_tcp_sendpages(sk, &page, offset, size, flags);
747	TCP_CHECK_TIMER(sk);
748	release_sock(sk);
749	return res;
750}
751
752#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
753#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
754
755static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{
757	int tmp = tp->mss_cache_std;
758
759	if (sk->sk_route_caps & NETIF_F_SG)
760		tmp = 0;
761
762	return tmp;
763}
764
765int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
766		size_t size)
767{
768	struct iovec *iov;
769	struct tcp_sock *tp = tcp_sk(sk);
770	struct sk_buff *skb;
771	int iovlen, flags;
772	int mss_now;
773	int err, copied;
774	long timeo;
775
776	lock_sock(sk);
777	TCP_CHECK_TIMER(sk);
778
779	flags = msg->msg_flags;
780	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
781
782	/* Wait for a connection to finish. */
783	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
784		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
785			goto out_err;
786
787	/* This should be in poll */
788	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
789
790	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
791
792	/* Ok commence sending. */
793	iovlen = msg->msg_iovlen;
794	iov = msg->msg_iov;
795	copied = 0;
796
797	err = -EPIPE;
798	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
799		goto do_error;
800
801	while (--iovlen >= 0) {
802		int seglen = iov->iov_len;
803		unsigned char __user *from = iov->iov_base;
804
805		iov++;
806
807		while (seglen > 0) {
808			int copy;
809
810			skb = sk->sk_write_queue.prev;
811
812			if (!sk->sk_send_head ||
813			    (copy = mss_now - skb->len) <= 0) {
814
815new_segment:
816				/* Allocate new segment. If the interface is SG,
817				 * allocate skb fitting to single page.
818				 */
819				if (!sk_stream_memory_free(sk))
820					goto wait_for_sndbuf;
821
822				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
823							   0, sk->sk_allocation);
824				if (!skb)
825					goto wait_for_memory;
826
827				/*
828				 * Check whether we can use HW checksum.
829				 */
830				if (sk->sk_route_caps &
831				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
832				     NETIF_F_HW_CSUM))
833					skb->ip_summed = CHECKSUM_HW;
834
835				skb_entail(sk, tp, skb);
836				copy = mss_now;
837			}
838
839			/* Try to append data to the end of skb. */
840			if (copy > seglen)
841				copy = seglen;
842
843			/* Where to copy to? */
844			if (skb_tailroom(skb) > 0) {
845				/* We have some space in skb head. Superb! */
846				if (copy > skb_tailroom(skb))
847					copy = skb_tailroom(skb);
848				if ((err = skb_add_data(skb, from, copy)) != 0)
849					goto do_fault;
850			} else {
851				int merge = 0;
852				int i = skb_shinfo(skb)->nr_frags;
853				struct page *page = TCP_PAGE(sk);
854				int off = TCP_OFF(sk);
855
856				if (skb_can_coalesce(skb, i, page, off) &&
857				    off != PAGE_SIZE) {
858					/* We can extend the last page
859					 * fragment. */
860					merge = 1;
861				} else if (i == MAX_SKB_FRAGS ||
862					   (!i &&
863					   !(sk->sk_route_caps & NETIF_F_SG))) {
864					/* Need to add new fragment and cannot
865					 * do this because interface is non-SG,
866					 * or because all the page slots are
867					 * busy. */
868					tcp_mark_push(tp, skb);
869					goto new_segment;
870				} else if (page) {
871					if (off == PAGE_SIZE) {
872						put_page(page);
873						TCP_PAGE(sk) = page = NULL;
874					}
875				}
876
877				if (!page) {
878					/* Allocate new cache page. */
879					if (!(page = sk_stream_alloc_page(sk)))
880						goto wait_for_memory;
881					off = 0;
882				}
883
884				if (copy > PAGE_SIZE - off)
885					copy = PAGE_SIZE - off;
886
887				/* Time to copy data. We are close to
888				 * the end! */
889				err = skb_copy_to_page(sk, from, skb, page,
890						       off, copy);
891				if (err) {
892					/* If this page was new, give it to the
893					 * socket so it does not get leaked.
894					 */
895					if (!TCP_PAGE(sk)) {
896						TCP_PAGE(sk) = page;
897						TCP_OFF(sk) = 0;
898					}
899					goto do_error;
900				}
901
902				/* Update the skb. */
903				if (merge) {
904					skb_shinfo(skb)->frags[i - 1].size +=
905									copy;
906				} else {
907					skb_fill_page_desc(skb, i, page, off, copy);
908					if (TCP_PAGE(sk)) {
909						get_page(page);
910					} else if (off + copy < PAGE_SIZE) {
911						get_page(page);
912						TCP_PAGE(sk) = page;
913					}
914				}
915
916				TCP_OFF(sk) = off + copy;
917			}
918
919			if (!copied)
920				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
921
922			tp->write_seq += copy;
923			TCP_SKB_CB(skb)->end_seq += copy;
924			skb_shinfo(skb)->tso_segs = 0;
925
926			from += copy;
927			copied += copy;
928			if ((seglen -= copy) == 0 && iovlen == 0)
929				goto out;
930
931			if (skb->len != mss_now || (flags & MSG_OOB))
932				continue;
933
934			if (forced_push(tp)) {
935				tcp_mark_push(tp, skb);
936				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
937			} else if (skb == sk->sk_send_head)
938				tcp_push_one(sk, mss_now);
939			continue;
940
941wait_for_sndbuf:
942			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
943wait_for_memory:
944			if (copied)
945				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
946
947			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
948				goto do_error;
949
950			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
951		}
952	}
953
954out:
955	if (copied)
956		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
957	TCP_CHECK_TIMER(sk);
958	release_sock(sk);
959	return copied;
960
961do_fault:
962	if (!skb->len) {
963		if (sk->sk_send_head == skb)
964			sk->sk_send_head = NULL;
965		__skb_unlink(skb, skb->list);
966		sk_stream_free_skb(sk, skb);
967	}
968
969do_error:
970	if (copied)
971		goto out;
972out_err:
973	err = sk_stream_error(sk, flags, err);
974	TCP_CHECK_TIMER(sk);
975	release_sock(sk);
976	return err;
977}
978
979/*
980 *	Handle reading urgent data. BSD has very simple semantics for
981 *	this, no blocking and very strange errors 8)
982 */
983
984static int tcp_recv_urg(struct sock *sk, long timeo,
985			struct msghdr *msg, int len, int flags,
986			int *addr_len)
987{
988	struct tcp_sock *tp = tcp_sk(sk);
989
990	/* No URG data to read. */
991	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
992	    tp->urg_data == TCP_URG_READ)
993		return -EINVAL;	/* Yes this is right ! */
994
995	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
996		return -ENOTCONN;
997
998	if (tp->urg_data & TCP_URG_VALID) {
999		int err = 0;
1000		char c = tp->urg_data;
1001
1002		if (!(flags & MSG_PEEK))
1003			tp->urg_data = TCP_URG_READ;
1004
1005		/* Read urgent data. */
1006		msg->msg_flags |= MSG_OOB;
1007
1008		if (len > 0) {
1009			if (!(flags & MSG_TRUNC))
1010				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1011			len = 1;
1012		} else
1013			msg->msg_flags |= MSG_TRUNC;
1014
1015		return err ? -EFAULT : len;
1016	}
1017
1018	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1019		return 0;
1020
1021	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1022	 * the available implementations agree in this case:
1023	 * this call should never block, independent of the
1024	 * blocking state of the socket.
1025	 * Mike <pall@rz.uni-karlsruhe.de>
1026	 */
1027	return -EAGAIN;
1028}
1029
1030/* Clean up the receive buffer for full frames taken by the user,
1031 * then send an ACK if necessary.  COPIED is the number of bytes
1032 * tcp_recvmsg has given to the user so far, it speeds up the
1033 * calculation of whether or not we must ACK for the sake of
1034 * a window update.
1035 */
1036static void cleanup_rbuf(struct sock *sk, int copied)
1037{
1038	struct tcp_sock *tp = tcp_sk(sk);
1039	int time_to_ack = 0;
1040
1041#if TCP_DEBUG
1042	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1043
1044	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1045#endif
1046
1047	if (tcp_ack_scheduled(tp)) {
1048		   /* Delayed ACKs frequently hit locked sockets during bulk
1049		    * receive. */
1050		if (tp->ack.blocked ||
1051		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1052		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1053		    /*
1054		     * If this read emptied read buffer, we send ACK, if
1055		     * connection is not bidirectional, user drained
1056		     * receive buffer and there was a small segment
1057		     * in queue.
1058		     */
1059		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1060		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1061			time_to_ack = 1;
1062	}
1063
1064	/* We send an ACK if we can now advertise a non-zero window
1065	 * which has been raised "significantly".
1066	 *
1067	 * Even if window raised up to infinity, do not send window open ACK
1068	 * in states, where we will not receive more. It is useless.
1069	 */
1070	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1071		__u32 rcv_window_now = tcp_receive_window(tp);
1072
1073		/* Optimize, __tcp_select_window() is not cheap. */
1074		if (2*rcv_window_now <= tp->window_clamp) {
1075			__u32 new_window = __tcp_select_window(sk);
1076
1077			/* Send ACK now, if this read freed lots of space
1078			 * in our buffer. Certainly, new_window is new window.
1079			 * We can advertise it now, if it is not less than current one.
1080			 * "Lots" means "at least twice" here.
1081			 */
1082			if (new_window && new_window >= 2 * rcv_window_now)
1083				time_to_ack = 1;
1084		}
1085	}
1086	if (time_to_ack)
1087		tcp_send_ack(sk);
1088}
1089
1090static void tcp_prequeue_process(struct sock *sk)
1091{
1092	struct sk_buff *skb;
1093	struct tcp_sock *tp = tcp_sk(sk);
1094
1095	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1096
1097	/* RX process wants to run with disabled BHs, though it is not
1098	 * necessary */
1099	local_bh_disable();
1100	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1101		sk->sk_backlog_rcv(sk, skb);
1102	local_bh_enable();
1103
1104	/* Clear memory counter. */
1105	tp->ucopy.memory = 0;
1106}
1107
1108static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1109{
1110	struct sk_buff *skb;
1111	u32 offset;
1112
1113	skb_queue_walk(&sk->sk_receive_queue, skb) {
1114		offset = seq - TCP_SKB_CB(skb)->seq;
1115		if (skb->h.th->syn)
1116			offset--;
1117		if (offset < skb->len || skb->h.th->fin) {
1118			*off = offset;
1119			return skb;
1120		}
1121	}
1122	return NULL;
1123}
1124
1125/*
1126 * This routine provides an alternative to tcp_recvmsg() for routines
1127 * that would like to handle copying from skbuffs directly in 'sendfile'
1128 * fashion.
1129 * Note:
1130 *	- It is assumed that the socket was locked by the caller.
1131 *	- The routine does not block.
1132 *	- At present, there is no support for reading OOB data
1133 *	  or for 'peeking' the socket using this routine
1134 *	  (although both would be easy to implement).
1135 */
1136int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1137		  sk_read_actor_t recv_actor)
1138{
1139	struct sk_buff *skb;
1140	struct tcp_sock *tp = tcp_sk(sk);
1141	u32 seq = tp->copied_seq;
1142	u32 offset;
1143	int copied = 0;
1144
1145	if (sk->sk_state == TCP_LISTEN)
1146		return -ENOTCONN;
1147	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1148		if (offset < skb->len) {
1149			size_t used, len;
1150
1151			len = skb->len - offset;
1152			/* Stop reading if we hit a patch of urgent data */
1153			if (tp->urg_data) {
1154				u32 urg_offset = tp->urg_seq - seq;
1155				if (urg_offset < len)
1156					len = urg_offset;
1157				if (!len)
1158					break;
1159			}
1160			used = recv_actor(desc, skb, offset, len);
1161			if (used <= len) {
1162				seq += used;
1163				copied += used;
1164				offset += used;
1165			}
1166			if (offset != skb->len)
1167				break;
1168		}
1169		if (skb->h.th->fin) {
1170			sk_eat_skb(sk, skb);
1171			++seq;
1172			break;
1173		}
1174		sk_eat_skb(sk, skb);
1175		if (!desc->count)
1176			break;
1177	}
1178	tp->copied_seq = seq;
1179
1180	tcp_rcv_space_adjust(sk);
1181
1182	/* Clean up data we have read: This will do ACK frames. */
1183	if (copied)
1184		cleanup_rbuf(sk, copied);
1185	return copied;
1186}
1187
1188/*
1189 *	This routine copies from a sock struct into the user buffer.
1190 *
1191 *	Technical note: in 2.3 we work on _locked_ socket, so that
1192 *	tricks with *seq access order and skb->users are not required.
1193 *	Probably, code can be easily improved even more.
1194 */
1195
1196int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1197		size_t len, int nonblock, int flags, int *addr_len)
1198{
1199	struct tcp_sock *tp = tcp_sk(sk);
1200	int copied = 0;
1201	u32 peek_seq;
1202	u32 *seq;
1203	unsigned long used;
1204	int err;
1205	int target;		/* Read at least this many bytes */
1206	long timeo;
1207	struct task_struct *user_recv = NULL;
1208
1209	lock_sock(sk);
1210
1211	TCP_CHECK_TIMER(sk);
1212
1213	err = -ENOTCONN;
1214	if (sk->sk_state == TCP_LISTEN)
1215		goto out;
1216
1217	timeo = sock_rcvtimeo(sk, nonblock);
1218
1219	/* Urgent data needs to be handled specially. */
1220	if (flags & MSG_OOB)
1221		goto recv_urg;
1222
1223	seq = &tp->copied_seq;
1224	if (flags & MSG_PEEK) {
1225		peek_seq = tp->copied_seq;
1226		seq = &peek_seq;
1227	}
1228
1229	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1230
1231	do {
1232		struct sk_buff *skb;
1233		u32 offset;
1234
1235		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1236		if (tp->urg_data && tp->urg_seq == *seq) {
1237			if (copied)
1238				break;
1239			if (signal_pending(current)) {
1240				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1241				break;
1242			}
1243		}
1244
1245		/* Next get a buffer. */
1246
1247		skb = skb_peek(&sk->sk_receive_queue);
1248		do {
1249			if (!skb)
1250				break;
1251
1252			/* Now that we have two receive queues this
1253			 * shouldn't happen.
1254			 */
1255			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1256				printk(KERN_INFO "recvmsg bug: copied %X "
1257				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1258				break;
1259			}
1260			offset = *seq - TCP_SKB_CB(skb)->seq;
1261			if (skb->h.th->syn)
1262				offset--;
1263			if (offset < skb->len)
1264				goto found_ok_skb;
1265			if (skb->h.th->fin)
1266				goto found_fin_ok;
1267			BUG_TRAP(flags & MSG_PEEK);
1268			skb = skb->next;
1269		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1270
1271		/* Well, if we have backlog, try to process it now yet. */
1272
1273		if (copied >= target && !sk->sk_backlog.tail)
1274			break;
1275
1276		if (copied) {
1277			if (sk->sk_err ||
1278			    sk->sk_state == TCP_CLOSE ||
1279			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1280			    !timeo ||
1281			    signal_pending(current) ||
1282			    (flags & MSG_PEEK))
1283				break;
1284		} else {
1285			if (sock_flag(sk, SOCK_DONE))
1286				break;
1287
1288			if (sk->sk_err) {
1289				copied = sock_error(sk);
1290				break;
1291			}
1292
1293			if (sk->sk_shutdown & RCV_SHUTDOWN)
1294				break;
1295
1296			if (sk->sk_state == TCP_CLOSE) {
1297				if (!sock_flag(sk, SOCK_DONE)) {
1298					/* This occurs when user tries to read
1299					 * from never connected socket.
1300					 */
1301					copied = -ENOTCONN;
1302					break;
1303				}
1304				break;
1305			}
1306
1307			if (!timeo) {
1308				copied = -EAGAIN;
1309				break;
1310			}
1311
1312			if (signal_pending(current)) {
1313				copied = sock_intr_errno(timeo);
1314				break;
1315			}
1316		}
1317
1318		cleanup_rbuf(sk, copied);
1319
1320		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1321			/* Install new reader */
1322			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1323				user_recv = current;
1324				tp->ucopy.task = user_recv;
1325				tp->ucopy.iov = msg->msg_iov;
1326			}
1327
1328			tp->ucopy.len = len;
1329
1330			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1331				 (flags & (MSG_PEEK | MSG_TRUNC)));
1332
1333			/* Ugly... If prequeue is not empty, we have to
1334			 * process it before releasing socket, otherwise
1335			 * order will be broken at second iteration.
1336			 * More elegant solution is required!!!
1337			 *
1338			 * Look: we have the following (pseudo)queues:
1339			 *
1340			 * 1. packets in flight
1341			 * 2. backlog
1342			 * 3. prequeue
1343			 * 4. receive_queue
1344			 *
1345			 * Each queue can be processed only if the next ones
1346			 * are empty. At this point we have empty receive_queue.
1347			 * But prequeue _can_ be not empty after 2nd iteration,
1348			 * when we jumped to start of loop because backlog
1349			 * processing added something to receive_queue.
1350			 * We cannot release_sock(), because backlog contains
1351			 * packets arrived _after_ prequeued ones.
1352			 *
1353			 * Shortly, algorithm is clear --- to process all
1354			 * the queues in order. We could make it more directly,
1355			 * requeueing packets from backlog to prequeue, if
1356			 * is not empty. It is more elegant, but eats cycles,
1357			 * unfortunately.
1358			 */
1359			if (skb_queue_len(&tp->ucopy.prequeue))
1360				goto do_prequeue;
1361
1362			/* __ Set realtime policy in scheduler __ */
1363		}
1364
1365		if (copied >= target) {
1366			/* Do not sleep, just process backlog. */
1367			release_sock(sk);
1368			lock_sock(sk);
1369		} else
1370			sk_wait_data(sk, &timeo);
1371
1372		if (user_recv) {
1373			int chunk;
1374
1375			/* __ Restore normal policy in scheduler __ */
1376
1377			if ((chunk = len - tp->ucopy.len) != 0) {
1378				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1379				len -= chunk;
1380				copied += chunk;
1381			}
1382
1383			if (tp->rcv_nxt == tp->copied_seq &&
1384			    skb_queue_len(&tp->ucopy.prequeue)) {
1385do_prequeue:
1386				tcp_prequeue_process(sk);
1387
1388				if ((chunk = len - tp->ucopy.len) != 0) {
1389					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1390					len -= chunk;
1391					copied += chunk;
1392				}
1393			}
1394		}
1395		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1396			if (net_ratelimit())
1397				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1398				       current->comm, current->pid);
1399			peek_seq = tp->copied_seq;
1400		}
1401		continue;
1402
1403	found_ok_skb:
1404		/* Ok so how much can we use? */
1405		used = skb->len - offset;
1406		if (len < used)
1407			used = len;
1408
1409		/* Do we have urgent data here? */
1410		if (tp->urg_data) {
1411			u32 urg_offset = tp->urg_seq - *seq;
1412			if (urg_offset < used) {
1413				if (!urg_offset) {
1414					if (!sock_flag(sk, SOCK_URGINLINE)) {
1415						++*seq;
1416						offset++;
1417						used--;
1418						if (!used)
1419							goto skip_copy;
1420					}
1421				} else
1422					used = urg_offset;
1423			}
1424		}
1425
1426		if (!(flags & MSG_TRUNC)) {
1427			err = skb_copy_datagram_iovec(skb, offset,
1428						      msg->msg_iov, used);
1429			if (err) {
1430				/* Exception. Bailout! */
1431				if (!copied)
1432					copied = -EFAULT;
1433				break;
1434			}
1435		}
1436
1437		*seq += used;
1438		copied += used;
1439		len -= used;
1440
1441		tcp_rcv_space_adjust(sk);
1442
1443skip_copy:
1444		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1445			tp->urg_data = 0;
1446			tcp_fast_path_check(sk, tp);
1447		}
1448		if (used + offset < skb->len)
1449			continue;
1450
1451		if (skb->h.th->fin)
1452			goto found_fin_ok;
1453		if (!(flags & MSG_PEEK))
1454			sk_eat_skb(sk, skb);
1455		continue;
1456
1457	found_fin_ok:
1458		/* Process the FIN. */
1459		++*seq;
1460		if (!(flags & MSG_PEEK))
1461			sk_eat_skb(sk, skb);
1462		break;
1463	} while (len > 0);
1464
1465	if (user_recv) {
1466		if (skb_queue_len(&tp->ucopy.prequeue)) {
1467			int chunk;
1468
1469			tp->ucopy.len = copied > 0 ? len : 0;
1470
1471			tcp_prequeue_process(sk);
1472
1473			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1474				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1475				len -= chunk;
1476				copied += chunk;
1477			}
1478		}
1479
1480		tp->ucopy.task = NULL;
1481		tp->ucopy.len = 0;
1482	}
1483
1484	/* According to UNIX98, msg_name/msg_namelen are ignored
1485	 * on connected socket. I was just happy when found this 8) --ANK
1486	 */
1487
1488	/* Clean up data we have read: This will do ACK frames. */
1489	cleanup_rbuf(sk, copied);
1490
1491	TCP_CHECK_TIMER(sk);
1492	release_sock(sk);
1493	return copied;
1494
1495out:
1496	TCP_CHECK_TIMER(sk);
1497	release_sock(sk);
1498	return err;
1499
1500recv_urg:
1501	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1502	goto out;
1503}
1504
1505/*
1506 *	State processing on a close. This implements the state shift for
1507 *	sending our FIN frame. Note that we only send a FIN for some
1508 *	states. A shutdown() may have already sent the FIN, or we may be
1509 *	closed.
1510 */
1511
1512static unsigned char new_state[16] = {
1513  /* current state:        new state:      action:	*/
1514  /* (Invalid)		*/ TCP_CLOSE,
1515  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1517  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1518  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1519  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1520  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1521  /* TCP_CLOSE		*/ TCP_CLOSE,
1522  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1523  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1524  /* TCP_LISTEN		*/ TCP_CLOSE,
1525  /* TCP_CLOSING	*/ TCP_CLOSING,
1526};
1527
1528static int tcp_close_state(struct sock *sk)
1529{
1530	int next = (int)new_state[sk->sk_state];
1531	int ns = next & TCP_STATE_MASK;
1532
1533	tcp_set_state(sk, ns);
1534
1535	return next & TCP_ACTION_FIN;
1536}
1537
1538/*
1539 *	Shutdown the sending side of a connection. Much like close except
1540 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1541 */
1542
1543void tcp_shutdown(struct sock *sk, int how)
1544{
1545	/*	We need to grab some memory, and put together a FIN,
1546	 *	and then put it into the queue to be sent.
1547	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1548	 */
1549	if (!(how & SEND_SHUTDOWN))
1550		return;
1551
1552	/* If we've already sent a FIN, or it's a closed state, skip this. */
1553	if ((1 << sk->sk_state) &
1554	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1555	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1556		/* Clear out any half completed packets.  FIN if needed. */
1557		if (tcp_close_state(sk))
1558			tcp_send_fin(sk);
1559	}
1560}
1561
1562/*
1563 * At this point, there should be no process reference to this
1564 * socket, and thus no user references at all.  Therefore we
1565 * can assume the socket waitqueue is inactive and nobody will
1566 * try to jump onto it.
1567 */
1568void tcp_destroy_sock(struct sock *sk)
1569{
1570	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1571	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1572
1573	/* It cannot be in hash table! */
1574	BUG_TRAP(sk_unhashed(sk));
1575
1576	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1577	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1578
1579	sk->sk_prot->destroy(sk);
1580
1581	sk_stream_kill_queues(sk);
1582
1583	xfrm_sk_free_policy(sk);
1584
1585#ifdef INET_REFCNT_DEBUG
1586	if (atomic_read(&sk->sk_refcnt) != 1) {
1587		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1588		       sk, atomic_read(&sk->sk_refcnt));
1589	}
1590#endif
1591
1592	atomic_dec(&tcp_orphan_count);
1593	sock_put(sk);
1594}
1595
1596void tcp_close(struct sock *sk, long timeout)
1597{
1598	struct sk_buff *skb;
1599	int data_was_unread = 0;
1600
1601	lock_sock(sk);
1602	sk->sk_shutdown = SHUTDOWN_MASK;
1603
1604	if (sk->sk_state == TCP_LISTEN) {
1605		tcp_set_state(sk, TCP_CLOSE);
1606
1607		/* Special case. */
1608		tcp_listen_stop(sk);
1609
1610		goto adjudge_to_death;
1611	}
1612
1613	/*  We need to flush the recv. buffs.  We do this only on the
1614	 *  descriptor close, not protocol-sourced closes, because the
1615	 *  reader process may not have drained the data yet!
1616	 */
1617	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1618		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1619			  skb->h.th->fin;
1620		data_was_unread += len;
1621		__kfree_skb(skb);
1622	}
1623
1624	sk_stream_mem_reclaim(sk);
1625
1626	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1627	 * 3.10, we send a RST here because data was lost.  To
1628	 * witness the awful effects of the old behavior of always
1629	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1630	 * a bulk GET in an FTP client, suspend the process, wait
1631	 * for the client to advertise a zero window, then kill -9
1632	 * the FTP client, wheee...  Note: timeout is always zero
1633	 * in such a case.
1634	 */
1635	if (data_was_unread) {
1636		/* Unread data was tossed, zap the connection. */
1637		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1638		tcp_set_state(sk, TCP_CLOSE);
1639		tcp_send_active_reset(sk, GFP_KERNEL);
1640	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1641		/* Check zero linger _after_ checking for unread data. */
1642		sk->sk_prot->disconnect(sk, 0);
1643		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1644	} else if (tcp_close_state(sk)) {
1645		/* We FIN if the application ate all the data before
1646		 * zapping the connection.
1647		 */
1648
1649		/* RED-PEN. Formally speaking, we have broken TCP state
1650		 * machine. State transitions:
1651		 *
1652		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1653		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1654		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1655		 *
1656		 * are legal only when FIN has been sent (i.e. in window),
1657		 * rather than queued out of window. Purists blame.
1658		 *
1659		 * F.e. "RFC state" is ESTABLISHED,
1660		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1661		 *
1662		 * The visible declinations are that sometimes
1663		 * we enter time-wait state, when it is not required really
1664		 * (harmless), do not send active resets, when they are
1665		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1666		 * they look as CLOSING or LAST_ACK for Linux)
1667		 * Probably, I missed some more holelets.
1668		 * 						--ANK
1669		 */
1670		tcp_send_fin(sk);
1671	}
1672
1673	sk_stream_wait_close(sk, timeout);
1674
1675adjudge_to_death:
1676	/* It is the last release_sock in its life. It will remove backlog. */
1677	release_sock(sk);
1678
1679
1680	/* Now socket is owned by kernel and we acquire BH lock
1681	   to finish close. No need to check for user refs.
1682	 */
1683	local_bh_disable();
1684	bh_lock_sock(sk);
1685	BUG_TRAP(!sock_owned_by_user(sk));
1686
1687	sock_hold(sk);
1688	sock_orphan(sk);
1689
1690	/*	This is a (useful) BSD violating of the RFC. There is a
1691	 *	problem with TCP as specified in that the other end could
1692	 *	keep a socket open forever with no application left this end.
1693	 *	We use a 3 minute timeout (about the same as BSD) then kill
1694	 *	our end. If they send after that then tough - BUT: long enough
1695	 *	that we won't make the old 4*rto = almost no time - whoops
1696	 *	reset mistake.
1697	 *
1698	 *	Nope, it was not mistake. It is really desired behaviour
1699	 *	f.e. on http servers, when such sockets are useless, but
1700	 *	consume significant resources. Let's do it with special
1701	 *	linger2	option.					--ANK
1702	 */
1703
1704	if (sk->sk_state == TCP_FIN_WAIT2) {
1705		struct tcp_sock *tp = tcp_sk(sk);
1706		if (tp->linger2 < 0) {
1707			tcp_set_state(sk, TCP_CLOSE);
1708			tcp_send_active_reset(sk, GFP_ATOMIC);
1709			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1710		} else {
1711			int tmo = tcp_fin_time(tp);
1712
1713			if (tmo > TCP_TIMEWAIT_LEN) {
1714				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1715			} else {
1716				atomic_inc(&tcp_orphan_count);
1717				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1718				goto out;
1719			}
1720		}
1721	}
1722	if (sk->sk_state != TCP_CLOSE) {
1723		sk_stream_mem_reclaim(sk);
1724		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1725		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1726		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1727			if (net_ratelimit())
1728				printk(KERN_INFO "TCP: too many of orphaned "
1729				       "sockets\n");
1730			tcp_set_state(sk, TCP_CLOSE);
1731			tcp_send_active_reset(sk, GFP_ATOMIC);
1732			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1733		}
1734	}
1735	atomic_inc(&tcp_orphan_count);
1736
1737	if (sk->sk_state == TCP_CLOSE)
1738		tcp_destroy_sock(sk);
1739	/* Otherwise, socket is reprieved until protocol close. */
1740
1741out:
1742	bh_unlock_sock(sk);
1743	local_bh_enable();
1744	sock_put(sk);
1745}
1746
1747/* These states need RST on ABORT according to RFC793 */
1748
1749static inline int tcp_need_reset(int state)
1750{
1751	return (1 << state) &
1752	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1753		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1754}
1755
1756int tcp_disconnect(struct sock *sk, int flags)
1757{
1758	struct inet_sock *inet = inet_sk(sk);
1759	struct tcp_sock *tp = tcp_sk(sk);
1760	int err = 0;
1761	int old_state = sk->sk_state;
1762
1763	if (old_state != TCP_CLOSE)
1764		tcp_set_state(sk, TCP_CLOSE);
1765
1766	/* ABORT function of RFC793 */
1767	if (old_state == TCP_LISTEN) {
1768		tcp_listen_stop(sk);
1769	} else if (tcp_need_reset(old_state) ||
1770		   (tp->snd_nxt != tp->write_seq &&
1771		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1772		/* The last check adjusts for discrepance of Linux wrt. RFC
1773		 * states
1774		 */
1775		tcp_send_active_reset(sk, gfp_any());
1776		sk->sk_err = ECONNRESET;
1777	} else if (old_state == TCP_SYN_SENT)
1778		sk->sk_err = ECONNRESET;
1779
1780	tcp_clear_xmit_timers(sk);
1781	__skb_queue_purge(&sk->sk_receive_queue);
1782	sk_stream_writequeue_purge(sk);
1783	__skb_queue_purge(&tp->out_of_order_queue);
1784
1785	inet->dport = 0;
1786
1787	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1788		inet_reset_saddr(sk);
1789
1790	sk->sk_shutdown = 0;
1791	sock_reset_flag(sk, SOCK_DONE);
1792	tp->srtt = 0;
1793	if ((tp->write_seq += tp->max_window + 2) == 0)
1794		tp->write_seq = 1;
1795	tp->backoff = 0;
1796	tp->snd_cwnd = 2;
1797	tp->probes_out = 0;
1798	tp->packets_out = 0;
1799	tp->snd_ssthresh = 0x7fffffff;
1800	tp->snd_cwnd_cnt = 0;
1801	tcp_set_ca_state(tp, TCP_CA_Open);
1802	tcp_clear_retrans(tp);
1803	tcp_delack_init(tp);
1804	sk->sk_send_head = NULL;
1805	tp->rx_opt.saw_tstamp = 0;
1806	tcp_sack_reset(&tp->rx_opt);
1807	__sk_dst_reset(sk);
1808
1809	BUG_TRAP(!inet->num || tp->bind_hash);
1810
1811	sk->sk_error_report(sk);
1812	return err;
1813}
1814
1815/*
1816 *	Wait for an incoming connection, avoid race
1817 *	conditions. This must be called with the socket locked.
1818 */
1819static int wait_for_connect(struct sock *sk, long timeo)
1820{
1821	struct tcp_sock *tp = tcp_sk(sk);
1822	DEFINE_WAIT(wait);
1823	int err;
1824
1825	/*
1826	 * True wake-one mechanism for incoming connections: only
1827	 * one process gets woken up, not the 'whole herd'.
1828	 * Since we do not 'race & poll' for established sockets
1829	 * anymore, the common case will execute the loop only once.
1830	 *
1831	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1832	 * after any current non-exclusive waiters, and we know that
1833	 * it will always _stay_ after any new non-exclusive waiters
1834	 * because all non-exclusive waiters are added at the
1835	 * beginning of the wait-queue. As such, it's ok to "drop"
1836	 * our exclusiveness temporarily when we get woken up without
1837	 * having to remove and re-insert us on the wait queue.
1838	 */
1839	for (;;) {
1840		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1841					  TASK_INTERRUPTIBLE);
1842		release_sock(sk);
1843		if (reqsk_queue_empty(&tp->accept_queue))
1844			timeo = schedule_timeout(timeo);
1845		lock_sock(sk);
1846		err = 0;
1847		if (!reqsk_queue_empty(&tp->accept_queue))
1848			break;
1849		err = -EINVAL;
1850		if (sk->sk_state != TCP_LISTEN)
1851			break;
1852		err = sock_intr_errno(timeo);
1853		if (signal_pending(current))
1854			break;
1855		err = -EAGAIN;
1856		if (!timeo)
1857			break;
1858	}
1859	finish_wait(sk->sk_sleep, &wait);
1860	return err;
1861}
1862
1863/*
1864 *	This will accept the next outstanding connection.
1865 */
1866
1867struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1868{
1869	struct tcp_sock *tp = tcp_sk(sk);
1870	struct sock *newsk;
1871	int error;
1872
1873	lock_sock(sk);
1874
1875	/* We need to make sure that this socket is listening,
1876	 * and that it has something pending.
1877	 */
1878	error = -EINVAL;
1879	if (sk->sk_state != TCP_LISTEN)
1880		goto out_err;
1881
1882	/* Find already established connection */
1883	if (reqsk_queue_empty(&tp->accept_queue)) {
1884		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1885
1886		/* If this is a non blocking socket don't sleep */
1887		error = -EAGAIN;
1888		if (!timeo)
1889			goto out_err;
1890
1891		error = wait_for_connect(sk, timeo);
1892		if (error)
1893			goto out_err;
1894	}
1895
1896	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1897	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1898out:
1899	release_sock(sk);
1900	return newsk;
1901out_err:
1902	newsk = NULL;
1903	*err = error;
1904	goto out;
1905}
1906
1907/*
1908 *	Socket option code for TCP.
1909 */
1910int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1911		   int optlen)
1912{
1913	struct tcp_sock *tp = tcp_sk(sk);
1914	int val;
1915	int err = 0;
1916
1917	if (level != SOL_TCP)
1918		return tp->af_specific->setsockopt(sk, level, optname,
1919						   optval, optlen);
1920
1921	/* This is a string value all the others are int's */
1922	if (optname == TCP_CONGESTION) {
1923		char name[TCP_CA_NAME_MAX];
1924
1925		if (optlen < 1)
1926			return -EINVAL;
1927
1928		val = strncpy_from_user(name, optval,
1929					min(TCP_CA_NAME_MAX-1, optlen));
1930		if (val < 0)
1931			return -EFAULT;
1932		name[val] = 0;
1933
1934		lock_sock(sk);
1935		err = tcp_set_congestion_control(tp, name);
1936		release_sock(sk);
1937		return err;
1938	}
1939
1940	if (optlen < sizeof(int))
1941		return -EINVAL;
1942
1943	if (get_user(val, (int __user *)optval))
1944		return -EFAULT;
1945
1946	lock_sock(sk);
1947
1948	switch (optname) {
1949	case TCP_MAXSEG:
1950		/* Values greater than interface MTU won't take effect. However
1951		 * at the point when this call is done we typically don't yet
1952		 * know which interface is going to be used */
1953		if (val < 8 || val > MAX_TCP_WINDOW) {
1954			err = -EINVAL;
1955			break;
1956		}
1957		tp->rx_opt.user_mss = val;
1958		break;
1959
1960	case TCP_NODELAY:
1961		if (val) {
1962			/* TCP_NODELAY is weaker than TCP_CORK, so that
1963			 * this option on corked socket is remembered, but
1964			 * it is not activated until cork is cleared.
1965			 *
1966			 * However, when TCP_NODELAY is set we make
1967			 * an explicit push, which overrides even TCP_CORK
1968			 * for currently queued segments.
1969			 */
1970			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1971			tcp_push_pending_frames(sk, tp);
1972		} else {
1973			tp->nonagle &= ~TCP_NAGLE_OFF;
1974		}
1975		break;
1976
1977	case TCP_CORK:
1978		/* When set indicates to always queue non-full frames.
1979		 * Later the user clears this option and we transmit
1980		 * any pending partial frames in the queue.  This is
1981		 * meant to be used alongside sendfile() to get properly
1982		 * filled frames when the user (for example) must write
1983		 * out headers with a write() call first and then use
1984		 * sendfile to send out the data parts.
1985		 *
1986		 * TCP_CORK can be set together with TCP_NODELAY and it is
1987		 * stronger than TCP_NODELAY.
1988		 */
1989		if (val) {
1990			tp->nonagle |= TCP_NAGLE_CORK;
1991		} else {
1992			tp->nonagle &= ~TCP_NAGLE_CORK;
1993			if (tp->nonagle&TCP_NAGLE_OFF)
1994				tp->nonagle |= TCP_NAGLE_PUSH;
1995			tcp_push_pending_frames(sk, tp);
1996		}
1997		break;
1998
1999	case TCP_KEEPIDLE:
2000		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2001			err = -EINVAL;
2002		else {
2003			tp->keepalive_time = val * HZ;
2004			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2005			    !((1 << sk->sk_state) &
2006			      (TCPF_CLOSE | TCPF_LISTEN))) {
2007				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2008				if (tp->keepalive_time > elapsed)
2009					elapsed = tp->keepalive_time - elapsed;
2010				else
2011					elapsed = 0;
2012				tcp_reset_keepalive_timer(sk, elapsed);
2013			}
2014		}
2015		break;
2016	case TCP_KEEPINTVL:
2017		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2018			err = -EINVAL;
2019		else
2020			tp->keepalive_intvl = val * HZ;
2021		break;
2022	case TCP_KEEPCNT:
2023		if (val < 1 || val > MAX_TCP_KEEPCNT)
2024			err = -EINVAL;
2025		else
2026			tp->keepalive_probes = val;
2027		break;
2028	case TCP_SYNCNT:
2029		if (val < 1 || val > MAX_TCP_SYNCNT)
2030			err = -EINVAL;
2031		else
2032			tp->syn_retries = val;
2033		break;
2034
2035	case TCP_LINGER2:
2036		if (val < 0)
2037			tp->linger2 = -1;
2038		else if (val > sysctl_tcp_fin_timeout / HZ)
2039			tp->linger2 = 0;
2040		else
2041			tp->linger2 = val * HZ;
2042		break;
2043
2044	case TCP_DEFER_ACCEPT:
2045		tp->defer_accept = 0;
2046		if (val > 0) {
2047			/* Translate value in seconds to number of
2048			 * retransmits */
2049			while (tp->defer_accept < 32 &&
2050			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2051				       tp->defer_accept))
2052				tp->defer_accept++;
2053			tp->defer_accept++;
2054		}
2055		break;
2056
2057	case TCP_WINDOW_CLAMP:
2058		if (!val) {
2059			if (sk->sk_state != TCP_CLOSE) {
2060				err = -EINVAL;
2061				break;
2062			}
2063			tp->window_clamp = 0;
2064		} else
2065			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2066						SOCK_MIN_RCVBUF / 2 : val;
2067		break;
2068
2069	case TCP_QUICKACK:
2070		if (!val) {
2071			tp->ack.pingpong = 1;
2072		} else {
2073			tp->ack.pingpong = 0;
2074			if ((1 << sk->sk_state) &
2075			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2076			    tcp_ack_scheduled(tp)) {
2077				tp->ack.pending |= TCP_ACK_PUSHED;
2078				cleanup_rbuf(sk, 1);
2079				if (!(val & 1))
2080					tp->ack.pingpong = 1;
2081			}
2082		}
2083		break;
2084
2085	default:
2086		err = -ENOPROTOOPT;
2087		break;
2088	};
2089	release_sock(sk);
2090	return err;
2091}
2092
2093/* Return information about state of tcp endpoint in API format. */
2094void tcp_get_info(struct sock *sk, struct tcp_info *info)
2095{
2096	struct tcp_sock *tp = tcp_sk(sk);
2097	u32 now = tcp_time_stamp;
2098
2099	memset(info, 0, sizeof(*info));
2100
2101	info->tcpi_state = sk->sk_state;
2102	info->tcpi_ca_state = tp->ca_state;
2103	info->tcpi_retransmits = tp->retransmits;
2104	info->tcpi_probes = tp->probes_out;
2105	info->tcpi_backoff = tp->backoff;
2106
2107	if (tp->rx_opt.tstamp_ok)
2108		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2109	if (tp->rx_opt.sack_ok)
2110		info->tcpi_options |= TCPI_OPT_SACK;
2111	if (tp->rx_opt.wscale_ok) {
2112		info->tcpi_options |= TCPI_OPT_WSCALE;
2113		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2114		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2115	}
2116
2117	if (tp->ecn_flags&TCP_ECN_OK)
2118		info->tcpi_options |= TCPI_OPT_ECN;
2119
2120	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2121	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2122	info->tcpi_snd_mss = tp->mss_cache_std;
2123	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2124
2125	info->tcpi_unacked = tp->packets_out;
2126	info->tcpi_sacked = tp->sacked_out;
2127	info->tcpi_lost = tp->lost_out;
2128	info->tcpi_retrans = tp->retrans_out;
2129	info->tcpi_fackets = tp->fackets_out;
2130
2131	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2132	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2133	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2134
2135	info->tcpi_pmtu = tp->pmtu_cookie;
2136	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2137	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2138	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2139	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2140	info->tcpi_snd_cwnd = tp->snd_cwnd;
2141	info->tcpi_advmss = tp->advmss;
2142	info->tcpi_reordering = tp->reordering;
2143
2144	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2145	info->tcpi_rcv_space = tp->rcvq_space.space;
2146
2147	info->tcpi_total_retrans = tp->total_retrans;
2148}
2149
2150EXPORT_SYMBOL_GPL(tcp_get_info);
2151
2152int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2153		   int __user *optlen)
2154{
2155	struct tcp_sock *tp = tcp_sk(sk);
2156	int val, len;
2157
2158	if (level != SOL_TCP)
2159		return tp->af_specific->getsockopt(sk, level, optname,
2160						   optval, optlen);
2161
2162	if (get_user(len, optlen))
2163		return -EFAULT;
2164
2165	len = min_t(unsigned int, len, sizeof(int));
2166
2167	if (len < 0)
2168		return -EINVAL;
2169
2170	switch (optname) {
2171	case TCP_MAXSEG:
2172		val = tp->mss_cache_std;
2173		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2174			val = tp->rx_opt.user_mss;
2175		break;
2176	case TCP_NODELAY:
2177		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2178		break;
2179	case TCP_CORK:
2180		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2181		break;
2182	case TCP_KEEPIDLE:
2183		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2184		break;
2185	case TCP_KEEPINTVL:
2186		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2187		break;
2188	case TCP_KEEPCNT:
2189		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2190		break;
2191	case TCP_SYNCNT:
2192		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2193		break;
2194	case TCP_LINGER2:
2195		val = tp->linger2;
2196		if (val >= 0)
2197			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2198		break;
2199	case TCP_DEFER_ACCEPT:
2200		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2201					       (tp->defer_accept - 1));
2202		break;
2203	case TCP_WINDOW_CLAMP:
2204		val = tp->window_clamp;
2205		break;
2206	case TCP_INFO: {
2207		struct tcp_info info;
2208
2209		if (get_user(len, optlen))
2210			return -EFAULT;
2211
2212		tcp_get_info(sk, &info);
2213
2214		len = min_t(unsigned int, len, sizeof(info));
2215		if (put_user(len, optlen))
2216			return -EFAULT;
2217		if (copy_to_user(optval, &info, len))
2218			return -EFAULT;
2219		return 0;
2220	}
2221	case TCP_QUICKACK:
2222		val = !tp->ack.pingpong;
2223		break;
2224
2225	case TCP_CONGESTION:
2226		if (get_user(len, optlen))
2227			return -EFAULT;
2228		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2229		if (put_user(len, optlen))
2230			return -EFAULT;
2231		if (copy_to_user(optval, tp->ca_ops->name, len))
2232			return -EFAULT;
2233		return 0;
2234	default:
2235		return -ENOPROTOOPT;
2236	};
2237
2238	if (put_user(len, optlen))
2239		return -EFAULT;
2240	if (copy_to_user(optval, &val, len))
2241		return -EFAULT;
2242	return 0;
2243}
2244
2245
2246extern void __skb_cb_too_small_for_tcp(int, int);
2247extern struct tcp_congestion_ops tcp_reno;
2248
2249static __initdata unsigned long thash_entries;
2250static int __init set_thash_entries(char *str)
2251{
2252	if (!str)
2253		return 0;
2254	thash_entries = simple_strtoul(str, &str, 0);
2255	return 1;
2256}
2257__setup("thash_entries=", set_thash_entries);
2258
2259void __init tcp_init(void)
2260{
2261	struct sk_buff *skb = NULL;
2262	int order, i;
2263
2264	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2265		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2266					   sizeof(skb->cb));
2267
2268	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2269					      sizeof(struct tcp_bind_bucket),
2270					      0, SLAB_HWCACHE_ALIGN,
2271					      NULL, NULL);
2272	if (!tcp_bucket_cachep)
2273		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2274
2275	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2276						sizeof(struct tcp_tw_bucket),
2277						0, SLAB_HWCACHE_ALIGN,
2278						NULL, NULL);
2279	if (!tcp_timewait_cachep)
2280		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2281
2282	/* Size and allocate the main established and bind bucket
2283	 * hash tables.
2284	 *
2285	 * The methodology is similar to that of the buffer cache.
2286	 */
2287	tcp_ehash = (struct tcp_ehash_bucket *)
2288		alloc_large_system_hash("TCP established",
2289					sizeof(struct tcp_ehash_bucket),
2290					thash_entries,
2291					(num_physpages >= 128 * 1024) ?
2292						(25 - PAGE_SHIFT) :
2293						(27 - PAGE_SHIFT),
2294					HASH_HIGHMEM,
2295					&tcp_ehash_size,
2296					NULL,
2297					0);
2298	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2299	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2300		rwlock_init(&tcp_ehash[i].lock);
2301		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2302	}
2303
2304	tcp_bhash = (struct tcp_bind_hashbucket *)
2305		alloc_large_system_hash("TCP bind",
2306					sizeof(struct tcp_bind_hashbucket),
2307					tcp_ehash_size,
2308					(num_physpages >= 128 * 1024) ?
2309						(25 - PAGE_SHIFT) :
2310						(27 - PAGE_SHIFT),
2311					HASH_HIGHMEM,
2312					&tcp_bhash_size,
2313					NULL,
2314					64 * 1024);
2315	tcp_bhash_size = 1 << tcp_bhash_size;
2316	for (i = 0; i < tcp_bhash_size; i++) {
2317		spin_lock_init(&tcp_bhash[i].lock);
2318		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2319	}
2320
2321	/* Try to be a bit smarter and adjust defaults depending
2322	 * on available memory.
2323	 */
2324	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2325			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2326			order++)
2327		;
2328	if (order >= 4) {
2329		sysctl_local_port_range[0] = 32768;
2330		sysctl_local_port_range[1] = 61000;
2331		sysctl_tcp_max_tw_buckets = 180000;
2332		sysctl_tcp_max_orphans = 4096 << (order - 4);
2333		sysctl_max_syn_backlog = 1024;
2334	} else if (order < 3) {
2335		sysctl_local_port_range[0] = 1024 * (3 - order);
2336		sysctl_tcp_max_tw_buckets >>= (3 - order);
2337		sysctl_tcp_max_orphans >>= (3 - order);
2338		sysctl_max_syn_backlog = 128;
2339	}
2340	tcp_port_rover = sysctl_local_port_range[0] - 1;
2341
2342	sysctl_tcp_mem[0] =  768 << order;
2343	sysctl_tcp_mem[1] = 1024 << order;
2344	sysctl_tcp_mem[2] = 1536 << order;
2345
2346	if (order < 3) {
2347		sysctl_tcp_wmem[2] = 64 * 1024;
2348		sysctl_tcp_rmem[0] = PAGE_SIZE;
2349		sysctl_tcp_rmem[1] = 43689;
2350		sysctl_tcp_rmem[2] = 2 * 43689;
2351	}
2352
2353	printk(KERN_INFO "TCP: Hash tables configured "
2354	       "(established %d bind %d)\n",
2355	       tcp_ehash_size << 1, tcp_bhash_size);
2356
2357	tcp_register_congestion_control(&tcp_reno);
2358}
2359
2360EXPORT_SYMBOL(tcp_accept);
2361EXPORT_SYMBOL(tcp_close);
2362EXPORT_SYMBOL(tcp_destroy_sock);
2363EXPORT_SYMBOL(tcp_disconnect);
2364EXPORT_SYMBOL(tcp_getsockopt);
2365EXPORT_SYMBOL(tcp_ioctl);
2366EXPORT_SYMBOL(tcp_poll);
2367EXPORT_SYMBOL(tcp_read_sock);
2368EXPORT_SYMBOL(tcp_recvmsg);
2369EXPORT_SYMBOL(tcp_sendmsg);
2370EXPORT_SYMBOL(tcp_sendpage);
2371EXPORT_SYMBOL(tcp_setsockopt);
2372EXPORT_SYMBOL(tcp_shutdown);
2373EXPORT_SYMBOL(tcp_statistics);
2374EXPORT_SYMBOL(tcp_timewait_cachep);
2375