tcp.c revision e7626486c3c4ce456b11a7944edf164ef76fc599
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_openreq_cachep;
275kmem_cache_t *tcp_bucket_cachep;
276kmem_cache_t *tcp_timewait_cachep;
277
278atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279
280int sysctl_tcp_mem[3];
281int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283
284EXPORT_SYMBOL(sysctl_tcp_mem);
285EXPORT_SYMBOL(sysctl_tcp_rmem);
286EXPORT_SYMBOL(sysctl_tcp_wmem);
287
288atomic_t tcp_memory_allocated;	/* Current allocated memory. */
289atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
290
291EXPORT_SYMBOL(tcp_memory_allocated);
292EXPORT_SYMBOL(tcp_sockets_allocated);
293
294/*
295 * Pressure flag: try to collapse.
296 * Technical note: it is used by multiple contexts non atomically.
297 * All the sk_stream_mem_schedule() is of this nature: accounting
298 * is strict, actions are advisory and have some latency.
299 */
300int tcp_memory_pressure;
301
302EXPORT_SYMBOL(tcp_memory_pressure);
303
304void tcp_enter_memory_pressure(void)
305{
306	if (!tcp_memory_pressure) {
307		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308		tcp_memory_pressure = 1;
309	}
310}
311
312EXPORT_SYMBOL(tcp_enter_memory_pressure);
313
314/*
315 * LISTEN is a special case for poll..
316 */
317static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318					       poll_table *wait)
319{
320	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321}
322
323/*
324 *	Wait for a TCP event.
325 *
326 *	Note that we don't need to lock the socket, as the upper poll layers
327 *	take care of normal races (between the test and the event) and we don't
328 *	go look at any of the socket buffers directly.
329 */
330unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331{
332	unsigned int mask;
333	struct sock *sk = sock->sk;
334	struct tcp_sock *tp = tcp_sk(sk);
335
336	poll_wait(file, sk->sk_sleep, wait);
337	if (sk->sk_state == TCP_LISTEN)
338		return tcp_listen_poll(sk, wait);
339
340	/* Socket is not locked. We are protected from async events
341	   by poll logic and correct handling of state changes
342	   made by another threads is impossible in any case.
343	 */
344
345	mask = 0;
346	if (sk->sk_err)
347		mask = POLLERR;
348
349	/*
350	 * POLLHUP is certainly not done right. But poll() doesn't
351	 * have a notion of HUP in just one direction, and for a
352	 * socket the read side is more interesting.
353	 *
354	 * Some poll() documentation says that POLLHUP is incompatible
355	 * with the POLLOUT/POLLWR flags, so somebody should check this
356	 * all. But careful, it tends to be safer to return too many
357	 * bits than too few, and you can easily break real applications
358	 * if you don't tell them that something has hung up!
359	 *
360	 * Check-me.
361	 *
362	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363	 * our fs/select.c). It means that after we received EOF,
364	 * poll always returns immediately, making impossible poll() on write()
365	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366	 * if and only if shutdown has been made in both directions.
367	 * Actually, it is interesting to look how Solaris and DUX
368	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
369	 * then we could set it on SND_SHUTDOWN. BTW examples given
370	 * in Stevens' books assume exactly this behaviour, it explains
371	 * why PULLHUP is incompatible with POLLOUT.	--ANK
372	 *
373	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374	 * blocking on fresh not-connected or disconnected socket. --ANK
375	 */
376	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377		mask |= POLLHUP;
378	if (sk->sk_shutdown & RCV_SHUTDOWN)
379		mask |= POLLIN | POLLRDNORM;
380
381	/* Connected? */
382	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383		/* Potential race condition. If read of tp below will
384		 * escape above sk->sk_state, we can be illegally awaken
385		 * in SYN_* states. */
386		if ((tp->rcv_nxt != tp->copied_seq) &&
387		    (tp->urg_seq != tp->copied_seq ||
388		     tp->rcv_nxt != tp->copied_seq + 1 ||
389		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390			mask |= POLLIN | POLLRDNORM;
391
392		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394				mask |= POLLOUT | POLLWRNORM;
395			} else {  /* send SIGIO later */
396				set_bit(SOCK_ASYNC_NOSPACE,
397					&sk->sk_socket->flags);
398				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399
400				/* Race breaker. If space is freed after
401				 * wspace test but before the flags are set,
402				 * IO signal will be lost.
403				 */
404				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405					mask |= POLLOUT | POLLWRNORM;
406			}
407		}
408
409		if (tp->urg_data & TCP_URG_VALID)
410			mask |= POLLPRI;
411	}
412	return mask;
413}
414
415int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416{
417	struct tcp_sock *tp = tcp_sk(sk);
418	int answ;
419
420	switch (cmd) {
421	case SIOCINQ:
422		if (sk->sk_state == TCP_LISTEN)
423			return -EINVAL;
424
425		lock_sock(sk);
426		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427			answ = 0;
428		else if (sock_flag(sk, SOCK_URGINLINE) ||
429			 !tp->urg_data ||
430			 before(tp->urg_seq, tp->copied_seq) ||
431			 !before(tp->urg_seq, tp->rcv_nxt)) {
432			answ = tp->rcv_nxt - tp->copied_seq;
433
434			/* Subtract 1, if FIN is in queue. */
435			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436				answ -=
437		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438		} else
439			answ = tp->urg_seq - tp->copied_seq;
440		release_sock(sk);
441		break;
442	case SIOCATMARK:
443		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444		break;
445	case SIOCOUTQ:
446		if (sk->sk_state == TCP_LISTEN)
447			return -EINVAL;
448
449		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450			answ = 0;
451		else
452			answ = tp->write_seq - tp->snd_una;
453		break;
454	default:
455		return -ENOIOCTLCMD;
456	};
457
458	return put_user(answ, (int __user *)arg);
459}
460
461
462int tcp_listen_start(struct sock *sk)
463{
464	struct inet_sock *inet = inet_sk(sk);
465	struct tcp_sock *tp = tcp_sk(sk);
466	struct tcp_listen_opt *lopt;
467
468	sk->sk_max_ack_backlog = 0;
469	sk->sk_ack_backlog = 0;
470	tp->accept_queue = tp->accept_queue_tail = NULL;
471	rwlock_init(&tp->syn_wait_lock);
472	tcp_delack_init(tp);
473
474	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475	if (!lopt)
476		return -ENOMEM;
477
478	memset(lopt, 0, sizeof(struct tcp_listen_opt));
479	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481			break;
482	get_random_bytes(&lopt->hash_rnd, 4);
483
484	write_lock_bh(&tp->syn_wait_lock);
485	tp->listen_opt = lopt;
486	write_unlock_bh(&tp->syn_wait_lock);
487
488	/* There is race window here: we announce ourselves listening,
489	 * but this transition is still not validated by get_port().
490	 * It is OK, because this socket enters to hash table only
491	 * after validation is complete.
492	 */
493	sk->sk_state = TCP_LISTEN;
494	if (!sk->sk_prot->get_port(sk, inet->num)) {
495		inet->sport = htons(inet->num);
496
497		sk_dst_reset(sk);
498		sk->sk_prot->hash(sk);
499
500		return 0;
501	}
502
503	sk->sk_state = TCP_CLOSE;
504	write_lock_bh(&tp->syn_wait_lock);
505	tp->listen_opt = NULL;
506	write_unlock_bh(&tp->syn_wait_lock);
507	kfree(lopt);
508	return -EADDRINUSE;
509}
510
511/*
512 *	This routine closes sockets which have been at least partially
513 *	opened, but not yet accepted.
514 */
515
516static void tcp_listen_stop (struct sock *sk)
517{
518	struct tcp_sock *tp = tcp_sk(sk);
519	struct tcp_listen_opt *lopt = tp->listen_opt;
520	struct open_request *acc_req = tp->accept_queue;
521	struct open_request *req;
522	int i;
523
524	tcp_delete_keepalive_timer(sk);
525
526	/* make all the listen_opt local to us */
527	write_lock_bh(&tp->syn_wait_lock);
528	tp->listen_opt = NULL;
529	write_unlock_bh(&tp->syn_wait_lock);
530	tp->accept_queue = tp->accept_queue_tail = NULL;
531
532	if (lopt->qlen) {
533		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534			while ((req = lopt->syn_table[i]) != NULL) {
535				lopt->syn_table[i] = req->dl_next;
536				lopt->qlen--;
537				tcp_openreq_free(req);
538
539		/* Following specs, it would be better either to send FIN
540		 * (and enter FIN-WAIT-1, it is normal close)
541		 * or to send active reset (abort).
542		 * Certainly, it is pretty dangerous while synflood, but it is
543		 * bad justification for our negligence 8)
544		 * To be honest, we are not able to make either
545		 * of the variants now.			--ANK
546		 */
547			}
548		}
549	}
550	BUG_TRAP(!lopt->qlen);
551
552	kfree(lopt);
553
554	while ((req = acc_req) != NULL) {
555		struct sock *child = req->sk;
556
557		acc_req = req->dl_next;
558
559		local_bh_disable();
560		bh_lock_sock(child);
561		BUG_TRAP(!sock_owned_by_user(child));
562		sock_hold(child);
563
564		tcp_disconnect(child, O_NONBLOCK);
565
566		sock_orphan(child);
567
568		atomic_inc(&tcp_orphan_count);
569
570		tcp_destroy_sock(child);
571
572		bh_unlock_sock(child);
573		local_bh_enable();
574		sock_put(child);
575
576		sk_acceptq_removed(sk);
577		tcp_openreq_fastfree(req);
578	}
579	BUG_TRAP(!sk->sk_ack_backlog);
580}
581
582static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
583{
584	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
585	tp->pushed_seq = tp->write_seq;
586}
587
588static inline int forced_push(struct tcp_sock *tp)
589{
590	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
591}
592
593static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
594			      struct sk_buff *skb)
595{
596	skb->csum = 0;
597	TCP_SKB_CB(skb)->seq = tp->write_seq;
598	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
599	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
600	TCP_SKB_CB(skb)->sacked = 0;
601	skb_header_release(skb);
602	__skb_queue_tail(&sk->sk_write_queue, skb);
603	sk_charge_skb(sk, skb);
604	if (!sk->sk_send_head)
605		sk->sk_send_head = skb;
606	else if (tp->nonagle&TCP_NAGLE_PUSH)
607		tp->nonagle &= ~TCP_NAGLE_PUSH;
608}
609
610static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
611				struct sk_buff *skb)
612{
613	if (flags & MSG_OOB) {
614		tp->urg_mode = 1;
615		tp->snd_up = tp->write_seq;
616		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
617	}
618}
619
620static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
621			    int mss_now, int nonagle)
622{
623	if (sk->sk_send_head) {
624		struct sk_buff *skb = sk->sk_write_queue.prev;
625		if (!(flags & MSG_MORE) || forced_push(tp))
626			tcp_mark_push(tp, skb);
627		tcp_mark_urg(tp, flags, skb);
628		__tcp_push_pending_frames(sk, tp, mss_now,
629					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
630	}
631}
632
633static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
634			 size_t psize, int flags)
635{
636	struct tcp_sock *tp = tcp_sk(sk);
637	int mss_now;
638	int err;
639	ssize_t copied;
640	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
641
642	/* Wait for a connection to finish. */
643	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
644		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
645			goto out_err;
646
647	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
648
649	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
650	copied = 0;
651
652	err = -EPIPE;
653	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
654		goto do_error;
655
656	while (psize > 0) {
657		struct sk_buff *skb = sk->sk_write_queue.prev;
658		struct page *page = pages[poffset / PAGE_SIZE];
659		int copy, i, can_coalesce;
660		int offset = poffset % PAGE_SIZE;
661		int size = min_t(size_t, psize, PAGE_SIZE - offset);
662
663		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
664new_segment:
665			if (!sk_stream_memory_free(sk))
666				goto wait_for_sndbuf;
667
668			skb = sk_stream_alloc_pskb(sk, 0, 0,
669						   sk->sk_allocation);
670			if (!skb)
671				goto wait_for_memory;
672
673			skb_entail(sk, tp, skb);
674			copy = mss_now;
675		}
676
677		if (copy > size)
678			copy = size;
679
680		i = skb_shinfo(skb)->nr_frags;
681		can_coalesce = skb_can_coalesce(skb, i, page, offset);
682		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
683			tcp_mark_push(tp, skb);
684			goto new_segment;
685		}
686		if (sk->sk_forward_alloc < copy &&
687		    !sk_stream_mem_schedule(sk, copy, 0))
688			goto wait_for_memory;
689
690		if (can_coalesce) {
691			skb_shinfo(skb)->frags[i - 1].size += copy;
692		} else {
693			get_page(page);
694			skb_fill_page_desc(skb, i, page, offset, copy);
695		}
696
697		skb->len += copy;
698		skb->data_len += copy;
699		skb->truesize += copy;
700		sk->sk_wmem_queued += copy;
701		sk->sk_forward_alloc -= copy;
702		skb->ip_summed = CHECKSUM_HW;
703		tp->write_seq += copy;
704		TCP_SKB_CB(skb)->end_seq += copy;
705		skb_shinfo(skb)->tso_segs = 0;
706
707		if (!copied)
708			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
709
710		copied += copy;
711		poffset += copy;
712		if (!(psize -= copy))
713			goto out;
714
715		if (skb->len != mss_now || (flags & MSG_OOB))
716			continue;
717
718		if (forced_push(tp)) {
719			tcp_mark_push(tp, skb);
720			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
721		} else if (skb == sk->sk_send_head)
722			tcp_push_one(sk, mss_now);
723		continue;
724
725wait_for_sndbuf:
726		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
727wait_for_memory:
728		if (copied)
729			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
730
731		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
732			goto do_error;
733
734		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
735	}
736
737out:
738	if (copied)
739		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
740	return copied;
741
742do_error:
743	if (copied)
744		goto out;
745out_err:
746	return sk_stream_error(sk, flags, err);
747}
748
749ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
750		     size_t size, int flags)
751{
752	ssize_t res;
753	struct sock *sk = sock->sk;
754
755#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
756
757	if (!(sk->sk_route_caps & NETIF_F_SG) ||
758	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
759		return sock_no_sendpage(sock, page, offset, size, flags);
760
761#undef TCP_ZC_CSUM_FLAGS
762
763	lock_sock(sk);
764	TCP_CHECK_TIMER(sk);
765	res = do_tcp_sendpages(sk, &page, offset, size, flags);
766	TCP_CHECK_TIMER(sk);
767	release_sock(sk);
768	return res;
769}
770
771#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
772#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
773
774static inline int select_size(struct sock *sk, struct tcp_sock *tp)
775{
776	int tmp = tp->mss_cache_std;
777
778	if (sk->sk_route_caps & NETIF_F_SG) {
779		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
780
781		if (tmp >= pgbreak &&
782		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
783			tmp = pgbreak;
784	}
785	return tmp;
786}
787
788int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
789		size_t size)
790{
791	struct iovec *iov;
792	struct tcp_sock *tp = tcp_sk(sk);
793	struct sk_buff *skb;
794	int iovlen, flags;
795	int mss_now;
796	int err, copied;
797	long timeo;
798
799	lock_sock(sk);
800	TCP_CHECK_TIMER(sk);
801
802	flags = msg->msg_flags;
803	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
804
805	/* Wait for a connection to finish. */
806	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
807		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
808			goto out_err;
809
810	/* This should be in poll */
811	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
812
813	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
814
815	/* Ok commence sending. */
816	iovlen = msg->msg_iovlen;
817	iov = msg->msg_iov;
818	copied = 0;
819
820	err = -EPIPE;
821	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
822		goto do_error;
823
824	while (--iovlen >= 0) {
825		int seglen = iov->iov_len;
826		unsigned char __user *from = iov->iov_base;
827
828		iov++;
829
830		while (seglen > 0) {
831			int copy;
832
833			skb = sk->sk_write_queue.prev;
834
835			if (!sk->sk_send_head ||
836			    (copy = mss_now - skb->len) <= 0) {
837
838new_segment:
839				/* Allocate new segment. If the interface is SG,
840				 * allocate skb fitting to single page.
841				 */
842				if (!sk_stream_memory_free(sk))
843					goto wait_for_sndbuf;
844
845				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
846							   0, sk->sk_allocation);
847				if (!skb)
848					goto wait_for_memory;
849
850				/*
851				 * Check whether we can use HW checksum.
852				 */
853				if (sk->sk_route_caps &
854				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
855				     NETIF_F_HW_CSUM))
856					skb->ip_summed = CHECKSUM_HW;
857
858				skb_entail(sk, tp, skb);
859				copy = mss_now;
860			}
861
862			/* Try to append data to the end of skb. */
863			if (copy > seglen)
864				copy = seglen;
865
866			/* Where to copy to? */
867			if (skb_tailroom(skb) > 0) {
868				/* We have some space in skb head. Superb! */
869				if (copy > skb_tailroom(skb))
870					copy = skb_tailroom(skb);
871				if ((err = skb_add_data(skb, from, copy)) != 0)
872					goto do_fault;
873			} else {
874				int merge = 0;
875				int i = skb_shinfo(skb)->nr_frags;
876				struct page *page = TCP_PAGE(sk);
877				int off = TCP_OFF(sk);
878
879				if (skb_can_coalesce(skb, i, page, off) &&
880				    off != PAGE_SIZE) {
881					/* We can extend the last page
882					 * fragment. */
883					merge = 1;
884				} else if (i == MAX_SKB_FRAGS ||
885					   (!i &&
886					   !(sk->sk_route_caps & NETIF_F_SG))) {
887					/* Need to add new fragment and cannot
888					 * do this because interface is non-SG,
889					 * or because all the page slots are
890					 * busy. */
891					tcp_mark_push(tp, skb);
892					goto new_segment;
893				} else if (page) {
894					/* If page is cached, align
895					 * offset to L1 cache boundary
896					 */
897					off = (off + L1_CACHE_BYTES - 1) &
898					      ~(L1_CACHE_BYTES - 1);
899					if (off == PAGE_SIZE) {
900						put_page(page);
901						TCP_PAGE(sk) = page = NULL;
902					}
903				}
904
905				if (!page) {
906					/* Allocate new cache page. */
907					if (!(page = sk_stream_alloc_page(sk)))
908						goto wait_for_memory;
909					off = 0;
910				}
911
912				if (copy > PAGE_SIZE - off)
913					copy = PAGE_SIZE - off;
914
915				/* Time to copy data. We are close to
916				 * the end! */
917				err = skb_copy_to_page(sk, from, skb, page,
918						       off, copy);
919				if (err) {
920					/* If this page was new, give it to the
921					 * socket so it does not get leaked.
922					 */
923					if (!TCP_PAGE(sk)) {
924						TCP_PAGE(sk) = page;
925						TCP_OFF(sk) = 0;
926					}
927					goto do_error;
928				}
929
930				/* Update the skb. */
931				if (merge) {
932					skb_shinfo(skb)->frags[i - 1].size +=
933									copy;
934				} else {
935					skb_fill_page_desc(skb, i, page, off, copy);
936					if (TCP_PAGE(sk)) {
937						get_page(page);
938					} else if (off + copy < PAGE_SIZE) {
939						get_page(page);
940						TCP_PAGE(sk) = page;
941					}
942				}
943
944				TCP_OFF(sk) = off + copy;
945			}
946
947			if (!copied)
948				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
949
950			tp->write_seq += copy;
951			TCP_SKB_CB(skb)->end_seq += copy;
952			skb_shinfo(skb)->tso_segs = 0;
953
954			from += copy;
955			copied += copy;
956			if ((seglen -= copy) == 0 && iovlen == 0)
957				goto out;
958
959			if (skb->len != mss_now || (flags & MSG_OOB))
960				continue;
961
962			if (forced_push(tp)) {
963				tcp_mark_push(tp, skb);
964				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
965			} else if (skb == sk->sk_send_head)
966				tcp_push_one(sk, mss_now);
967			continue;
968
969wait_for_sndbuf:
970			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
971wait_for_memory:
972			if (copied)
973				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
974
975			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
976				goto do_error;
977
978			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
979		}
980	}
981
982out:
983	if (copied)
984		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
985	TCP_CHECK_TIMER(sk);
986	release_sock(sk);
987	return copied;
988
989do_fault:
990	if (!skb->len) {
991		if (sk->sk_send_head == skb)
992			sk->sk_send_head = NULL;
993		__skb_unlink(skb, skb->list);
994		sk_stream_free_skb(sk, skb);
995	}
996
997do_error:
998	if (copied)
999		goto out;
1000out_err:
1001	err = sk_stream_error(sk, flags, err);
1002	TCP_CHECK_TIMER(sk);
1003	release_sock(sk);
1004	return err;
1005}
1006
1007/*
1008 *	Handle reading urgent data. BSD has very simple semantics for
1009 *	this, no blocking and very strange errors 8)
1010 */
1011
1012static int tcp_recv_urg(struct sock *sk, long timeo,
1013			struct msghdr *msg, int len, int flags,
1014			int *addr_len)
1015{
1016	struct tcp_sock *tp = tcp_sk(sk);
1017
1018	/* No URG data to read. */
1019	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1020	    tp->urg_data == TCP_URG_READ)
1021		return -EINVAL;	/* Yes this is right ! */
1022
1023	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1024		return -ENOTCONN;
1025
1026	if (tp->urg_data & TCP_URG_VALID) {
1027		int err = 0;
1028		char c = tp->urg_data;
1029
1030		if (!(flags & MSG_PEEK))
1031			tp->urg_data = TCP_URG_READ;
1032
1033		/* Read urgent data. */
1034		msg->msg_flags |= MSG_OOB;
1035
1036		if (len > 0) {
1037			if (!(flags & MSG_TRUNC))
1038				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1039			len = 1;
1040		} else
1041			msg->msg_flags |= MSG_TRUNC;
1042
1043		return err ? -EFAULT : len;
1044	}
1045
1046	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1047		return 0;
1048
1049	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1050	 * the available implementations agree in this case:
1051	 * this call should never block, independent of the
1052	 * blocking state of the socket.
1053	 * Mike <pall@rz.uni-karlsruhe.de>
1054	 */
1055	return -EAGAIN;
1056}
1057
1058/* Clean up the receive buffer for full frames taken by the user,
1059 * then send an ACK if necessary.  COPIED is the number of bytes
1060 * tcp_recvmsg has given to the user so far, it speeds up the
1061 * calculation of whether or not we must ACK for the sake of
1062 * a window update.
1063 */
1064static void cleanup_rbuf(struct sock *sk, int copied)
1065{
1066	struct tcp_sock *tp = tcp_sk(sk);
1067	int time_to_ack = 0;
1068
1069#if TCP_DEBUG
1070	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1071
1072	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1073#endif
1074
1075	if (tcp_ack_scheduled(tp)) {
1076		   /* Delayed ACKs frequently hit locked sockets during bulk
1077		    * receive. */
1078		if (tp->ack.blocked ||
1079		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1080		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1081		    /*
1082		     * If this read emptied read buffer, we send ACK, if
1083		     * connection is not bidirectional, user drained
1084		     * receive buffer and there was a small segment
1085		     * in queue.
1086		     */
1087		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1088		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1089			time_to_ack = 1;
1090	}
1091
1092	/* We send an ACK if we can now advertise a non-zero window
1093	 * which has been raised "significantly".
1094	 *
1095	 * Even if window raised up to infinity, do not send window open ACK
1096	 * in states, where we will not receive more. It is useless.
1097	 */
1098	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1099		__u32 rcv_window_now = tcp_receive_window(tp);
1100
1101		/* Optimize, __tcp_select_window() is not cheap. */
1102		if (2*rcv_window_now <= tp->window_clamp) {
1103			__u32 new_window = __tcp_select_window(sk);
1104
1105			/* Send ACK now, if this read freed lots of space
1106			 * in our buffer. Certainly, new_window is new window.
1107			 * We can advertise it now, if it is not less than current one.
1108			 * "Lots" means "at least twice" here.
1109			 */
1110			if (new_window && new_window >= 2 * rcv_window_now)
1111				time_to_ack = 1;
1112		}
1113	}
1114	if (time_to_ack)
1115		tcp_send_ack(sk);
1116}
1117
1118static void tcp_prequeue_process(struct sock *sk)
1119{
1120	struct sk_buff *skb;
1121	struct tcp_sock *tp = tcp_sk(sk);
1122
1123	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1124
1125	/* RX process wants to run with disabled BHs, though it is not
1126	 * necessary */
1127	local_bh_disable();
1128	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1129		sk->sk_backlog_rcv(sk, skb);
1130	local_bh_enable();
1131
1132	/* Clear memory counter. */
1133	tp->ucopy.memory = 0;
1134}
1135
1136static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1137{
1138	struct sk_buff *skb;
1139	u32 offset;
1140
1141	skb_queue_walk(&sk->sk_receive_queue, skb) {
1142		offset = seq - TCP_SKB_CB(skb)->seq;
1143		if (skb->h.th->syn)
1144			offset--;
1145		if (offset < skb->len || skb->h.th->fin) {
1146			*off = offset;
1147			return skb;
1148		}
1149	}
1150	return NULL;
1151}
1152
1153/*
1154 * This routine provides an alternative to tcp_recvmsg() for routines
1155 * that would like to handle copying from skbuffs directly in 'sendfile'
1156 * fashion.
1157 * Note:
1158 *	- It is assumed that the socket was locked by the caller.
1159 *	- The routine does not block.
1160 *	- At present, there is no support for reading OOB data
1161 *	  or for 'peeking' the socket using this routine
1162 *	  (although both would be easy to implement).
1163 */
1164int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1165		  sk_read_actor_t recv_actor)
1166{
1167	struct sk_buff *skb;
1168	struct tcp_sock *tp = tcp_sk(sk);
1169	u32 seq = tp->copied_seq;
1170	u32 offset;
1171	int copied = 0;
1172
1173	if (sk->sk_state == TCP_LISTEN)
1174		return -ENOTCONN;
1175	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1176		if (offset < skb->len) {
1177			size_t used, len;
1178
1179			len = skb->len - offset;
1180			/* Stop reading if we hit a patch of urgent data */
1181			if (tp->urg_data) {
1182				u32 urg_offset = tp->urg_seq - seq;
1183				if (urg_offset < len)
1184					len = urg_offset;
1185				if (!len)
1186					break;
1187			}
1188			used = recv_actor(desc, skb, offset, len);
1189			if (used <= len) {
1190				seq += used;
1191				copied += used;
1192				offset += used;
1193			}
1194			if (offset != skb->len)
1195				break;
1196		}
1197		if (skb->h.th->fin) {
1198			sk_eat_skb(sk, skb);
1199			++seq;
1200			break;
1201		}
1202		sk_eat_skb(sk, skb);
1203		if (!desc->count)
1204			break;
1205	}
1206	tp->copied_seq = seq;
1207
1208	tcp_rcv_space_adjust(sk);
1209
1210	/* Clean up data we have read: This will do ACK frames. */
1211	if (copied)
1212		cleanup_rbuf(sk, copied);
1213	return copied;
1214}
1215
1216/*
1217 *	This routine copies from a sock struct into the user buffer.
1218 *
1219 *	Technical note: in 2.3 we work on _locked_ socket, so that
1220 *	tricks with *seq access order and skb->users are not required.
1221 *	Probably, code can be easily improved even more.
1222 */
1223
1224int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1225		size_t len, int nonblock, int flags, int *addr_len)
1226{
1227	struct tcp_sock *tp = tcp_sk(sk);
1228	int copied = 0;
1229	u32 peek_seq;
1230	u32 *seq;
1231	unsigned long used;
1232	int err;
1233	int target;		/* Read at least this many bytes */
1234	long timeo;
1235	struct task_struct *user_recv = NULL;
1236
1237	lock_sock(sk);
1238
1239	TCP_CHECK_TIMER(sk);
1240
1241	err = -ENOTCONN;
1242	if (sk->sk_state == TCP_LISTEN)
1243		goto out;
1244
1245	timeo = sock_rcvtimeo(sk, nonblock);
1246
1247	/* Urgent data needs to be handled specially. */
1248	if (flags & MSG_OOB)
1249		goto recv_urg;
1250
1251	seq = &tp->copied_seq;
1252	if (flags & MSG_PEEK) {
1253		peek_seq = tp->copied_seq;
1254		seq = &peek_seq;
1255	}
1256
1257	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1258
1259	do {
1260		struct sk_buff *skb;
1261		u32 offset;
1262
1263		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1264		if (tp->urg_data && tp->urg_seq == *seq) {
1265			if (copied)
1266				break;
1267			if (signal_pending(current)) {
1268				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1269				break;
1270			}
1271		}
1272
1273		/* Next get a buffer. */
1274
1275		skb = skb_peek(&sk->sk_receive_queue);
1276		do {
1277			if (!skb)
1278				break;
1279
1280			/* Now that we have two receive queues this
1281			 * shouldn't happen.
1282			 */
1283			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1284				printk(KERN_INFO "recvmsg bug: copied %X "
1285				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1286				break;
1287			}
1288			offset = *seq - TCP_SKB_CB(skb)->seq;
1289			if (skb->h.th->syn)
1290				offset--;
1291			if (offset < skb->len)
1292				goto found_ok_skb;
1293			if (skb->h.th->fin)
1294				goto found_fin_ok;
1295			BUG_TRAP(flags & MSG_PEEK);
1296			skb = skb->next;
1297		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1298
1299		/* Well, if we have backlog, try to process it now yet. */
1300
1301		if (copied >= target && !sk->sk_backlog.tail)
1302			break;
1303
1304		if (copied) {
1305			if (sk->sk_err ||
1306			    sk->sk_state == TCP_CLOSE ||
1307			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1308			    !timeo ||
1309			    signal_pending(current) ||
1310			    (flags & MSG_PEEK))
1311				break;
1312		} else {
1313			if (sock_flag(sk, SOCK_DONE))
1314				break;
1315
1316			if (sk->sk_err) {
1317				copied = sock_error(sk);
1318				break;
1319			}
1320
1321			if (sk->sk_shutdown & RCV_SHUTDOWN)
1322				break;
1323
1324			if (sk->sk_state == TCP_CLOSE) {
1325				if (!sock_flag(sk, SOCK_DONE)) {
1326					/* This occurs when user tries to read
1327					 * from never connected socket.
1328					 */
1329					copied = -ENOTCONN;
1330					break;
1331				}
1332				break;
1333			}
1334
1335			if (!timeo) {
1336				copied = -EAGAIN;
1337				break;
1338			}
1339
1340			if (signal_pending(current)) {
1341				copied = sock_intr_errno(timeo);
1342				break;
1343			}
1344		}
1345
1346		cleanup_rbuf(sk, copied);
1347
1348		if (tp->ucopy.task == user_recv) {
1349			/* Install new reader */
1350			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351				user_recv = current;
1352				tp->ucopy.task = user_recv;
1353				tp->ucopy.iov = msg->msg_iov;
1354			}
1355
1356			tp->ucopy.len = len;
1357
1358			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1359				 (flags & (MSG_PEEK | MSG_TRUNC)));
1360
1361			/* Ugly... If prequeue is not empty, we have to
1362			 * process it before releasing socket, otherwise
1363			 * order will be broken at second iteration.
1364			 * More elegant solution is required!!!
1365			 *
1366			 * Look: we have the following (pseudo)queues:
1367			 *
1368			 * 1. packets in flight
1369			 * 2. backlog
1370			 * 3. prequeue
1371			 * 4. receive_queue
1372			 *
1373			 * Each queue can be processed only if the next ones
1374			 * are empty. At this point we have empty receive_queue.
1375			 * But prequeue _can_ be not empty after 2nd iteration,
1376			 * when we jumped to start of loop because backlog
1377			 * processing added something to receive_queue.
1378			 * We cannot release_sock(), because backlog contains
1379			 * packets arrived _after_ prequeued ones.
1380			 *
1381			 * Shortly, algorithm is clear --- to process all
1382			 * the queues in order. We could make it more directly,
1383			 * requeueing packets from backlog to prequeue, if
1384			 * is not empty. It is more elegant, but eats cycles,
1385			 * unfortunately.
1386			 */
1387			if (skb_queue_len(&tp->ucopy.prequeue))
1388				goto do_prequeue;
1389
1390			/* __ Set realtime policy in scheduler __ */
1391		}
1392
1393		if (copied >= target) {
1394			/* Do not sleep, just process backlog. */
1395			release_sock(sk);
1396			lock_sock(sk);
1397		} else
1398			sk_wait_data(sk, &timeo);
1399
1400		if (user_recv) {
1401			int chunk;
1402
1403			/* __ Restore normal policy in scheduler __ */
1404
1405			if ((chunk = len - tp->ucopy.len) != 0) {
1406				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1407				len -= chunk;
1408				copied += chunk;
1409			}
1410
1411			if (tp->rcv_nxt == tp->copied_seq &&
1412			    skb_queue_len(&tp->ucopy.prequeue)) {
1413do_prequeue:
1414				tcp_prequeue_process(sk);
1415
1416				if ((chunk = len - tp->ucopy.len) != 0) {
1417					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1418					len -= chunk;
1419					copied += chunk;
1420				}
1421			}
1422		}
1423		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1424			if (net_ratelimit())
1425				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1426				       current->comm, current->pid);
1427			peek_seq = tp->copied_seq;
1428		}
1429		continue;
1430
1431	found_ok_skb:
1432		/* Ok so how much can we use? */
1433		used = skb->len - offset;
1434		if (len < used)
1435			used = len;
1436
1437		/* Do we have urgent data here? */
1438		if (tp->urg_data) {
1439			u32 urg_offset = tp->urg_seq - *seq;
1440			if (urg_offset < used) {
1441				if (!urg_offset) {
1442					if (!sock_flag(sk, SOCK_URGINLINE)) {
1443						++*seq;
1444						offset++;
1445						used--;
1446						if (!used)
1447							goto skip_copy;
1448					}
1449				} else
1450					used = urg_offset;
1451			}
1452		}
1453
1454		if (!(flags & MSG_TRUNC)) {
1455			err = skb_copy_datagram_iovec(skb, offset,
1456						      msg->msg_iov, used);
1457			if (err) {
1458				/* Exception. Bailout! */
1459				if (!copied)
1460					copied = -EFAULT;
1461				break;
1462			}
1463		}
1464
1465		*seq += used;
1466		copied += used;
1467		len -= used;
1468
1469		tcp_rcv_space_adjust(sk);
1470
1471skip_copy:
1472		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1473			tp->urg_data = 0;
1474			tcp_fast_path_check(sk, tp);
1475		}
1476		if (used + offset < skb->len)
1477			continue;
1478
1479		if (skb->h.th->fin)
1480			goto found_fin_ok;
1481		if (!(flags & MSG_PEEK))
1482			sk_eat_skb(sk, skb);
1483		continue;
1484
1485	found_fin_ok:
1486		/* Process the FIN. */
1487		++*seq;
1488		if (!(flags & MSG_PEEK))
1489			sk_eat_skb(sk, skb);
1490		break;
1491	} while (len > 0);
1492
1493	if (user_recv) {
1494		if (skb_queue_len(&tp->ucopy.prequeue)) {
1495			int chunk;
1496
1497			tp->ucopy.len = copied > 0 ? len : 0;
1498
1499			tcp_prequeue_process(sk);
1500
1501			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1502				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1503				len -= chunk;
1504				copied += chunk;
1505			}
1506		}
1507
1508		tp->ucopy.task = NULL;
1509		tp->ucopy.len = 0;
1510	}
1511
1512	/* According to UNIX98, msg_name/msg_namelen are ignored
1513	 * on connected socket. I was just happy when found this 8) --ANK
1514	 */
1515
1516	/* Clean up data we have read: This will do ACK frames. */
1517	cleanup_rbuf(sk, copied);
1518
1519	TCP_CHECK_TIMER(sk);
1520	release_sock(sk);
1521	return copied;
1522
1523out:
1524	TCP_CHECK_TIMER(sk);
1525	release_sock(sk);
1526	return err;
1527
1528recv_urg:
1529	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1530	goto out;
1531}
1532
1533/*
1534 *	State processing on a close. This implements the state shift for
1535 *	sending our FIN frame. Note that we only send a FIN for some
1536 *	states. A shutdown() may have already sent the FIN, or we may be
1537 *	closed.
1538 */
1539
1540static unsigned char new_state[16] = {
1541  /* current state:        new state:      action:	*/
1542  /* (Invalid)		*/ TCP_CLOSE,
1543  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1544  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1545  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1546  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1547  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1548  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1549  /* TCP_CLOSE		*/ TCP_CLOSE,
1550  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1551  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1552  /* TCP_LISTEN		*/ TCP_CLOSE,
1553  /* TCP_CLOSING	*/ TCP_CLOSING,
1554};
1555
1556static int tcp_close_state(struct sock *sk)
1557{
1558	int next = (int)new_state[sk->sk_state];
1559	int ns = next & TCP_STATE_MASK;
1560
1561	tcp_set_state(sk, ns);
1562
1563	return next & TCP_ACTION_FIN;
1564}
1565
1566/*
1567 *	Shutdown the sending side of a connection. Much like close except
1568 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1569 */
1570
1571void tcp_shutdown(struct sock *sk, int how)
1572{
1573	/*	We need to grab some memory, and put together a FIN,
1574	 *	and then put it into the queue to be sent.
1575	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1576	 */
1577	if (!(how & SEND_SHUTDOWN))
1578		return;
1579
1580	/* If we've already sent a FIN, or it's a closed state, skip this. */
1581	if ((1 << sk->sk_state) &
1582	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1583	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1584		/* Clear out any half completed packets.  FIN if needed. */
1585		if (tcp_close_state(sk))
1586			tcp_send_fin(sk);
1587	}
1588}
1589
1590/*
1591 * At this point, there should be no process reference to this
1592 * socket, and thus no user references at all.  Therefore we
1593 * can assume the socket waitqueue is inactive and nobody will
1594 * try to jump onto it.
1595 */
1596void tcp_destroy_sock(struct sock *sk)
1597{
1598	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1599	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1600
1601	/* It cannot be in hash table! */
1602	BUG_TRAP(sk_unhashed(sk));
1603
1604	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1605	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1606
1607	sk->sk_prot->destroy(sk);
1608
1609	sk_stream_kill_queues(sk);
1610
1611	xfrm_sk_free_policy(sk);
1612
1613#ifdef INET_REFCNT_DEBUG
1614	if (atomic_read(&sk->sk_refcnt) != 1) {
1615		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1616		       sk, atomic_read(&sk->sk_refcnt));
1617	}
1618#endif
1619
1620	atomic_dec(&tcp_orphan_count);
1621	sock_put(sk);
1622}
1623
1624void tcp_close(struct sock *sk, long timeout)
1625{
1626	struct sk_buff *skb;
1627	int data_was_unread = 0;
1628
1629	lock_sock(sk);
1630	sk->sk_shutdown = SHUTDOWN_MASK;
1631
1632	if (sk->sk_state == TCP_LISTEN) {
1633		tcp_set_state(sk, TCP_CLOSE);
1634
1635		/* Special case. */
1636		tcp_listen_stop(sk);
1637
1638		goto adjudge_to_death;
1639	}
1640
1641	/*  We need to flush the recv. buffs.  We do this only on the
1642	 *  descriptor close, not protocol-sourced closes, because the
1643	 *  reader process may not have drained the data yet!
1644	 */
1645	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1646		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1647			  skb->h.th->fin;
1648		data_was_unread += len;
1649		__kfree_skb(skb);
1650	}
1651
1652	sk_stream_mem_reclaim(sk);
1653
1654	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1655	 * 3.10, we send a RST here because data was lost.  To
1656	 * witness the awful effects of the old behavior of always
1657	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1658	 * a bulk GET in an FTP client, suspend the process, wait
1659	 * for the client to advertise a zero window, then kill -9
1660	 * the FTP client, wheee...  Note: timeout is always zero
1661	 * in such a case.
1662	 */
1663	if (data_was_unread) {
1664		/* Unread data was tossed, zap the connection. */
1665		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1666		tcp_set_state(sk, TCP_CLOSE);
1667		tcp_send_active_reset(sk, GFP_KERNEL);
1668	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1669		/* Check zero linger _after_ checking for unread data. */
1670		sk->sk_prot->disconnect(sk, 0);
1671		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1672	} else if (tcp_close_state(sk)) {
1673		/* We FIN if the application ate all the data before
1674		 * zapping the connection.
1675		 */
1676
1677		/* RED-PEN. Formally speaking, we have broken TCP state
1678		 * machine. State transitions:
1679		 *
1680		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1681		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1682		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1683		 *
1684		 * are legal only when FIN has been sent (i.e. in window),
1685		 * rather than queued out of window. Purists blame.
1686		 *
1687		 * F.e. "RFC state" is ESTABLISHED,
1688		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1689		 *
1690		 * The visible declinations are that sometimes
1691		 * we enter time-wait state, when it is not required really
1692		 * (harmless), do not send active resets, when they are
1693		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1694		 * they look as CLOSING or LAST_ACK for Linux)
1695		 * Probably, I missed some more holelets.
1696		 * 						--ANK
1697		 */
1698		tcp_send_fin(sk);
1699	}
1700
1701	sk_stream_wait_close(sk, timeout);
1702
1703adjudge_to_death:
1704	/* It is the last release_sock in its life. It will remove backlog. */
1705	release_sock(sk);
1706
1707
1708	/* Now socket is owned by kernel and we acquire BH lock
1709	   to finish close. No need to check for user refs.
1710	 */
1711	local_bh_disable();
1712	bh_lock_sock(sk);
1713	BUG_TRAP(!sock_owned_by_user(sk));
1714
1715	sock_hold(sk);
1716	sock_orphan(sk);
1717
1718	/*	This is a (useful) BSD violating of the RFC. There is a
1719	 *	problem with TCP as specified in that the other end could
1720	 *	keep a socket open forever with no application left this end.
1721	 *	We use a 3 minute timeout (about the same as BSD) then kill
1722	 *	our end. If they send after that then tough - BUT: long enough
1723	 *	that we won't make the old 4*rto = almost no time - whoops
1724	 *	reset mistake.
1725	 *
1726	 *	Nope, it was not mistake. It is really desired behaviour
1727	 *	f.e. on http servers, when such sockets are useless, but
1728	 *	consume significant resources. Let's do it with special
1729	 *	linger2	option.					--ANK
1730	 */
1731
1732	if (sk->sk_state == TCP_FIN_WAIT2) {
1733		struct tcp_sock *tp = tcp_sk(sk);
1734		if (tp->linger2 < 0) {
1735			tcp_set_state(sk, TCP_CLOSE);
1736			tcp_send_active_reset(sk, GFP_ATOMIC);
1737			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1738		} else {
1739			int tmo = tcp_fin_time(tp);
1740
1741			if (tmo > TCP_TIMEWAIT_LEN) {
1742				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1743			} else {
1744				atomic_inc(&tcp_orphan_count);
1745				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1746				goto out;
1747			}
1748		}
1749	}
1750	if (sk->sk_state != TCP_CLOSE) {
1751		sk_stream_mem_reclaim(sk);
1752		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1753		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1754		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1755			if (net_ratelimit())
1756				printk(KERN_INFO "TCP: too many of orphaned "
1757				       "sockets\n");
1758			tcp_set_state(sk, TCP_CLOSE);
1759			tcp_send_active_reset(sk, GFP_ATOMIC);
1760			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1761		}
1762	}
1763	atomic_inc(&tcp_orphan_count);
1764
1765	if (sk->sk_state == TCP_CLOSE)
1766		tcp_destroy_sock(sk);
1767	/* Otherwise, socket is reprieved until protocol close. */
1768
1769out:
1770	bh_unlock_sock(sk);
1771	local_bh_enable();
1772	sock_put(sk);
1773}
1774
1775/* These states need RST on ABORT according to RFC793 */
1776
1777static inline int tcp_need_reset(int state)
1778{
1779	return (1 << state) &
1780	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1781		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1782}
1783
1784int tcp_disconnect(struct sock *sk, int flags)
1785{
1786	struct inet_sock *inet = inet_sk(sk);
1787	struct tcp_sock *tp = tcp_sk(sk);
1788	int err = 0;
1789	int old_state = sk->sk_state;
1790
1791	if (old_state != TCP_CLOSE)
1792		tcp_set_state(sk, TCP_CLOSE);
1793
1794	/* ABORT function of RFC793 */
1795	if (old_state == TCP_LISTEN) {
1796		tcp_listen_stop(sk);
1797	} else if (tcp_need_reset(old_state) ||
1798		   (tp->snd_nxt != tp->write_seq &&
1799		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1800		/* The last check adjusts for discrepance of Linux wrt. RFC
1801		 * states
1802		 */
1803		tcp_send_active_reset(sk, gfp_any());
1804		sk->sk_err = ECONNRESET;
1805	} else if (old_state == TCP_SYN_SENT)
1806		sk->sk_err = ECONNRESET;
1807
1808	tcp_clear_xmit_timers(sk);
1809	__skb_queue_purge(&sk->sk_receive_queue);
1810	sk_stream_writequeue_purge(sk);
1811	__skb_queue_purge(&tp->out_of_order_queue);
1812
1813	inet->dport = 0;
1814
1815	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1816		inet_reset_saddr(sk);
1817
1818	sk->sk_shutdown = 0;
1819	sock_reset_flag(sk, SOCK_DONE);
1820	tp->srtt = 0;
1821	if ((tp->write_seq += tp->max_window + 2) == 0)
1822		tp->write_seq = 1;
1823	tp->backoff = 0;
1824	tp->snd_cwnd = 2;
1825	tp->probes_out = 0;
1826	tp->packets_out = 0;
1827	tp->snd_ssthresh = 0x7fffffff;
1828	tp->snd_cwnd_cnt = 0;
1829	tcp_set_ca_state(tp, TCP_CA_Open);
1830	tcp_clear_retrans(tp);
1831	tcp_delack_init(tp);
1832	sk->sk_send_head = NULL;
1833	tp->rx_opt.saw_tstamp = 0;
1834	tcp_sack_reset(&tp->rx_opt);
1835	__sk_dst_reset(sk);
1836
1837	BUG_TRAP(!inet->num || tp->bind_hash);
1838
1839	sk->sk_error_report(sk);
1840	return err;
1841}
1842
1843/*
1844 *	Wait for an incoming connection, avoid race
1845 *	conditions. This must be called with the socket locked.
1846 */
1847static int wait_for_connect(struct sock *sk, long timeo)
1848{
1849	struct tcp_sock *tp = tcp_sk(sk);
1850	DEFINE_WAIT(wait);
1851	int err;
1852
1853	/*
1854	 * True wake-one mechanism for incoming connections: only
1855	 * one process gets woken up, not the 'whole herd'.
1856	 * Since we do not 'race & poll' for established sockets
1857	 * anymore, the common case will execute the loop only once.
1858	 *
1859	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1860	 * after any current non-exclusive waiters, and we know that
1861	 * it will always _stay_ after any new non-exclusive waiters
1862	 * because all non-exclusive waiters are added at the
1863	 * beginning of the wait-queue. As such, it's ok to "drop"
1864	 * our exclusiveness temporarily when we get woken up without
1865	 * having to remove and re-insert us on the wait queue.
1866	 */
1867	for (;;) {
1868		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869					  TASK_INTERRUPTIBLE);
1870		release_sock(sk);
1871		if (!tp->accept_queue)
1872			timeo = schedule_timeout(timeo);
1873		lock_sock(sk);
1874		err = 0;
1875		if (tp->accept_queue)
1876			break;
1877		err = -EINVAL;
1878		if (sk->sk_state != TCP_LISTEN)
1879			break;
1880		err = sock_intr_errno(timeo);
1881		if (signal_pending(current))
1882			break;
1883		err = -EAGAIN;
1884		if (!timeo)
1885			break;
1886	}
1887	finish_wait(sk->sk_sleep, &wait);
1888	return err;
1889}
1890
1891/*
1892 *	This will accept the next outstanding connection.
1893 */
1894
1895struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896{
1897	struct tcp_sock *tp = tcp_sk(sk);
1898	struct open_request *req;
1899	struct sock *newsk;
1900	int error;
1901
1902	lock_sock(sk);
1903
1904	/* We need to make sure that this socket is listening,
1905	 * and that it has something pending.
1906	 */
1907	error = -EINVAL;
1908	if (sk->sk_state != TCP_LISTEN)
1909		goto out;
1910
1911	/* Find already established connection */
1912	if (!tp->accept_queue) {
1913		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914
1915		/* If this is a non blocking socket don't sleep */
1916		error = -EAGAIN;
1917		if (!timeo)
1918			goto out;
1919
1920		error = wait_for_connect(sk, timeo);
1921		if (error)
1922			goto out;
1923	}
1924
1925	req = tp->accept_queue;
1926	if ((tp->accept_queue = req->dl_next) == NULL)
1927		tp->accept_queue_tail = NULL;
1928
1929 	newsk = req->sk;
1930	sk_acceptq_removed(sk);
1931	tcp_openreq_fastfree(req);
1932	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933	release_sock(sk);
1934	return newsk;
1935
1936out:
1937	release_sock(sk);
1938	*err = error;
1939	return NULL;
1940}
1941
1942/*
1943 *	Socket option code for TCP.
1944 */
1945int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1946		   int optlen)
1947{
1948	struct tcp_sock *tp = tcp_sk(sk);
1949	int val;
1950	int err = 0;
1951
1952	if (level != SOL_TCP)
1953		return tp->af_specific->setsockopt(sk, level, optname,
1954						   optval, optlen);
1955
1956	if (optlen < sizeof(int))
1957		return -EINVAL;
1958
1959	if (get_user(val, (int __user *)optval))
1960		return -EFAULT;
1961
1962	lock_sock(sk);
1963
1964	switch (optname) {
1965	case TCP_MAXSEG:
1966		/* Values greater than interface MTU won't take effect. However
1967		 * at the point when this call is done we typically don't yet
1968		 * know which interface is going to be used */
1969		if (val < 8 || val > MAX_TCP_WINDOW) {
1970			err = -EINVAL;
1971			break;
1972		}
1973		tp->rx_opt.user_mss = val;
1974		break;
1975
1976	case TCP_NODELAY:
1977		if (val) {
1978			/* TCP_NODELAY is weaker than TCP_CORK, so that
1979			 * this option on corked socket is remembered, but
1980			 * it is not activated until cork is cleared.
1981			 *
1982			 * However, when TCP_NODELAY is set we make
1983			 * an explicit push, which overrides even TCP_CORK
1984			 * for currently queued segments.
1985			 */
1986			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1987			tcp_push_pending_frames(sk, tp);
1988		} else {
1989			tp->nonagle &= ~TCP_NAGLE_OFF;
1990		}
1991		break;
1992
1993	case TCP_CORK:
1994		/* When set indicates to always queue non-full frames.
1995		 * Later the user clears this option and we transmit
1996		 * any pending partial frames in the queue.  This is
1997		 * meant to be used alongside sendfile() to get properly
1998		 * filled frames when the user (for example) must write
1999		 * out headers with a write() call first and then use
2000		 * sendfile to send out the data parts.
2001		 *
2002		 * TCP_CORK can be set together with TCP_NODELAY and it is
2003		 * stronger than TCP_NODELAY.
2004		 */
2005		if (val) {
2006			tp->nonagle |= TCP_NAGLE_CORK;
2007		} else {
2008			tp->nonagle &= ~TCP_NAGLE_CORK;
2009			if (tp->nonagle&TCP_NAGLE_OFF)
2010				tp->nonagle |= TCP_NAGLE_PUSH;
2011			tcp_push_pending_frames(sk, tp);
2012		}
2013		break;
2014
2015	case TCP_KEEPIDLE:
2016		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2017			err = -EINVAL;
2018		else {
2019			tp->keepalive_time = val * HZ;
2020			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2021			    !((1 << sk->sk_state) &
2022			      (TCPF_CLOSE | TCPF_LISTEN))) {
2023				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2024				if (tp->keepalive_time > elapsed)
2025					elapsed = tp->keepalive_time - elapsed;
2026				else
2027					elapsed = 0;
2028				tcp_reset_keepalive_timer(sk, elapsed);
2029			}
2030		}
2031		break;
2032	case TCP_KEEPINTVL:
2033		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2034			err = -EINVAL;
2035		else
2036			tp->keepalive_intvl = val * HZ;
2037		break;
2038	case TCP_KEEPCNT:
2039		if (val < 1 || val > MAX_TCP_KEEPCNT)
2040			err = -EINVAL;
2041		else
2042			tp->keepalive_probes = val;
2043		break;
2044	case TCP_SYNCNT:
2045		if (val < 1 || val > MAX_TCP_SYNCNT)
2046			err = -EINVAL;
2047		else
2048			tp->syn_retries = val;
2049		break;
2050
2051	case TCP_LINGER2:
2052		if (val < 0)
2053			tp->linger2 = -1;
2054		else if (val > sysctl_tcp_fin_timeout / HZ)
2055			tp->linger2 = 0;
2056		else
2057			tp->linger2 = val * HZ;
2058		break;
2059
2060	case TCP_DEFER_ACCEPT:
2061		tp->defer_accept = 0;
2062		if (val > 0) {
2063			/* Translate value in seconds to number of
2064			 * retransmits */
2065			while (tp->defer_accept < 32 &&
2066			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2067				       tp->defer_accept))
2068				tp->defer_accept++;
2069			tp->defer_accept++;
2070		}
2071		break;
2072
2073	case TCP_WINDOW_CLAMP:
2074		if (!val) {
2075			if (sk->sk_state != TCP_CLOSE) {
2076				err = -EINVAL;
2077				break;
2078			}
2079			tp->window_clamp = 0;
2080		} else
2081			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2082						SOCK_MIN_RCVBUF / 2 : val;
2083		break;
2084
2085	case TCP_QUICKACK:
2086		if (!val) {
2087			tp->ack.pingpong = 1;
2088		} else {
2089			tp->ack.pingpong = 0;
2090			if ((1 << sk->sk_state) &
2091			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2092			    tcp_ack_scheduled(tp)) {
2093				tp->ack.pending |= TCP_ACK_PUSHED;
2094				cleanup_rbuf(sk, 1);
2095				if (!(val & 1))
2096					tp->ack.pingpong = 1;
2097			}
2098		}
2099		break;
2100
2101	default:
2102		err = -ENOPROTOOPT;
2103		break;
2104	};
2105	release_sock(sk);
2106	return err;
2107}
2108
2109/* Return information about state of tcp endpoint in API format. */
2110void tcp_get_info(struct sock *sk, struct tcp_info *info)
2111{
2112	struct tcp_sock *tp = tcp_sk(sk);
2113	u32 now = tcp_time_stamp;
2114
2115	memset(info, 0, sizeof(*info));
2116
2117	info->tcpi_state = sk->sk_state;
2118	info->tcpi_ca_state = tp->ca_state;
2119	info->tcpi_retransmits = tp->retransmits;
2120	info->tcpi_probes = tp->probes_out;
2121	info->tcpi_backoff = tp->backoff;
2122
2123	if (tp->rx_opt.tstamp_ok)
2124		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2125	if (tp->rx_opt.sack_ok)
2126		info->tcpi_options |= TCPI_OPT_SACK;
2127	if (tp->rx_opt.wscale_ok) {
2128		info->tcpi_options |= TCPI_OPT_WSCALE;
2129		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2130		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2131	}
2132
2133	if (tp->ecn_flags&TCP_ECN_OK)
2134		info->tcpi_options |= TCPI_OPT_ECN;
2135
2136	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138	info->tcpi_snd_mss = tp->mss_cache_std;
2139	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140
2141	info->tcpi_unacked = tp->packets_out;
2142	info->tcpi_sacked = tp->sacked_out;
2143	info->tcpi_lost = tp->lost_out;
2144	info->tcpi_retrans = tp->retrans_out;
2145	info->tcpi_fackets = tp->fackets_out;
2146
2147	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2148	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2149	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2150
2151	info->tcpi_pmtu = tp->pmtu_cookie;
2152	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2153	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2154	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2155	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2156	info->tcpi_snd_cwnd = tp->snd_cwnd;
2157	info->tcpi_advmss = tp->advmss;
2158	info->tcpi_reordering = tp->reordering;
2159
2160	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2161	info->tcpi_rcv_space = tp->rcvq_space.space;
2162
2163	info->tcpi_total_retrans = tp->total_retrans;
2164}
2165
2166EXPORT_SYMBOL_GPL(tcp_get_info);
2167
2168int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2169		   int __user *optlen)
2170{
2171	struct tcp_sock *tp = tcp_sk(sk);
2172	int val, len;
2173
2174	if (level != SOL_TCP)
2175		return tp->af_specific->getsockopt(sk, level, optname,
2176						   optval, optlen);
2177
2178	if (get_user(len, optlen))
2179		return -EFAULT;
2180
2181	len = min_t(unsigned int, len, sizeof(int));
2182
2183	if (len < 0)
2184		return -EINVAL;
2185
2186	switch (optname) {
2187	case TCP_MAXSEG:
2188		val = tp->mss_cache_std;
2189		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190			val = tp->rx_opt.user_mss;
2191		break;
2192	case TCP_NODELAY:
2193		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2194		break;
2195	case TCP_CORK:
2196		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2197		break;
2198	case TCP_KEEPIDLE:
2199		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2200		break;
2201	case TCP_KEEPINTVL:
2202		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2203		break;
2204	case TCP_KEEPCNT:
2205		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2206		break;
2207	case TCP_SYNCNT:
2208		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2209		break;
2210	case TCP_LINGER2:
2211		val = tp->linger2;
2212		if (val >= 0)
2213			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2214		break;
2215	case TCP_DEFER_ACCEPT:
2216		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2217					       (tp->defer_accept - 1));
2218		break;
2219	case TCP_WINDOW_CLAMP:
2220		val = tp->window_clamp;
2221		break;
2222	case TCP_INFO: {
2223		struct tcp_info info;
2224
2225		if (get_user(len, optlen))
2226			return -EFAULT;
2227
2228		tcp_get_info(sk, &info);
2229
2230		len = min_t(unsigned int, len, sizeof(info));
2231		if (put_user(len, optlen))
2232			return -EFAULT;
2233		if (copy_to_user(optval, &info, len))
2234			return -EFAULT;
2235		return 0;
2236	}
2237	case TCP_QUICKACK:
2238		val = !tp->ack.pingpong;
2239		break;
2240	default:
2241		return -ENOPROTOOPT;
2242	};
2243
2244	if (put_user(len, optlen))
2245		return -EFAULT;
2246	if (copy_to_user(optval, &val, len))
2247		return -EFAULT;
2248	return 0;
2249}
2250
2251
2252extern void __skb_cb_too_small_for_tcp(int, int);
2253extern void tcpdiag_init(void);
2254
2255static __initdata unsigned long thash_entries;
2256static int __init set_thash_entries(char *str)
2257{
2258	if (!str)
2259		return 0;
2260	thash_entries = simple_strtoul(str, &str, 0);
2261	return 1;
2262}
2263__setup("thash_entries=", set_thash_entries);
2264
2265void __init tcp_init(void)
2266{
2267	struct sk_buff *skb = NULL;
2268	int order, i;
2269
2270	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2271		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272					   sizeof(skb->cb));
2273
2274	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275						   sizeof(struct open_request),
2276					       0, SLAB_HWCACHE_ALIGN,
2277					       NULL, NULL);
2278	if (!tcp_openreq_cachep)
2279		panic("tcp_init: Cannot alloc open_request cache.");
2280
2281	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282					      sizeof(struct tcp_bind_bucket),
2283					      0, SLAB_HWCACHE_ALIGN,
2284					      NULL, NULL);
2285	if (!tcp_bucket_cachep)
2286		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287
2288	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289						sizeof(struct tcp_tw_bucket),
2290						0, SLAB_HWCACHE_ALIGN,
2291						NULL, NULL);
2292	if (!tcp_timewait_cachep)
2293		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295	/* Size and allocate the main established and bind bucket
2296	 * hash tables.
2297	 *
2298	 * The methodology is similar to that of the buffer cache.
2299	 */
2300	tcp_ehash = (struct tcp_ehash_bucket *)
2301		alloc_large_system_hash("TCP established",
2302					sizeof(struct tcp_ehash_bucket),
2303					thash_entries,
2304					(num_physpages >= 128 * 1024) ?
2305						(25 - PAGE_SHIFT) :
2306						(27 - PAGE_SHIFT),
2307					HASH_HIGHMEM,
2308					&tcp_ehash_size,
2309					NULL,
2310					0);
2311	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313		rwlock_init(&tcp_ehash[i].lock);
2314		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315	}
2316
2317	tcp_bhash = (struct tcp_bind_hashbucket *)
2318		alloc_large_system_hash("TCP bind",
2319					sizeof(struct tcp_bind_hashbucket),
2320					tcp_ehash_size,
2321					(num_physpages >= 128 * 1024) ?
2322						(25 - PAGE_SHIFT) :
2323						(27 - PAGE_SHIFT),
2324					HASH_HIGHMEM,
2325					&tcp_bhash_size,
2326					NULL,
2327					64 * 1024);
2328	tcp_bhash_size = 1 << tcp_bhash_size;
2329	for (i = 0; i < tcp_bhash_size; i++) {
2330		spin_lock_init(&tcp_bhash[i].lock);
2331		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332	}
2333
2334	/* Try to be a bit smarter and adjust defaults depending
2335	 * on available memory.
2336	 */
2337	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339			order++)
2340		;
2341	if (order >= 4) {
2342		sysctl_local_port_range[0] = 32768;
2343		sysctl_local_port_range[1] = 61000;
2344		sysctl_tcp_max_tw_buckets = 180000;
2345		sysctl_tcp_max_orphans = 4096 << (order - 4);
2346		sysctl_max_syn_backlog = 1024;
2347	} else if (order < 3) {
2348		sysctl_local_port_range[0] = 1024 * (3 - order);
2349		sysctl_tcp_max_tw_buckets >>= (3 - order);
2350		sysctl_tcp_max_orphans >>= (3 - order);
2351		sysctl_max_syn_backlog = 128;
2352	}
2353	tcp_port_rover = sysctl_local_port_range[0] - 1;
2354
2355	sysctl_tcp_mem[0] =  768 << order;
2356	sysctl_tcp_mem[1] = 1024 << order;
2357	sysctl_tcp_mem[2] = 1536 << order;
2358
2359	if (order < 3) {
2360		sysctl_tcp_wmem[2] = 64 * 1024;
2361		sysctl_tcp_rmem[0] = PAGE_SIZE;
2362		sysctl_tcp_rmem[1] = 43689;
2363		sysctl_tcp_rmem[2] = 2 * 43689;
2364	}
2365
2366	printk(KERN_INFO "TCP: Hash tables configured "
2367	       "(established %d bind %d)\n",
2368	       tcp_ehash_size << 1, tcp_bhash_size);
2369}
2370
2371EXPORT_SYMBOL(tcp_accept);
2372EXPORT_SYMBOL(tcp_close);
2373EXPORT_SYMBOL(tcp_destroy_sock);
2374EXPORT_SYMBOL(tcp_disconnect);
2375EXPORT_SYMBOL(tcp_getsockopt);
2376EXPORT_SYMBOL(tcp_ioctl);
2377EXPORT_SYMBOL(tcp_openreq_cachep);
2378EXPORT_SYMBOL(tcp_poll);
2379EXPORT_SYMBOL(tcp_read_sock);
2380EXPORT_SYMBOL(tcp_recvmsg);
2381EXPORT_SYMBOL(tcp_sendmsg);
2382EXPORT_SYMBOL(tcp_sendpage);
2383EXPORT_SYMBOL(tcp_setsockopt);
2384EXPORT_SYMBOL(tcp_shutdown);
2385EXPORT_SYMBOL(tcp_statistics);
2386EXPORT_SYMBOL(tcp_timewait_cachep);
2387