tcp.c revision 0e87506fcc734647c7b2497eee4eb81e785c857a
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305	if (!tcp_memory_pressure) {
306		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307		tcp_memory_pressure = 1;
308	}
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317					       poll_table *wait)
318{
319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 *	Wait for a TCP event.
324 *
325 *	Note that we don't need to lock the socket, as the upper poll layers
326 *	take care of normal races (between the test and the event) and we don't
327 *	go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331	unsigned int mask;
332	struct sock *sk = sock->sk;
333	struct tcp_sock *tp = tcp_sk(sk);
334
335	poll_wait(file, sk->sk_sleep, wait);
336	if (sk->sk_state == TCP_LISTEN)
337		return tcp_listen_poll(sk, wait);
338
339	/* Socket is not locked. We are protected from async events
340	   by poll logic and correct handling of state changes
341	   made by another threads is impossible in any case.
342	 */
343
344	mask = 0;
345	if (sk->sk_err)
346		mask = POLLERR;
347
348	/*
349	 * POLLHUP is certainly not done right. But poll() doesn't
350	 * have a notion of HUP in just one direction, and for a
351	 * socket the read side is more interesting.
352	 *
353	 * Some poll() documentation says that POLLHUP is incompatible
354	 * with the POLLOUT/POLLWR flags, so somebody should check this
355	 * all. But careful, it tends to be safer to return too many
356	 * bits than too few, and you can easily break real applications
357	 * if you don't tell them that something has hung up!
358	 *
359	 * Check-me.
360	 *
361	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362	 * our fs/select.c). It means that after we received EOF,
363	 * poll always returns immediately, making impossible poll() on write()
364	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365	 * if and only if shutdown has been made in both directions.
366	 * Actually, it is interesting to look how Solaris and DUX
367	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368	 * then we could set it on SND_SHUTDOWN. BTW examples given
369	 * in Stevens' books assume exactly this behaviour, it explains
370	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371	 *
372	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373	 * blocking on fresh not-connected or disconnected socket. --ANK
374	 */
375	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376		mask |= POLLHUP;
377	if (sk->sk_shutdown & RCV_SHUTDOWN)
378		mask |= POLLIN | POLLRDNORM;
379
380	/* Connected? */
381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382		/* Potential race condition. If read of tp below will
383		 * escape above sk->sk_state, we can be illegally awaken
384		 * in SYN_* states. */
385		if ((tp->rcv_nxt != tp->copied_seq) &&
386		    (tp->urg_seq != tp->copied_seq ||
387		     tp->rcv_nxt != tp->copied_seq + 1 ||
388		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389			mask |= POLLIN | POLLRDNORM;
390
391		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393				mask |= POLLOUT | POLLWRNORM;
394			} else {  /* send SIGIO later */
395				set_bit(SOCK_ASYNC_NOSPACE,
396					&sk->sk_socket->flags);
397				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399				/* Race breaker. If space is freed after
400				 * wspace test but before the flags are set,
401				 * IO signal will be lost.
402				 */
403				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404					mask |= POLLOUT | POLLWRNORM;
405			}
406		}
407
408		if (tp->urg_data & TCP_URG_VALID)
409			mask |= POLLPRI;
410	}
411	return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416	struct tcp_sock *tp = tcp_sk(sk);
417	int answ;
418
419	switch (cmd) {
420	case SIOCINQ:
421		if (sk->sk_state == TCP_LISTEN)
422			return -EINVAL;
423
424		lock_sock(sk);
425		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426			answ = 0;
427		else if (sock_flag(sk, SOCK_URGINLINE) ||
428			 !tp->urg_data ||
429			 before(tp->urg_seq, tp->copied_seq) ||
430			 !before(tp->urg_seq, tp->rcv_nxt)) {
431			answ = tp->rcv_nxt - tp->copied_seq;
432
433			/* Subtract 1, if FIN is in queue. */
434			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435				answ -=
436		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437		} else
438			answ = tp->urg_seq - tp->copied_seq;
439		release_sock(sk);
440		break;
441	case SIOCATMARK:
442		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443		break;
444	case SIOCOUTQ:
445		if (sk->sk_state == TCP_LISTEN)
446			return -EINVAL;
447
448		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449			answ = 0;
450		else
451			answ = tp->write_seq - tp->snd_una;
452		break;
453	default:
454		return -ENOIOCTLCMD;
455	};
456
457	return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463	struct inet_sock *inet = inet_sk(sk);
464	struct tcp_sock *tp = tcp_sk(sk);
465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467	if (rc != 0)
468		return rc;
469
470	sk->sk_max_ack_backlog = 0;
471	sk->sk_ack_backlog = 0;
472	tcp_delack_init(tp);
473
474	/* There is race window here: we announce ourselves listening,
475	 * but this transition is still not validated by get_port().
476	 * It is OK, because this socket enters to hash table only
477	 * after validation is complete.
478	 */
479	sk->sk_state = TCP_LISTEN;
480	if (!sk->sk_prot->get_port(sk, inet->num)) {
481		inet->sport = htons(inet->num);
482
483		sk_dst_reset(sk);
484		sk->sk_prot->hash(sk);
485
486		return 0;
487	}
488
489	sk->sk_state = TCP_CLOSE;
490	reqsk_queue_destroy(&tp->accept_queue);
491	return -EADDRINUSE;
492}
493
494/*
495 *	This routine closes sockets which have been at least partially
496 *	opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501	struct tcp_sock *tp = tcp_sk(sk);
502	struct tcp_listen_opt *lopt;
503	struct request_sock *acc_req;
504	struct request_sock *req;
505	int i;
506
507	tcp_delete_keepalive_timer(sk);
508
509	/* make all the listen_opt local to us */
510	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513	if (lopt->qlen) {
514		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515			while ((req = lopt->syn_table[i]) != NULL) {
516				lopt->syn_table[i] = req->dl_next;
517				lopt->qlen--;
518				reqsk_free(req);
519
520		/* Following specs, it would be better either to send FIN
521		 * (and enter FIN-WAIT-1, it is normal close)
522		 * or to send active reset (abort).
523		 * Certainly, it is pretty dangerous while synflood, but it is
524		 * bad justification for our negligence 8)
525		 * To be honest, we are not able to make either
526		 * of the variants now.			--ANK
527		 */
528			}
529		}
530	}
531	BUG_TRAP(!lopt->qlen);
532
533	kfree(lopt);
534
535	while ((req = acc_req) != NULL) {
536		struct sock *child = req->sk;
537
538		acc_req = req->dl_next;
539
540		local_bh_disable();
541		bh_lock_sock(child);
542		BUG_TRAP(!sock_owned_by_user(child));
543		sock_hold(child);
544
545		tcp_disconnect(child, O_NONBLOCK);
546
547		sock_orphan(child);
548
549		atomic_inc(&tcp_orphan_count);
550
551		tcp_destroy_sock(child);
552
553		bh_unlock_sock(child);
554		local_bh_enable();
555		sock_put(child);
556
557		sk_acceptq_removed(sk);
558		__reqsk_free(req);
559	}
560	BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{
565	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566	tp->pushed_seq = tp->write_seq;
567}
568
569static inline int forced_push(struct tcp_sock *tp)
570{
571	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572}
573
574static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575			      struct sk_buff *skb)
576{
577	skb->csum = 0;
578	TCP_SKB_CB(skb)->seq = tp->write_seq;
579	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581	TCP_SKB_CB(skb)->sacked = 0;
582	skb_header_release(skb);
583	__skb_queue_tail(&sk->sk_write_queue, skb);
584	sk_charge_skb(sk, skb);
585	if (!sk->sk_send_head)
586		sk->sk_send_head = skb;
587	else if (tp->nonagle&TCP_NAGLE_PUSH)
588		tp->nonagle &= ~TCP_NAGLE_PUSH;
589}
590
591static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592				struct sk_buff *skb)
593{
594	if (flags & MSG_OOB) {
595		tp->urg_mode = 1;
596		tp->snd_up = tp->write_seq;
597		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598	}
599}
600
601static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602			    int mss_now, int nonagle)
603{
604	if (sk->sk_send_head) {
605		struct sk_buff *skb = sk->sk_write_queue.prev;
606		if (!(flags & MSG_MORE) || forced_push(tp))
607			tcp_mark_push(tp, skb);
608		tcp_mark_urg(tp, flags, skb);
609		__tcp_push_pending_frames(sk, tp, mss_now,
610					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611	}
612}
613
614static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615			 size_t psize, int flags)
616{
617	struct tcp_sock *tp = tcp_sk(sk);
618	int mss_now;
619	int err;
620	ssize_t copied;
621	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623	/* Wait for a connection to finish. */
624	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626			goto out_err;
627
628	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631	copied = 0;
632
633	err = -EPIPE;
634	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635		goto do_error;
636
637	while (psize > 0) {
638		struct sk_buff *skb = sk->sk_write_queue.prev;
639		struct page *page = pages[poffset / PAGE_SIZE];
640		int copy, i, can_coalesce;
641		int offset = poffset % PAGE_SIZE;
642		int size = min_t(size_t, psize, PAGE_SIZE - offset);
643
644		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645new_segment:
646			if (!sk_stream_memory_free(sk))
647				goto wait_for_sndbuf;
648
649			skb = sk_stream_alloc_pskb(sk, 0, 0,
650						   sk->sk_allocation);
651			if (!skb)
652				goto wait_for_memory;
653
654			skb_entail(sk, tp, skb);
655			copy = mss_now;
656		}
657
658		if (copy > size)
659			copy = size;
660
661		i = skb_shinfo(skb)->nr_frags;
662		can_coalesce = skb_can_coalesce(skb, i, page, offset);
663		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664			tcp_mark_push(tp, skb);
665			goto new_segment;
666		}
667		if (sk->sk_forward_alloc < copy &&
668		    !sk_stream_mem_schedule(sk, copy, 0))
669			goto wait_for_memory;
670
671		if (can_coalesce) {
672			skb_shinfo(skb)->frags[i - 1].size += copy;
673		} else {
674			get_page(page);
675			skb_fill_page_desc(skb, i, page, offset, copy);
676		}
677
678		skb->len += copy;
679		skb->data_len += copy;
680		skb->truesize += copy;
681		sk->sk_wmem_queued += copy;
682		sk->sk_forward_alloc -= copy;
683		skb->ip_summed = CHECKSUM_HW;
684		tp->write_seq += copy;
685		TCP_SKB_CB(skb)->end_seq += copy;
686		skb_shinfo(skb)->tso_segs = 0;
687
688		if (!copied)
689			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690
691		copied += copy;
692		poffset += copy;
693		if (!(psize -= copy))
694			goto out;
695
696		if (skb->len != mss_now || (flags & MSG_OOB))
697			continue;
698
699		if (forced_push(tp)) {
700			tcp_mark_push(tp, skb);
701			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702		} else if (skb == sk->sk_send_head)
703			tcp_push_one(sk, mss_now);
704		continue;
705
706wait_for_sndbuf:
707		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708wait_for_memory:
709		if (copied)
710			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711
712		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713			goto do_error;
714
715		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716	}
717
718out:
719	if (copied)
720		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721	return copied;
722
723do_error:
724	if (copied)
725		goto out;
726out_err:
727	return sk_stream_error(sk, flags, err);
728}
729
730ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731		     size_t size, int flags)
732{
733	ssize_t res;
734	struct sock *sk = sock->sk;
735
736#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737
738	if (!(sk->sk_route_caps & NETIF_F_SG) ||
739	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740		return sock_no_sendpage(sock, page, offset, size, flags);
741
742#undef TCP_ZC_CSUM_FLAGS
743
744	lock_sock(sk);
745	TCP_CHECK_TIMER(sk);
746	res = do_tcp_sendpages(sk, &page, offset, size, flags);
747	TCP_CHECK_TIMER(sk);
748	release_sock(sk);
749	return res;
750}
751
752#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
753#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
754
755static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{
757	int tmp = tp->mss_cache_std;
758
759	if (sk->sk_route_caps & NETIF_F_SG) {
760		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761
762		if (tmp >= pgbreak &&
763		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764			tmp = pgbreak;
765	}
766	return tmp;
767}
768
769int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
770		size_t size)
771{
772	struct iovec *iov;
773	struct tcp_sock *tp = tcp_sk(sk);
774	struct sk_buff *skb;
775	int iovlen, flags;
776	int mss_now;
777	int err, copied;
778	long timeo;
779
780	lock_sock(sk);
781	TCP_CHECK_TIMER(sk);
782
783	flags = msg->msg_flags;
784	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
785
786	/* Wait for a connection to finish. */
787	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
788		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
789			goto out_err;
790
791	/* This should be in poll */
792	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793
794	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
795
796	/* Ok commence sending. */
797	iovlen = msg->msg_iovlen;
798	iov = msg->msg_iov;
799	copied = 0;
800
801	err = -EPIPE;
802	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
803		goto do_error;
804
805	while (--iovlen >= 0) {
806		int seglen = iov->iov_len;
807		unsigned char __user *from = iov->iov_base;
808
809		iov++;
810
811		while (seglen > 0) {
812			int copy;
813
814			skb = sk->sk_write_queue.prev;
815
816			if (!sk->sk_send_head ||
817			    (copy = mss_now - skb->len) <= 0) {
818
819new_segment:
820				/* Allocate new segment. If the interface is SG,
821				 * allocate skb fitting to single page.
822				 */
823				if (!sk_stream_memory_free(sk))
824					goto wait_for_sndbuf;
825
826				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
827							   0, sk->sk_allocation);
828				if (!skb)
829					goto wait_for_memory;
830
831				/*
832				 * Check whether we can use HW checksum.
833				 */
834				if (sk->sk_route_caps &
835				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
836				     NETIF_F_HW_CSUM))
837					skb->ip_summed = CHECKSUM_HW;
838
839				skb_entail(sk, tp, skb);
840				copy = mss_now;
841			}
842
843			/* Try to append data to the end of skb. */
844			if (copy > seglen)
845				copy = seglen;
846
847			/* Where to copy to? */
848			if (skb_tailroom(skb) > 0) {
849				/* We have some space in skb head. Superb! */
850				if (copy > skb_tailroom(skb))
851					copy = skb_tailroom(skb);
852				if ((err = skb_add_data(skb, from, copy)) != 0)
853					goto do_fault;
854			} else {
855				int merge = 0;
856				int i = skb_shinfo(skb)->nr_frags;
857				struct page *page = TCP_PAGE(sk);
858				int off = TCP_OFF(sk);
859
860				if (skb_can_coalesce(skb, i, page, off) &&
861				    off != PAGE_SIZE) {
862					/* We can extend the last page
863					 * fragment. */
864					merge = 1;
865				} else if (i == MAX_SKB_FRAGS ||
866					   (!i &&
867					   !(sk->sk_route_caps & NETIF_F_SG))) {
868					/* Need to add new fragment and cannot
869					 * do this because interface is non-SG,
870					 * or because all the page slots are
871					 * busy. */
872					tcp_mark_push(tp, skb);
873					goto new_segment;
874				} else if (page) {
875					/* If page is cached, align
876					 * offset to L1 cache boundary
877					 */
878					off = (off + L1_CACHE_BYTES - 1) &
879					      ~(L1_CACHE_BYTES - 1);
880					if (off == PAGE_SIZE) {
881						put_page(page);
882						TCP_PAGE(sk) = page = NULL;
883					}
884				}
885
886				if (!page) {
887					/* Allocate new cache page. */
888					if (!(page = sk_stream_alloc_page(sk)))
889						goto wait_for_memory;
890					off = 0;
891				}
892
893				if (copy > PAGE_SIZE - off)
894					copy = PAGE_SIZE - off;
895
896				/* Time to copy data. We are close to
897				 * the end! */
898				err = skb_copy_to_page(sk, from, skb, page,
899						       off, copy);
900				if (err) {
901					/* If this page was new, give it to the
902					 * socket so it does not get leaked.
903					 */
904					if (!TCP_PAGE(sk)) {
905						TCP_PAGE(sk) = page;
906						TCP_OFF(sk) = 0;
907					}
908					goto do_error;
909				}
910
911				/* Update the skb. */
912				if (merge) {
913					skb_shinfo(skb)->frags[i - 1].size +=
914									copy;
915				} else {
916					skb_fill_page_desc(skb, i, page, off, copy);
917					if (TCP_PAGE(sk)) {
918						get_page(page);
919					} else if (off + copy < PAGE_SIZE) {
920						get_page(page);
921						TCP_PAGE(sk) = page;
922					}
923				}
924
925				TCP_OFF(sk) = off + copy;
926			}
927
928			if (!copied)
929				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
930
931			tp->write_seq += copy;
932			TCP_SKB_CB(skb)->end_seq += copy;
933			skb_shinfo(skb)->tso_segs = 0;
934
935			from += copy;
936			copied += copy;
937			if ((seglen -= copy) == 0 && iovlen == 0)
938				goto out;
939
940			if (skb->len != mss_now || (flags & MSG_OOB))
941				continue;
942
943			if (forced_push(tp)) {
944				tcp_mark_push(tp, skb);
945				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
946			} else if (skb == sk->sk_send_head)
947				tcp_push_one(sk, mss_now);
948			continue;
949
950wait_for_sndbuf:
951			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
952wait_for_memory:
953			if (copied)
954				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
955
956			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
957				goto do_error;
958
959			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
960		}
961	}
962
963out:
964	if (copied)
965		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
966	TCP_CHECK_TIMER(sk);
967	release_sock(sk);
968	return copied;
969
970do_fault:
971	if (!skb->len) {
972		if (sk->sk_send_head == skb)
973			sk->sk_send_head = NULL;
974		__skb_unlink(skb, skb->list);
975		sk_stream_free_skb(sk, skb);
976	}
977
978do_error:
979	if (copied)
980		goto out;
981out_err:
982	err = sk_stream_error(sk, flags, err);
983	TCP_CHECK_TIMER(sk);
984	release_sock(sk);
985	return err;
986}
987
988/*
989 *	Handle reading urgent data. BSD has very simple semantics for
990 *	this, no blocking and very strange errors 8)
991 */
992
993static int tcp_recv_urg(struct sock *sk, long timeo,
994			struct msghdr *msg, int len, int flags,
995			int *addr_len)
996{
997	struct tcp_sock *tp = tcp_sk(sk);
998
999	/* No URG data to read. */
1000	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1001	    tp->urg_data == TCP_URG_READ)
1002		return -EINVAL;	/* Yes this is right ! */
1003
1004	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1005		return -ENOTCONN;
1006
1007	if (tp->urg_data & TCP_URG_VALID) {
1008		int err = 0;
1009		char c = tp->urg_data;
1010
1011		if (!(flags & MSG_PEEK))
1012			tp->urg_data = TCP_URG_READ;
1013
1014		/* Read urgent data. */
1015		msg->msg_flags |= MSG_OOB;
1016
1017		if (len > 0) {
1018			if (!(flags & MSG_TRUNC))
1019				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1020			len = 1;
1021		} else
1022			msg->msg_flags |= MSG_TRUNC;
1023
1024		return err ? -EFAULT : len;
1025	}
1026
1027	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1028		return 0;
1029
1030	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1031	 * the available implementations agree in this case:
1032	 * this call should never block, independent of the
1033	 * blocking state of the socket.
1034	 * Mike <pall@rz.uni-karlsruhe.de>
1035	 */
1036	return -EAGAIN;
1037}
1038
1039/* Clean up the receive buffer for full frames taken by the user,
1040 * then send an ACK if necessary.  COPIED is the number of bytes
1041 * tcp_recvmsg has given to the user so far, it speeds up the
1042 * calculation of whether or not we must ACK for the sake of
1043 * a window update.
1044 */
1045static void cleanup_rbuf(struct sock *sk, int copied)
1046{
1047	struct tcp_sock *tp = tcp_sk(sk);
1048	int time_to_ack = 0;
1049
1050#if TCP_DEBUG
1051	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1052
1053	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1054#endif
1055
1056	if (tcp_ack_scheduled(tp)) {
1057		   /* Delayed ACKs frequently hit locked sockets during bulk
1058		    * receive. */
1059		if (tp->ack.blocked ||
1060		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1061		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1062		    /*
1063		     * If this read emptied read buffer, we send ACK, if
1064		     * connection is not bidirectional, user drained
1065		     * receive buffer and there was a small segment
1066		     * in queue.
1067		     */
1068		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1069		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1070			time_to_ack = 1;
1071	}
1072
1073	/* We send an ACK if we can now advertise a non-zero window
1074	 * which has been raised "significantly".
1075	 *
1076	 * Even if window raised up to infinity, do not send window open ACK
1077	 * in states, where we will not receive more. It is useless.
1078	 */
1079	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1080		__u32 rcv_window_now = tcp_receive_window(tp);
1081
1082		/* Optimize, __tcp_select_window() is not cheap. */
1083		if (2*rcv_window_now <= tp->window_clamp) {
1084			__u32 new_window = __tcp_select_window(sk);
1085
1086			/* Send ACK now, if this read freed lots of space
1087			 * in our buffer. Certainly, new_window is new window.
1088			 * We can advertise it now, if it is not less than current one.
1089			 * "Lots" means "at least twice" here.
1090			 */
1091			if (new_window && new_window >= 2 * rcv_window_now)
1092				time_to_ack = 1;
1093		}
1094	}
1095	if (time_to_ack)
1096		tcp_send_ack(sk);
1097}
1098
1099static void tcp_prequeue_process(struct sock *sk)
1100{
1101	struct sk_buff *skb;
1102	struct tcp_sock *tp = tcp_sk(sk);
1103
1104	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1105
1106	/* RX process wants to run with disabled BHs, though it is not
1107	 * necessary */
1108	local_bh_disable();
1109	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1110		sk->sk_backlog_rcv(sk, skb);
1111	local_bh_enable();
1112
1113	/* Clear memory counter. */
1114	tp->ucopy.memory = 0;
1115}
1116
1117static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1118{
1119	struct sk_buff *skb;
1120	u32 offset;
1121
1122	skb_queue_walk(&sk->sk_receive_queue, skb) {
1123		offset = seq - TCP_SKB_CB(skb)->seq;
1124		if (skb->h.th->syn)
1125			offset--;
1126		if (offset < skb->len || skb->h.th->fin) {
1127			*off = offset;
1128			return skb;
1129		}
1130	}
1131	return NULL;
1132}
1133
1134/*
1135 * This routine provides an alternative to tcp_recvmsg() for routines
1136 * that would like to handle copying from skbuffs directly in 'sendfile'
1137 * fashion.
1138 * Note:
1139 *	- It is assumed that the socket was locked by the caller.
1140 *	- The routine does not block.
1141 *	- At present, there is no support for reading OOB data
1142 *	  or for 'peeking' the socket using this routine
1143 *	  (although both would be easy to implement).
1144 */
1145int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1146		  sk_read_actor_t recv_actor)
1147{
1148	struct sk_buff *skb;
1149	struct tcp_sock *tp = tcp_sk(sk);
1150	u32 seq = tp->copied_seq;
1151	u32 offset;
1152	int copied = 0;
1153
1154	if (sk->sk_state == TCP_LISTEN)
1155		return -ENOTCONN;
1156	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1157		if (offset < skb->len) {
1158			size_t used, len;
1159
1160			len = skb->len - offset;
1161			/* Stop reading if we hit a patch of urgent data */
1162			if (tp->urg_data) {
1163				u32 urg_offset = tp->urg_seq - seq;
1164				if (urg_offset < len)
1165					len = urg_offset;
1166				if (!len)
1167					break;
1168			}
1169			used = recv_actor(desc, skb, offset, len);
1170			if (used <= len) {
1171				seq += used;
1172				copied += used;
1173				offset += used;
1174			}
1175			if (offset != skb->len)
1176				break;
1177		}
1178		if (skb->h.th->fin) {
1179			sk_eat_skb(sk, skb);
1180			++seq;
1181			break;
1182		}
1183		sk_eat_skb(sk, skb);
1184		if (!desc->count)
1185			break;
1186	}
1187	tp->copied_seq = seq;
1188
1189	tcp_rcv_space_adjust(sk);
1190
1191	/* Clean up data we have read: This will do ACK frames. */
1192	if (copied)
1193		cleanup_rbuf(sk, copied);
1194	return copied;
1195}
1196
1197/*
1198 *	This routine copies from a sock struct into the user buffer.
1199 *
1200 *	Technical note: in 2.3 we work on _locked_ socket, so that
1201 *	tricks with *seq access order and skb->users are not required.
1202 *	Probably, code can be easily improved even more.
1203 */
1204
1205int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1206		size_t len, int nonblock, int flags, int *addr_len)
1207{
1208	struct tcp_sock *tp = tcp_sk(sk);
1209	int copied = 0;
1210	u32 peek_seq;
1211	u32 *seq;
1212	unsigned long used;
1213	int err;
1214	int target;		/* Read at least this many bytes */
1215	long timeo;
1216	struct task_struct *user_recv = NULL;
1217
1218	lock_sock(sk);
1219
1220	TCP_CHECK_TIMER(sk);
1221
1222	err = -ENOTCONN;
1223	if (sk->sk_state == TCP_LISTEN)
1224		goto out;
1225
1226	timeo = sock_rcvtimeo(sk, nonblock);
1227
1228	/* Urgent data needs to be handled specially. */
1229	if (flags & MSG_OOB)
1230		goto recv_urg;
1231
1232	seq = &tp->copied_seq;
1233	if (flags & MSG_PEEK) {
1234		peek_seq = tp->copied_seq;
1235		seq = &peek_seq;
1236	}
1237
1238	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1239
1240	do {
1241		struct sk_buff *skb;
1242		u32 offset;
1243
1244		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1245		if (tp->urg_data && tp->urg_seq == *seq) {
1246			if (copied)
1247				break;
1248			if (signal_pending(current)) {
1249				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1250				break;
1251			}
1252		}
1253
1254		/* Next get a buffer. */
1255
1256		skb = skb_peek(&sk->sk_receive_queue);
1257		do {
1258			if (!skb)
1259				break;
1260
1261			/* Now that we have two receive queues this
1262			 * shouldn't happen.
1263			 */
1264			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1265				printk(KERN_INFO "recvmsg bug: copied %X "
1266				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1267				break;
1268			}
1269			offset = *seq - TCP_SKB_CB(skb)->seq;
1270			if (skb->h.th->syn)
1271				offset--;
1272			if (offset < skb->len)
1273				goto found_ok_skb;
1274			if (skb->h.th->fin)
1275				goto found_fin_ok;
1276			BUG_TRAP(flags & MSG_PEEK);
1277			skb = skb->next;
1278		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1279
1280		/* Well, if we have backlog, try to process it now yet. */
1281
1282		if (copied >= target && !sk->sk_backlog.tail)
1283			break;
1284
1285		if (copied) {
1286			if (sk->sk_err ||
1287			    sk->sk_state == TCP_CLOSE ||
1288			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1289			    !timeo ||
1290			    signal_pending(current) ||
1291			    (flags & MSG_PEEK))
1292				break;
1293		} else {
1294			if (sock_flag(sk, SOCK_DONE))
1295				break;
1296
1297			if (sk->sk_err) {
1298				copied = sock_error(sk);
1299				break;
1300			}
1301
1302			if (sk->sk_shutdown & RCV_SHUTDOWN)
1303				break;
1304
1305			if (sk->sk_state == TCP_CLOSE) {
1306				if (!sock_flag(sk, SOCK_DONE)) {
1307					/* This occurs when user tries to read
1308					 * from never connected socket.
1309					 */
1310					copied = -ENOTCONN;
1311					break;
1312				}
1313				break;
1314			}
1315
1316			if (!timeo) {
1317				copied = -EAGAIN;
1318				break;
1319			}
1320
1321			if (signal_pending(current)) {
1322				copied = sock_intr_errno(timeo);
1323				break;
1324			}
1325		}
1326
1327		cleanup_rbuf(sk, copied);
1328
1329		if (tp->ucopy.task == user_recv) {
1330			/* Install new reader */
1331			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1332				user_recv = current;
1333				tp->ucopy.task = user_recv;
1334				tp->ucopy.iov = msg->msg_iov;
1335			}
1336
1337			tp->ucopy.len = len;
1338
1339			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1340				 (flags & (MSG_PEEK | MSG_TRUNC)));
1341
1342			/* Ugly... If prequeue is not empty, we have to
1343			 * process it before releasing socket, otherwise
1344			 * order will be broken at second iteration.
1345			 * More elegant solution is required!!!
1346			 *
1347			 * Look: we have the following (pseudo)queues:
1348			 *
1349			 * 1. packets in flight
1350			 * 2. backlog
1351			 * 3. prequeue
1352			 * 4. receive_queue
1353			 *
1354			 * Each queue can be processed only if the next ones
1355			 * are empty. At this point we have empty receive_queue.
1356			 * But prequeue _can_ be not empty after 2nd iteration,
1357			 * when we jumped to start of loop because backlog
1358			 * processing added something to receive_queue.
1359			 * We cannot release_sock(), because backlog contains
1360			 * packets arrived _after_ prequeued ones.
1361			 *
1362			 * Shortly, algorithm is clear --- to process all
1363			 * the queues in order. We could make it more directly,
1364			 * requeueing packets from backlog to prequeue, if
1365			 * is not empty. It is more elegant, but eats cycles,
1366			 * unfortunately.
1367			 */
1368			if (skb_queue_len(&tp->ucopy.prequeue))
1369				goto do_prequeue;
1370
1371			/* __ Set realtime policy in scheduler __ */
1372		}
1373
1374		if (copied >= target) {
1375			/* Do not sleep, just process backlog. */
1376			release_sock(sk);
1377			lock_sock(sk);
1378		} else
1379			sk_wait_data(sk, &timeo);
1380
1381		if (user_recv) {
1382			int chunk;
1383
1384			/* __ Restore normal policy in scheduler __ */
1385
1386			if ((chunk = len - tp->ucopy.len) != 0) {
1387				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1388				len -= chunk;
1389				copied += chunk;
1390			}
1391
1392			if (tp->rcv_nxt == tp->copied_seq &&
1393			    skb_queue_len(&tp->ucopy.prequeue)) {
1394do_prequeue:
1395				tcp_prequeue_process(sk);
1396
1397				if ((chunk = len - tp->ucopy.len) != 0) {
1398					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1399					len -= chunk;
1400					copied += chunk;
1401				}
1402			}
1403		}
1404		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1405			if (net_ratelimit())
1406				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1407				       current->comm, current->pid);
1408			peek_seq = tp->copied_seq;
1409		}
1410		continue;
1411
1412	found_ok_skb:
1413		/* Ok so how much can we use? */
1414		used = skb->len - offset;
1415		if (len < used)
1416			used = len;
1417
1418		/* Do we have urgent data here? */
1419		if (tp->urg_data) {
1420			u32 urg_offset = tp->urg_seq - *seq;
1421			if (urg_offset < used) {
1422				if (!urg_offset) {
1423					if (!sock_flag(sk, SOCK_URGINLINE)) {
1424						++*seq;
1425						offset++;
1426						used--;
1427						if (!used)
1428							goto skip_copy;
1429					}
1430				} else
1431					used = urg_offset;
1432			}
1433		}
1434
1435		if (!(flags & MSG_TRUNC)) {
1436			err = skb_copy_datagram_iovec(skb, offset,
1437						      msg->msg_iov, used);
1438			if (err) {
1439				/* Exception. Bailout! */
1440				if (!copied)
1441					copied = -EFAULT;
1442				break;
1443			}
1444		}
1445
1446		*seq += used;
1447		copied += used;
1448		len -= used;
1449
1450		tcp_rcv_space_adjust(sk);
1451
1452skip_copy:
1453		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1454			tp->urg_data = 0;
1455			tcp_fast_path_check(sk, tp);
1456		}
1457		if (used + offset < skb->len)
1458			continue;
1459
1460		if (skb->h.th->fin)
1461			goto found_fin_ok;
1462		if (!(flags & MSG_PEEK))
1463			sk_eat_skb(sk, skb);
1464		continue;
1465
1466	found_fin_ok:
1467		/* Process the FIN. */
1468		++*seq;
1469		if (!(flags & MSG_PEEK))
1470			sk_eat_skb(sk, skb);
1471		break;
1472	} while (len > 0);
1473
1474	if (user_recv) {
1475		if (skb_queue_len(&tp->ucopy.prequeue)) {
1476			int chunk;
1477
1478			tp->ucopy.len = copied > 0 ? len : 0;
1479
1480			tcp_prequeue_process(sk);
1481
1482			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1483				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1484				len -= chunk;
1485				copied += chunk;
1486			}
1487		}
1488
1489		tp->ucopy.task = NULL;
1490		tp->ucopy.len = 0;
1491	}
1492
1493	/* According to UNIX98, msg_name/msg_namelen are ignored
1494	 * on connected socket. I was just happy when found this 8) --ANK
1495	 */
1496
1497	/* Clean up data we have read: This will do ACK frames. */
1498	cleanup_rbuf(sk, copied);
1499
1500	TCP_CHECK_TIMER(sk);
1501	release_sock(sk);
1502	return copied;
1503
1504out:
1505	TCP_CHECK_TIMER(sk);
1506	release_sock(sk);
1507	return err;
1508
1509recv_urg:
1510	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1511	goto out;
1512}
1513
1514/*
1515 *	State processing on a close. This implements the state shift for
1516 *	sending our FIN frame. Note that we only send a FIN for some
1517 *	states. A shutdown() may have already sent the FIN, or we may be
1518 *	closed.
1519 */
1520
1521static unsigned char new_state[16] = {
1522  /* current state:        new state:      action:	*/
1523  /* (Invalid)		*/ TCP_CLOSE,
1524  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1525  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1526  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1527  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1528  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1529  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1530  /* TCP_CLOSE		*/ TCP_CLOSE,
1531  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1532  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1533  /* TCP_LISTEN		*/ TCP_CLOSE,
1534  /* TCP_CLOSING	*/ TCP_CLOSING,
1535};
1536
1537static int tcp_close_state(struct sock *sk)
1538{
1539	int next = (int)new_state[sk->sk_state];
1540	int ns = next & TCP_STATE_MASK;
1541
1542	tcp_set_state(sk, ns);
1543
1544	return next & TCP_ACTION_FIN;
1545}
1546
1547/*
1548 *	Shutdown the sending side of a connection. Much like close except
1549 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1550 */
1551
1552void tcp_shutdown(struct sock *sk, int how)
1553{
1554	/*	We need to grab some memory, and put together a FIN,
1555	 *	and then put it into the queue to be sent.
1556	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1557	 */
1558	if (!(how & SEND_SHUTDOWN))
1559		return;
1560
1561	/* If we've already sent a FIN, or it's a closed state, skip this. */
1562	if ((1 << sk->sk_state) &
1563	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1564	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1565		/* Clear out any half completed packets.  FIN if needed. */
1566		if (tcp_close_state(sk))
1567			tcp_send_fin(sk);
1568	}
1569}
1570
1571/*
1572 * At this point, there should be no process reference to this
1573 * socket, and thus no user references at all.  Therefore we
1574 * can assume the socket waitqueue is inactive and nobody will
1575 * try to jump onto it.
1576 */
1577void tcp_destroy_sock(struct sock *sk)
1578{
1579	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1580	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1581
1582	/* It cannot be in hash table! */
1583	BUG_TRAP(sk_unhashed(sk));
1584
1585	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1586	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1587
1588	sk->sk_prot->destroy(sk);
1589
1590	sk_stream_kill_queues(sk);
1591
1592	xfrm_sk_free_policy(sk);
1593
1594#ifdef INET_REFCNT_DEBUG
1595	if (atomic_read(&sk->sk_refcnt) != 1) {
1596		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1597		       sk, atomic_read(&sk->sk_refcnt));
1598	}
1599#endif
1600
1601	atomic_dec(&tcp_orphan_count);
1602	sock_put(sk);
1603}
1604
1605void tcp_close(struct sock *sk, long timeout)
1606{
1607	struct sk_buff *skb;
1608	int data_was_unread = 0;
1609
1610	lock_sock(sk);
1611	sk->sk_shutdown = SHUTDOWN_MASK;
1612
1613	if (sk->sk_state == TCP_LISTEN) {
1614		tcp_set_state(sk, TCP_CLOSE);
1615
1616		/* Special case. */
1617		tcp_listen_stop(sk);
1618
1619		goto adjudge_to_death;
1620	}
1621
1622	/*  We need to flush the recv. buffs.  We do this only on the
1623	 *  descriptor close, not protocol-sourced closes, because the
1624	 *  reader process may not have drained the data yet!
1625	 */
1626	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1627		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1628			  skb->h.th->fin;
1629		data_was_unread += len;
1630		__kfree_skb(skb);
1631	}
1632
1633	sk_stream_mem_reclaim(sk);
1634
1635	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1636	 * 3.10, we send a RST here because data was lost.  To
1637	 * witness the awful effects of the old behavior of always
1638	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1639	 * a bulk GET in an FTP client, suspend the process, wait
1640	 * for the client to advertise a zero window, then kill -9
1641	 * the FTP client, wheee...  Note: timeout is always zero
1642	 * in such a case.
1643	 */
1644	if (data_was_unread) {
1645		/* Unread data was tossed, zap the connection. */
1646		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1647		tcp_set_state(sk, TCP_CLOSE);
1648		tcp_send_active_reset(sk, GFP_KERNEL);
1649	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1650		/* Check zero linger _after_ checking for unread data. */
1651		sk->sk_prot->disconnect(sk, 0);
1652		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1653	} else if (tcp_close_state(sk)) {
1654		/* We FIN if the application ate all the data before
1655		 * zapping the connection.
1656		 */
1657
1658		/* RED-PEN. Formally speaking, we have broken TCP state
1659		 * machine. State transitions:
1660		 *
1661		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1662		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1663		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1664		 *
1665		 * are legal only when FIN has been sent (i.e. in window),
1666		 * rather than queued out of window. Purists blame.
1667		 *
1668		 * F.e. "RFC state" is ESTABLISHED,
1669		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1670		 *
1671		 * The visible declinations are that sometimes
1672		 * we enter time-wait state, when it is not required really
1673		 * (harmless), do not send active resets, when they are
1674		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1675		 * they look as CLOSING or LAST_ACK for Linux)
1676		 * Probably, I missed some more holelets.
1677		 * 						--ANK
1678		 */
1679		tcp_send_fin(sk);
1680	}
1681
1682	sk_stream_wait_close(sk, timeout);
1683
1684adjudge_to_death:
1685	/* It is the last release_sock in its life. It will remove backlog. */
1686	release_sock(sk);
1687
1688
1689	/* Now socket is owned by kernel and we acquire BH lock
1690	   to finish close. No need to check for user refs.
1691	 */
1692	local_bh_disable();
1693	bh_lock_sock(sk);
1694	BUG_TRAP(!sock_owned_by_user(sk));
1695
1696	sock_hold(sk);
1697	sock_orphan(sk);
1698
1699	/*	This is a (useful) BSD violating of the RFC. There is a
1700	 *	problem with TCP as specified in that the other end could
1701	 *	keep a socket open forever with no application left this end.
1702	 *	We use a 3 minute timeout (about the same as BSD) then kill
1703	 *	our end. If they send after that then tough - BUT: long enough
1704	 *	that we won't make the old 4*rto = almost no time - whoops
1705	 *	reset mistake.
1706	 *
1707	 *	Nope, it was not mistake. It is really desired behaviour
1708	 *	f.e. on http servers, when such sockets are useless, but
1709	 *	consume significant resources. Let's do it with special
1710	 *	linger2	option.					--ANK
1711	 */
1712
1713	if (sk->sk_state == TCP_FIN_WAIT2) {
1714		struct tcp_sock *tp = tcp_sk(sk);
1715		if (tp->linger2 < 0) {
1716			tcp_set_state(sk, TCP_CLOSE);
1717			tcp_send_active_reset(sk, GFP_ATOMIC);
1718			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1719		} else {
1720			int tmo = tcp_fin_time(tp);
1721
1722			if (tmo > TCP_TIMEWAIT_LEN) {
1723				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1724			} else {
1725				atomic_inc(&tcp_orphan_count);
1726				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1727				goto out;
1728			}
1729		}
1730	}
1731	if (sk->sk_state != TCP_CLOSE) {
1732		sk_stream_mem_reclaim(sk);
1733		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1734		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1735		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1736			if (net_ratelimit())
1737				printk(KERN_INFO "TCP: too many of orphaned "
1738				       "sockets\n");
1739			tcp_set_state(sk, TCP_CLOSE);
1740			tcp_send_active_reset(sk, GFP_ATOMIC);
1741			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1742		}
1743	}
1744	atomic_inc(&tcp_orphan_count);
1745
1746	if (sk->sk_state == TCP_CLOSE)
1747		tcp_destroy_sock(sk);
1748	/* Otherwise, socket is reprieved until protocol close. */
1749
1750out:
1751	bh_unlock_sock(sk);
1752	local_bh_enable();
1753	sock_put(sk);
1754}
1755
1756/* These states need RST on ABORT according to RFC793 */
1757
1758static inline int tcp_need_reset(int state)
1759{
1760	return (1 << state) &
1761	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1762		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1763}
1764
1765int tcp_disconnect(struct sock *sk, int flags)
1766{
1767	struct inet_sock *inet = inet_sk(sk);
1768	struct tcp_sock *tp = tcp_sk(sk);
1769	int err = 0;
1770	int old_state = sk->sk_state;
1771
1772	if (old_state != TCP_CLOSE)
1773		tcp_set_state(sk, TCP_CLOSE);
1774
1775	/* ABORT function of RFC793 */
1776	if (old_state == TCP_LISTEN) {
1777		tcp_listen_stop(sk);
1778	} else if (tcp_need_reset(old_state) ||
1779		   (tp->snd_nxt != tp->write_seq &&
1780		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1781		/* The last check adjusts for discrepance of Linux wrt. RFC
1782		 * states
1783		 */
1784		tcp_send_active_reset(sk, gfp_any());
1785		sk->sk_err = ECONNRESET;
1786	} else if (old_state == TCP_SYN_SENT)
1787		sk->sk_err = ECONNRESET;
1788
1789	tcp_clear_xmit_timers(sk);
1790	__skb_queue_purge(&sk->sk_receive_queue);
1791	sk_stream_writequeue_purge(sk);
1792	__skb_queue_purge(&tp->out_of_order_queue);
1793
1794	inet->dport = 0;
1795
1796	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1797		inet_reset_saddr(sk);
1798
1799	sk->sk_shutdown = 0;
1800	sock_reset_flag(sk, SOCK_DONE);
1801	tp->srtt = 0;
1802	if ((tp->write_seq += tp->max_window + 2) == 0)
1803		tp->write_seq = 1;
1804	tp->backoff = 0;
1805	tp->snd_cwnd = 2;
1806	tp->probes_out = 0;
1807	tp->packets_out = 0;
1808	tp->snd_ssthresh = 0x7fffffff;
1809	tp->snd_cwnd_cnt = 0;
1810	tcp_set_ca_state(tp, TCP_CA_Open);
1811	tcp_clear_retrans(tp);
1812	tcp_delack_init(tp);
1813	sk->sk_send_head = NULL;
1814	tp->rx_opt.saw_tstamp = 0;
1815	tcp_sack_reset(&tp->rx_opt);
1816	__sk_dst_reset(sk);
1817
1818	BUG_TRAP(!inet->num || tp->bind_hash);
1819
1820	sk->sk_error_report(sk);
1821	return err;
1822}
1823
1824/*
1825 *	Wait for an incoming connection, avoid race
1826 *	conditions. This must be called with the socket locked.
1827 */
1828static int wait_for_connect(struct sock *sk, long timeo)
1829{
1830	struct tcp_sock *tp = tcp_sk(sk);
1831	DEFINE_WAIT(wait);
1832	int err;
1833
1834	/*
1835	 * True wake-one mechanism for incoming connections: only
1836	 * one process gets woken up, not the 'whole herd'.
1837	 * Since we do not 'race & poll' for established sockets
1838	 * anymore, the common case will execute the loop only once.
1839	 *
1840	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1841	 * after any current non-exclusive waiters, and we know that
1842	 * it will always _stay_ after any new non-exclusive waiters
1843	 * because all non-exclusive waiters are added at the
1844	 * beginning of the wait-queue. As such, it's ok to "drop"
1845	 * our exclusiveness temporarily when we get woken up without
1846	 * having to remove and re-insert us on the wait queue.
1847	 */
1848	for (;;) {
1849		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1850					  TASK_INTERRUPTIBLE);
1851		release_sock(sk);
1852		if (reqsk_queue_empty(&tp->accept_queue))
1853			timeo = schedule_timeout(timeo);
1854		lock_sock(sk);
1855		err = 0;
1856		if (!reqsk_queue_empty(&tp->accept_queue))
1857			break;
1858		err = -EINVAL;
1859		if (sk->sk_state != TCP_LISTEN)
1860			break;
1861		err = sock_intr_errno(timeo);
1862		if (signal_pending(current))
1863			break;
1864		err = -EAGAIN;
1865		if (!timeo)
1866			break;
1867	}
1868	finish_wait(sk->sk_sleep, &wait);
1869	return err;
1870}
1871
1872/*
1873 *	This will accept the next outstanding connection.
1874 */
1875
1876struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1877{
1878	struct tcp_sock *tp = tcp_sk(sk);
1879	struct sock *newsk;
1880	int error;
1881
1882	lock_sock(sk);
1883
1884	/* We need to make sure that this socket is listening,
1885	 * and that it has something pending.
1886	 */
1887	error = -EINVAL;
1888	if (sk->sk_state != TCP_LISTEN)
1889		goto out_err;
1890
1891	/* Find already established connection */
1892	if (reqsk_queue_empty(&tp->accept_queue)) {
1893		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1894
1895		/* If this is a non blocking socket don't sleep */
1896		error = -EAGAIN;
1897		if (!timeo)
1898			goto out_err;
1899
1900		error = wait_for_connect(sk, timeo);
1901		if (error)
1902			goto out_err;
1903	}
1904
1905	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1906	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1907out:
1908	release_sock(sk);
1909	return newsk;
1910out_err:
1911	newsk = NULL;
1912	*err = error;
1913	goto out;
1914}
1915
1916/*
1917 *	Socket option code for TCP.
1918 */
1919int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1920		   int optlen)
1921{
1922	struct tcp_sock *tp = tcp_sk(sk);
1923	int val;
1924	int err = 0;
1925
1926	if (level != SOL_TCP)
1927		return tp->af_specific->setsockopt(sk, level, optname,
1928						   optval, optlen);
1929
1930	if (optlen < sizeof(int))
1931		return -EINVAL;
1932
1933	if (get_user(val, (int __user *)optval))
1934		return -EFAULT;
1935
1936	lock_sock(sk);
1937
1938	switch (optname) {
1939	case TCP_MAXSEG:
1940		/* Values greater than interface MTU won't take effect. However
1941		 * at the point when this call is done we typically don't yet
1942		 * know which interface is going to be used */
1943		if (val < 8 || val > MAX_TCP_WINDOW) {
1944			err = -EINVAL;
1945			break;
1946		}
1947		tp->rx_opt.user_mss = val;
1948		break;
1949
1950	case TCP_NODELAY:
1951		if (val) {
1952			/* TCP_NODELAY is weaker than TCP_CORK, so that
1953			 * this option on corked socket is remembered, but
1954			 * it is not activated until cork is cleared.
1955			 *
1956			 * However, when TCP_NODELAY is set we make
1957			 * an explicit push, which overrides even TCP_CORK
1958			 * for currently queued segments.
1959			 */
1960			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1961			tcp_push_pending_frames(sk, tp);
1962		} else {
1963			tp->nonagle &= ~TCP_NAGLE_OFF;
1964		}
1965		break;
1966
1967	case TCP_CORK:
1968		/* When set indicates to always queue non-full frames.
1969		 * Later the user clears this option and we transmit
1970		 * any pending partial frames in the queue.  This is
1971		 * meant to be used alongside sendfile() to get properly
1972		 * filled frames when the user (for example) must write
1973		 * out headers with a write() call first and then use
1974		 * sendfile to send out the data parts.
1975		 *
1976		 * TCP_CORK can be set together with TCP_NODELAY and it is
1977		 * stronger than TCP_NODELAY.
1978		 */
1979		if (val) {
1980			tp->nonagle |= TCP_NAGLE_CORK;
1981		} else {
1982			tp->nonagle &= ~TCP_NAGLE_CORK;
1983			if (tp->nonagle&TCP_NAGLE_OFF)
1984				tp->nonagle |= TCP_NAGLE_PUSH;
1985			tcp_push_pending_frames(sk, tp);
1986		}
1987		break;
1988
1989	case TCP_KEEPIDLE:
1990		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1991			err = -EINVAL;
1992		else {
1993			tp->keepalive_time = val * HZ;
1994			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1995			    !((1 << sk->sk_state) &
1996			      (TCPF_CLOSE | TCPF_LISTEN))) {
1997				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1998				if (tp->keepalive_time > elapsed)
1999					elapsed = tp->keepalive_time - elapsed;
2000				else
2001					elapsed = 0;
2002				tcp_reset_keepalive_timer(sk, elapsed);
2003			}
2004		}
2005		break;
2006	case TCP_KEEPINTVL:
2007		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2008			err = -EINVAL;
2009		else
2010			tp->keepalive_intvl = val * HZ;
2011		break;
2012	case TCP_KEEPCNT:
2013		if (val < 1 || val > MAX_TCP_KEEPCNT)
2014			err = -EINVAL;
2015		else
2016			tp->keepalive_probes = val;
2017		break;
2018	case TCP_SYNCNT:
2019		if (val < 1 || val > MAX_TCP_SYNCNT)
2020			err = -EINVAL;
2021		else
2022			tp->syn_retries = val;
2023		break;
2024
2025	case TCP_LINGER2:
2026		if (val < 0)
2027			tp->linger2 = -1;
2028		else if (val > sysctl_tcp_fin_timeout / HZ)
2029			tp->linger2 = 0;
2030		else
2031			tp->linger2 = val * HZ;
2032		break;
2033
2034	case TCP_DEFER_ACCEPT:
2035		tp->defer_accept = 0;
2036		if (val > 0) {
2037			/* Translate value in seconds to number of
2038			 * retransmits */
2039			while (tp->defer_accept < 32 &&
2040			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2041				       tp->defer_accept))
2042				tp->defer_accept++;
2043			tp->defer_accept++;
2044		}
2045		break;
2046
2047	case TCP_WINDOW_CLAMP:
2048		if (!val) {
2049			if (sk->sk_state != TCP_CLOSE) {
2050				err = -EINVAL;
2051				break;
2052			}
2053			tp->window_clamp = 0;
2054		} else
2055			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2056						SOCK_MIN_RCVBUF / 2 : val;
2057		break;
2058
2059	case TCP_QUICKACK:
2060		if (!val) {
2061			tp->ack.pingpong = 1;
2062		} else {
2063			tp->ack.pingpong = 0;
2064			if ((1 << sk->sk_state) &
2065			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2066			    tcp_ack_scheduled(tp)) {
2067				tp->ack.pending |= TCP_ACK_PUSHED;
2068				cleanup_rbuf(sk, 1);
2069				if (!(val & 1))
2070					tp->ack.pingpong = 1;
2071			}
2072		}
2073		break;
2074
2075	default:
2076		err = -ENOPROTOOPT;
2077		break;
2078	};
2079	release_sock(sk);
2080	return err;
2081}
2082
2083/* Return information about state of tcp endpoint in API format. */
2084void tcp_get_info(struct sock *sk, struct tcp_info *info)
2085{
2086	struct tcp_sock *tp = tcp_sk(sk);
2087	u32 now = tcp_time_stamp;
2088
2089	memset(info, 0, sizeof(*info));
2090
2091	info->tcpi_state = sk->sk_state;
2092	info->tcpi_ca_state = tp->ca_state;
2093	info->tcpi_retransmits = tp->retransmits;
2094	info->tcpi_probes = tp->probes_out;
2095	info->tcpi_backoff = tp->backoff;
2096
2097	if (tp->rx_opt.tstamp_ok)
2098		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2099	if (tp->rx_opt.sack_ok)
2100		info->tcpi_options |= TCPI_OPT_SACK;
2101	if (tp->rx_opt.wscale_ok) {
2102		info->tcpi_options |= TCPI_OPT_WSCALE;
2103		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2104		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2105	}
2106
2107	if (tp->ecn_flags&TCP_ECN_OK)
2108		info->tcpi_options |= TCPI_OPT_ECN;
2109
2110	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2111	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2112	info->tcpi_snd_mss = tp->mss_cache_std;
2113	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2114
2115	info->tcpi_unacked = tp->packets_out;
2116	info->tcpi_sacked = tp->sacked_out;
2117	info->tcpi_lost = tp->lost_out;
2118	info->tcpi_retrans = tp->retrans_out;
2119	info->tcpi_fackets = tp->fackets_out;
2120
2121	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2122	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2123	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2124
2125	info->tcpi_pmtu = tp->pmtu_cookie;
2126	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2127	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2128	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2129	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2130	info->tcpi_snd_cwnd = tp->snd_cwnd;
2131	info->tcpi_advmss = tp->advmss;
2132	info->tcpi_reordering = tp->reordering;
2133
2134	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2135	info->tcpi_rcv_space = tp->rcvq_space.space;
2136
2137	info->tcpi_total_retrans = tp->total_retrans;
2138}
2139
2140EXPORT_SYMBOL_GPL(tcp_get_info);
2141
2142int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2143		   int __user *optlen)
2144{
2145	struct tcp_sock *tp = tcp_sk(sk);
2146	int val, len;
2147
2148	if (level != SOL_TCP)
2149		return tp->af_specific->getsockopt(sk, level, optname,
2150						   optval, optlen);
2151
2152	if (get_user(len, optlen))
2153		return -EFAULT;
2154
2155	len = min_t(unsigned int, len, sizeof(int));
2156
2157	if (len < 0)
2158		return -EINVAL;
2159
2160	switch (optname) {
2161	case TCP_MAXSEG:
2162		val = tp->mss_cache_std;
2163		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2164			val = tp->rx_opt.user_mss;
2165		break;
2166	case TCP_NODELAY:
2167		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2168		break;
2169	case TCP_CORK:
2170		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2171		break;
2172	case TCP_KEEPIDLE:
2173		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2174		break;
2175	case TCP_KEEPINTVL:
2176		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2177		break;
2178	case TCP_KEEPCNT:
2179		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2180		break;
2181	case TCP_SYNCNT:
2182		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2183		break;
2184	case TCP_LINGER2:
2185		val = tp->linger2;
2186		if (val >= 0)
2187			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2188		break;
2189	case TCP_DEFER_ACCEPT:
2190		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2191					       (tp->defer_accept - 1));
2192		break;
2193	case TCP_WINDOW_CLAMP:
2194		val = tp->window_clamp;
2195		break;
2196	case TCP_INFO: {
2197		struct tcp_info info;
2198
2199		if (get_user(len, optlen))
2200			return -EFAULT;
2201
2202		tcp_get_info(sk, &info);
2203
2204		len = min_t(unsigned int, len, sizeof(info));
2205		if (put_user(len, optlen))
2206			return -EFAULT;
2207		if (copy_to_user(optval, &info, len))
2208			return -EFAULT;
2209		return 0;
2210	}
2211	case TCP_QUICKACK:
2212		val = !tp->ack.pingpong;
2213		break;
2214	default:
2215		return -ENOPROTOOPT;
2216	};
2217
2218	if (put_user(len, optlen))
2219		return -EFAULT;
2220	if (copy_to_user(optval, &val, len))
2221		return -EFAULT;
2222	return 0;
2223}
2224
2225
2226extern void __skb_cb_too_small_for_tcp(int, int);
2227extern void tcpdiag_init(void);
2228
2229static __initdata unsigned long thash_entries;
2230static int __init set_thash_entries(char *str)
2231{
2232	if (!str)
2233		return 0;
2234	thash_entries = simple_strtoul(str, &str, 0);
2235	return 1;
2236}
2237__setup("thash_entries=", set_thash_entries);
2238
2239void __init tcp_init(void)
2240{
2241	struct sk_buff *skb = NULL;
2242	int order, i;
2243
2244	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2245		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2246					   sizeof(skb->cb));
2247
2248	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2249					      sizeof(struct tcp_bind_bucket),
2250					      0, SLAB_HWCACHE_ALIGN,
2251					      NULL, NULL);
2252	if (!tcp_bucket_cachep)
2253		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2254
2255	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2256						sizeof(struct tcp_tw_bucket),
2257						0, SLAB_HWCACHE_ALIGN,
2258						NULL, NULL);
2259	if (!tcp_timewait_cachep)
2260		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2261
2262	/* Size and allocate the main established and bind bucket
2263	 * hash tables.
2264	 *
2265	 * The methodology is similar to that of the buffer cache.
2266	 */
2267	tcp_ehash = (struct tcp_ehash_bucket *)
2268		alloc_large_system_hash("TCP established",
2269					sizeof(struct tcp_ehash_bucket),
2270					thash_entries,
2271					(num_physpages >= 128 * 1024) ?
2272						(25 - PAGE_SHIFT) :
2273						(27 - PAGE_SHIFT),
2274					HASH_HIGHMEM,
2275					&tcp_ehash_size,
2276					NULL,
2277					0);
2278	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2279	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2280		rwlock_init(&tcp_ehash[i].lock);
2281		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2282	}
2283
2284	tcp_bhash = (struct tcp_bind_hashbucket *)
2285		alloc_large_system_hash("TCP bind",
2286					sizeof(struct tcp_bind_hashbucket),
2287					tcp_ehash_size,
2288					(num_physpages >= 128 * 1024) ?
2289						(25 - PAGE_SHIFT) :
2290						(27 - PAGE_SHIFT),
2291					HASH_HIGHMEM,
2292					&tcp_bhash_size,
2293					NULL,
2294					64 * 1024);
2295	tcp_bhash_size = 1 << tcp_bhash_size;
2296	for (i = 0; i < tcp_bhash_size; i++) {
2297		spin_lock_init(&tcp_bhash[i].lock);
2298		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2299	}
2300
2301	/* Try to be a bit smarter and adjust defaults depending
2302	 * on available memory.
2303	 */
2304	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2305			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2306			order++)
2307		;
2308	if (order >= 4) {
2309		sysctl_local_port_range[0] = 32768;
2310		sysctl_local_port_range[1] = 61000;
2311		sysctl_tcp_max_tw_buckets = 180000;
2312		sysctl_tcp_max_orphans = 4096 << (order - 4);
2313		sysctl_max_syn_backlog = 1024;
2314	} else if (order < 3) {
2315		sysctl_local_port_range[0] = 1024 * (3 - order);
2316		sysctl_tcp_max_tw_buckets >>= (3 - order);
2317		sysctl_tcp_max_orphans >>= (3 - order);
2318		sysctl_max_syn_backlog = 128;
2319	}
2320	tcp_port_rover = sysctl_local_port_range[0] - 1;
2321
2322	sysctl_tcp_mem[0] =  768 << order;
2323	sysctl_tcp_mem[1] = 1024 << order;
2324	sysctl_tcp_mem[2] = 1536 << order;
2325
2326	if (order < 3) {
2327		sysctl_tcp_wmem[2] = 64 * 1024;
2328		sysctl_tcp_rmem[0] = PAGE_SIZE;
2329		sysctl_tcp_rmem[1] = 43689;
2330		sysctl_tcp_rmem[2] = 2 * 43689;
2331	}
2332
2333	printk(KERN_INFO "TCP: Hash tables configured "
2334	       "(established %d bind %d)\n",
2335	       tcp_ehash_size << 1, tcp_bhash_size);
2336}
2337
2338EXPORT_SYMBOL(tcp_accept);
2339EXPORT_SYMBOL(tcp_close);
2340EXPORT_SYMBOL(tcp_destroy_sock);
2341EXPORT_SYMBOL(tcp_disconnect);
2342EXPORT_SYMBOL(tcp_getsockopt);
2343EXPORT_SYMBOL(tcp_ioctl);
2344EXPORT_SYMBOL(tcp_poll);
2345EXPORT_SYMBOL(tcp_read_sock);
2346EXPORT_SYMBOL(tcp_recvmsg);
2347EXPORT_SYMBOL(tcp_sendmsg);
2348EXPORT_SYMBOL(tcp_sendpage);
2349EXPORT_SYMBOL(tcp_setsockopt);
2350EXPORT_SYMBOL(tcp_shutdown);
2351EXPORT_SYMBOL(tcp_statistics);
2352EXPORT_SYMBOL(tcp_timewait_cachep);
2353