tcp.c revision c1b4a7e69576d65efc31a8cea0714173c2841244
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305	if (!tcp_memory_pressure) {
306		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307		tcp_memory_pressure = 1;
308	}
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317					       poll_table *wait)
318{
319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 *	Wait for a TCP event.
324 *
325 *	Note that we don't need to lock the socket, as the upper poll layers
326 *	take care of normal races (between the test and the event) and we don't
327 *	go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331	unsigned int mask;
332	struct sock *sk = sock->sk;
333	struct tcp_sock *tp = tcp_sk(sk);
334
335	poll_wait(file, sk->sk_sleep, wait);
336	if (sk->sk_state == TCP_LISTEN)
337		return tcp_listen_poll(sk, wait);
338
339	/* Socket is not locked. We are protected from async events
340	   by poll logic and correct handling of state changes
341	   made by another threads is impossible in any case.
342	 */
343
344	mask = 0;
345	if (sk->sk_err)
346		mask = POLLERR;
347
348	/*
349	 * POLLHUP is certainly not done right. But poll() doesn't
350	 * have a notion of HUP in just one direction, and for a
351	 * socket the read side is more interesting.
352	 *
353	 * Some poll() documentation says that POLLHUP is incompatible
354	 * with the POLLOUT/POLLWR flags, so somebody should check this
355	 * all. But careful, it tends to be safer to return too many
356	 * bits than too few, and you can easily break real applications
357	 * if you don't tell them that something has hung up!
358	 *
359	 * Check-me.
360	 *
361	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362	 * our fs/select.c). It means that after we received EOF,
363	 * poll always returns immediately, making impossible poll() on write()
364	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365	 * if and only if shutdown has been made in both directions.
366	 * Actually, it is interesting to look how Solaris and DUX
367	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368	 * then we could set it on SND_SHUTDOWN. BTW examples given
369	 * in Stevens' books assume exactly this behaviour, it explains
370	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371	 *
372	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373	 * blocking on fresh not-connected or disconnected socket. --ANK
374	 */
375	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376		mask |= POLLHUP;
377	if (sk->sk_shutdown & RCV_SHUTDOWN)
378		mask |= POLLIN | POLLRDNORM;
379
380	/* Connected? */
381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382		/* Potential race condition. If read of tp below will
383		 * escape above sk->sk_state, we can be illegally awaken
384		 * in SYN_* states. */
385		if ((tp->rcv_nxt != tp->copied_seq) &&
386		    (tp->urg_seq != tp->copied_seq ||
387		     tp->rcv_nxt != tp->copied_seq + 1 ||
388		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389			mask |= POLLIN | POLLRDNORM;
390
391		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393				mask |= POLLOUT | POLLWRNORM;
394			} else {  /* send SIGIO later */
395				set_bit(SOCK_ASYNC_NOSPACE,
396					&sk->sk_socket->flags);
397				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399				/* Race breaker. If space is freed after
400				 * wspace test but before the flags are set,
401				 * IO signal will be lost.
402				 */
403				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404					mask |= POLLOUT | POLLWRNORM;
405			}
406		}
407
408		if (tp->urg_data & TCP_URG_VALID)
409			mask |= POLLPRI;
410	}
411	return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416	struct tcp_sock *tp = tcp_sk(sk);
417	int answ;
418
419	switch (cmd) {
420	case SIOCINQ:
421		if (sk->sk_state == TCP_LISTEN)
422			return -EINVAL;
423
424		lock_sock(sk);
425		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426			answ = 0;
427		else if (sock_flag(sk, SOCK_URGINLINE) ||
428			 !tp->urg_data ||
429			 before(tp->urg_seq, tp->copied_seq) ||
430			 !before(tp->urg_seq, tp->rcv_nxt)) {
431			answ = tp->rcv_nxt - tp->copied_seq;
432
433			/* Subtract 1, if FIN is in queue. */
434			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435				answ -=
436		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437		} else
438			answ = tp->urg_seq - tp->copied_seq;
439		release_sock(sk);
440		break;
441	case SIOCATMARK:
442		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443		break;
444	case SIOCOUTQ:
445		if (sk->sk_state == TCP_LISTEN)
446			return -EINVAL;
447
448		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449			answ = 0;
450		else
451			answ = tp->write_seq - tp->snd_una;
452		break;
453	default:
454		return -ENOIOCTLCMD;
455	};
456
457	return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463	struct inet_sock *inet = inet_sk(sk);
464	struct tcp_sock *tp = tcp_sk(sk);
465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467	if (rc != 0)
468		return rc;
469
470	sk->sk_max_ack_backlog = 0;
471	sk->sk_ack_backlog = 0;
472	tcp_delack_init(tp);
473
474	/* There is race window here: we announce ourselves listening,
475	 * but this transition is still not validated by get_port().
476	 * It is OK, because this socket enters to hash table only
477	 * after validation is complete.
478	 */
479	sk->sk_state = TCP_LISTEN;
480	if (!sk->sk_prot->get_port(sk, inet->num)) {
481		inet->sport = htons(inet->num);
482
483		sk_dst_reset(sk);
484		sk->sk_prot->hash(sk);
485
486		return 0;
487	}
488
489	sk->sk_state = TCP_CLOSE;
490	reqsk_queue_destroy(&tp->accept_queue);
491	return -EADDRINUSE;
492}
493
494/*
495 *	This routine closes sockets which have been at least partially
496 *	opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501	struct tcp_sock *tp = tcp_sk(sk);
502	struct listen_sock *lopt;
503	struct request_sock *acc_req;
504	struct request_sock *req;
505	int i;
506
507	tcp_delete_keepalive_timer(sk);
508
509	/* make all the listen_opt local to us */
510	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513	if (lopt->qlen) {
514		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515			while ((req = lopt->syn_table[i]) != NULL) {
516				lopt->syn_table[i] = req->dl_next;
517				lopt->qlen--;
518				reqsk_free(req);
519
520		/* Following specs, it would be better either to send FIN
521		 * (and enter FIN-WAIT-1, it is normal close)
522		 * or to send active reset (abort).
523		 * Certainly, it is pretty dangerous while synflood, but it is
524		 * bad justification for our negligence 8)
525		 * To be honest, we are not able to make either
526		 * of the variants now.			--ANK
527		 */
528			}
529		}
530	}
531	BUG_TRAP(!lopt->qlen);
532
533	kfree(lopt);
534
535	while ((req = acc_req) != NULL) {
536		struct sock *child = req->sk;
537
538		acc_req = req->dl_next;
539
540		local_bh_disable();
541		bh_lock_sock(child);
542		BUG_TRAP(!sock_owned_by_user(child));
543		sock_hold(child);
544
545		tcp_disconnect(child, O_NONBLOCK);
546
547		sock_orphan(child);
548
549		atomic_inc(&tcp_orphan_count);
550
551		tcp_destroy_sock(child);
552
553		bh_unlock_sock(child);
554		local_bh_enable();
555		sock_put(child);
556
557		sk_acceptq_removed(sk);
558		__reqsk_free(req);
559	}
560	BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{
565	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566	tp->pushed_seq = tp->write_seq;
567}
568
569static inline int forced_push(struct tcp_sock *tp)
570{
571	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572}
573
574static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575			      struct sk_buff *skb)
576{
577	skb->csum = 0;
578	TCP_SKB_CB(skb)->seq = tp->write_seq;
579	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581	TCP_SKB_CB(skb)->sacked = 0;
582	skb_header_release(skb);
583	__skb_queue_tail(&sk->sk_write_queue, skb);
584	sk_charge_skb(sk, skb);
585	if (!sk->sk_send_head)
586		sk->sk_send_head = skb;
587	else if (tp->nonagle&TCP_NAGLE_PUSH)
588		tp->nonagle &= ~TCP_NAGLE_PUSH;
589}
590
591static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592				struct sk_buff *skb)
593{
594	if (flags & MSG_OOB) {
595		tp->urg_mode = 1;
596		tp->snd_up = tp->write_seq;
597		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598	}
599}
600
601static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602			    int mss_now, int nonagle)
603{
604	if (sk->sk_send_head) {
605		struct sk_buff *skb = sk->sk_write_queue.prev;
606		if (!(flags & MSG_MORE) || forced_push(tp))
607			tcp_mark_push(tp, skb);
608		tcp_mark_urg(tp, flags, skb);
609		__tcp_push_pending_frames(sk, tp, mss_now,
610					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611	}
612}
613
614static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615			 size_t psize, int flags)
616{
617	struct tcp_sock *tp = tcp_sk(sk);
618	int mss_now, size_goal;
619	int err;
620	ssize_t copied;
621	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623	/* Wait for a connection to finish. */
624	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626			goto out_err;
627
628	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631	size_goal = tp->xmit_size_goal;
632	copied = 0;
633
634	err = -EPIPE;
635	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
636		goto do_error;
637
638	while (psize > 0) {
639		struct sk_buff *skb = sk->sk_write_queue.prev;
640		struct page *page = pages[poffset / PAGE_SIZE];
641		int copy, i, can_coalesce;
642		int offset = poffset % PAGE_SIZE;
643		int size = min_t(size_t, psize, PAGE_SIZE - offset);
644
645		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
646new_segment:
647			if (!sk_stream_memory_free(sk))
648				goto wait_for_sndbuf;
649
650			skb = sk_stream_alloc_pskb(sk, 0, 0,
651						   sk->sk_allocation);
652			if (!skb)
653				goto wait_for_memory;
654
655			skb_entail(sk, tp, skb);
656			copy = size_goal;
657		}
658
659		if (copy > size)
660			copy = size;
661
662		i = skb_shinfo(skb)->nr_frags;
663		can_coalesce = skb_can_coalesce(skb, i, page, offset);
664		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
665			tcp_mark_push(tp, skb);
666			goto new_segment;
667		}
668		if (sk->sk_forward_alloc < copy &&
669		    !sk_stream_mem_schedule(sk, copy, 0))
670			goto wait_for_memory;
671
672		if (can_coalesce) {
673			skb_shinfo(skb)->frags[i - 1].size += copy;
674		} else {
675			get_page(page);
676			skb_fill_page_desc(skb, i, page, offset, copy);
677		}
678
679		skb->len += copy;
680		skb->data_len += copy;
681		skb->truesize += copy;
682		sk->sk_wmem_queued += copy;
683		sk->sk_forward_alloc -= copy;
684		skb->ip_summed = CHECKSUM_HW;
685		tp->write_seq += copy;
686		TCP_SKB_CB(skb)->end_seq += copy;
687		skb_shinfo(skb)->tso_segs = 0;
688
689		if (!copied)
690			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
691
692		copied += copy;
693		poffset += copy;
694		if (!(psize -= copy))
695			goto out;
696
697		if (skb->len < mss_now || (flags & MSG_OOB))
698			continue;
699
700		if (forced_push(tp)) {
701			tcp_mark_push(tp, skb);
702			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
703		} else if (skb == sk->sk_send_head)
704			tcp_push_one(sk, mss_now);
705		continue;
706
707wait_for_sndbuf:
708		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
709wait_for_memory:
710		if (copied)
711			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
712
713		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
714			goto do_error;
715
716		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717		size_goal = tp->xmit_size_goal;
718	}
719
720out:
721	if (copied)
722		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
723	return copied;
724
725do_error:
726	if (copied)
727		goto out;
728out_err:
729	return sk_stream_error(sk, flags, err);
730}
731
732ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
733		     size_t size, int flags)
734{
735	ssize_t res;
736	struct sock *sk = sock->sk;
737
738#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
739
740	if (!(sk->sk_route_caps & NETIF_F_SG) ||
741	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
742		return sock_no_sendpage(sock, page, offset, size, flags);
743
744#undef TCP_ZC_CSUM_FLAGS
745
746	lock_sock(sk);
747	TCP_CHECK_TIMER(sk);
748	res = do_tcp_sendpages(sk, &page, offset, size, flags);
749	TCP_CHECK_TIMER(sk);
750	release_sock(sk);
751	return res;
752}
753
754#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
755#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
756
757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
758{
759	int tmp = tp->mss_cache;
760
761	if (sk->sk_route_caps & NETIF_F_SG) {
762		if (sk->sk_route_caps & NETIF_F_TSO)
763			tmp = 0;
764		else {
765			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
766
767			if (tmp >= pgbreak &&
768			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
769				tmp = pgbreak;
770		}
771	}
772
773	return tmp;
774}
775
776int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
777		size_t size)
778{
779	struct iovec *iov;
780	struct tcp_sock *tp = tcp_sk(sk);
781	struct sk_buff *skb;
782	int iovlen, flags;
783	int mss_now, size_goal;
784	int err, copied;
785	long timeo;
786
787	lock_sock(sk);
788	TCP_CHECK_TIMER(sk);
789
790	flags = msg->msg_flags;
791	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
792
793	/* Wait for a connection to finish. */
794	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
795		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
796			goto out_err;
797
798	/* This should be in poll */
799	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
800
801	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802	size_goal = tp->xmit_size_goal;
803
804	/* Ok commence sending. */
805	iovlen = msg->msg_iovlen;
806	iov = msg->msg_iov;
807	copied = 0;
808
809	err = -EPIPE;
810	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
811		goto do_error;
812
813	while (--iovlen >= 0) {
814		int seglen = iov->iov_len;
815		unsigned char __user *from = iov->iov_base;
816
817		iov++;
818
819		while (seglen > 0) {
820			int copy;
821
822			skb = sk->sk_write_queue.prev;
823
824			if (!sk->sk_send_head ||
825			    (copy = size_goal - skb->len) <= 0) {
826
827new_segment:
828				/* Allocate new segment. If the interface is SG,
829				 * allocate skb fitting to single page.
830				 */
831				if (!sk_stream_memory_free(sk))
832					goto wait_for_sndbuf;
833
834				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
835							   0, sk->sk_allocation);
836				if (!skb)
837					goto wait_for_memory;
838
839				/*
840				 * Check whether we can use HW checksum.
841				 */
842				if (sk->sk_route_caps &
843				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
844				     NETIF_F_HW_CSUM))
845					skb->ip_summed = CHECKSUM_HW;
846
847				skb_entail(sk, tp, skb);
848				copy = size_goal;
849			}
850
851			/* Try to append data to the end of skb. */
852			if (copy > seglen)
853				copy = seglen;
854
855			/* Where to copy to? */
856			if (skb_tailroom(skb) > 0) {
857				/* We have some space in skb head. Superb! */
858				if (copy > skb_tailroom(skb))
859					copy = skb_tailroom(skb);
860				if ((err = skb_add_data(skb, from, copy)) != 0)
861					goto do_fault;
862			} else {
863				int merge = 0;
864				int i = skb_shinfo(skb)->nr_frags;
865				struct page *page = TCP_PAGE(sk);
866				int off = TCP_OFF(sk);
867
868				if (skb_can_coalesce(skb, i, page, off) &&
869				    off != PAGE_SIZE) {
870					/* We can extend the last page
871					 * fragment. */
872					merge = 1;
873				} else if (i == MAX_SKB_FRAGS ||
874					   (!i &&
875					   !(sk->sk_route_caps & NETIF_F_SG))) {
876					/* Need to add new fragment and cannot
877					 * do this because interface is non-SG,
878					 * or because all the page slots are
879					 * busy. */
880					tcp_mark_push(tp, skb);
881					goto new_segment;
882				} else if (page) {
883					if (off == PAGE_SIZE) {
884						put_page(page);
885						TCP_PAGE(sk) = page = NULL;
886					}
887				}
888
889				if (!page) {
890					/* Allocate new cache page. */
891					if (!(page = sk_stream_alloc_page(sk)))
892						goto wait_for_memory;
893					off = 0;
894				}
895
896				if (copy > PAGE_SIZE - off)
897					copy = PAGE_SIZE - off;
898
899				/* Time to copy data. We are close to
900				 * the end! */
901				err = skb_copy_to_page(sk, from, skb, page,
902						       off, copy);
903				if (err) {
904					/* If this page was new, give it to the
905					 * socket so it does not get leaked.
906					 */
907					if (!TCP_PAGE(sk)) {
908						TCP_PAGE(sk) = page;
909						TCP_OFF(sk) = 0;
910					}
911					goto do_error;
912				}
913
914				/* Update the skb. */
915				if (merge) {
916					skb_shinfo(skb)->frags[i - 1].size +=
917									copy;
918				} else {
919					skb_fill_page_desc(skb, i, page, off, copy);
920					if (TCP_PAGE(sk)) {
921						get_page(page);
922					} else if (off + copy < PAGE_SIZE) {
923						get_page(page);
924						TCP_PAGE(sk) = page;
925					}
926				}
927
928				TCP_OFF(sk) = off + copy;
929			}
930
931			if (!copied)
932				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
933
934			tp->write_seq += copy;
935			TCP_SKB_CB(skb)->end_seq += copy;
936			skb_shinfo(skb)->tso_segs = 0;
937
938			from += copy;
939			copied += copy;
940			if ((seglen -= copy) == 0 && iovlen == 0)
941				goto out;
942
943			if (skb->len < mss_now || (flags & MSG_OOB))
944				continue;
945
946			if (forced_push(tp)) {
947				tcp_mark_push(tp, skb);
948				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
949			} else if (skb == sk->sk_send_head)
950				tcp_push_one(sk, mss_now);
951			continue;
952
953wait_for_sndbuf:
954			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
955wait_for_memory:
956			if (copied)
957				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
958
959			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
960				goto do_error;
961
962			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963			size_goal = tp->xmit_size_goal;
964		}
965	}
966
967out:
968	if (copied)
969		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
970	TCP_CHECK_TIMER(sk);
971	release_sock(sk);
972	return copied;
973
974do_fault:
975	if (!skb->len) {
976		if (sk->sk_send_head == skb)
977			sk->sk_send_head = NULL;
978		__skb_unlink(skb, skb->list);
979		sk_stream_free_skb(sk, skb);
980	}
981
982do_error:
983	if (copied)
984		goto out;
985out_err:
986	err = sk_stream_error(sk, flags, err);
987	TCP_CHECK_TIMER(sk);
988	release_sock(sk);
989	return err;
990}
991
992/*
993 *	Handle reading urgent data. BSD has very simple semantics for
994 *	this, no blocking and very strange errors 8)
995 */
996
997static int tcp_recv_urg(struct sock *sk, long timeo,
998			struct msghdr *msg, int len, int flags,
999			int *addr_len)
1000{
1001	struct tcp_sock *tp = tcp_sk(sk);
1002
1003	/* No URG data to read. */
1004	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1005	    tp->urg_data == TCP_URG_READ)
1006		return -EINVAL;	/* Yes this is right ! */
1007
1008	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1009		return -ENOTCONN;
1010
1011	if (tp->urg_data & TCP_URG_VALID) {
1012		int err = 0;
1013		char c = tp->urg_data;
1014
1015		if (!(flags & MSG_PEEK))
1016			tp->urg_data = TCP_URG_READ;
1017
1018		/* Read urgent data. */
1019		msg->msg_flags |= MSG_OOB;
1020
1021		if (len > 0) {
1022			if (!(flags & MSG_TRUNC))
1023				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1024			len = 1;
1025		} else
1026			msg->msg_flags |= MSG_TRUNC;
1027
1028		return err ? -EFAULT : len;
1029	}
1030
1031	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1032		return 0;
1033
1034	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1035	 * the available implementations agree in this case:
1036	 * this call should never block, independent of the
1037	 * blocking state of the socket.
1038	 * Mike <pall@rz.uni-karlsruhe.de>
1039	 */
1040	return -EAGAIN;
1041}
1042
1043/* Clean up the receive buffer for full frames taken by the user,
1044 * then send an ACK if necessary.  COPIED is the number of bytes
1045 * tcp_recvmsg has given to the user so far, it speeds up the
1046 * calculation of whether or not we must ACK for the sake of
1047 * a window update.
1048 */
1049static void cleanup_rbuf(struct sock *sk, int copied)
1050{
1051	struct tcp_sock *tp = tcp_sk(sk);
1052	int time_to_ack = 0;
1053
1054#if TCP_DEBUG
1055	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1056
1057	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1058#endif
1059
1060	if (tcp_ack_scheduled(tp)) {
1061		   /* Delayed ACKs frequently hit locked sockets during bulk
1062		    * receive. */
1063		if (tp->ack.blocked ||
1064		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1065		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1066		    /*
1067		     * If this read emptied read buffer, we send ACK, if
1068		     * connection is not bidirectional, user drained
1069		     * receive buffer and there was a small segment
1070		     * in queue.
1071		     */
1072		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1073		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1074			time_to_ack = 1;
1075	}
1076
1077	/* We send an ACK if we can now advertise a non-zero window
1078	 * which has been raised "significantly".
1079	 *
1080	 * Even if window raised up to infinity, do not send window open ACK
1081	 * in states, where we will not receive more. It is useless.
1082	 */
1083	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1084		__u32 rcv_window_now = tcp_receive_window(tp);
1085
1086		/* Optimize, __tcp_select_window() is not cheap. */
1087		if (2*rcv_window_now <= tp->window_clamp) {
1088			__u32 new_window = __tcp_select_window(sk);
1089
1090			/* Send ACK now, if this read freed lots of space
1091			 * in our buffer. Certainly, new_window is new window.
1092			 * We can advertise it now, if it is not less than current one.
1093			 * "Lots" means "at least twice" here.
1094			 */
1095			if (new_window && new_window >= 2 * rcv_window_now)
1096				time_to_ack = 1;
1097		}
1098	}
1099	if (time_to_ack)
1100		tcp_send_ack(sk);
1101}
1102
1103static void tcp_prequeue_process(struct sock *sk)
1104{
1105	struct sk_buff *skb;
1106	struct tcp_sock *tp = tcp_sk(sk);
1107
1108	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1109
1110	/* RX process wants to run with disabled BHs, though it is not
1111	 * necessary */
1112	local_bh_disable();
1113	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1114		sk->sk_backlog_rcv(sk, skb);
1115	local_bh_enable();
1116
1117	/* Clear memory counter. */
1118	tp->ucopy.memory = 0;
1119}
1120
1121static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1122{
1123	struct sk_buff *skb;
1124	u32 offset;
1125
1126	skb_queue_walk(&sk->sk_receive_queue, skb) {
1127		offset = seq - TCP_SKB_CB(skb)->seq;
1128		if (skb->h.th->syn)
1129			offset--;
1130		if (offset < skb->len || skb->h.th->fin) {
1131			*off = offset;
1132			return skb;
1133		}
1134	}
1135	return NULL;
1136}
1137
1138/*
1139 * This routine provides an alternative to tcp_recvmsg() for routines
1140 * that would like to handle copying from skbuffs directly in 'sendfile'
1141 * fashion.
1142 * Note:
1143 *	- It is assumed that the socket was locked by the caller.
1144 *	- The routine does not block.
1145 *	- At present, there is no support for reading OOB data
1146 *	  or for 'peeking' the socket using this routine
1147 *	  (although both would be easy to implement).
1148 */
1149int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1150		  sk_read_actor_t recv_actor)
1151{
1152	struct sk_buff *skb;
1153	struct tcp_sock *tp = tcp_sk(sk);
1154	u32 seq = tp->copied_seq;
1155	u32 offset;
1156	int copied = 0;
1157
1158	if (sk->sk_state == TCP_LISTEN)
1159		return -ENOTCONN;
1160	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1161		if (offset < skb->len) {
1162			size_t used, len;
1163
1164			len = skb->len - offset;
1165			/* Stop reading if we hit a patch of urgent data */
1166			if (tp->urg_data) {
1167				u32 urg_offset = tp->urg_seq - seq;
1168				if (urg_offset < len)
1169					len = urg_offset;
1170				if (!len)
1171					break;
1172			}
1173			used = recv_actor(desc, skb, offset, len);
1174			if (used <= len) {
1175				seq += used;
1176				copied += used;
1177				offset += used;
1178			}
1179			if (offset != skb->len)
1180				break;
1181		}
1182		if (skb->h.th->fin) {
1183			sk_eat_skb(sk, skb);
1184			++seq;
1185			break;
1186		}
1187		sk_eat_skb(sk, skb);
1188		if (!desc->count)
1189			break;
1190	}
1191	tp->copied_seq = seq;
1192
1193	tcp_rcv_space_adjust(sk);
1194
1195	/* Clean up data we have read: This will do ACK frames. */
1196	if (copied)
1197		cleanup_rbuf(sk, copied);
1198	return copied;
1199}
1200
1201/*
1202 *	This routine copies from a sock struct into the user buffer.
1203 *
1204 *	Technical note: in 2.3 we work on _locked_ socket, so that
1205 *	tricks with *seq access order and skb->users are not required.
1206 *	Probably, code can be easily improved even more.
1207 */
1208
1209int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1210		size_t len, int nonblock, int flags, int *addr_len)
1211{
1212	struct tcp_sock *tp = tcp_sk(sk);
1213	int copied = 0;
1214	u32 peek_seq;
1215	u32 *seq;
1216	unsigned long used;
1217	int err;
1218	int target;		/* Read at least this many bytes */
1219	long timeo;
1220	struct task_struct *user_recv = NULL;
1221
1222	lock_sock(sk);
1223
1224	TCP_CHECK_TIMER(sk);
1225
1226	err = -ENOTCONN;
1227	if (sk->sk_state == TCP_LISTEN)
1228		goto out;
1229
1230	timeo = sock_rcvtimeo(sk, nonblock);
1231
1232	/* Urgent data needs to be handled specially. */
1233	if (flags & MSG_OOB)
1234		goto recv_urg;
1235
1236	seq = &tp->copied_seq;
1237	if (flags & MSG_PEEK) {
1238		peek_seq = tp->copied_seq;
1239		seq = &peek_seq;
1240	}
1241
1242	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1243
1244	do {
1245		struct sk_buff *skb;
1246		u32 offset;
1247
1248		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1249		if (tp->urg_data && tp->urg_seq == *seq) {
1250			if (copied)
1251				break;
1252			if (signal_pending(current)) {
1253				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1254				break;
1255			}
1256		}
1257
1258		/* Next get a buffer. */
1259
1260		skb = skb_peek(&sk->sk_receive_queue);
1261		do {
1262			if (!skb)
1263				break;
1264
1265			/* Now that we have two receive queues this
1266			 * shouldn't happen.
1267			 */
1268			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1269				printk(KERN_INFO "recvmsg bug: copied %X "
1270				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1271				break;
1272			}
1273			offset = *seq - TCP_SKB_CB(skb)->seq;
1274			if (skb->h.th->syn)
1275				offset--;
1276			if (offset < skb->len)
1277				goto found_ok_skb;
1278			if (skb->h.th->fin)
1279				goto found_fin_ok;
1280			BUG_TRAP(flags & MSG_PEEK);
1281			skb = skb->next;
1282		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1283
1284		/* Well, if we have backlog, try to process it now yet. */
1285
1286		if (copied >= target && !sk->sk_backlog.tail)
1287			break;
1288
1289		if (copied) {
1290			if (sk->sk_err ||
1291			    sk->sk_state == TCP_CLOSE ||
1292			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1293			    !timeo ||
1294			    signal_pending(current) ||
1295			    (flags & MSG_PEEK))
1296				break;
1297		} else {
1298			if (sock_flag(sk, SOCK_DONE))
1299				break;
1300
1301			if (sk->sk_err) {
1302				copied = sock_error(sk);
1303				break;
1304			}
1305
1306			if (sk->sk_shutdown & RCV_SHUTDOWN)
1307				break;
1308
1309			if (sk->sk_state == TCP_CLOSE) {
1310				if (!sock_flag(sk, SOCK_DONE)) {
1311					/* This occurs when user tries to read
1312					 * from never connected socket.
1313					 */
1314					copied = -ENOTCONN;
1315					break;
1316				}
1317				break;
1318			}
1319
1320			if (!timeo) {
1321				copied = -EAGAIN;
1322				break;
1323			}
1324
1325			if (signal_pending(current)) {
1326				copied = sock_intr_errno(timeo);
1327				break;
1328			}
1329		}
1330
1331		cleanup_rbuf(sk, copied);
1332
1333		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1334			/* Install new reader */
1335			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1336				user_recv = current;
1337				tp->ucopy.task = user_recv;
1338				tp->ucopy.iov = msg->msg_iov;
1339			}
1340
1341			tp->ucopy.len = len;
1342
1343			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1344				 (flags & (MSG_PEEK | MSG_TRUNC)));
1345
1346			/* Ugly... If prequeue is not empty, we have to
1347			 * process it before releasing socket, otherwise
1348			 * order will be broken at second iteration.
1349			 * More elegant solution is required!!!
1350			 *
1351			 * Look: we have the following (pseudo)queues:
1352			 *
1353			 * 1. packets in flight
1354			 * 2. backlog
1355			 * 3. prequeue
1356			 * 4. receive_queue
1357			 *
1358			 * Each queue can be processed only if the next ones
1359			 * are empty. At this point we have empty receive_queue.
1360			 * But prequeue _can_ be not empty after 2nd iteration,
1361			 * when we jumped to start of loop because backlog
1362			 * processing added something to receive_queue.
1363			 * We cannot release_sock(), because backlog contains
1364			 * packets arrived _after_ prequeued ones.
1365			 *
1366			 * Shortly, algorithm is clear --- to process all
1367			 * the queues in order. We could make it more directly,
1368			 * requeueing packets from backlog to prequeue, if
1369			 * is not empty. It is more elegant, but eats cycles,
1370			 * unfortunately.
1371			 */
1372			if (skb_queue_len(&tp->ucopy.prequeue))
1373				goto do_prequeue;
1374
1375			/* __ Set realtime policy in scheduler __ */
1376		}
1377
1378		if (copied >= target) {
1379			/* Do not sleep, just process backlog. */
1380			release_sock(sk);
1381			lock_sock(sk);
1382		} else
1383			sk_wait_data(sk, &timeo);
1384
1385		if (user_recv) {
1386			int chunk;
1387
1388			/* __ Restore normal policy in scheduler __ */
1389
1390			if ((chunk = len - tp->ucopy.len) != 0) {
1391				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1392				len -= chunk;
1393				copied += chunk;
1394			}
1395
1396			if (tp->rcv_nxt == tp->copied_seq &&
1397			    skb_queue_len(&tp->ucopy.prequeue)) {
1398do_prequeue:
1399				tcp_prequeue_process(sk);
1400
1401				if ((chunk = len - tp->ucopy.len) != 0) {
1402					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1403					len -= chunk;
1404					copied += chunk;
1405				}
1406			}
1407		}
1408		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1409			if (net_ratelimit())
1410				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1411				       current->comm, current->pid);
1412			peek_seq = tp->copied_seq;
1413		}
1414		continue;
1415
1416	found_ok_skb:
1417		/* Ok so how much can we use? */
1418		used = skb->len - offset;
1419		if (len < used)
1420			used = len;
1421
1422		/* Do we have urgent data here? */
1423		if (tp->urg_data) {
1424			u32 urg_offset = tp->urg_seq - *seq;
1425			if (urg_offset < used) {
1426				if (!urg_offset) {
1427					if (!sock_flag(sk, SOCK_URGINLINE)) {
1428						++*seq;
1429						offset++;
1430						used--;
1431						if (!used)
1432							goto skip_copy;
1433					}
1434				} else
1435					used = urg_offset;
1436			}
1437		}
1438
1439		if (!(flags & MSG_TRUNC)) {
1440			err = skb_copy_datagram_iovec(skb, offset,
1441						      msg->msg_iov, used);
1442			if (err) {
1443				/* Exception. Bailout! */
1444				if (!copied)
1445					copied = -EFAULT;
1446				break;
1447			}
1448		}
1449
1450		*seq += used;
1451		copied += used;
1452		len -= used;
1453
1454		tcp_rcv_space_adjust(sk);
1455
1456skip_copy:
1457		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1458			tp->urg_data = 0;
1459			tcp_fast_path_check(sk, tp);
1460		}
1461		if (used + offset < skb->len)
1462			continue;
1463
1464		if (skb->h.th->fin)
1465			goto found_fin_ok;
1466		if (!(flags & MSG_PEEK))
1467			sk_eat_skb(sk, skb);
1468		continue;
1469
1470	found_fin_ok:
1471		/* Process the FIN. */
1472		++*seq;
1473		if (!(flags & MSG_PEEK))
1474			sk_eat_skb(sk, skb);
1475		break;
1476	} while (len > 0);
1477
1478	if (user_recv) {
1479		if (skb_queue_len(&tp->ucopy.prequeue)) {
1480			int chunk;
1481
1482			tp->ucopy.len = copied > 0 ? len : 0;
1483
1484			tcp_prequeue_process(sk);
1485
1486			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1487				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1488				len -= chunk;
1489				copied += chunk;
1490			}
1491		}
1492
1493		tp->ucopy.task = NULL;
1494		tp->ucopy.len = 0;
1495	}
1496
1497	/* According to UNIX98, msg_name/msg_namelen are ignored
1498	 * on connected socket. I was just happy when found this 8) --ANK
1499	 */
1500
1501	/* Clean up data we have read: This will do ACK frames. */
1502	cleanup_rbuf(sk, copied);
1503
1504	TCP_CHECK_TIMER(sk);
1505	release_sock(sk);
1506	return copied;
1507
1508out:
1509	TCP_CHECK_TIMER(sk);
1510	release_sock(sk);
1511	return err;
1512
1513recv_urg:
1514	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1515	goto out;
1516}
1517
1518/*
1519 *	State processing on a close. This implements the state shift for
1520 *	sending our FIN frame. Note that we only send a FIN for some
1521 *	states. A shutdown() may have already sent the FIN, or we may be
1522 *	closed.
1523 */
1524
1525static unsigned char new_state[16] = {
1526  /* current state:        new state:      action:	*/
1527  /* (Invalid)		*/ TCP_CLOSE,
1528  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1529  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1530  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1531  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1532  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1533  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1534  /* TCP_CLOSE		*/ TCP_CLOSE,
1535  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1536  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1537  /* TCP_LISTEN		*/ TCP_CLOSE,
1538  /* TCP_CLOSING	*/ TCP_CLOSING,
1539};
1540
1541static int tcp_close_state(struct sock *sk)
1542{
1543	int next = (int)new_state[sk->sk_state];
1544	int ns = next & TCP_STATE_MASK;
1545
1546	tcp_set_state(sk, ns);
1547
1548	return next & TCP_ACTION_FIN;
1549}
1550
1551/*
1552 *	Shutdown the sending side of a connection. Much like close except
1553 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1554 */
1555
1556void tcp_shutdown(struct sock *sk, int how)
1557{
1558	/*	We need to grab some memory, and put together a FIN,
1559	 *	and then put it into the queue to be sent.
1560	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1561	 */
1562	if (!(how & SEND_SHUTDOWN))
1563		return;
1564
1565	/* If we've already sent a FIN, or it's a closed state, skip this. */
1566	if ((1 << sk->sk_state) &
1567	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1568	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1569		/* Clear out any half completed packets.  FIN if needed. */
1570		if (tcp_close_state(sk))
1571			tcp_send_fin(sk);
1572	}
1573}
1574
1575/*
1576 * At this point, there should be no process reference to this
1577 * socket, and thus no user references at all.  Therefore we
1578 * can assume the socket waitqueue is inactive and nobody will
1579 * try to jump onto it.
1580 */
1581void tcp_destroy_sock(struct sock *sk)
1582{
1583	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1584	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1585
1586	/* It cannot be in hash table! */
1587	BUG_TRAP(sk_unhashed(sk));
1588
1589	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1590	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1591
1592	sk->sk_prot->destroy(sk);
1593
1594	sk_stream_kill_queues(sk);
1595
1596	xfrm_sk_free_policy(sk);
1597
1598#ifdef INET_REFCNT_DEBUG
1599	if (atomic_read(&sk->sk_refcnt) != 1) {
1600		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1601		       sk, atomic_read(&sk->sk_refcnt));
1602	}
1603#endif
1604
1605	atomic_dec(&tcp_orphan_count);
1606	sock_put(sk);
1607}
1608
1609void tcp_close(struct sock *sk, long timeout)
1610{
1611	struct sk_buff *skb;
1612	int data_was_unread = 0;
1613
1614	lock_sock(sk);
1615	sk->sk_shutdown = SHUTDOWN_MASK;
1616
1617	if (sk->sk_state == TCP_LISTEN) {
1618		tcp_set_state(sk, TCP_CLOSE);
1619
1620		/* Special case. */
1621		tcp_listen_stop(sk);
1622
1623		goto adjudge_to_death;
1624	}
1625
1626	/*  We need to flush the recv. buffs.  We do this only on the
1627	 *  descriptor close, not protocol-sourced closes, because the
1628	 *  reader process may not have drained the data yet!
1629	 */
1630	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1631		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1632			  skb->h.th->fin;
1633		data_was_unread += len;
1634		__kfree_skb(skb);
1635	}
1636
1637	sk_stream_mem_reclaim(sk);
1638
1639	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1640	 * 3.10, we send a RST here because data was lost.  To
1641	 * witness the awful effects of the old behavior of always
1642	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1643	 * a bulk GET in an FTP client, suspend the process, wait
1644	 * for the client to advertise a zero window, then kill -9
1645	 * the FTP client, wheee...  Note: timeout is always zero
1646	 * in such a case.
1647	 */
1648	if (data_was_unread) {
1649		/* Unread data was tossed, zap the connection. */
1650		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1651		tcp_set_state(sk, TCP_CLOSE);
1652		tcp_send_active_reset(sk, GFP_KERNEL);
1653	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1654		/* Check zero linger _after_ checking for unread data. */
1655		sk->sk_prot->disconnect(sk, 0);
1656		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1657	} else if (tcp_close_state(sk)) {
1658		/* We FIN if the application ate all the data before
1659		 * zapping the connection.
1660		 */
1661
1662		/* RED-PEN. Formally speaking, we have broken TCP state
1663		 * machine. State transitions:
1664		 *
1665		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1666		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1667		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1668		 *
1669		 * are legal only when FIN has been sent (i.e. in window),
1670		 * rather than queued out of window. Purists blame.
1671		 *
1672		 * F.e. "RFC state" is ESTABLISHED,
1673		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1674		 *
1675		 * The visible declinations are that sometimes
1676		 * we enter time-wait state, when it is not required really
1677		 * (harmless), do not send active resets, when they are
1678		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1679		 * they look as CLOSING or LAST_ACK for Linux)
1680		 * Probably, I missed some more holelets.
1681		 * 						--ANK
1682		 */
1683		tcp_send_fin(sk);
1684	}
1685
1686	sk_stream_wait_close(sk, timeout);
1687
1688adjudge_to_death:
1689	/* It is the last release_sock in its life. It will remove backlog. */
1690	release_sock(sk);
1691
1692
1693	/* Now socket is owned by kernel and we acquire BH lock
1694	   to finish close. No need to check for user refs.
1695	 */
1696	local_bh_disable();
1697	bh_lock_sock(sk);
1698	BUG_TRAP(!sock_owned_by_user(sk));
1699
1700	sock_hold(sk);
1701	sock_orphan(sk);
1702
1703	/*	This is a (useful) BSD violating of the RFC. There is a
1704	 *	problem with TCP as specified in that the other end could
1705	 *	keep a socket open forever with no application left this end.
1706	 *	We use a 3 minute timeout (about the same as BSD) then kill
1707	 *	our end. If they send after that then tough - BUT: long enough
1708	 *	that we won't make the old 4*rto = almost no time - whoops
1709	 *	reset mistake.
1710	 *
1711	 *	Nope, it was not mistake. It is really desired behaviour
1712	 *	f.e. on http servers, when such sockets are useless, but
1713	 *	consume significant resources. Let's do it with special
1714	 *	linger2	option.					--ANK
1715	 */
1716
1717	if (sk->sk_state == TCP_FIN_WAIT2) {
1718		struct tcp_sock *tp = tcp_sk(sk);
1719		if (tp->linger2 < 0) {
1720			tcp_set_state(sk, TCP_CLOSE);
1721			tcp_send_active_reset(sk, GFP_ATOMIC);
1722			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1723		} else {
1724			int tmo = tcp_fin_time(tp);
1725
1726			if (tmo > TCP_TIMEWAIT_LEN) {
1727				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1728			} else {
1729				atomic_inc(&tcp_orphan_count);
1730				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1731				goto out;
1732			}
1733		}
1734	}
1735	if (sk->sk_state != TCP_CLOSE) {
1736		sk_stream_mem_reclaim(sk);
1737		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1738		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1739		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1740			if (net_ratelimit())
1741				printk(KERN_INFO "TCP: too many of orphaned "
1742				       "sockets\n");
1743			tcp_set_state(sk, TCP_CLOSE);
1744			tcp_send_active_reset(sk, GFP_ATOMIC);
1745			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1746		}
1747	}
1748	atomic_inc(&tcp_orphan_count);
1749
1750	if (sk->sk_state == TCP_CLOSE)
1751		tcp_destroy_sock(sk);
1752	/* Otherwise, socket is reprieved until protocol close. */
1753
1754out:
1755	bh_unlock_sock(sk);
1756	local_bh_enable();
1757	sock_put(sk);
1758}
1759
1760/* These states need RST on ABORT according to RFC793 */
1761
1762static inline int tcp_need_reset(int state)
1763{
1764	return (1 << state) &
1765	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1766		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1767}
1768
1769int tcp_disconnect(struct sock *sk, int flags)
1770{
1771	struct inet_sock *inet = inet_sk(sk);
1772	struct tcp_sock *tp = tcp_sk(sk);
1773	int err = 0;
1774	int old_state = sk->sk_state;
1775
1776	if (old_state != TCP_CLOSE)
1777		tcp_set_state(sk, TCP_CLOSE);
1778
1779	/* ABORT function of RFC793 */
1780	if (old_state == TCP_LISTEN) {
1781		tcp_listen_stop(sk);
1782	} else if (tcp_need_reset(old_state) ||
1783		   (tp->snd_nxt != tp->write_seq &&
1784		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1785		/* The last check adjusts for discrepance of Linux wrt. RFC
1786		 * states
1787		 */
1788		tcp_send_active_reset(sk, gfp_any());
1789		sk->sk_err = ECONNRESET;
1790	} else if (old_state == TCP_SYN_SENT)
1791		sk->sk_err = ECONNRESET;
1792
1793	tcp_clear_xmit_timers(sk);
1794	__skb_queue_purge(&sk->sk_receive_queue);
1795	sk_stream_writequeue_purge(sk);
1796	__skb_queue_purge(&tp->out_of_order_queue);
1797
1798	inet->dport = 0;
1799
1800	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1801		inet_reset_saddr(sk);
1802
1803	sk->sk_shutdown = 0;
1804	sock_reset_flag(sk, SOCK_DONE);
1805	tp->srtt = 0;
1806	if ((tp->write_seq += tp->max_window + 2) == 0)
1807		tp->write_seq = 1;
1808	tp->backoff = 0;
1809	tp->snd_cwnd = 2;
1810	tp->probes_out = 0;
1811	tp->packets_out = 0;
1812	tp->snd_ssthresh = 0x7fffffff;
1813	tp->snd_cwnd_cnt = 0;
1814	tcp_set_ca_state(tp, TCP_CA_Open);
1815	tcp_clear_retrans(tp);
1816	tcp_delack_init(tp);
1817	sk->sk_send_head = NULL;
1818	tp->rx_opt.saw_tstamp = 0;
1819	tcp_sack_reset(&tp->rx_opt);
1820	__sk_dst_reset(sk);
1821
1822	BUG_TRAP(!inet->num || tp->bind_hash);
1823
1824	sk->sk_error_report(sk);
1825	return err;
1826}
1827
1828/*
1829 *	Wait for an incoming connection, avoid race
1830 *	conditions. This must be called with the socket locked.
1831 */
1832static int wait_for_connect(struct sock *sk, long timeo)
1833{
1834	struct tcp_sock *tp = tcp_sk(sk);
1835	DEFINE_WAIT(wait);
1836	int err;
1837
1838	/*
1839	 * True wake-one mechanism for incoming connections: only
1840	 * one process gets woken up, not the 'whole herd'.
1841	 * Since we do not 'race & poll' for established sockets
1842	 * anymore, the common case will execute the loop only once.
1843	 *
1844	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1845	 * after any current non-exclusive waiters, and we know that
1846	 * it will always _stay_ after any new non-exclusive waiters
1847	 * because all non-exclusive waiters are added at the
1848	 * beginning of the wait-queue. As such, it's ok to "drop"
1849	 * our exclusiveness temporarily when we get woken up without
1850	 * having to remove and re-insert us on the wait queue.
1851	 */
1852	for (;;) {
1853		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1854					  TASK_INTERRUPTIBLE);
1855		release_sock(sk);
1856		if (reqsk_queue_empty(&tp->accept_queue))
1857			timeo = schedule_timeout(timeo);
1858		lock_sock(sk);
1859		err = 0;
1860		if (!reqsk_queue_empty(&tp->accept_queue))
1861			break;
1862		err = -EINVAL;
1863		if (sk->sk_state != TCP_LISTEN)
1864			break;
1865		err = sock_intr_errno(timeo);
1866		if (signal_pending(current))
1867			break;
1868		err = -EAGAIN;
1869		if (!timeo)
1870			break;
1871	}
1872	finish_wait(sk->sk_sleep, &wait);
1873	return err;
1874}
1875
1876/*
1877 *	This will accept the next outstanding connection.
1878 */
1879
1880struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1881{
1882	struct tcp_sock *tp = tcp_sk(sk);
1883	struct sock *newsk;
1884	int error;
1885
1886	lock_sock(sk);
1887
1888	/* We need to make sure that this socket is listening,
1889	 * and that it has something pending.
1890	 */
1891	error = -EINVAL;
1892	if (sk->sk_state != TCP_LISTEN)
1893		goto out_err;
1894
1895	/* Find already established connection */
1896	if (reqsk_queue_empty(&tp->accept_queue)) {
1897		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1898
1899		/* If this is a non blocking socket don't sleep */
1900		error = -EAGAIN;
1901		if (!timeo)
1902			goto out_err;
1903
1904		error = wait_for_connect(sk, timeo);
1905		if (error)
1906			goto out_err;
1907	}
1908
1909	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1910	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1911out:
1912	release_sock(sk);
1913	return newsk;
1914out_err:
1915	newsk = NULL;
1916	*err = error;
1917	goto out;
1918}
1919
1920/*
1921 *	Socket option code for TCP.
1922 */
1923int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1924		   int optlen)
1925{
1926	struct tcp_sock *tp = tcp_sk(sk);
1927	int val;
1928	int err = 0;
1929
1930	if (level != SOL_TCP)
1931		return tp->af_specific->setsockopt(sk, level, optname,
1932						   optval, optlen);
1933
1934	/* This is a string value all the others are int's */
1935	if (optname == TCP_CONGESTION) {
1936		char name[TCP_CA_NAME_MAX];
1937
1938		if (optlen < 1)
1939			return -EINVAL;
1940
1941		val = strncpy_from_user(name, optval,
1942					min(TCP_CA_NAME_MAX-1, optlen));
1943		if (val < 0)
1944			return -EFAULT;
1945		name[val] = 0;
1946
1947		lock_sock(sk);
1948		err = tcp_set_congestion_control(tp, name);
1949		release_sock(sk);
1950		return err;
1951	}
1952
1953	if (optlen < sizeof(int))
1954		return -EINVAL;
1955
1956	if (get_user(val, (int __user *)optval))
1957		return -EFAULT;
1958
1959	lock_sock(sk);
1960
1961	switch (optname) {
1962	case TCP_MAXSEG:
1963		/* Values greater than interface MTU won't take effect. However
1964		 * at the point when this call is done we typically don't yet
1965		 * know which interface is going to be used */
1966		if (val < 8 || val > MAX_TCP_WINDOW) {
1967			err = -EINVAL;
1968			break;
1969		}
1970		tp->rx_opt.user_mss = val;
1971		break;
1972
1973	case TCP_NODELAY:
1974		if (val) {
1975			/* TCP_NODELAY is weaker than TCP_CORK, so that
1976			 * this option on corked socket is remembered, but
1977			 * it is not activated until cork is cleared.
1978			 *
1979			 * However, when TCP_NODELAY is set we make
1980			 * an explicit push, which overrides even TCP_CORK
1981			 * for currently queued segments.
1982			 */
1983			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1984			tcp_push_pending_frames(sk, tp);
1985		} else {
1986			tp->nonagle &= ~TCP_NAGLE_OFF;
1987		}
1988		break;
1989
1990	case TCP_CORK:
1991		/* When set indicates to always queue non-full frames.
1992		 * Later the user clears this option and we transmit
1993		 * any pending partial frames in the queue.  This is
1994		 * meant to be used alongside sendfile() to get properly
1995		 * filled frames when the user (for example) must write
1996		 * out headers with a write() call first and then use
1997		 * sendfile to send out the data parts.
1998		 *
1999		 * TCP_CORK can be set together with TCP_NODELAY and it is
2000		 * stronger than TCP_NODELAY.
2001		 */
2002		if (val) {
2003			tp->nonagle |= TCP_NAGLE_CORK;
2004		} else {
2005			tp->nonagle &= ~TCP_NAGLE_CORK;
2006			if (tp->nonagle&TCP_NAGLE_OFF)
2007				tp->nonagle |= TCP_NAGLE_PUSH;
2008			tcp_push_pending_frames(sk, tp);
2009		}
2010		break;
2011
2012	case TCP_KEEPIDLE:
2013		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2014			err = -EINVAL;
2015		else {
2016			tp->keepalive_time = val * HZ;
2017			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2018			    !((1 << sk->sk_state) &
2019			      (TCPF_CLOSE | TCPF_LISTEN))) {
2020				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2021				if (tp->keepalive_time > elapsed)
2022					elapsed = tp->keepalive_time - elapsed;
2023				else
2024					elapsed = 0;
2025				tcp_reset_keepalive_timer(sk, elapsed);
2026			}
2027		}
2028		break;
2029	case TCP_KEEPINTVL:
2030		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2031			err = -EINVAL;
2032		else
2033			tp->keepalive_intvl = val * HZ;
2034		break;
2035	case TCP_KEEPCNT:
2036		if (val < 1 || val > MAX_TCP_KEEPCNT)
2037			err = -EINVAL;
2038		else
2039			tp->keepalive_probes = val;
2040		break;
2041	case TCP_SYNCNT:
2042		if (val < 1 || val > MAX_TCP_SYNCNT)
2043			err = -EINVAL;
2044		else
2045			tp->syn_retries = val;
2046		break;
2047
2048	case TCP_LINGER2:
2049		if (val < 0)
2050			tp->linger2 = -1;
2051		else if (val > sysctl_tcp_fin_timeout / HZ)
2052			tp->linger2 = 0;
2053		else
2054			tp->linger2 = val * HZ;
2055		break;
2056
2057	case TCP_DEFER_ACCEPT:
2058		tp->defer_accept = 0;
2059		if (val > 0) {
2060			/* Translate value in seconds to number of
2061			 * retransmits */
2062			while (tp->defer_accept < 32 &&
2063			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2064				       tp->defer_accept))
2065				tp->defer_accept++;
2066			tp->defer_accept++;
2067		}
2068		break;
2069
2070	case TCP_WINDOW_CLAMP:
2071		if (!val) {
2072			if (sk->sk_state != TCP_CLOSE) {
2073				err = -EINVAL;
2074				break;
2075			}
2076			tp->window_clamp = 0;
2077		} else
2078			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2079						SOCK_MIN_RCVBUF / 2 : val;
2080		break;
2081
2082	case TCP_QUICKACK:
2083		if (!val) {
2084			tp->ack.pingpong = 1;
2085		} else {
2086			tp->ack.pingpong = 0;
2087			if ((1 << sk->sk_state) &
2088			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2089			    tcp_ack_scheduled(tp)) {
2090				tp->ack.pending |= TCP_ACK_PUSHED;
2091				cleanup_rbuf(sk, 1);
2092				if (!(val & 1))
2093					tp->ack.pingpong = 1;
2094			}
2095		}
2096		break;
2097
2098	default:
2099		err = -ENOPROTOOPT;
2100		break;
2101	};
2102	release_sock(sk);
2103	return err;
2104}
2105
2106/* Return information about state of tcp endpoint in API format. */
2107void tcp_get_info(struct sock *sk, struct tcp_info *info)
2108{
2109	struct tcp_sock *tp = tcp_sk(sk);
2110	u32 now = tcp_time_stamp;
2111
2112	memset(info, 0, sizeof(*info));
2113
2114	info->tcpi_state = sk->sk_state;
2115	info->tcpi_ca_state = tp->ca_state;
2116	info->tcpi_retransmits = tp->retransmits;
2117	info->tcpi_probes = tp->probes_out;
2118	info->tcpi_backoff = tp->backoff;
2119
2120	if (tp->rx_opt.tstamp_ok)
2121		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2122	if (tp->rx_opt.sack_ok)
2123		info->tcpi_options |= TCPI_OPT_SACK;
2124	if (tp->rx_opt.wscale_ok) {
2125		info->tcpi_options |= TCPI_OPT_WSCALE;
2126		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2127		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2128	}
2129
2130	if (tp->ecn_flags&TCP_ECN_OK)
2131		info->tcpi_options |= TCPI_OPT_ECN;
2132
2133	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2134	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2135	info->tcpi_snd_mss = tp->mss_cache;
2136	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2137
2138	info->tcpi_unacked = tp->packets_out;
2139	info->tcpi_sacked = tp->sacked_out;
2140	info->tcpi_lost = tp->lost_out;
2141	info->tcpi_retrans = tp->retrans_out;
2142	info->tcpi_fackets = tp->fackets_out;
2143
2144	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2145	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2146	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2147
2148	info->tcpi_pmtu = tp->pmtu_cookie;
2149	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2150	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2151	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2152	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2153	info->tcpi_snd_cwnd = tp->snd_cwnd;
2154	info->tcpi_advmss = tp->advmss;
2155	info->tcpi_reordering = tp->reordering;
2156
2157	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2158	info->tcpi_rcv_space = tp->rcvq_space.space;
2159
2160	info->tcpi_total_retrans = tp->total_retrans;
2161}
2162
2163EXPORT_SYMBOL_GPL(tcp_get_info);
2164
2165int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2166		   int __user *optlen)
2167{
2168	struct tcp_sock *tp = tcp_sk(sk);
2169	int val, len;
2170
2171	if (level != SOL_TCP)
2172		return tp->af_specific->getsockopt(sk, level, optname,
2173						   optval, optlen);
2174
2175	if (get_user(len, optlen))
2176		return -EFAULT;
2177
2178	len = min_t(unsigned int, len, sizeof(int));
2179
2180	if (len < 0)
2181		return -EINVAL;
2182
2183	switch (optname) {
2184	case TCP_MAXSEG:
2185		val = tp->mss_cache;
2186		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2187			val = tp->rx_opt.user_mss;
2188		break;
2189	case TCP_NODELAY:
2190		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2191		break;
2192	case TCP_CORK:
2193		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2194		break;
2195	case TCP_KEEPIDLE:
2196		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2197		break;
2198	case TCP_KEEPINTVL:
2199		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2200		break;
2201	case TCP_KEEPCNT:
2202		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2203		break;
2204	case TCP_SYNCNT:
2205		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2206		break;
2207	case TCP_LINGER2:
2208		val = tp->linger2;
2209		if (val >= 0)
2210			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2211		break;
2212	case TCP_DEFER_ACCEPT:
2213		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2214					       (tp->defer_accept - 1));
2215		break;
2216	case TCP_WINDOW_CLAMP:
2217		val = tp->window_clamp;
2218		break;
2219	case TCP_INFO: {
2220		struct tcp_info info;
2221
2222		if (get_user(len, optlen))
2223			return -EFAULT;
2224
2225		tcp_get_info(sk, &info);
2226
2227		len = min_t(unsigned int, len, sizeof(info));
2228		if (put_user(len, optlen))
2229			return -EFAULT;
2230		if (copy_to_user(optval, &info, len))
2231			return -EFAULT;
2232		return 0;
2233	}
2234	case TCP_QUICKACK:
2235		val = !tp->ack.pingpong;
2236		break;
2237
2238	case TCP_CONGESTION:
2239		if (get_user(len, optlen))
2240			return -EFAULT;
2241		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242		if (put_user(len, optlen))
2243			return -EFAULT;
2244		if (copy_to_user(optval, tp->ca_ops->name, len))
2245			return -EFAULT;
2246		return 0;
2247	default:
2248		return -ENOPROTOOPT;
2249	};
2250
2251	if (put_user(len, optlen))
2252		return -EFAULT;
2253	if (copy_to_user(optval, &val, len))
2254		return -EFAULT;
2255	return 0;
2256}
2257
2258
2259extern void __skb_cb_too_small_for_tcp(int, int);
2260extern struct tcp_congestion_ops tcp_reno;
2261
2262static __initdata unsigned long thash_entries;
2263static int __init set_thash_entries(char *str)
2264{
2265	if (!str)
2266		return 0;
2267	thash_entries = simple_strtoul(str, &str, 0);
2268	return 1;
2269}
2270__setup("thash_entries=", set_thash_entries);
2271
2272void __init tcp_init(void)
2273{
2274	struct sk_buff *skb = NULL;
2275	int order, i;
2276
2277	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2278		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2279					   sizeof(skb->cb));
2280
2281	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282					      sizeof(struct tcp_bind_bucket),
2283					      0, SLAB_HWCACHE_ALIGN,
2284					      NULL, NULL);
2285	if (!tcp_bucket_cachep)
2286		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287
2288	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289						sizeof(struct tcp_tw_bucket),
2290						0, SLAB_HWCACHE_ALIGN,
2291						NULL, NULL);
2292	if (!tcp_timewait_cachep)
2293		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295	/* Size and allocate the main established and bind bucket
2296	 * hash tables.
2297	 *
2298	 * The methodology is similar to that of the buffer cache.
2299	 */
2300	tcp_ehash = (struct tcp_ehash_bucket *)
2301		alloc_large_system_hash("TCP established",
2302					sizeof(struct tcp_ehash_bucket),
2303					thash_entries,
2304					(num_physpages >= 128 * 1024) ?
2305						(25 - PAGE_SHIFT) :
2306						(27 - PAGE_SHIFT),
2307					HASH_HIGHMEM,
2308					&tcp_ehash_size,
2309					NULL,
2310					0);
2311	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313		rwlock_init(&tcp_ehash[i].lock);
2314		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315	}
2316
2317	tcp_bhash = (struct tcp_bind_hashbucket *)
2318		alloc_large_system_hash("TCP bind",
2319					sizeof(struct tcp_bind_hashbucket),
2320					tcp_ehash_size,
2321					(num_physpages >= 128 * 1024) ?
2322						(25 - PAGE_SHIFT) :
2323						(27 - PAGE_SHIFT),
2324					HASH_HIGHMEM,
2325					&tcp_bhash_size,
2326					NULL,
2327					64 * 1024);
2328	tcp_bhash_size = 1 << tcp_bhash_size;
2329	for (i = 0; i < tcp_bhash_size; i++) {
2330		spin_lock_init(&tcp_bhash[i].lock);
2331		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332	}
2333
2334	/* Try to be a bit smarter and adjust defaults depending
2335	 * on available memory.
2336	 */
2337	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339			order++)
2340		;
2341	if (order >= 4) {
2342		sysctl_local_port_range[0] = 32768;
2343		sysctl_local_port_range[1] = 61000;
2344		sysctl_tcp_max_tw_buckets = 180000;
2345		sysctl_tcp_max_orphans = 4096 << (order - 4);
2346		sysctl_max_syn_backlog = 1024;
2347	} else if (order < 3) {
2348		sysctl_local_port_range[0] = 1024 * (3 - order);
2349		sysctl_tcp_max_tw_buckets >>= (3 - order);
2350		sysctl_tcp_max_orphans >>= (3 - order);
2351		sysctl_max_syn_backlog = 128;
2352	}
2353	tcp_port_rover = sysctl_local_port_range[0] - 1;
2354
2355	sysctl_tcp_mem[0] =  768 << order;
2356	sysctl_tcp_mem[1] = 1024 << order;
2357	sysctl_tcp_mem[2] = 1536 << order;
2358
2359	if (order < 3) {
2360		sysctl_tcp_wmem[2] = 64 * 1024;
2361		sysctl_tcp_rmem[0] = PAGE_SIZE;
2362		sysctl_tcp_rmem[1] = 43689;
2363		sysctl_tcp_rmem[2] = 2 * 43689;
2364	}
2365
2366	printk(KERN_INFO "TCP: Hash tables configured "
2367	       "(established %d bind %d)\n",
2368	       tcp_ehash_size << 1, tcp_bhash_size);
2369
2370	tcp_register_congestion_control(&tcp_reno);
2371}
2372
2373EXPORT_SYMBOL(tcp_accept);
2374EXPORT_SYMBOL(tcp_close);
2375EXPORT_SYMBOL(tcp_destroy_sock);
2376EXPORT_SYMBOL(tcp_disconnect);
2377EXPORT_SYMBOL(tcp_getsockopt);
2378EXPORT_SYMBOL(tcp_ioctl);
2379EXPORT_SYMBOL(tcp_poll);
2380EXPORT_SYMBOL(tcp_read_sock);
2381EXPORT_SYMBOL(tcp_recvmsg);
2382EXPORT_SYMBOL(tcp_sendmsg);
2383EXPORT_SYMBOL(tcp_sendpage);
2384EXPORT_SYMBOL(tcp_setsockopt);
2385EXPORT_SYMBOL(tcp_shutdown);
2386EXPORT_SYMBOL(tcp_statistics);
2387EXPORT_SYMBOL(tcp_timewait_cachep);
2388