tcp.c revision 83e3609eba3818f6e18b8bf9442195169ac306b7
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 *		Alan Cox	:	Numerous verify_area() calls
24 *		Alan Cox	:	Set the ACK bit on a reset
25 *		Alan Cox	:	Stopped it crashing if it closed while
26 *					sk->inuse=1 and was trying to connect
27 *					(tcp_err()).
28 *		Alan Cox	:	All icmp error handling was broken
29 *					pointers passed where wrong and the
30 *					socket was looked up backwards. Nobody
31 *					tested any icmp error code obviously.
32 *		Alan Cox	:	tcp_err() now handled properly. It
33 *					wakes people on errors. poll
34 *					behaves and the icmp error race
35 *					has gone by moving it into sock.c
36 *		Alan Cox	:	tcp_send_reset() fixed to work for
37 *					everything not just packets for
38 *					unknown sockets.
39 *		Alan Cox	:	tcp option processing.
40 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41 *					syn rule wrong]
42 *		Herp Rosmanith  :	More reset fixes
43 *		Alan Cox	:	No longer acks invalid rst frames.
44 *					Acking any kind of RST is right out.
45 *		Alan Cox	:	Sets an ignore me flag on an rst
46 *					receive otherwise odd bits of prattle
47 *					escape still
48 *		Alan Cox	:	Fixed another acking RST frame bug.
49 *					Should stop LAN workplace lockups.
50 *		Alan Cox	: 	Some tidyups using the new skb list
51 *					facilities
52 *		Alan Cox	:	sk->keepopen now seems to work
53 *		Alan Cox	:	Pulls options out correctly on accepts
54 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56 *					bit to skb ops.
57 *		Alan Cox	:	Tidied tcp_data to avoid a potential
58 *					nasty.
59 *		Alan Cox	:	Added some better commenting, as the
60 *					tcp is hard to follow
61 *		Alan Cox	:	Removed incorrect check for 20 * psh
62 *	Michael O'Reilly	:	ack < copied bug fix.
63 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64 *		Alan Cox	:	FIN with no memory -> CRASH
65 *		Alan Cox	:	Added socket option proto entries.
66 *					Also added awareness of them to accept.
67 *		Alan Cox	:	Added TCP options (SOL_TCP)
68 *		Alan Cox	:	Switched wakeup calls to callbacks,
69 *					so the kernel can layer network
70 *					sockets.
71 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72 *		Alan Cox	:	Handle FIN (more) properly (we hope).
73 *		Alan Cox	:	RST frames sent on unsynchronised
74 *					state ack error.
75 *		Alan Cox	:	Put in missing check for SYN bit.
76 *		Alan Cox	:	Added tcp_select_window() aka NET2E
77 *					window non shrink trick.
78 *		Alan Cox	:	Added a couple of small NET2E timer
79 *					fixes
80 *		Charles Hedrick :	TCP fixes
81 *		Toomas Tamm	:	TCP window fixes
82 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83 *		Charles Hedrick	:	Rewrote most of it to actually work
84 *		Linus		:	Rewrote tcp_read() and URG handling
85 *					completely
86 *		Gerhard Koerting:	Fixed some missing timer handling
87 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88 *		Gerhard Koerting:	PC/TCP workarounds
89 *		Adam Caldwell	:	Assorted timer/timing errors
90 *		Matthew Dillon	:	Fixed another RST bug
91 *		Alan Cox	:	Move to kernel side addressing changes.
92 *		Alan Cox	:	Beginning work on TCP fastpathing
93 *					(not yet usable)
94 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95 *		Alan Cox	:	TCP fast path debugging
96 *		Alan Cox	:	Window clamping
97 *		Michael Riepe	:	Bug in tcp_check()
98 *		Matt Dillon	:	More TCP improvements and RST bug fixes
99 *		Matt Dillon	:	Yet more small nasties remove from the
100 *					TCP code (Be very nice to this man if
101 *					tcp finally works 100%) 8)
102 *		Alan Cox	:	BSD accept semantics.
103 *		Alan Cox	:	Reset on closedown bug.
104 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105 *		Michael Pall	:	Handle poll() after URG properly in
106 *					all cases.
107 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108 *					(multi URG PUSH broke rlogin).
109 *		Michael Pall	:	Fix the multi URG PUSH problem in
110 *					tcp_readable(), poll() after URG
111 *					works now.
112 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113 *					BSD api.
114 *		Alan Cox	:	Changed the semantics of sk->socket to
115 *					fix a race and a signal problem with
116 *					accept() and async I/O.
117 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120 *					clients/servers which listen in on
121 *					fixed ports.
122 *		Alan Cox	:	Cleaned the above up and shrank it to
123 *					a sensible code size.
124 *		Alan Cox	:	Self connect lockup fix.
125 *		Alan Cox	:	No connect to multicast.
126 *		Ross Biro	:	Close unaccepted children on master
127 *					socket close.
128 *		Alan Cox	:	Reset tracing code.
129 *		Alan Cox	:	Spurious resets on shutdown.
130 *		Alan Cox	:	Giant 15 minute/60 second timer error
131 *		Alan Cox	:	Small whoops in polling before an
132 *					accept.
133 *		Alan Cox	:	Kept the state trace facility since
134 *					it's handy for debugging.
135 *		Alan Cox	:	More reset handler fixes.
136 *		Alan Cox	:	Started rewriting the code based on
137 *					the RFC's for other useful protocol
138 *					references see: Comer, KA9Q NOS, and
139 *					for a reference on the difference
140 *					between specifications and how BSD
141 *					works see the 4.4lite source.
142 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143 *					close.
144 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146 *		Alan Cox	:	Reimplemented timers as per the RFC
147 *					and using multiple timers for sanity.
148 *		Alan Cox	:	Small bug fixes, and a lot of new
149 *					comments.
150 *		Alan Cox	:	Fixed dual reader crash by locking
151 *					the buffers (much like datagram.c)
152 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153 *					now gets fed up of retrying without
154 *					(even a no space) answer.
155 *		Alan Cox	:	Extracted closing code better
156 *		Alan Cox	:	Fixed the closing state machine to
157 *					resemble the RFC.
158 *		Alan Cox	:	More 'per spec' fixes.
159 *		Jorge Cwik	:	Even faster checksumming.
160 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161 *					only frames. At least one pc tcp stack
162 *					generates them.
163 *		Alan Cox	:	Cache last socket.
164 *		Alan Cox	:	Per route irtt.
165 *		Matt Day	:	poll()->select() match BSD precisely on error
166 *		Alan Cox	:	New buffers
167 *		Marc Tamsky	:	Various sk->prot->retransmits and
168 *					sk->retransmits misupdating fixed.
169 *					Fixed tcp_write_timeout: stuck close,
170 *					and TCP syn retries gets used now.
171 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172 *					ack if state is TCP_CLOSED.
173 *		Alan Cox	:	Look up device on a retransmit - routes may
174 *					change. Doesn't yet cope with MSS shrink right
175 *					but it's a start!
176 *		Marc Tamsky	:	Closing in closing fixes.
177 *		Mike Shaver	:	RFC1122 verifications.
178 *		Alan Cox	:	rcv_saddr errors.
179 *		Alan Cox	:	Block double connect().
180 *		Alan Cox	:	Small hooks for enSKIP.
181 *		Alexey Kuznetsov:	Path MTU discovery.
182 *		Alan Cox	:	Support soft errors.
183 *		Alan Cox	:	Fix MTU discovery pathological case
184 *					when the remote claims no mtu!
185 *		Marc Tamsky	:	TCP_CLOSE fix.
186 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187 *					window but wrong (fixes NT lpd problems)
188 *		Pedro Roque	:	Better TCP window handling, delayed ack.
189 *		Joerg Reuter	:	No modification of locked buffers in
190 *					tcp_do_retransmit()
191 *		Eric Schenk	:	Changed receiver side silly window
192 *					avoidance algorithm to BSD style
193 *					algorithm. This doubles throughput
194 *					against machines running Solaris,
195 *					and seems to result in general
196 *					improvement.
197 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198 *	Willy Konynenberg	:	Transparent proxying support.
199 *	Mike McLagan		:	Routing by source
200 *		Keith Owens	:	Do proper merging with partial SKB's in
201 *					tcp_do_sendmsg to avoid burstiness.
202 *		Eric Schenk	:	Fix fast close down bug with
203 *					shutdown() followed by close().
204 *		Andi Kleen 	:	Make poll agree with SIGIO
205 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206 *					lingertime == 0 (RFC 793 ABORT Call)
207 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208 *					csum_and_copy_from_user() if possible.
209 *
210 *		This program is free software; you can redistribute it and/or
211 *		modify it under the terms of the GNU General Public License
212 *		as published by the Free Software Foundation; either version
213 *		2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 *	TCP_SYN_SENT		sent a connection request, waiting for ack
218 *
219 *	TCP_SYN_RECV		received a connection request, sent ack,
220 *				waiting for final ack in three-way handshake.
221 *
222 *	TCP_ESTABLISHED		connection established
223 *
224 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225 *				transmission of remaining buffered data
226 *
227 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228 *				to shutdown
229 *
230 *	TCP_CLOSING		both sides have shutdown but we still have
231 *				data we have to finish sending
232 *
233 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234 *				closed, can only be entered from FIN_WAIT2
235 *				or CLOSING.  Required because the other end
236 *				may not have gotten our last ACK causing it
237 *				to retransmit the data packet (which we ignore)
238 *
239 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240 *				us to finish writing our data and to shutdown
241 *				(we have to close() to move on to LAST_ACK)
242 *
243 *	TCP_LAST_ACK		out side has shutdown after remote has
244 *				shutdown.  There may still be data in our
245 *				buffer that we have to finish sending
246 *
247 *	TCP_CLOSE		socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305	if (!tcp_memory_pressure) {
306		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307		tcp_memory_pressure = 1;
308	}
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317					       poll_table *wait)
318{
319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 *	Wait for a TCP event.
324 *
325 *	Note that we don't need to lock the socket, as the upper poll layers
326 *	take care of normal races (between the test and the event) and we don't
327 *	go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331	unsigned int mask;
332	struct sock *sk = sock->sk;
333	struct tcp_sock *tp = tcp_sk(sk);
334
335	poll_wait(file, sk->sk_sleep, wait);
336	if (sk->sk_state == TCP_LISTEN)
337		return tcp_listen_poll(sk, wait);
338
339	/* Socket is not locked. We are protected from async events
340	   by poll logic and correct handling of state changes
341	   made by another threads is impossible in any case.
342	 */
343
344	mask = 0;
345	if (sk->sk_err)
346		mask = POLLERR;
347
348	/*
349	 * POLLHUP is certainly not done right. But poll() doesn't
350	 * have a notion of HUP in just one direction, and for a
351	 * socket the read side is more interesting.
352	 *
353	 * Some poll() documentation says that POLLHUP is incompatible
354	 * with the POLLOUT/POLLWR flags, so somebody should check this
355	 * all. But careful, it tends to be safer to return too many
356	 * bits than too few, and you can easily break real applications
357	 * if you don't tell them that something has hung up!
358	 *
359	 * Check-me.
360	 *
361	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362	 * our fs/select.c). It means that after we received EOF,
363	 * poll always returns immediately, making impossible poll() on write()
364	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365	 * if and only if shutdown has been made in both directions.
366	 * Actually, it is interesting to look how Solaris and DUX
367	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368	 * then we could set it on SND_SHUTDOWN. BTW examples given
369	 * in Stevens' books assume exactly this behaviour, it explains
370	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371	 *
372	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373	 * blocking on fresh not-connected or disconnected socket. --ANK
374	 */
375	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376		mask |= POLLHUP;
377	if (sk->sk_shutdown & RCV_SHUTDOWN)
378		mask |= POLLIN | POLLRDNORM;
379
380	/* Connected? */
381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382		/* Potential race condition. If read of tp below will
383		 * escape above sk->sk_state, we can be illegally awaken
384		 * in SYN_* states. */
385		if ((tp->rcv_nxt != tp->copied_seq) &&
386		    (tp->urg_seq != tp->copied_seq ||
387		     tp->rcv_nxt != tp->copied_seq + 1 ||
388		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389			mask |= POLLIN | POLLRDNORM;
390
391		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393				mask |= POLLOUT | POLLWRNORM;
394			} else {  /* send SIGIO later */
395				set_bit(SOCK_ASYNC_NOSPACE,
396					&sk->sk_socket->flags);
397				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399				/* Race breaker. If space is freed after
400				 * wspace test but before the flags are set,
401				 * IO signal will be lost.
402				 */
403				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404					mask |= POLLOUT | POLLWRNORM;
405			}
406		}
407
408		if (tp->urg_data & TCP_URG_VALID)
409			mask |= POLLPRI;
410	}
411	return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416	struct tcp_sock *tp = tcp_sk(sk);
417	int answ;
418
419	switch (cmd) {
420	case SIOCINQ:
421		if (sk->sk_state == TCP_LISTEN)
422			return -EINVAL;
423
424		lock_sock(sk);
425		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426			answ = 0;
427		else if (sock_flag(sk, SOCK_URGINLINE) ||
428			 !tp->urg_data ||
429			 before(tp->urg_seq, tp->copied_seq) ||
430			 !before(tp->urg_seq, tp->rcv_nxt)) {
431			answ = tp->rcv_nxt - tp->copied_seq;
432
433			/* Subtract 1, if FIN is in queue. */
434			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435				answ -=
436		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437		} else
438			answ = tp->urg_seq - tp->copied_seq;
439		release_sock(sk);
440		break;
441	case SIOCATMARK:
442		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443		break;
444	case SIOCOUTQ:
445		if (sk->sk_state == TCP_LISTEN)
446			return -EINVAL;
447
448		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449			answ = 0;
450		else
451			answ = tp->write_seq - tp->snd_una;
452		break;
453	default:
454		return -ENOIOCTLCMD;
455	};
456
457	return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463	struct inet_sock *inet = inet_sk(sk);
464	struct tcp_sock *tp = tcp_sk(sk);
465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467	if (rc != 0)
468		return rc;
469
470	sk->sk_max_ack_backlog = 0;
471	sk->sk_ack_backlog = 0;
472	tcp_delack_init(tp);
473
474	/* There is race window here: we announce ourselves listening,
475	 * but this transition is still not validated by get_port().
476	 * It is OK, because this socket enters to hash table only
477	 * after validation is complete.
478	 */
479	sk->sk_state = TCP_LISTEN;
480	if (!sk->sk_prot->get_port(sk, inet->num)) {
481		inet->sport = htons(inet->num);
482
483		sk_dst_reset(sk);
484		sk->sk_prot->hash(sk);
485
486		return 0;
487	}
488
489	sk->sk_state = TCP_CLOSE;
490	__reqsk_queue_destroy(&tp->accept_queue);
491	return -EADDRINUSE;
492}
493
494/*
495 *	This routine closes sockets which have been at least partially
496 *	opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501	struct tcp_sock *tp = tcp_sk(sk);
502	struct request_sock *acc_req;
503	struct request_sock *req;
504
505	tcp_delete_keepalive_timer(sk);
506
507	/* make all the listen_opt local to us */
508	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
509
510	/* Following specs, it would be better either to send FIN
511	 * (and enter FIN-WAIT-1, it is normal close)
512	 * or to send active reset (abort).
513	 * Certainly, it is pretty dangerous while synflood, but it is
514	 * bad justification for our negligence 8)
515	 * To be honest, we are not able to make either
516	 * of the variants now.			--ANK
517	 */
518	reqsk_queue_destroy(&tp->accept_queue);
519
520	while ((req = acc_req) != NULL) {
521		struct sock *child = req->sk;
522
523		acc_req = req->dl_next;
524
525		local_bh_disable();
526		bh_lock_sock(child);
527		BUG_TRAP(!sock_owned_by_user(child));
528		sock_hold(child);
529
530		tcp_disconnect(child, O_NONBLOCK);
531
532		sock_orphan(child);
533
534		atomic_inc(&tcp_orphan_count);
535
536		tcp_destroy_sock(child);
537
538		bh_unlock_sock(child);
539		local_bh_enable();
540		sock_put(child);
541
542		sk_acceptq_removed(sk);
543		__reqsk_free(req);
544	}
545	BUG_TRAP(!sk->sk_ack_backlog);
546}
547
548static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
549{
550	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
551	tp->pushed_seq = tp->write_seq;
552}
553
554static inline int forced_push(struct tcp_sock *tp)
555{
556	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
557}
558
559static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
560			      struct sk_buff *skb)
561{
562	skb->csum = 0;
563	TCP_SKB_CB(skb)->seq = tp->write_seq;
564	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
565	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
566	TCP_SKB_CB(skb)->sacked = 0;
567	skb_header_release(skb);
568	__skb_queue_tail(&sk->sk_write_queue, skb);
569	sk_charge_skb(sk, skb);
570	if (!sk->sk_send_head)
571		sk->sk_send_head = skb;
572	if (tp->nonagle & TCP_NAGLE_PUSH)
573		tp->nonagle &= ~TCP_NAGLE_PUSH;
574}
575
576static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
577				struct sk_buff *skb)
578{
579	if (flags & MSG_OOB) {
580		tp->urg_mode = 1;
581		tp->snd_up = tp->write_seq;
582		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
583	}
584}
585
586static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
587			    int mss_now, int nonagle)
588{
589	if (sk->sk_send_head) {
590		struct sk_buff *skb = sk->sk_write_queue.prev;
591		if (!(flags & MSG_MORE) || forced_push(tp))
592			tcp_mark_push(tp, skb);
593		tcp_mark_urg(tp, flags, skb);
594		__tcp_push_pending_frames(sk, tp, mss_now,
595					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
596	}
597}
598
599static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
600			 size_t psize, int flags)
601{
602	struct tcp_sock *tp = tcp_sk(sk);
603	int mss_now, size_goal;
604	int err;
605	ssize_t copied;
606	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
607
608	/* Wait for a connection to finish. */
609	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
610		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
611			goto out_err;
612
613	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
614
615	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
616	size_goal = tp->xmit_size_goal;
617	copied = 0;
618
619	err = -EPIPE;
620	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
621		goto do_error;
622
623	while (psize > 0) {
624		struct sk_buff *skb = sk->sk_write_queue.prev;
625		struct page *page = pages[poffset / PAGE_SIZE];
626		int copy, i, can_coalesce;
627		int offset = poffset % PAGE_SIZE;
628		int size = min_t(size_t, psize, PAGE_SIZE - offset);
629
630		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
631new_segment:
632			if (!sk_stream_memory_free(sk))
633				goto wait_for_sndbuf;
634
635			skb = sk_stream_alloc_pskb(sk, 0, 0,
636						   sk->sk_allocation);
637			if (!skb)
638				goto wait_for_memory;
639
640			skb_entail(sk, tp, skb);
641			copy = size_goal;
642		}
643
644		if (copy > size)
645			copy = size;
646
647		i = skb_shinfo(skb)->nr_frags;
648		can_coalesce = skb_can_coalesce(skb, i, page, offset);
649		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
650			tcp_mark_push(tp, skb);
651			goto new_segment;
652		}
653		if (sk->sk_forward_alloc < copy &&
654		    !sk_stream_mem_schedule(sk, copy, 0))
655			goto wait_for_memory;
656
657		if (can_coalesce) {
658			skb_shinfo(skb)->frags[i - 1].size += copy;
659		} else {
660			get_page(page);
661			skb_fill_page_desc(skb, i, page, offset, copy);
662		}
663
664		skb->len += copy;
665		skb->data_len += copy;
666		skb->truesize += copy;
667		sk->sk_wmem_queued += copy;
668		sk->sk_forward_alloc -= copy;
669		skb->ip_summed = CHECKSUM_HW;
670		tp->write_seq += copy;
671		TCP_SKB_CB(skb)->end_seq += copy;
672		skb_shinfo(skb)->tso_segs = 0;
673
674		if (!copied)
675			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
676
677		copied += copy;
678		poffset += copy;
679		if (!(psize -= copy))
680			goto out;
681
682		if (skb->len < mss_now || (flags & MSG_OOB))
683			continue;
684
685		if (forced_push(tp)) {
686			tcp_mark_push(tp, skb);
687			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
688		} else if (skb == sk->sk_send_head)
689			tcp_push_one(sk, mss_now);
690		continue;
691
692wait_for_sndbuf:
693		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
694wait_for_memory:
695		if (copied)
696			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
697
698		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
699			goto do_error;
700
701		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
702		size_goal = tp->xmit_size_goal;
703	}
704
705out:
706	if (copied)
707		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
708	return copied;
709
710do_error:
711	if (copied)
712		goto out;
713out_err:
714	return sk_stream_error(sk, flags, err);
715}
716
717ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
718		     size_t size, int flags)
719{
720	ssize_t res;
721	struct sock *sk = sock->sk;
722
723#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
724
725	if (!(sk->sk_route_caps & NETIF_F_SG) ||
726	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
727		return sock_no_sendpage(sock, page, offset, size, flags);
728
729#undef TCP_ZC_CSUM_FLAGS
730
731	lock_sock(sk);
732	TCP_CHECK_TIMER(sk);
733	res = do_tcp_sendpages(sk, &page, offset, size, flags);
734	TCP_CHECK_TIMER(sk);
735	release_sock(sk);
736	return res;
737}
738
739#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
740#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
741
742static inline int select_size(struct sock *sk, struct tcp_sock *tp)
743{
744	int tmp = tp->mss_cache;
745
746	if (sk->sk_route_caps & NETIF_F_SG) {
747		if (sk->sk_route_caps & NETIF_F_TSO)
748			tmp = 0;
749		else {
750			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
751
752			if (tmp >= pgbreak &&
753			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
754				tmp = pgbreak;
755		}
756	}
757
758	return tmp;
759}
760
761int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
762		size_t size)
763{
764	struct iovec *iov;
765	struct tcp_sock *tp = tcp_sk(sk);
766	struct sk_buff *skb;
767	int iovlen, flags;
768	int mss_now, size_goal;
769	int err, copied;
770	long timeo;
771
772	lock_sock(sk);
773	TCP_CHECK_TIMER(sk);
774
775	flags = msg->msg_flags;
776	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
777
778	/* Wait for a connection to finish. */
779	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
780		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
781			goto out_err;
782
783	/* This should be in poll */
784	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
785
786	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
787	size_goal = tp->xmit_size_goal;
788
789	/* Ok commence sending. */
790	iovlen = msg->msg_iovlen;
791	iov = msg->msg_iov;
792	copied = 0;
793
794	err = -EPIPE;
795	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
796		goto do_error;
797
798	while (--iovlen >= 0) {
799		int seglen = iov->iov_len;
800		unsigned char __user *from = iov->iov_base;
801
802		iov++;
803
804		while (seglen > 0) {
805			int copy;
806
807			skb = sk->sk_write_queue.prev;
808
809			if (!sk->sk_send_head ||
810			    (copy = size_goal - skb->len) <= 0) {
811
812new_segment:
813				/* Allocate new segment. If the interface is SG,
814				 * allocate skb fitting to single page.
815				 */
816				if (!sk_stream_memory_free(sk))
817					goto wait_for_sndbuf;
818
819				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
820							   0, sk->sk_allocation);
821				if (!skb)
822					goto wait_for_memory;
823
824				/*
825				 * Check whether we can use HW checksum.
826				 */
827				if (sk->sk_route_caps &
828				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
829				     NETIF_F_HW_CSUM))
830					skb->ip_summed = CHECKSUM_HW;
831
832				skb_entail(sk, tp, skb);
833				copy = size_goal;
834			}
835
836			/* Try to append data to the end of skb. */
837			if (copy > seglen)
838				copy = seglen;
839
840			/* Where to copy to? */
841			if (skb_tailroom(skb) > 0) {
842				/* We have some space in skb head. Superb! */
843				if (copy > skb_tailroom(skb))
844					copy = skb_tailroom(skb);
845				if ((err = skb_add_data(skb, from, copy)) != 0)
846					goto do_fault;
847			} else {
848				int merge = 0;
849				int i = skb_shinfo(skb)->nr_frags;
850				struct page *page = TCP_PAGE(sk);
851				int off = TCP_OFF(sk);
852
853				if (skb_can_coalesce(skb, i, page, off) &&
854				    off != PAGE_SIZE) {
855					/* We can extend the last page
856					 * fragment. */
857					merge = 1;
858				} else if (i == MAX_SKB_FRAGS ||
859					   (!i &&
860					   !(sk->sk_route_caps & NETIF_F_SG))) {
861					/* Need to add new fragment and cannot
862					 * do this because interface is non-SG,
863					 * or because all the page slots are
864					 * busy. */
865					tcp_mark_push(tp, skb);
866					goto new_segment;
867				} else if (page) {
868					if (off == PAGE_SIZE) {
869						put_page(page);
870						TCP_PAGE(sk) = page = NULL;
871					}
872				}
873
874				if (!page) {
875					/* Allocate new cache page. */
876					if (!(page = sk_stream_alloc_page(sk)))
877						goto wait_for_memory;
878					off = 0;
879				}
880
881				if (copy > PAGE_SIZE - off)
882					copy = PAGE_SIZE - off;
883
884				/* Time to copy data. We are close to
885				 * the end! */
886				err = skb_copy_to_page(sk, from, skb, page,
887						       off, copy);
888				if (err) {
889					/* If this page was new, give it to the
890					 * socket so it does not get leaked.
891					 */
892					if (!TCP_PAGE(sk)) {
893						TCP_PAGE(sk) = page;
894						TCP_OFF(sk) = 0;
895					}
896					goto do_error;
897				}
898
899				/* Update the skb. */
900				if (merge) {
901					skb_shinfo(skb)->frags[i - 1].size +=
902									copy;
903				} else {
904					skb_fill_page_desc(skb, i, page, off, copy);
905					if (TCP_PAGE(sk)) {
906						get_page(page);
907					} else if (off + copy < PAGE_SIZE) {
908						get_page(page);
909						TCP_PAGE(sk) = page;
910					}
911				}
912
913				TCP_OFF(sk) = off + copy;
914			}
915
916			if (!copied)
917				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
918
919			tp->write_seq += copy;
920			TCP_SKB_CB(skb)->end_seq += copy;
921			skb_shinfo(skb)->tso_segs = 0;
922
923			from += copy;
924			copied += copy;
925			if ((seglen -= copy) == 0 && iovlen == 0)
926				goto out;
927
928			if (skb->len < mss_now || (flags & MSG_OOB))
929				continue;
930
931			if (forced_push(tp)) {
932				tcp_mark_push(tp, skb);
933				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
934			} else if (skb == sk->sk_send_head)
935				tcp_push_one(sk, mss_now);
936			continue;
937
938wait_for_sndbuf:
939			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
940wait_for_memory:
941			if (copied)
942				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
943
944			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
945				goto do_error;
946
947			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
948			size_goal = tp->xmit_size_goal;
949		}
950	}
951
952out:
953	if (copied)
954		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
955	TCP_CHECK_TIMER(sk);
956	release_sock(sk);
957	return copied;
958
959do_fault:
960	if (!skb->len) {
961		if (sk->sk_send_head == skb)
962			sk->sk_send_head = NULL;
963		__skb_unlink(skb, &sk->sk_write_queue);
964		sk_stream_free_skb(sk, skb);
965	}
966
967do_error:
968	if (copied)
969		goto out;
970out_err:
971	err = sk_stream_error(sk, flags, err);
972	TCP_CHECK_TIMER(sk);
973	release_sock(sk);
974	return err;
975}
976
977/*
978 *	Handle reading urgent data. BSD has very simple semantics for
979 *	this, no blocking and very strange errors 8)
980 */
981
982static int tcp_recv_urg(struct sock *sk, long timeo,
983			struct msghdr *msg, int len, int flags,
984			int *addr_len)
985{
986	struct tcp_sock *tp = tcp_sk(sk);
987
988	/* No URG data to read. */
989	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
990	    tp->urg_data == TCP_URG_READ)
991		return -EINVAL;	/* Yes this is right ! */
992
993	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
994		return -ENOTCONN;
995
996	if (tp->urg_data & TCP_URG_VALID) {
997		int err = 0;
998		char c = tp->urg_data;
999
1000		if (!(flags & MSG_PEEK))
1001			tp->urg_data = TCP_URG_READ;
1002
1003		/* Read urgent data. */
1004		msg->msg_flags |= MSG_OOB;
1005
1006		if (len > 0) {
1007			if (!(flags & MSG_TRUNC))
1008				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1009			len = 1;
1010		} else
1011			msg->msg_flags |= MSG_TRUNC;
1012
1013		return err ? -EFAULT : len;
1014	}
1015
1016	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1017		return 0;
1018
1019	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1020	 * the available implementations agree in this case:
1021	 * this call should never block, independent of the
1022	 * blocking state of the socket.
1023	 * Mike <pall@rz.uni-karlsruhe.de>
1024	 */
1025	return -EAGAIN;
1026}
1027
1028/* Clean up the receive buffer for full frames taken by the user,
1029 * then send an ACK if necessary.  COPIED is the number of bytes
1030 * tcp_recvmsg has given to the user so far, it speeds up the
1031 * calculation of whether or not we must ACK for the sake of
1032 * a window update.
1033 */
1034static void cleanup_rbuf(struct sock *sk, int copied)
1035{
1036	struct tcp_sock *tp = tcp_sk(sk);
1037	int time_to_ack = 0;
1038
1039#if TCP_DEBUG
1040	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1041
1042	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1043#endif
1044
1045	if (tcp_ack_scheduled(tp)) {
1046		   /* Delayed ACKs frequently hit locked sockets during bulk
1047		    * receive. */
1048		if (tp->ack.blocked ||
1049		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1050		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1051		    /*
1052		     * If this read emptied read buffer, we send ACK, if
1053		     * connection is not bidirectional, user drained
1054		     * receive buffer and there was a small segment
1055		     * in queue.
1056		     */
1057		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1058		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1059			time_to_ack = 1;
1060	}
1061
1062	/* We send an ACK if we can now advertise a non-zero window
1063	 * which has been raised "significantly".
1064	 *
1065	 * Even if window raised up to infinity, do not send window open ACK
1066	 * in states, where we will not receive more. It is useless.
1067	 */
1068	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1069		__u32 rcv_window_now = tcp_receive_window(tp);
1070
1071		/* Optimize, __tcp_select_window() is not cheap. */
1072		if (2*rcv_window_now <= tp->window_clamp) {
1073			__u32 new_window = __tcp_select_window(sk);
1074
1075			/* Send ACK now, if this read freed lots of space
1076			 * in our buffer. Certainly, new_window is new window.
1077			 * We can advertise it now, if it is not less than current one.
1078			 * "Lots" means "at least twice" here.
1079			 */
1080			if (new_window && new_window >= 2 * rcv_window_now)
1081				time_to_ack = 1;
1082		}
1083	}
1084	if (time_to_ack)
1085		tcp_send_ack(sk);
1086}
1087
1088static void tcp_prequeue_process(struct sock *sk)
1089{
1090	struct sk_buff *skb;
1091	struct tcp_sock *tp = tcp_sk(sk);
1092
1093	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1094
1095	/* RX process wants to run with disabled BHs, though it is not
1096	 * necessary */
1097	local_bh_disable();
1098	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1099		sk->sk_backlog_rcv(sk, skb);
1100	local_bh_enable();
1101
1102	/* Clear memory counter. */
1103	tp->ucopy.memory = 0;
1104}
1105
1106static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1107{
1108	struct sk_buff *skb;
1109	u32 offset;
1110
1111	skb_queue_walk(&sk->sk_receive_queue, skb) {
1112		offset = seq - TCP_SKB_CB(skb)->seq;
1113		if (skb->h.th->syn)
1114			offset--;
1115		if (offset < skb->len || skb->h.th->fin) {
1116			*off = offset;
1117			return skb;
1118		}
1119	}
1120	return NULL;
1121}
1122
1123/*
1124 * This routine provides an alternative to tcp_recvmsg() for routines
1125 * that would like to handle copying from skbuffs directly in 'sendfile'
1126 * fashion.
1127 * Note:
1128 *	- It is assumed that the socket was locked by the caller.
1129 *	- The routine does not block.
1130 *	- At present, there is no support for reading OOB data
1131 *	  or for 'peeking' the socket using this routine
1132 *	  (although both would be easy to implement).
1133 */
1134int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1135		  sk_read_actor_t recv_actor)
1136{
1137	struct sk_buff *skb;
1138	struct tcp_sock *tp = tcp_sk(sk);
1139	u32 seq = tp->copied_seq;
1140	u32 offset;
1141	int copied = 0;
1142
1143	if (sk->sk_state == TCP_LISTEN)
1144		return -ENOTCONN;
1145	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1146		if (offset < skb->len) {
1147			size_t used, len;
1148
1149			len = skb->len - offset;
1150			/* Stop reading if we hit a patch of urgent data */
1151			if (tp->urg_data) {
1152				u32 urg_offset = tp->urg_seq - seq;
1153				if (urg_offset < len)
1154					len = urg_offset;
1155				if (!len)
1156					break;
1157			}
1158			used = recv_actor(desc, skb, offset, len);
1159			if (used <= len) {
1160				seq += used;
1161				copied += used;
1162				offset += used;
1163			}
1164			if (offset != skb->len)
1165				break;
1166		}
1167		if (skb->h.th->fin) {
1168			sk_eat_skb(sk, skb);
1169			++seq;
1170			break;
1171		}
1172		sk_eat_skb(sk, skb);
1173		if (!desc->count)
1174			break;
1175	}
1176	tp->copied_seq = seq;
1177
1178	tcp_rcv_space_adjust(sk);
1179
1180	/* Clean up data we have read: This will do ACK frames. */
1181	if (copied)
1182		cleanup_rbuf(sk, copied);
1183	return copied;
1184}
1185
1186/*
1187 *	This routine copies from a sock struct into the user buffer.
1188 *
1189 *	Technical note: in 2.3 we work on _locked_ socket, so that
1190 *	tricks with *seq access order and skb->users are not required.
1191 *	Probably, code can be easily improved even more.
1192 */
1193
1194int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1195		size_t len, int nonblock, int flags, int *addr_len)
1196{
1197	struct tcp_sock *tp = tcp_sk(sk);
1198	int copied = 0;
1199	u32 peek_seq;
1200	u32 *seq;
1201	unsigned long used;
1202	int err;
1203	int target;		/* Read at least this many bytes */
1204	long timeo;
1205	struct task_struct *user_recv = NULL;
1206
1207	lock_sock(sk);
1208
1209	TCP_CHECK_TIMER(sk);
1210
1211	err = -ENOTCONN;
1212	if (sk->sk_state == TCP_LISTEN)
1213		goto out;
1214
1215	timeo = sock_rcvtimeo(sk, nonblock);
1216
1217	/* Urgent data needs to be handled specially. */
1218	if (flags & MSG_OOB)
1219		goto recv_urg;
1220
1221	seq = &tp->copied_seq;
1222	if (flags & MSG_PEEK) {
1223		peek_seq = tp->copied_seq;
1224		seq = &peek_seq;
1225	}
1226
1227	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1228
1229	do {
1230		struct sk_buff *skb;
1231		u32 offset;
1232
1233		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1234		if (tp->urg_data && tp->urg_seq == *seq) {
1235			if (copied)
1236				break;
1237			if (signal_pending(current)) {
1238				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1239				break;
1240			}
1241		}
1242
1243		/* Next get a buffer. */
1244
1245		skb = skb_peek(&sk->sk_receive_queue);
1246		do {
1247			if (!skb)
1248				break;
1249
1250			/* Now that we have two receive queues this
1251			 * shouldn't happen.
1252			 */
1253			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1254				printk(KERN_INFO "recvmsg bug: copied %X "
1255				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1256				break;
1257			}
1258			offset = *seq - TCP_SKB_CB(skb)->seq;
1259			if (skb->h.th->syn)
1260				offset--;
1261			if (offset < skb->len)
1262				goto found_ok_skb;
1263			if (skb->h.th->fin)
1264				goto found_fin_ok;
1265			BUG_TRAP(flags & MSG_PEEK);
1266			skb = skb->next;
1267		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1268
1269		/* Well, if we have backlog, try to process it now yet. */
1270
1271		if (copied >= target && !sk->sk_backlog.tail)
1272			break;
1273
1274		if (copied) {
1275			if (sk->sk_err ||
1276			    sk->sk_state == TCP_CLOSE ||
1277			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1278			    !timeo ||
1279			    signal_pending(current) ||
1280			    (flags & MSG_PEEK))
1281				break;
1282		} else {
1283			if (sock_flag(sk, SOCK_DONE))
1284				break;
1285
1286			if (sk->sk_err) {
1287				copied = sock_error(sk);
1288				break;
1289			}
1290
1291			if (sk->sk_shutdown & RCV_SHUTDOWN)
1292				break;
1293
1294			if (sk->sk_state == TCP_CLOSE) {
1295				if (!sock_flag(sk, SOCK_DONE)) {
1296					/* This occurs when user tries to read
1297					 * from never connected socket.
1298					 */
1299					copied = -ENOTCONN;
1300					break;
1301				}
1302				break;
1303			}
1304
1305			if (!timeo) {
1306				copied = -EAGAIN;
1307				break;
1308			}
1309
1310			if (signal_pending(current)) {
1311				copied = sock_intr_errno(timeo);
1312				break;
1313			}
1314		}
1315
1316		cleanup_rbuf(sk, copied);
1317
1318		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1319			/* Install new reader */
1320			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1321				user_recv = current;
1322				tp->ucopy.task = user_recv;
1323				tp->ucopy.iov = msg->msg_iov;
1324			}
1325
1326			tp->ucopy.len = len;
1327
1328			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1329				 (flags & (MSG_PEEK | MSG_TRUNC)));
1330
1331			/* Ugly... If prequeue is not empty, we have to
1332			 * process it before releasing socket, otherwise
1333			 * order will be broken at second iteration.
1334			 * More elegant solution is required!!!
1335			 *
1336			 * Look: we have the following (pseudo)queues:
1337			 *
1338			 * 1. packets in flight
1339			 * 2. backlog
1340			 * 3. prequeue
1341			 * 4. receive_queue
1342			 *
1343			 * Each queue can be processed only if the next ones
1344			 * are empty. At this point we have empty receive_queue.
1345			 * But prequeue _can_ be not empty after 2nd iteration,
1346			 * when we jumped to start of loop because backlog
1347			 * processing added something to receive_queue.
1348			 * We cannot release_sock(), because backlog contains
1349			 * packets arrived _after_ prequeued ones.
1350			 *
1351			 * Shortly, algorithm is clear --- to process all
1352			 * the queues in order. We could make it more directly,
1353			 * requeueing packets from backlog to prequeue, if
1354			 * is not empty. It is more elegant, but eats cycles,
1355			 * unfortunately.
1356			 */
1357			if (!skb_queue_empty(&tp->ucopy.prequeue))
1358				goto do_prequeue;
1359
1360			/* __ Set realtime policy in scheduler __ */
1361		}
1362
1363		if (copied >= target) {
1364			/* Do not sleep, just process backlog. */
1365			release_sock(sk);
1366			lock_sock(sk);
1367		} else
1368			sk_wait_data(sk, &timeo);
1369
1370		if (user_recv) {
1371			int chunk;
1372
1373			/* __ Restore normal policy in scheduler __ */
1374
1375			if ((chunk = len - tp->ucopy.len) != 0) {
1376				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1377				len -= chunk;
1378				copied += chunk;
1379			}
1380
1381			if (tp->rcv_nxt == tp->copied_seq &&
1382			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1383do_prequeue:
1384				tcp_prequeue_process(sk);
1385
1386				if ((chunk = len - tp->ucopy.len) != 0) {
1387					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1388					len -= chunk;
1389					copied += chunk;
1390				}
1391			}
1392		}
1393		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1394			if (net_ratelimit())
1395				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1396				       current->comm, current->pid);
1397			peek_seq = tp->copied_seq;
1398		}
1399		continue;
1400
1401	found_ok_skb:
1402		/* Ok so how much can we use? */
1403		used = skb->len - offset;
1404		if (len < used)
1405			used = len;
1406
1407		/* Do we have urgent data here? */
1408		if (tp->urg_data) {
1409			u32 urg_offset = tp->urg_seq - *seq;
1410			if (urg_offset < used) {
1411				if (!urg_offset) {
1412					if (!sock_flag(sk, SOCK_URGINLINE)) {
1413						++*seq;
1414						offset++;
1415						used--;
1416						if (!used)
1417							goto skip_copy;
1418					}
1419				} else
1420					used = urg_offset;
1421			}
1422		}
1423
1424		if (!(flags & MSG_TRUNC)) {
1425			err = skb_copy_datagram_iovec(skb, offset,
1426						      msg->msg_iov, used);
1427			if (err) {
1428				/* Exception. Bailout! */
1429				if (!copied)
1430					copied = -EFAULT;
1431				break;
1432			}
1433		}
1434
1435		*seq += used;
1436		copied += used;
1437		len -= used;
1438
1439		tcp_rcv_space_adjust(sk);
1440
1441skip_copy:
1442		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1443			tp->urg_data = 0;
1444			tcp_fast_path_check(sk, tp);
1445		}
1446		if (used + offset < skb->len)
1447			continue;
1448
1449		if (skb->h.th->fin)
1450			goto found_fin_ok;
1451		if (!(flags & MSG_PEEK))
1452			sk_eat_skb(sk, skb);
1453		continue;
1454
1455	found_fin_ok:
1456		/* Process the FIN. */
1457		++*seq;
1458		if (!(flags & MSG_PEEK))
1459			sk_eat_skb(sk, skb);
1460		break;
1461	} while (len > 0);
1462
1463	if (user_recv) {
1464		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1465			int chunk;
1466
1467			tp->ucopy.len = copied > 0 ? len : 0;
1468
1469			tcp_prequeue_process(sk);
1470
1471			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1472				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1473				len -= chunk;
1474				copied += chunk;
1475			}
1476		}
1477
1478		tp->ucopy.task = NULL;
1479		tp->ucopy.len = 0;
1480	}
1481
1482	/* According to UNIX98, msg_name/msg_namelen are ignored
1483	 * on connected socket. I was just happy when found this 8) --ANK
1484	 */
1485
1486	/* Clean up data we have read: This will do ACK frames. */
1487	cleanup_rbuf(sk, copied);
1488
1489	TCP_CHECK_TIMER(sk);
1490	release_sock(sk);
1491	return copied;
1492
1493out:
1494	TCP_CHECK_TIMER(sk);
1495	release_sock(sk);
1496	return err;
1497
1498recv_urg:
1499	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1500	goto out;
1501}
1502
1503/*
1504 *	State processing on a close. This implements the state shift for
1505 *	sending our FIN frame. Note that we only send a FIN for some
1506 *	states. A shutdown() may have already sent the FIN, or we may be
1507 *	closed.
1508 */
1509
1510static unsigned char new_state[16] = {
1511  /* current state:        new state:      action:	*/
1512  /* (Invalid)		*/ TCP_CLOSE,
1513  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1514  /* TCP_SYN_SENT	*/ TCP_CLOSE,
1515  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1517  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1518  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1519  /* TCP_CLOSE		*/ TCP_CLOSE,
1520  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1521  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1522  /* TCP_LISTEN		*/ TCP_CLOSE,
1523  /* TCP_CLOSING	*/ TCP_CLOSING,
1524};
1525
1526static int tcp_close_state(struct sock *sk)
1527{
1528	int next = (int)new_state[sk->sk_state];
1529	int ns = next & TCP_STATE_MASK;
1530
1531	tcp_set_state(sk, ns);
1532
1533	return next & TCP_ACTION_FIN;
1534}
1535
1536/*
1537 *	Shutdown the sending side of a connection. Much like close except
1538 *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1539 */
1540
1541void tcp_shutdown(struct sock *sk, int how)
1542{
1543	/*	We need to grab some memory, and put together a FIN,
1544	 *	and then put it into the queue to be sent.
1545	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1546	 */
1547	if (!(how & SEND_SHUTDOWN))
1548		return;
1549
1550	/* If we've already sent a FIN, or it's a closed state, skip this. */
1551	if ((1 << sk->sk_state) &
1552	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1553	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1554		/* Clear out any half completed packets.  FIN if needed. */
1555		if (tcp_close_state(sk))
1556			tcp_send_fin(sk);
1557	}
1558}
1559
1560/*
1561 * At this point, there should be no process reference to this
1562 * socket, and thus no user references at all.  Therefore we
1563 * can assume the socket waitqueue is inactive and nobody will
1564 * try to jump onto it.
1565 */
1566void tcp_destroy_sock(struct sock *sk)
1567{
1568	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1569	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1570
1571	/* It cannot be in hash table! */
1572	BUG_TRAP(sk_unhashed(sk));
1573
1574	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1575	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1576
1577	sk->sk_prot->destroy(sk);
1578
1579	sk_stream_kill_queues(sk);
1580
1581	xfrm_sk_free_policy(sk);
1582
1583#ifdef INET_REFCNT_DEBUG
1584	if (atomic_read(&sk->sk_refcnt) != 1) {
1585		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1586		       sk, atomic_read(&sk->sk_refcnt));
1587	}
1588#endif
1589
1590	atomic_dec(&tcp_orphan_count);
1591	sock_put(sk);
1592}
1593
1594void tcp_close(struct sock *sk, long timeout)
1595{
1596	struct sk_buff *skb;
1597	int data_was_unread = 0;
1598
1599	lock_sock(sk);
1600	sk->sk_shutdown = SHUTDOWN_MASK;
1601
1602	if (sk->sk_state == TCP_LISTEN) {
1603		tcp_set_state(sk, TCP_CLOSE);
1604
1605		/* Special case. */
1606		tcp_listen_stop(sk);
1607
1608		goto adjudge_to_death;
1609	}
1610
1611	/*  We need to flush the recv. buffs.  We do this only on the
1612	 *  descriptor close, not protocol-sourced closes, because the
1613	 *  reader process may not have drained the data yet!
1614	 */
1615	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1616		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1617			  skb->h.th->fin;
1618		data_was_unread += len;
1619		__kfree_skb(skb);
1620	}
1621
1622	sk_stream_mem_reclaim(sk);
1623
1624	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1625	 * 3.10, we send a RST here because data was lost.  To
1626	 * witness the awful effects of the old behavior of always
1627	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1628	 * a bulk GET in an FTP client, suspend the process, wait
1629	 * for the client to advertise a zero window, then kill -9
1630	 * the FTP client, wheee...  Note: timeout is always zero
1631	 * in such a case.
1632	 */
1633	if (data_was_unread) {
1634		/* Unread data was tossed, zap the connection. */
1635		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1636		tcp_set_state(sk, TCP_CLOSE);
1637		tcp_send_active_reset(sk, GFP_KERNEL);
1638	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1639		/* Check zero linger _after_ checking for unread data. */
1640		sk->sk_prot->disconnect(sk, 0);
1641		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1642	} else if (tcp_close_state(sk)) {
1643		/* We FIN if the application ate all the data before
1644		 * zapping the connection.
1645		 */
1646
1647		/* RED-PEN. Formally speaking, we have broken TCP state
1648		 * machine. State transitions:
1649		 *
1650		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1651		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1652		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1653		 *
1654		 * are legal only when FIN has been sent (i.e. in window),
1655		 * rather than queued out of window. Purists blame.
1656		 *
1657		 * F.e. "RFC state" is ESTABLISHED,
1658		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1659		 *
1660		 * The visible declinations are that sometimes
1661		 * we enter time-wait state, when it is not required really
1662		 * (harmless), do not send active resets, when they are
1663		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1664		 * they look as CLOSING or LAST_ACK for Linux)
1665		 * Probably, I missed some more holelets.
1666		 * 						--ANK
1667		 */
1668		tcp_send_fin(sk);
1669	}
1670
1671	sk_stream_wait_close(sk, timeout);
1672
1673adjudge_to_death:
1674	/* It is the last release_sock in its life. It will remove backlog. */
1675	release_sock(sk);
1676
1677
1678	/* Now socket is owned by kernel and we acquire BH lock
1679	   to finish close. No need to check for user refs.
1680	 */
1681	local_bh_disable();
1682	bh_lock_sock(sk);
1683	BUG_TRAP(!sock_owned_by_user(sk));
1684
1685	sock_hold(sk);
1686	sock_orphan(sk);
1687
1688	/*	This is a (useful) BSD violating of the RFC. There is a
1689	 *	problem with TCP as specified in that the other end could
1690	 *	keep a socket open forever with no application left this end.
1691	 *	We use a 3 minute timeout (about the same as BSD) then kill
1692	 *	our end. If they send after that then tough - BUT: long enough
1693	 *	that we won't make the old 4*rto = almost no time - whoops
1694	 *	reset mistake.
1695	 *
1696	 *	Nope, it was not mistake. It is really desired behaviour
1697	 *	f.e. on http servers, when such sockets are useless, but
1698	 *	consume significant resources. Let's do it with special
1699	 *	linger2	option.					--ANK
1700	 */
1701
1702	if (sk->sk_state == TCP_FIN_WAIT2) {
1703		struct tcp_sock *tp = tcp_sk(sk);
1704		if (tp->linger2 < 0) {
1705			tcp_set_state(sk, TCP_CLOSE);
1706			tcp_send_active_reset(sk, GFP_ATOMIC);
1707			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1708		} else {
1709			int tmo = tcp_fin_time(tp);
1710
1711			if (tmo > TCP_TIMEWAIT_LEN) {
1712				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1713			} else {
1714				atomic_inc(&tcp_orphan_count);
1715				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1716				goto out;
1717			}
1718		}
1719	}
1720	if (sk->sk_state != TCP_CLOSE) {
1721		sk_stream_mem_reclaim(sk);
1722		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1723		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1724		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1725			if (net_ratelimit())
1726				printk(KERN_INFO "TCP: too many of orphaned "
1727				       "sockets\n");
1728			tcp_set_state(sk, TCP_CLOSE);
1729			tcp_send_active_reset(sk, GFP_ATOMIC);
1730			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1731		}
1732	}
1733	atomic_inc(&tcp_orphan_count);
1734
1735	if (sk->sk_state == TCP_CLOSE)
1736		tcp_destroy_sock(sk);
1737	/* Otherwise, socket is reprieved until protocol close. */
1738
1739out:
1740	bh_unlock_sock(sk);
1741	local_bh_enable();
1742	sock_put(sk);
1743}
1744
1745/* These states need RST on ABORT according to RFC793 */
1746
1747static inline int tcp_need_reset(int state)
1748{
1749	return (1 << state) &
1750	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1751		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1752}
1753
1754int tcp_disconnect(struct sock *sk, int flags)
1755{
1756	struct inet_sock *inet = inet_sk(sk);
1757	struct tcp_sock *tp = tcp_sk(sk);
1758	int err = 0;
1759	int old_state = sk->sk_state;
1760
1761	if (old_state != TCP_CLOSE)
1762		tcp_set_state(sk, TCP_CLOSE);
1763
1764	/* ABORT function of RFC793 */
1765	if (old_state == TCP_LISTEN) {
1766		tcp_listen_stop(sk);
1767	} else if (tcp_need_reset(old_state) ||
1768		   (tp->snd_nxt != tp->write_seq &&
1769		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1770		/* The last check adjusts for discrepance of Linux wrt. RFC
1771		 * states
1772		 */
1773		tcp_send_active_reset(sk, gfp_any());
1774		sk->sk_err = ECONNRESET;
1775	} else if (old_state == TCP_SYN_SENT)
1776		sk->sk_err = ECONNRESET;
1777
1778	tcp_clear_xmit_timers(sk);
1779	__skb_queue_purge(&sk->sk_receive_queue);
1780	sk_stream_writequeue_purge(sk);
1781	__skb_queue_purge(&tp->out_of_order_queue);
1782
1783	inet->dport = 0;
1784
1785	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1786		inet_reset_saddr(sk);
1787
1788	sk->sk_shutdown = 0;
1789	sock_reset_flag(sk, SOCK_DONE);
1790	tp->srtt = 0;
1791	if ((tp->write_seq += tp->max_window + 2) == 0)
1792		tp->write_seq = 1;
1793	tp->backoff = 0;
1794	tp->snd_cwnd = 2;
1795	tp->probes_out = 0;
1796	tp->packets_out = 0;
1797	tp->snd_ssthresh = 0x7fffffff;
1798	tp->snd_cwnd_cnt = 0;
1799	tcp_set_ca_state(tp, TCP_CA_Open);
1800	tcp_clear_retrans(tp);
1801	tcp_delack_init(tp);
1802	sk->sk_send_head = NULL;
1803	tp->rx_opt.saw_tstamp = 0;
1804	tcp_sack_reset(&tp->rx_opt);
1805	__sk_dst_reset(sk);
1806
1807	BUG_TRAP(!inet->num || tp->bind_hash);
1808
1809	sk->sk_error_report(sk);
1810	return err;
1811}
1812
1813/*
1814 *	Wait for an incoming connection, avoid race
1815 *	conditions. This must be called with the socket locked.
1816 */
1817static int wait_for_connect(struct sock *sk, long timeo)
1818{
1819	struct tcp_sock *tp = tcp_sk(sk);
1820	DEFINE_WAIT(wait);
1821	int err;
1822
1823	/*
1824	 * True wake-one mechanism for incoming connections: only
1825	 * one process gets woken up, not the 'whole herd'.
1826	 * Since we do not 'race & poll' for established sockets
1827	 * anymore, the common case will execute the loop only once.
1828	 *
1829	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1830	 * after any current non-exclusive waiters, and we know that
1831	 * it will always _stay_ after any new non-exclusive waiters
1832	 * because all non-exclusive waiters are added at the
1833	 * beginning of the wait-queue. As such, it's ok to "drop"
1834	 * our exclusiveness temporarily when we get woken up without
1835	 * having to remove and re-insert us on the wait queue.
1836	 */
1837	for (;;) {
1838		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1839					  TASK_INTERRUPTIBLE);
1840		release_sock(sk);
1841		if (reqsk_queue_empty(&tp->accept_queue))
1842			timeo = schedule_timeout(timeo);
1843		lock_sock(sk);
1844		err = 0;
1845		if (!reqsk_queue_empty(&tp->accept_queue))
1846			break;
1847		err = -EINVAL;
1848		if (sk->sk_state != TCP_LISTEN)
1849			break;
1850		err = sock_intr_errno(timeo);
1851		if (signal_pending(current))
1852			break;
1853		err = -EAGAIN;
1854		if (!timeo)
1855			break;
1856	}
1857	finish_wait(sk->sk_sleep, &wait);
1858	return err;
1859}
1860
1861/*
1862 *	This will accept the next outstanding connection.
1863 */
1864
1865struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1866{
1867	struct tcp_sock *tp = tcp_sk(sk);
1868	struct sock *newsk;
1869	int error;
1870
1871	lock_sock(sk);
1872
1873	/* We need to make sure that this socket is listening,
1874	 * and that it has something pending.
1875	 */
1876	error = -EINVAL;
1877	if (sk->sk_state != TCP_LISTEN)
1878		goto out_err;
1879
1880	/* Find already established connection */
1881	if (reqsk_queue_empty(&tp->accept_queue)) {
1882		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1883
1884		/* If this is a non blocking socket don't sleep */
1885		error = -EAGAIN;
1886		if (!timeo)
1887			goto out_err;
1888
1889		error = wait_for_connect(sk, timeo);
1890		if (error)
1891			goto out_err;
1892	}
1893
1894	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1895	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1896out:
1897	release_sock(sk);
1898	return newsk;
1899out_err:
1900	newsk = NULL;
1901	*err = error;
1902	goto out;
1903}
1904
1905/*
1906 *	Socket option code for TCP.
1907 */
1908int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1909		   int optlen)
1910{
1911	struct tcp_sock *tp = tcp_sk(sk);
1912	int val;
1913	int err = 0;
1914
1915	if (level != SOL_TCP)
1916		return tp->af_specific->setsockopt(sk, level, optname,
1917						   optval, optlen);
1918
1919	/* This is a string value all the others are int's */
1920	if (optname == TCP_CONGESTION) {
1921		char name[TCP_CA_NAME_MAX];
1922
1923		if (optlen < 1)
1924			return -EINVAL;
1925
1926		val = strncpy_from_user(name, optval,
1927					min(TCP_CA_NAME_MAX-1, optlen));
1928		if (val < 0)
1929			return -EFAULT;
1930		name[val] = 0;
1931
1932		lock_sock(sk);
1933		err = tcp_set_congestion_control(tp, name);
1934		release_sock(sk);
1935		return err;
1936	}
1937
1938	if (optlen < sizeof(int))
1939		return -EINVAL;
1940
1941	if (get_user(val, (int __user *)optval))
1942		return -EFAULT;
1943
1944	lock_sock(sk);
1945
1946	switch (optname) {
1947	case TCP_MAXSEG:
1948		/* Values greater than interface MTU won't take effect. However
1949		 * at the point when this call is done we typically don't yet
1950		 * know which interface is going to be used */
1951		if (val < 8 || val > MAX_TCP_WINDOW) {
1952			err = -EINVAL;
1953			break;
1954		}
1955		tp->rx_opt.user_mss = val;
1956		break;
1957
1958	case TCP_NODELAY:
1959		if (val) {
1960			/* TCP_NODELAY is weaker than TCP_CORK, so that
1961			 * this option on corked socket is remembered, but
1962			 * it is not activated until cork is cleared.
1963			 *
1964			 * However, when TCP_NODELAY is set we make
1965			 * an explicit push, which overrides even TCP_CORK
1966			 * for currently queued segments.
1967			 */
1968			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1969			tcp_push_pending_frames(sk, tp);
1970		} else {
1971			tp->nonagle &= ~TCP_NAGLE_OFF;
1972		}
1973		break;
1974
1975	case TCP_CORK:
1976		/* When set indicates to always queue non-full frames.
1977		 * Later the user clears this option and we transmit
1978		 * any pending partial frames in the queue.  This is
1979		 * meant to be used alongside sendfile() to get properly
1980		 * filled frames when the user (for example) must write
1981		 * out headers with a write() call first and then use
1982		 * sendfile to send out the data parts.
1983		 *
1984		 * TCP_CORK can be set together with TCP_NODELAY and it is
1985		 * stronger than TCP_NODELAY.
1986		 */
1987		if (val) {
1988			tp->nonagle |= TCP_NAGLE_CORK;
1989		} else {
1990			tp->nonagle &= ~TCP_NAGLE_CORK;
1991			if (tp->nonagle&TCP_NAGLE_OFF)
1992				tp->nonagle |= TCP_NAGLE_PUSH;
1993			tcp_push_pending_frames(sk, tp);
1994		}
1995		break;
1996
1997	case TCP_KEEPIDLE:
1998		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1999			err = -EINVAL;
2000		else {
2001			tp->keepalive_time = val * HZ;
2002			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2003			    !((1 << sk->sk_state) &
2004			      (TCPF_CLOSE | TCPF_LISTEN))) {
2005				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2006				if (tp->keepalive_time > elapsed)
2007					elapsed = tp->keepalive_time - elapsed;
2008				else
2009					elapsed = 0;
2010				tcp_reset_keepalive_timer(sk, elapsed);
2011			}
2012		}
2013		break;
2014	case TCP_KEEPINTVL:
2015		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2016			err = -EINVAL;
2017		else
2018			tp->keepalive_intvl = val * HZ;
2019		break;
2020	case TCP_KEEPCNT:
2021		if (val < 1 || val > MAX_TCP_KEEPCNT)
2022			err = -EINVAL;
2023		else
2024			tp->keepalive_probes = val;
2025		break;
2026	case TCP_SYNCNT:
2027		if (val < 1 || val > MAX_TCP_SYNCNT)
2028			err = -EINVAL;
2029		else
2030			tp->syn_retries = val;
2031		break;
2032
2033	case TCP_LINGER2:
2034		if (val < 0)
2035			tp->linger2 = -1;
2036		else if (val > sysctl_tcp_fin_timeout / HZ)
2037			tp->linger2 = 0;
2038		else
2039			tp->linger2 = val * HZ;
2040		break;
2041
2042	case TCP_DEFER_ACCEPT:
2043		tp->defer_accept = 0;
2044		if (val > 0) {
2045			/* Translate value in seconds to number of
2046			 * retransmits */
2047			while (tp->defer_accept < 32 &&
2048			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2049				       tp->defer_accept))
2050				tp->defer_accept++;
2051			tp->defer_accept++;
2052		}
2053		break;
2054
2055	case TCP_WINDOW_CLAMP:
2056		if (!val) {
2057			if (sk->sk_state != TCP_CLOSE) {
2058				err = -EINVAL;
2059				break;
2060			}
2061			tp->window_clamp = 0;
2062		} else
2063			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2064						SOCK_MIN_RCVBUF / 2 : val;
2065		break;
2066
2067	case TCP_QUICKACK:
2068		if (!val) {
2069			tp->ack.pingpong = 1;
2070		} else {
2071			tp->ack.pingpong = 0;
2072			if ((1 << sk->sk_state) &
2073			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2074			    tcp_ack_scheduled(tp)) {
2075				tp->ack.pending |= TCP_ACK_PUSHED;
2076				cleanup_rbuf(sk, 1);
2077				if (!(val & 1))
2078					tp->ack.pingpong = 1;
2079			}
2080		}
2081		break;
2082
2083	default:
2084		err = -ENOPROTOOPT;
2085		break;
2086	};
2087	release_sock(sk);
2088	return err;
2089}
2090
2091/* Return information about state of tcp endpoint in API format. */
2092void tcp_get_info(struct sock *sk, struct tcp_info *info)
2093{
2094	struct tcp_sock *tp = tcp_sk(sk);
2095	u32 now = tcp_time_stamp;
2096
2097	memset(info, 0, sizeof(*info));
2098
2099	info->tcpi_state = sk->sk_state;
2100	info->tcpi_ca_state = tp->ca_state;
2101	info->tcpi_retransmits = tp->retransmits;
2102	info->tcpi_probes = tp->probes_out;
2103	info->tcpi_backoff = tp->backoff;
2104
2105	if (tp->rx_opt.tstamp_ok)
2106		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2107	if (tp->rx_opt.sack_ok)
2108		info->tcpi_options |= TCPI_OPT_SACK;
2109	if (tp->rx_opt.wscale_ok) {
2110		info->tcpi_options |= TCPI_OPT_WSCALE;
2111		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2112		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2113	}
2114
2115	if (tp->ecn_flags&TCP_ECN_OK)
2116		info->tcpi_options |= TCPI_OPT_ECN;
2117
2118	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2119	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2120	info->tcpi_snd_mss = tp->mss_cache;
2121	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2122
2123	info->tcpi_unacked = tp->packets_out;
2124	info->tcpi_sacked = tp->sacked_out;
2125	info->tcpi_lost = tp->lost_out;
2126	info->tcpi_retrans = tp->retrans_out;
2127	info->tcpi_fackets = tp->fackets_out;
2128
2129	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2130	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2131	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2132
2133	info->tcpi_pmtu = tp->pmtu_cookie;
2134	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2135	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2136	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2137	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2138	info->tcpi_snd_cwnd = tp->snd_cwnd;
2139	info->tcpi_advmss = tp->advmss;
2140	info->tcpi_reordering = tp->reordering;
2141
2142	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2143	info->tcpi_rcv_space = tp->rcvq_space.space;
2144
2145	info->tcpi_total_retrans = tp->total_retrans;
2146}
2147
2148EXPORT_SYMBOL_GPL(tcp_get_info);
2149
2150int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2151		   int __user *optlen)
2152{
2153	struct tcp_sock *tp = tcp_sk(sk);
2154	int val, len;
2155
2156	if (level != SOL_TCP)
2157		return tp->af_specific->getsockopt(sk, level, optname,
2158						   optval, optlen);
2159
2160	if (get_user(len, optlen))
2161		return -EFAULT;
2162
2163	len = min_t(unsigned int, len, sizeof(int));
2164
2165	if (len < 0)
2166		return -EINVAL;
2167
2168	switch (optname) {
2169	case TCP_MAXSEG:
2170		val = tp->mss_cache;
2171		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2172			val = tp->rx_opt.user_mss;
2173		break;
2174	case TCP_NODELAY:
2175		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2176		break;
2177	case TCP_CORK:
2178		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2179		break;
2180	case TCP_KEEPIDLE:
2181		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2182		break;
2183	case TCP_KEEPINTVL:
2184		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2185		break;
2186	case TCP_KEEPCNT:
2187		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2188		break;
2189	case TCP_SYNCNT:
2190		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2191		break;
2192	case TCP_LINGER2:
2193		val = tp->linger2;
2194		if (val >= 0)
2195			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2196		break;
2197	case TCP_DEFER_ACCEPT:
2198		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2199					       (tp->defer_accept - 1));
2200		break;
2201	case TCP_WINDOW_CLAMP:
2202		val = tp->window_clamp;
2203		break;
2204	case TCP_INFO: {
2205		struct tcp_info info;
2206
2207		if (get_user(len, optlen))
2208			return -EFAULT;
2209
2210		tcp_get_info(sk, &info);
2211
2212		len = min_t(unsigned int, len, sizeof(info));
2213		if (put_user(len, optlen))
2214			return -EFAULT;
2215		if (copy_to_user(optval, &info, len))
2216			return -EFAULT;
2217		return 0;
2218	}
2219	case TCP_QUICKACK:
2220		val = !tp->ack.pingpong;
2221		break;
2222
2223	case TCP_CONGESTION:
2224		if (get_user(len, optlen))
2225			return -EFAULT;
2226		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2227		if (put_user(len, optlen))
2228			return -EFAULT;
2229		if (copy_to_user(optval, tp->ca_ops->name, len))
2230			return -EFAULT;
2231		return 0;
2232	default:
2233		return -ENOPROTOOPT;
2234	};
2235
2236	if (put_user(len, optlen))
2237		return -EFAULT;
2238	if (copy_to_user(optval, &val, len))
2239		return -EFAULT;
2240	return 0;
2241}
2242
2243
2244extern void __skb_cb_too_small_for_tcp(int, int);
2245extern struct tcp_congestion_ops tcp_reno;
2246
2247static __initdata unsigned long thash_entries;
2248static int __init set_thash_entries(char *str)
2249{
2250	if (!str)
2251		return 0;
2252	thash_entries = simple_strtoul(str, &str, 0);
2253	return 1;
2254}
2255__setup("thash_entries=", set_thash_entries);
2256
2257void __init tcp_init(void)
2258{
2259	struct sk_buff *skb = NULL;
2260	int order, i;
2261
2262	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2263		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2264					   sizeof(skb->cb));
2265
2266	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2267					      sizeof(struct tcp_bind_bucket),
2268					      0, SLAB_HWCACHE_ALIGN,
2269					      NULL, NULL);
2270	if (!tcp_bucket_cachep)
2271		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2272
2273	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2274						sizeof(struct tcp_tw_bucket),
2275						0, SLAB_HWCACHE_ALIGN,
2276						NULL, NULL);
2277	if (!tcp_timewait_cachep)
2278		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2279
2280	/* Size and allocate the main established and bind bucket
2281	 * hash tables.
2282	 *
2283	 * The methodology is similar to that of the buffer cache.
2284	 */
2285	tcp_ehash = (struct tcp_ehash_bucket *)
2286		alloc_large_system_hash("TCP established",
2287					sizeof(struct tcp_ehash_bucket),
2288					thash_entries,
2289					(num_physpages >= 128 * 1024) ?
2290						(25 - PAGE_SHIFT) :
2291						(27 - PAGE_SHIFT),
2292					HASH_HIGHMEM,
2293					&tcp_ehash_size,
2294					NULL,
2295					0);
2296	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2297	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2298		rwlock_init(&tcp_ehash[i].lock);
2299		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2300	}
2301
2302	tcp_bhash = (struct tcp_bind_hashbucket *)
2303		alloc_large_system_hash("TCP bind",
2304					sizeof(struct tcp_bind_hashbucket),
2305					tcp_ehash_size,
2306					(num_physpages >= 128 * 1024) ?
2307						(25 - PAGE_SHIFT) :
2308						(27 - PAGE_SHIFT),
2309					HASH_HIGHMEM,
2310					&tcp_bhash_size,
2311					NULL,
2312					64 * 1024);
2313	tcp_bhash_size = 1 << tcp_bhash_size;
2314	for (i = 0; i < tcp_bhash_size; i++) {
2315		spin_lock_init(&tcp_bhash[i].lock);
2316		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2317	}
2318
2319	/* Try to be a bit smarter and adjust defaults depending
2320	 * on available memory.
2321	 */
2322	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2323			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2324			order++)
2325		;
2326	if (order >= 4) {
2327		sysctl_local_port_range[0] = 32768;
2328		sysctl_local_port_range[1] = 61000;
2329		sysctl_tcp_max_tw_buckets = 180000;
2330		sysctl_tcp_max_orphans = 4096 << (order - 4);
2331		sysctl_max_syn_backlog = 1024;
2332	} else if (order < 3) {
2333		sysctl_local_port_range[0] = 1024 * (3 - order);
2334		sysctl_tcp_max_tw_buckets >>= (3 - order);
2335		sysctl_tcp_max_orphans >>= (3 - order);
2336		sysctl_max_syn_backlog = 128;
2337	}
2338	tcp_port_rover = sysctl_local_port_range[0] - 1;
2339
2340	sysctl_tcp_mem[0] =  768 << order;
2341	sysctl_tcp_mem[1] = 1024 << order;
2342	sysctl_tcp_mem[2] = 1536 << order;
2343
2344	if (order < 3) {
2345		sysctl_tcp_wmem[2] = 64 * 1024;
2346		sysctl_tcp_rmem[0] = PAGE_SIZE;
2347		sysctl_tcp_rmem[1] = 43689;
2348		sysctl_tcp_rmem[2] = 2 * 43689;
2349	}
2350
2351	printk(KERN_INFO "TCP: Hash tables configured "
2352	       "(established %d bind %d)\n",
2353	       tcp_ehash_size << 1, tcp_bhash_size);
2354
2355	tcp_register_congestion_control(&tcp_reno);
2356}
2357
2358EXPORT_SYMBOL(tcp_accept);
2359EXPORT_SYMBOL(tcp_close);
2360EXPORT_SYMBOL(tcp_destroy_sock);
2361EXPORT_SYMBOL(tcp_disconnect);
2362EXPORT_SYMBOL(tcp_getsockopt);
2363EXPORT_SYMBOL(tcp_ioctl);
2364EXPORT_SYMBOL(tcp_poll);
2365EXPORT_SYMBOL(tcp_read_sock);
2366EXPORT_SYMBOL(tcp_recvmsg);
2367EXPORT_SYMBOL(tcp_sendmsg);
2368EXPORT_SYMBOL(tcp_sendpage);
2369EXPORT_SYMBOL(tcp_setsockopt);
2370EXPORT_SYMBOL(tcp_shutdown);
2371EXPORT_SYMBOL(tcp_statistics);
2372EXPORT_SYMBOL(tcp_timewait_cachep);
2373