tcp_output.c revision 4dfc2817025965a2fc78a18c50f540736a6b5c24
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version:	$Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18 *		Matthew Dillon, <dillon@apollo.west.oic.com>
19 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 *		Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23/*
24 * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
25 *				:	Fragmentation on mtu decrease
26 *				:	Segment collapse on retransmit
27 *				:	AF independence
28 *
29 *		Linus Torvalds	:	send_delayed_ack
30 *		David S. Miller	:	Charge memory using the right skb
31 *					during syn/ack processing.
32 *		David S. Miller :	Output engine completely rewritten.
33 *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
34 *		Cacophonix Gaul :	draft-minshall-nagle-01
35 *		J Hadi Salim	:	ECN support
36 *
37 */
38
39#include <net/tcp.h>
40
41#include <linux/compiler.h>
42#include <linux/module.h>
43
44/* People can turn this off for buggy TCP's found in printers etc. */
45int sysctl_tcp_retrans_collapse __read_mostly = 1;
46
47/* People can turn this on to  work with those rare, broken TCPs that
48 * interpret the window field as a signed quantity.
49 */
50int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
51
52/* This limits the percentage of the congestion window which we
53 * will allow a single TSO frame to consume.  Building TSO frames
54 * which are too large can cause TCP streams to be bursty.
55 */
56int sysctl_tcp_tso_win_divisor __read_mostly = 3;
57
58int sysctl_tcp_mtu_probing __read_mostly = 0;
59int sysctl_tcp_base_mss __read_mostly = 512;
60
61/* By default, RFC2861 behavior.  */
62int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
63
64static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
65{
66	struct tcp_sock *tp = tcp_sk(sk);
67	unsigned int prior_packets = tp->packets_out;
68
69	tcp_advance_send_head(sk, skb);
70	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
71
72	/* Don't override Nagle indefinately with F-RTO */
73	if (tp->frto_counter == 2)
74		tp->frto_counter = 3;
75
76	tp->packets_out += tcp_skb_pcount(skb);
77	if (!prior_packets)
78		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
79					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
80}
81
82/* SND.NXT, if window was not shrunk.
83 * If window has been shrunk, what should we make? It is not clear at all.
84 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
85 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
86 * invalid. OK, let's make this for now:
87 */
88static inline __u32 tcp_acceptable_seq(struct sock *sk)
89{
90	struct tcp_sock *tp = tcp_sk(sk);
91
92	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
93		return tp->snd_nxt;
94	else
95		return tcp_wnd_end(tp);
96}
97
98/* Calculate mss to advertise in SYN segment.
99 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
100 *
101 * 1. It is independent of path mtu.
102 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
103 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
104 *    attached devices, because some buggy hosts are confused by
105 *    large MSS.
106 * 4. We do not make 3, we advertise MSS, calculated from first
107 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
108 *    This may be overridden via information stored in routing table.
109 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
110 *    probably even Jumbo".
111 */
112static __u16 tcp_advertise_mss(struct sock *sk)
113{
114	struct tcp_sock *tp = tcp_sk(sk);
115	struct dst_entry *dst = __sk_dst_get(sk);
116	int mss = tp->advmss;
117
118	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
119		mss = dst_metric(dst, RTAX_ADVMSS);
120		tp->advmss = mss;
121	}
122
123	return (__u16)mss;
124}
125
126/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
127 * This is the first part of cwnd validation mechanism. */
128static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
129{
130	struct tcp_sock *tp = tcp_sk(sk);
131	s32 delta = tcp_time_stamp - tp->lsndtime;
132	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
133	u32 cwnd = tp->snd_cwnd;
134
135	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
136
137	tp->snd_ssthresh = tcp_current_ssthresh(sk);
138	restart_cwnd = min(restart_cwnd, cwnd);
139
140	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
141		cwnd >>= 1;
142	tp->snd_cwnd = max(cwnd, restart_cwnd);
143	tp->snd_cwnd_stamp = tcp_time_stamp;
144	tp->snd_cwnd_used = 0;
145}
146
147static void tcp_event_data_sent(struct tcp_sock *tp,
148				struct sk_buff *skb, struct sock *sk)
149{
150	struct inet_connection_sock *icsk = inet_csk(sk);
151	const u32 now = tcp_time_stamp;
152
153	if (sysctl_tcp_slow_start_after_idle &&
154	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
155		tcp_cwnd_restart(sk, __sk_dst_get(sk));
156
157	tp->lsndtime = now;
158
159	/* If it is a reply for ato after last received
160	 * packet, enter pingpong mode.
161	 */
162	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
163		icsk->icsk_ack.pingpong = 1;
164}
165
166static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
167{
168	tcp_dec_quickack_mode(sk, pkts);
169	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
170}
171
172/* Determine a window scaling and initial window to offer.
173 * Based on the assumption that the given amount of space
174 * will be offered. Store the results in the tp structure.
175 * NOTE: for smooth operation initial space offering should
176 * be a multiple of mss if possible. We assume here that mss >= 1.
177 * This MUST be enforced by all callers.
178 */
179void tcp_select_initial_window(int __space, __u32 mss,
180			       __u32 *rcv_wnd, __u32 *window_clamp,
181			       int wscale_ok, __u8 *rcv_wscale)
182{
183	unsigned int space = (__space < 0 ? 0 : __space);
184
185	/* If no clamp set the clamp to the max possible scaled window */
186	if (*window_clamp == 0)
187		(*window_clamp) = (65535 << 14);
188	space = min(*window_clamp, space);
189
190	/* Quantize space offering to a multiple of mss if possible. */
191	if (space > mss)
192		space = (space / mss) * mss;
193
194	/* NOTE: offering an initial window larger than 32767
195	 * will break some buggy TCP stacks. If the admin tells us
196	 * it is likely we could be speaking with such a buggy stack
197	 * we will truncate our initial window offering to 32K-1
198	 * unless the remote has sent us a window scaling option,
199	 * which we interpret as a sign the remote TCP is not
200	 * misinterpreting the window field as a signed quantity.
201	 */
202	if (sysctl_tcp_workaround_signed_windows)
203		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
204	else
205		(*rcv_wnd) = space;
206
207	(*rcv_wscale) = 0;
208	if (wscale_ok) {
209		/* Set window scaling on max possible window
210		 * See RFC1323 for an explanation of the limit to 14
211		 */
212		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
213		space = min_t(u32, space, *window_clamp);
214		while (space > 65535 && (*rcv_wscale) < 14) {
215			space >>= 1;
216			(*rcv_wscale)++;
217		}
218	}
219
220	/* Set initial window to value enough for senders,
221	 * following RFC2414. Senders, not following this RFC,
222	 * will be satisfied with 2.
223	 */
224	if (mss > (1 << *rcv_wscale)) {
225		int init_cwnd = 4;
226		if (mss > 1460 * 3)
227			init_cwnd = 2;
228		else if (mss > 1460)
229			init_cwnd = 3;
230		if (*rcv_wnd > init_cwnd * mss)
231			*rcv_wnd = init_cwnd * mss;
232	}
233
234	/* Set the clamp no higher than max representable value */
235	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
236}
237
238/* Chose a new window to advertise, update state in tcp_sock for the
239 * socket, and return result with RFC1323 scaling applied.  The return
240 * value can be stuffed directly into th->window for an outgoing
241 * frame.
242 */
243static u16 tcp_select_window(struct sock *sk)
244{
245	struct tcp_sock *tp = tcp_sk(sk);
246	u32 cur_win = tcp_receive_window(tp);
247	u32 new_win = __tcp_select_window(sk);
248
249	/* Never shrink the offered window */
250	if (new_win < cur_win) {
251		/* Danger Will Robinson!
252		 * Don't update rcv_wup/rcv_wnd here or else
253		 * we will not be able to advertise a zero
254		 * window in time.  --DaveM
255		 *
256		 * Relax Will Robinson.
257		 */
258		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
259	}
260	tp->rcv_wnd = new_win;
261	tp->rcv_wup = tp->rcv_nxt;
262
263	/* Make sure we do not exceed the maximum possible
264	 * scaled window.
265	 */
266	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
267		new_win = min(new_win, MAX_TCP_WINDOW);
268	else
269		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
270
271	/* RFC1323 scaling applied */
272	new_win >>= tp->rx_opt.rcv_wscale;
273
274	/* If we advertise zero window, disable fast path. */
275	if (new_win == 0)
276		tp->pred_flags = 0;
277
278	return new_win;
279}
280
281static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
282{
283	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
284	if (!(tp->ecn_flags & TCP_ECN_OK))
285		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
286}
287
288static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
289{
290	struct tcp_sock *tp = tcp_sk(sk);
291
292	tp->ecn_flags = 0;
293	if (sysctl_tcp_ecn) {
294		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
295		tp->ecn_flags = TCP_ECN_OK;
296	}
297}
298
299static __inline__ void
300TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
301{
302	if (inet_rsk(req)->ecn_ok)
303		th->ece = 1;
304}
305
306static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
307				int tcp_header_len)
308{
309	struct tcp_sock *tp = tcp_sk(sk);
310
311	if (tp->ecn_flags & TCP_ECN_OK) {
312		/* Not-retransmitted data segment: set ECT and inject CWR. */
313		if (skb->len != tcp_header_len &&
314		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
315			INET_ECN_xmit(sk);
316			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
317				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
318				tcp_hdr(skb)->cwr = 1;
319				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
320			}
321		} else {
322			/* ACK or retransmitted segment: clear ECT|CE */
323			INET_ECN_dontxmit(sk);
324		}
325		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
326			tcp_hdr(skb)->ece = 1;
327	}
328}
329
330/* Constructs common control bits of non-data skb. If SYN/FIN is present,
331 * auto increment end seqno.
332 */
333static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
334{
335	skb->csum = 0;
336
337	TCP_SKB_CB(skb)->flags = flags;
338	TCP_SKB_CB(skb)->sacked = 0;
339
340	skb_shinfo(skb)->gso_segs = 1;
341	skb_shinfo(skb)->gso_size = 0;
342	skb_shinfo(skb)->gso_type = 0;
343
344	TCP_SKB_CB(skb)->seq = seq;
345	if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
346		seq++;
347	TCP_SKB_CB(skb)->end_seq = seq;
348}
349
350static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
351					 __u32 tstamp, __u8 **md5_hash)
352{
353	if (tp->rx_opt.tstamp_ok) {
354		*ptr++ = htonl((TCPOPT_NOP << 24) |
355			       (TCPOPT_NOP << 16) |
356			       (TCPOPT_TIMESTAMP << 8) |
357			       TCPOLEN_TIMESTAMP);
358		*ptr++ = htonl(tstamp);
359		*ptr++ = htonl(tp->rx_opt.ts_recent);
360	}
361	if (tp->rx_opt.eff_sacks) {
362		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
363		int this_sack;
364
365		*ptr++ = htonl((TCPOPT_NOP  << 24) |
366			       (TCPOPT_NOP  << 16) |
367			       (TCPOPT_SACK <<  8) |
368			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
369						     TCPOLEN_SACK_PERBLOCK)));
370
371		for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
372			*ptr++ = htonl(sp[this_sack].start_seq);
373			*ptr++ = htonl(sp[this_sack].end_seq);
374		}
375
376		if (tp->rx_opt.dsack) {
377			tp->rx_opt.dsack = 0;
378			tp->rx_opt.eff_sacks--;
379		}
380	}
381#ifdef CONFIG_TCP_MD5SIG
382	if (md5_hash) {
383		*ptr++ = htonl((TCPOPT_NOP << 24) |
384			       (TCPOPT_NOP << 16) |
385			       (TCPOPT_MD5SIG << 8) |
386			       TCPOLEN_MD5SIG);
387		*md5_hash = (__u8 *)ptr;
388	}
389#endif
390}
391
392/* Construct a tcp options header for a SYN or SYN_ACK packet.
393 * If this is every changed make sure to change the definition of
394 * MAX_SYN_SIZE to match the new maximum number of options that you
395 * can generate.
396 *
397 * Note - that with the RFC2385 TCP option, we make room for the
398 * 16 byte MD5 hash. This will be filled in later, so the pointer for the
399 * location to be filled is passed back up.
400 */
401static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
402				  int offer_wscale, int wscale, __u32 tstamp,
403				  __u32 ts_recent, __u8 **md5_hash)
404{
405	/* We always get an MSS option.
406	 * The option bytes which will be seen in normal data
407	 * packets should timestamps be used, must be in the MSS
408	 * advertised.  But we subtract them from tp->mss_cache so
409	 * that calculations in tcp_sendmsg are simpler etc.
410	 * So account for this fact here if necessary.  If we
411	 * don't do this correctly, as a receiver we won't
412	 * recognize data packets as being full sized when we
413	 * should, and thus we won't abide by the delayed ACK
414	 * rules correctly.
415	 * SACKs don't matter, we never delay an ACK when we
416	 * have any of those going out.
417	 */
418	*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
419	if (ts) {
420		if (sack)
421			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
422				       (TCPOLEN_SACK_PERM << 16) |
423				       (TCPOPT_TIMESTAMP << 8) |
424				       TCPOLEN_TIMESTAMP);
425		else
426			*ptr++ = htonl((TCPOPT_NOP << 24) |
427				       (TCPOPT_NOP << 16) |
428				       (TCPOPT_TIMESTAMP << 8) |
429				       TCPOLEN_TIMESTAMP);
430		*ptr++ = htonl(tstamp);		/* TSVAL */
431		*ptr++ = htonl(ts_recent);	/* TSECR */
432	} else if (sack)
433		*ptr++ = htonl((TCPOPT_NOP << 24) |
434			       (TCPOPT_NOP << 16) |
435			       (TCPOPT_SACK_PERM << 8) |
436			       TCPOLEN_SACK_PERM);
437	if (offer_wscale)
438		*ptr++ = htonl((TCPOPT_NOP << 24) |
439			       (TCPOPT_WINDOW << 16) |
440			       (TCPOLEN_WINDOW << 8) |
441			       (wscale));
442#ifdef CONFIG_TCP_MD5SIG
443	/*
444	 * If MD5 is enabled, then we set the option, and include the size
445	 * (always 18). The actual MD5 hash is added just before the
446	 * packet is sent.
447	 */
448	if (md5_hash) {
449		*ptr++ = htonl((TCPOPT_NOP << 24) |
450			       (TCPOPT_NOP << 16) |
451			       (TCPOPT_MD5SIG << 8) |
452			       TCPOLEN_MD5SIG);
453		*md5_hash = (__u8 *)ptr;
454	}
455#endif
456}
457
458/* This routine actually transmits TCP packets queued in by
459 * tcp_do_sendmsg().  This is used by both the initial
460 * transmission and possible later retransmissions.
461 * All SKB's seen here are completely headerless.  It is our
462 * job to build the TCP header, and pass the packet down to
463 * IP so it can do the same plus pass the packet off to the
464 * device.
465 *
466 * We are working here with either a clone of the original
467 * SKB, or a fresh unique copy made by the retransmit engine.
468 */
469static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
470			    gfp_t gfp_mask)
471{
472	const struct inet_connection_sock *icsk = inet_csk(sk);
473	struct inet_sock *inet;
474	struct tcp_sock *tp;
475	struct tcp_skb_cb *tcb;
476	int tcp_header_size;
477#ifdef CONFIG_TCP_MD5SIG
478	struct tcp_md5sig_key *md5;
479	__u8 *md5_hash_location;
480#endif
481	struct tcphdr *th;
482	int sysctl_flags;
483	int err;
484
485	BUG_ON(!skb || !tcp_skb_pcount(skb));
486
487	/* If congestion control is doing timestamping, we must
488	 * take such a timestamp before we potentially clone/copy.
489	 */
490	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
491		__net_timestamp(skb);
492
493	if (likely(clone_it)) {
494		if (unlikely(skb_cloned(skb)))
495			skb = pskb_copy(skb, gfp_mask);
496		else
497			skb = skb_clone(skb, gfp_mask);
498		if (unlikely(!skb))
499			return -ENOBUFS;
500	}
501
502	inet = inet_sk(sk);
503	tp = tcp_sk(sk);
504	tcb = TCP_SKB_CB(skb);
505	tcp_header_size = tp->tcp_header_len;
506
507#define SYSCTL_FLAG_TSTAMPS	0x1
508#define SYSCTL_FLAG_WSCALE	0x2
509#define SYSCTL_FLAG_SACK	0x4
510
511	sysctl_flags = 0;
512	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
513		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
514		if (sysctl_tcp_timestamps) {
515			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
516			sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
517		}
518		if (sysctl_tcp_window_scaling) {
519			tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
520			sysctl_flags |= SYSCTL_FLAG_WSCALE;
521		}
522		if (sysctl_tcp_sack) {
523			sysctl_flags |= SYSCTL_FLAG_SACK;
524			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
525				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
526		}
527	} else if (unlikely(tp->rx_opt.eff_sacks)) {
528		/* A SACK is 2 pad bytes, a 2 byte header, plus
529		 * 2 32-bit sequence numbers for each SACK block.
530		 */
531		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
532				    (tp->rx_opt.eff_sacks *
533				     TCPOLEN_SACK_PERBLOCK));
534	}
535
536	if (tcp_packets_in_flight(tp) == 0)
537		tcp_ca_event(sk, CA_EVENT_TX_START);
538
539#ifdef CONFIG_TCP_MD5SIG
540	/*
541	 * Are we doing MD5 on this segment? If so - make
542	 * room for it.
543	 */
544	md5 = tp->af_specific->md5_lookup(sk, sk);
545	if (md5)
546		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
547#endif
548
549	skb_push(skb, tcp_header_size);
550	skb_reset_transport_header(skb);
551	skb_set_owner_w(skb, sk);
552
553	/* Build TCP header and checksum it. */
554	th = tcp_hdr(skb);
555	th->source		= inet->sport;
556	th->dest		= inet->dport;
557	th->seq			= htonl(tcb->seq);
558	th->ack_seq		= htonl(tp->rcv_nxt);
559	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
560					tcb->flags);
561
562	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
563		/* RFC1323: The window in SYN & SYN/ACK segments
564		 * is never scaled.
565		 */
566		th->window	= htons(min(tp->rcv_wnd, 65535U));
567	} else {
568		th->window	= htons(tcp_select_window(sk));
569	}
570	th->check		= 0;
571	th->urg_ptr		= 0;
572
573	if (unlikely(tp->urg_mode &&
574		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
575		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
576		th->urg			= 1;
577	}
578
579	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
580		tcp_syn_build_options((__be32 *)(th + 1),
581				      tcp_advertise_mss(sk),
582				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
583				      (sysctl_flags & SYSCTL_FLAG_SACK),
584				      (sysctl_flags & SYSCTL_FLAG_WSCALE),
585				      tp->rx_opt.rcv_wscale,
586				      tcb->when,
587				      tp->rx_opt.ts_recent,
588
589#ifdef CONFIG_TCP_MD5SIG
590				      md5 ? &md5_hash_location :
591#endif
592				      NULL);
593	} else {
594		tcp_build_and_update_options((__be32 *)(th + 1),
595					     tp, tcb->when,
596#ifdef CONFIG_TCP_MD5SIG
597					     md5 ? &md5_hash_location :
598#endif
599					     NULL);
600		TCP_ECN_send(sk, skb, tcp_header_size);
601	}
602
603#ifdef CONFIG_TCP_MD5SIG
604	/* Calculate the MD5 hash, as we have all we need now */
605	if (md5) {
606		tp->af_specific->calc_md5_hash(md5_hash_location,
607					       md5,
608					       sk, NULL, NULL,
609					       tcp_hdr(skb),
610					       sk->sk_protocol,
611					       skb->len);
612	}
613#endif
614
615	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
616
617	if (likely(tcb->flags & TCPCB_FLAG_ACK))
618		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
619
620	if (skb->len != tcp_header_size)
621		tcp_event_data_sent(tp, skb, sk);
622
623	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
624		TCP_INC_STATS(TCP_MIB_OUTSEGS);
625
626	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
627	if (likely(err <= 0))
628		return err;
629
630	tcp_enter_cwr(sk, 1);
631
632	return net_xmit_eval(err);
633
634#undef SYSCTL_FLAG_TSTAMPS
635#undef SYSCTL_FLAG_WSCALE
636#undef SYSCTL_FLAG_SACK
637}
638
639/* This routine just queue's the buffer
640 *
641 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
642 * otherwise socket can stall.
643 */
644static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
645{
646	struct tcp_sock *tp = tcp_sk(sk);
647
648	/* Advance write_seq and place onto the write_queue. */
649	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
650	skb_header_release(skb);
651	tcp_add_write_queue_tail(sk, skb);
652	sk->sk_wmem_queued += skb->truesize;
653	sk_mem_charge(sk, skb->truesize);
654}
655
656static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
657				 unsigned int mss_now)
658{
659	if (skb->len <= mss_now || !sk_can_gso(sk)) {
660		/* Avoid the costly divide in the normal
661		 * non-TSO case.
662		 */
663		skb_shinfo(skb)->gso_segs = 1;
664		skb_shinfo(skb)->gso_size = 0;
665		skb_shinfo(skb)->gso_type = 0;
666	} else {
667		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
668		skb_shinfo(skb)->gso_size = mss_now;
669		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
670	}
671}
672
673/* When a modification to fackets out becomes necessary, we need to check
674 * skb is counted to fackets_out or not.
675 */
676static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
677				   int decr)
678{
679	struct tcp_sock *tp = tcp_sk(sk);
680
681	if (!tp->sacked_out || tcp_is_reno(tp))
682		return;
683
684	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
685		tp->fackets_out -= decr;
686}
687
688/* Function to create two new TCP segments.  Shrinks the given segment
689 * to the specified size and appends a new segment with the rest of the
690 * packet to the list.  This won't be called frequently, I hope.
691 * Remember, these are still headerless SKBs at this point.
692 */
693int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
694		 unsigned int mss_now)
695{
696	struct tcp_sock *tp = tcp_sk(sk);
697	struct sk_buff *buff;
698	int nsize, old_factor;
699	int nlen;
700	u16 flags;
701
702	BUG_ON(len > skb->len);
703
704	tcp_clear_retrans_hints_partial(tp);
705	nsize = skb_headlen(skb) - len;
706	if (nsize < 0)
707		nsize = 0;
708
709	if (skb_cloned(skb) &&
710	    skb_is_nonlinear(skb) &&
711	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
712		return -ENOMEM;
713
714	/* Get a new skb... force flag on. */
715	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
716	if (buff == NULL)
717		return -ENOMEM; /* We'll just try again later. */
718
719	sk->sk_wmem_queued += buff->truesize;
720	sk_mem_charge(sk, buff->truesize);
721	nlen = skb->len - len - nsize;
722	buff->truesize += nlen;
723	skb->truesize -= nlen;
724
725	/* Correct the sequence numbers. */
726	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
727	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
728	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
729
730	/* PSH and FIN should only be set in the second packet. */
731	flags = TCP_SKB_CB(skb)->flags;
732	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
733	TCP_SKB_CB(buff)->flags = flags;
734	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
735
736	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
737		/* Copy and checksum data tail into the new buffer. */
738		buff->csum = csum_partial_copy_nocheck(skb->data + len,
739						       skb_put(buff, nsize),
740						       nsize, 0);
741
742		skb_trim(skb, len);
743
744		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
745	} else {
746		skb->ip_summed = CHECKSUM_PARTIAL;
747		skb_split(skb, buff, len);
748	}
749
750	buff->ip_summed = skb->ip_summed;
751
752	/* Looks stupid, but our code really uses when of
753	 * skbs, which it never sent before. --ANK
754	 */
755	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
756	buff->tstamp = skb->tstamp;
757
758	old_factor = tcp_skb_pcount(skb);
759
760	/* Fix up tso_factor for both original and new SKB.  */
761	tcp_set_skb_tso_segs(sk, skb, mss_now);
762	tcp_set_skb_tso_segs(sk, buff, mss_now);
763
764	/* If this packet has been sent out already, we must
765	 * adjust the various packet counters.
766	 */
767	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
768		int diff = old_factor - tcp_skb_pcount(skb) -
769			tcp_skb_pcount(buff);
770
771		tp->packets_out -= diff;
772
773		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
774			tp->sacked_out -= diff;
775		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
776			tp->retrans_out -= diff;
777
778		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
779			tp->lost_out -= diff;
780
781		/* Adjust Reno SACK estimate. */
782		if (tcp_is_reno(tp) && diff > 0) {
783			tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
784			tcp_verify_left_out(tp);
785		}
786		tcp_adjust_fackets_out(sk, skb, diff);
787	}
788
789	/* Link BUFF into the send queue. */
790	skb_header_release(buff);
791	tcp_insert_write_queue_after(skb, buff, sk);
792
793	return 0;
794}
795
796/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
797 * eventually). The difference is that pulled data not copied, but
798 * immediately discarded.
799 */
800static void __pskb_trim_head(struct sk_buff *skb, int len)
801{
802	int i, k, eat;
803
804	eat = len;
805	k = 0;
806	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
807		if (skb_shinfo(skb)->frags[i].size <= eat) {
808			put_page(skb_shinfo(skb)->frags[i].page);
809			eat -= skb_shinfo(skb)->frags[i].size;
810		} else {
811			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
812			if (eat) {
813				skb_shinfo(skb)->frags[k].page_offset += eat;
814				skb_shinfo(skb)->frags[k].size -= eat;
815				eat = 0;
816			}
817			k++;
818		}
819	}
820	skb_shinfo(skb)->nr_frags = k;
821
822	skb_reset_tail_pointer(skb);
823	skb->data_len -= len;
824	skb->len = skb->data_len;
825}
826
827int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
828{
829	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
830		return -ENOMEM;
831
832	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
833	if (unlikely(len < skb_headlen(skb)))
834		__skb_pull(skb, len);
835	else
836		__pskb_trim_head(skb, len - skb_headlen(skb));
837
838	TCP_SKB_CB(skb)->seq += len;
839	skb->ip_summed = CHECKSUM_PARTIAL;
840
841	skb->truesize	     -= len;
842	sk->sk_wmem_queued   -= len;
843	sk_mem_uncharge(sk, len);
844	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
845
846	/* Any change of skb->len requires recalculation of tso
847	 * factor and mss.
848	 */
849	if (tcp_skb_pcount(skb) > 1)
850		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
851
852	return 0;
853}
854
855/* Not accounting for SACKs here. */
856int tcp_mtu_to_mss(struct sock *sk, int pmtu)
857{
858	struct tcp_sock *tp = tcp_sk(sk);
859	struct inet_connection_sock *icsk = inet_csk(sk);
860	int mss_now;
861
862	/* Calculate base mss without TCP options:
863	   It is MMS_S - sizeof(tcphdr) of rfc1122
864	 */
865	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
866
867	/* Clamp it (mss_clamp does not include tcp options) */
868	if (mss_now > tp->rx_opt.mss_clamp)
869		mss_now = tp->rx_opt.mss_clamp;
870
871	/* Now subtract optional transport overhead */
872	mss_now -= icsk->icsk_ext_hdr_len;
873
874	/* Then reserve room for full set of TCP options and 8 bytes of data */
875	if (mss_now < 48)
876		mss_now = 48;
877
878	/* Now subtract TCP options size, not including SACKs */
879	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
880
881	return mss_now;
882}
883
884/* Inverse of above */
885int tcp_mss_to_mtu(struct sock *sk, int mss)
886{
887	struct tcp_sock *tp = tcp_sk(sk);
888	struct inet_connection_sock *icsk = inet_csk(sk);
889	int mtu;
890
891	mtu = mss +
892	      tp->tcp_header_len +
893	      icsk->icsk_ext_hdr_len +
894	      icsk->icsk_af_ops->net_header_len;
895
896	return mtu;
897}
898
899void tcp_mtup_init(struct sock *sk)
900{
901	struct tcp_sock *tp = tcp_sk(sk);
902	struct inet_connection_sock *icsk = inet_csk(sk);
903
904	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
905	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
906			       icsk->icsk_af_ops->net_header_len;
907	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
908	icsk->icsk_mtup.probe_size = 0;
909}
910
911/* Bound MSS / TSO packet size with the half of the window */
912static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
913{
914	if (tp->max_window && pktsize > (tp->max_window >> 1))
915		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
916	else
917		return pktsize;
918}
919
920/* This function synchronize snd mss to current pmtu/exthdr set.
921
922   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
923   for TCP options, but includes only bare TCP header.
924
925   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
926   It is minimum of user_mss and mss received with SYN.
927   It also does not include TCP options.
928
929   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
930
931   tp->mss_cache is current effective sending mss, including
932   all tcp options except for SACKs. It is evaluated,
933   taking into account current pmtu, but never exceeds
934   tp->rx_opt.mss_clamp.
935
936   NOTE1. rfc1122 clearly states that advertised MSS
937   DOES NOT include either tcp or ip options.
938
939   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
940   are READ ONLY outside this function.		--ANK (980731)
941 */
942unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
943{
944	struct tcp_sock *tp = tcp_sk(sk);
945	struct inet_connection_sock *icsk = inet_csk(sk);
946	int mss_now;
947
948	if (icsk->icsk_mtup.search_high > pmtu)
949		icsk->icsk_mtup.search_high = pmtu;
950
951	mss_now = tcp_mtu_to_mss(sk, pmtu);
952	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
953
954	/* And store cached results */
955	icsk->icsk_pmtu_cookie = pmtu;
956	if (icsk->icsk_mtup.enabled)
957		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
958	tp->mss_cache = mss_now;
959
960	return mss_now;
961}
962
963/* Compute the current effective MSS, taking SACKs and IP options,
964 * and even PMTU discovery events into account.
965 *
966 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
967 * cannot be large. However, taking into account rare use of URG, this
968 * is not a big flaw.
969 */
970unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
971{
972	struct tcp_sock *tp = tcp_sk(sk);
973	struct dst_entry *dst = __sk_dst_get(sk);
974	u32 mss_now;
975	u16 xmit_size_goal;
976	int doing_tso = 0;
977
978	mss_now = tp->mss_cache;
979
980	if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
981		doing_tso = 1;
982
983	if (dst) {
984		u32 mtu = dst_mtu(dst);
985		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
986			mss_now = tcp_sync_mss(sk, mtu);
987	}
988
989	if (tp->rx_opt.eff_sacks)
990		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
991			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
992
993#ifdef CONFIG_TCP_MD5SIG
994	if (tp->af_specific->md5_lookup(sk, sk))
995		mss_now -= TCPOLEN_MD5SIG_ALIGNED;
996#endif
997
998	xmit_size_goal = mss_now;
999
1000	if (doing_tso) {
1001		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
1002				  inet_csk(sk)->icsk_af_ops->net_header_len -
1003				  inet_csk(sk)->icsk_ext_hdr_len -
1004				  tp->tcp_header_len);
1005
1006		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
1007		xmit_size_goal -= (xmit_size_goal % mss_now);
1008	}
1009	tp->xmit_size_goal = xmit_size_goal;
1010
1011	return mss_now;
1012}
1013
1014/* Congestion window validation. (RFC2861) */
1015static void tcp_cwnd_validate(struct sock *sk)
1016{
1017	struct tcp_sock *tp = tcp_sk(sk);
1018
1019	if (tp->packets_out >= tp->snd_cwnd) {
1020		/* Network is feed fully. */
1021		tp->snd_cwnd_used = 0;
1022		tp->snd_cwnd_stamp = tcp_time_stamp;
1023	} else {
1024		/* Network starves. */
1025		if (tp->packets_out > tp->snd_cwnd_used)
1026			tp->snd_cwnd_used = tp->packets_out;
1027
1028		if (sysctl_tcp_slow_start_after_idle &&
1029		    (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1030			tcp_cwnd_application_limited(sk);
1031	}
1032}
1033
1034/* Returns the portion of skb which can be sent right away without
1035 * introducing MSS oddities to segment boundaries. In rare cases where
1036 * mss_now != mss_cache, we will request caller to create a small skb
1037 * per input skb which could be mostly avoided here (if desired).
1038 *
1039 * We explicitly want to create a request for splitting write queue tail
1040 * to a small skb for Nagle purposes while avoiding unnecessary modulos,
1041 * thus all the complexity (cwnd_len is always MSS multiple which we
1042 * return whenever allowed by the other factors). Basically we need the
1043 * modulo only when the receiver window alone is the limiting factor or
1044 * when we would be allowed to send the split-due-to-Nagle skb fully.
1045 */
1046static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
1047					unsigned int mss_now, unsigned int cwnd)
1048{
1049	struct tcp_sock *tp = tcp_sk(sk);
1050	u32 needed, window, cwnd_len;
1051
1052	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1053	cwnd_len = mss_now * cwnd;
1054
1055	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
1056		return cwnd_len;
1057
1058	needed = min(skb->len, window);
1059
1060	if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)
1061		return cwnd_len;
1062
1063	return needed - needed % mss_now;
1064}
1065
1066/* Can at least one segment of SKB be sent right now, according to the
1067 * congestion window rules?  If so, return how many segments are allowed.
1068 */
1069static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1070					 struct sk_buff *skb)
1071{
1072	u32 in_flight, cwnd;
1073
1074	/* Don't be strict about the congestion window for the final FIN.  */
1075	if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1076	    tcp_skb_pcount(skb) == 1)
1077		return 1;
1078
1079	in_flight = tcp_packets_in_flight(tp);
1080	cwnd = tp->snd_cwnd;
1081	if (in_flight < cwnd)
1082		return (cwnd - in_flight);
1083
1084	return 0;
1085}
1086
1087/* This must be invoked the first time we consider transmitting
1088 * SKB onto the wire.
1089 */
1090static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1091			     unsigned int mss_now)
1092{
1093	int tso_segs = tcp_skb_pcount(skb);
1094
1095	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1096		tcp_set_skb_tso_segs(sk, skb, mss_now);
1097		tso_segs = tcp_skb_pcount(skb);
1098	}
1099	return tso_segs;
1100}
1101
1102static inline int tcp_minshall_check(const struct tcp_sock *tp)
1103{
1104	return after(tp->snd_sml,tp->snd_una) &&
1105		!after(tp->snd_sml, tp->snd_nxt);
1106}
1107
1108/* Return 0, if packet can be sent now without violation Nagle's rules:
1109 * 1. It is full sized.
1110 * 2. Or it contains FIN. (already checked by caller)
1111 * 3. Or TCP_NODELAY was set.
1112 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1113 *    With Minshall's modification: all sent small packets are ACKed.
1114 */
1115static inline int tcp_nagle_check(const struct tcp_sock *tp,
1116				  const struct sk_buff *skb,
1117				  unsigned mss_now, int nonagle)
1118{
1119	return (skb->len < mss_now &&
1120		((nonagle & TCP_NAGLE_CORK) ||
1121		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
1122}
1123
1124/* Return non-zero if the Nagle test allows this packet to be
1125 * sent now.
1126 */
1127static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1128				 unsigned int cur_mss, int nonagle)
1129{
1130	/* Nagle rule does not apply to frames, which sit in the middle of the
1131	 * write_queue (they have no chances to get new data).
1132	 *
1133	 * This is implemented in the callers, where they modify the 'nonagle'
1134	 * argument based upon the location of SKB in the send queue.
1135	 */
1136	if (nonagle & TCP_NAGLE_PUSH)
1137		return 1;
1138
1139	/* Don't use the nagle rule for urgent data (or for the final FIN).
1140	 * Nagle can be ignored during F-RTO too (see RFC4138).
1141	 */
1142	if (tp->urg_mode || (tp->frto_counter == 2) ||
1143	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
1144		return 1;
1145
1146	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1147		return 1;
1148
1149	return 0;
1150}
1151
1152/* Does at least the first segment of SKB fit into the send window? */
1153static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
1154				   unsigned int cur_mss)
1155{
1156	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1157
1158	if (skb->len > cur_mss)
1159		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1160
1161	return !after(end_seq, tcp_wnd_end(tp));
1162}
1163
1164/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1165 * should be put on the wire right now.  If so, it returns the number of
1166 * packets allowed by the congestion window.
1167 */
1168static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1169				 unsigned int cur_mss, int nonagle)
1170{
1171	struct tcp_sock *tp = tcp_sk(sk);
1172	unsigned int cwnd_quota;
1173
1174	tcp_init_tso_segs(sk, skb, cur_mss);
1175
1176	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1177		return 0;
1178
1179	cwnd_quota = tcp_cwnd_test(tp, skb);
1180	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
1181		cwnd_quota = 0;
1182
1183	return cwnd_quota;
1184}
1185
1186int tcp_may_send_now(struct sock *sk)
1187{
1188	struct tcp_sock *tp = tcp_sk(sk);
1189	struct sk_buff *skb = tcp_send_head(sk);
1190
1191	return (skb &&
1192		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
1193			     (tcp_skb_is_last(sk, skb) ?
1194			      tp->nonagle : TCP_NAGLE_PUSH)));
1195}
1196
1197/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1198 * which is put after SKB on the list.  It is very much like
1199 * tcp_fragment() except that it may make several kinds of assumptions
1200 * in order to speed up the splitting operation.  In particular, we
1201 * know that all the data is in scatter-gather pages, and that the
1202 * packet has never been sent out before (and thus is not cloned).
1203 */
1204static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1205			unsigned int mss_now)
1206{
1207	struct sk_buff *buff;
1208	int nlen = skb->len - len;
1209	u16 flags;
1210
1211	/* All of a TSO frame must be composed of paged data.  */
1212	if (skb->len != skb->data_len)
1213		return tcp_fragment(sk, skb, len, mss_now);
1214
1215	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
1216	if (unlikely(buff == NULL))
1217		return -ENOMEM;
1218
1219	sk->sk_wmem_queued += buff->truesize;
1220	sk_mem_charge(sk, buff->truesize);
1221	buff->truesize += nlen;
1222	skb->truesize -= nlen;
1223
1224	/* Correct the sequence numbers. */
1225	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1226	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1227	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1228
1229	/* PSH and FIN should only be set in the second packet. */
1230	flags = TCP_SKB_CB(skb)->flags;
1231	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
1232	TCP_SKB_CB(buff)->flags = flags;
1233
1234	/* This packet was never sent out yet, so no SACK bits. */
1235	TCP_SKB_CB(buff)->sacked = 0;
1236
1237	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1238	skb_split(skb, buff, len);
1239
1240	/* Fix up tso_factor for both original and new SKB.  */
1241	tcp_set_skb_tso_segs(sk, skb, mss_now);
1242	tcp_set_skb_tso_segs(sk, buff, mss_now);
1243
1244	/* Link BUFF into the send queue. */
1245	skb_header_release(buff);
1246	tcp_insert_write_queue_after(skb, buff, sk);
1247
1248	return 0;
1249}
1250
1251/* Try to defer sending, if possible, in order to minimize the amount
1252 * of TSO splitting we do.  View it as a kind of TSO Nagle test.
1253 *
1254 * This algorithm is from John Heffner.
1255 */
1256static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1257{
1258	struct tcp_sock *tp = tcp_sk(sk);
1259	const struct inet_connection_sock *icsk = inet_csk(sk);
1260	u32 send_win, cong_win, limit, in_flight;
1261
1262	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
1263		goto send_now;
1264
1265	if (icsk->icsk_ca_state != TCP_CA_Open)
1266		goto send_now;
1267
1268	/* Defer for less than two clock ticks. */
1269	if (tp->tso_deferred &&
1270	    ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1271		goto send_now;
1272
1273	in_flight = tcp_packets_in_flight(tp);
1274
1275	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1276
1277	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1278
1279	/* From in_flight test above, we know that cwnd > in_flight.  */
1280	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1281
1282	limit = min(send_win, cong_win);
1283
1284	/* If a full-sized TSO skb can be sent, do it. */
1285	if (limit >= sk->sk_gso_max_size)
1286		goto send_now;
1287
1288	if (sysctl_tcp_tso_win_divisor) {
1289		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1290
1291		/* If at least some fraction of a window is available,
1292		 * just use it.
1293		 */
1294		chunk /= sysctl_tcp_tso_win_divisor;
1295		if (limit >= chunk)
1296			goto send_now;
1297	} else {
1298		/* Different approach, try not to defer past a single
1299		 * ACK.  Receiver should ACK every other full sized
1300		 * frame, so if we have space for more than 3 frames
1301		 * then send now.
1302		 */
1303		if (limit > tcp_max_burst(tp) * tp->mss_cache)
1304			goto send_now;
1305	}
1306
1307	/* Ok, it looks like it is advisable to defer.  */
1308	tp->tso_deferred = 1 | (jiffies << 1);
1309
1310	return 1;
1311
1312send_now:
1313	tp->tso_deferred = 0;
1314	return 0;
1315}
1316
1317/* Create a new MTU probe if we are ready.
1318 * Returns 0 if we should wait to probe (no cwnd available),
1319 *         1 if a probe was sent,
1320 *         -1 otherwise
1321 */
1322static int tcp_mtu_probe(struct sock *sk)
1323{
1324	struct tcp_sock *tp = tcp_sk(sk);
1325	struct inet_connection_sock *icsk = inet_csk(sk);
1326	struct sk_buff *skb, *nskb, *next;
1327	int len;
1328	int probe_size;
1329	int size_needed;
1330	int copy;
1331	int mss_now;
1332
1333	/* Not currently probing/verifying,
1334	 * not in recovery,
1335	 * have enough cwnd, and
1336	 * not SACKing (the variable headers throw things off) */
1337	if (!icsk->icsk_mtup.enabled ||
1338	    icsk->icsk_mtup.probe_size ||
1339	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1340	    tp->snd_cwnd < 11 ||
1341	    tp->rx_opt.eff_sacks)
1342		return -1;
1343
1344	/* Very simple search strategy: just double the MSS. */
1345	mss_now = tcp_current_mss(sk, 0);
1346	probe_size = 2 * tp->mss_cache;
1347	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1348	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1349		/* TODO: set timer for probe_converge_event */
1350		return -1;
1351	}
1352
1353	/* Have enough data in the send queue to probe? */
1354	if (tp->write_seq - tp->snd_nxt < size_needed)
1355		return -1;
1356
1357	if (tp->snd_wnd < size_needed)
1358		return -1;
1359	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
1360		return 0;
1361
1362	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
1363	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1364		if (!tcp_packets_in_flight(tp))
1365			return -1;
1366		else
1367			return 0;
1368	}
1369
1370	/* We're allowed to probe.  Build it now. */
1371	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1372		return -1;
1373	sk->sk_wmem_queued += nskb->truesize;
1374	sk_mem_charge(sk, nskb->truesize);
1375
1376	skb = tcp_send_head(sk);
1377
1378	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1379	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1380	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
1381	TCP_SKB_CB(nskb)->sacked = 0;
1382	nskb->csum = 0;
1383	nskb->ip_summed = skb->ip_summed;
1384
1385	tcp_insert_write_queue_before(nskb, skb, sk);
1386
1387	len = 0;
1388	tcp_for_write_queue_from_safe(skb, next, sk) {
1389		copy = min_t(int, skb->len, probe_size - len);
1390		if (nskb->ip_summed)
1391			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1392		else
1393			nskb->csum = skb_copy_and_csum_bits(skb, 0,
1394							    skb_put(nskb, copy),
1395							    copy, nskb->csum);
1396
1397		if (skb->len <= copy) {
1398			/* We've eaten all the data from this skb.
1399			 * Throw it away. */
1400			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1401			tcp_unlink_write_queue(skb, sk);
1402			sk_wmem_free_skb(sk, skb);
1403		} else {
1404			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1405						   ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1406			if (!skb_shinfo(skb)->nr_frags) {
1407				skb_pull(skb, copy);
1408				if (skb->ip_summed != CHECKSUM_PARTIAL)
1409					skb->csum = csum_partial(skb->data,
1410								 skb->len, 0);
1411			} else {
1412				__pskb_trim_head(skb, copy);
1413				tcp_set_skb_tso_segs(sk, skb, mss_now);
1414			}
1415			TCP_SKB_CB(skb)->seq += copy;
1416		}
1417
1418		len += copy;
1419
1420		if (len >= probe_size)
1421			break;
1422	}
1423	tcp_init_tso_segs(sk, nskb, nskb->len);
1424
1425	/* We're ready to send.  If this fails, the probe will
1426	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1427	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1428	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1429		/* Decrement cwnd here because we are sending
1430		 * effectively two packets. */
1431		tp->snd_cwnd--;
1432		tcp_event_new_data_sent(sk, nskb);
1433
1434		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1435		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1436		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1437
1438		return 1;
1439	}
1440
1441	return -1;
1442}
1443
1444/* This routine writes packets to the network.  It advances the
1445 * send_head.  This happens as incoming acks open up the remote
1446 * window for us.
1447 *
1448 * Returns 1, if no segments are in flight and we have queued segments, but
1449 * cannot send anything now because of SWS or another problem.
1450 */
1451static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1452{
1453	struct tcp_sock *tp = tcp_sk(sk);
1454	struct sk_buff *skb;
1455	unsigned int tso_segs, sent_pkts;
1456	int cwnd_quota;
1457	int result;
1458
1459	/* If we are closed, the bytes will have to remain here.
1460	 * In time closedown will finish, we empty the write queue and all
1461	 * will be happy.
1462	 */
1463	if (unlikely(sk->sk_state == TCP_CLOSE))
1464		return 0;
1465
1466	sent_pkts = 0;
1467
1468	/* Do MTU probing. */
1469	if ((result = tcp_mtu_probe(sk)) == 0) {
1470		return 0;
1471	} else if (result > 0) {
1472		sent_pkts = 1;
1473	}
1474
1475	while ((skb = tcp_send_head(sk))) {
1476		unsigned int limit;
1477
1478		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1479		BUG_ON(!tso_segs);
1480
1481		cwnd_quota = tcp_cwnd_test(tp, skb);
1482		if (!cwnd_quota)
1483			break;
1484
1485		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1486			break;
1487
1488		if (tso_segs == 1) {
1489			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1490						     (tcp_skb_is_last(sk, skb) ?
1491						      nonagle : TCP_NAGLE_PUSH))))
1492				break;
1493		} else {
1494			if (tcp_tso_should_defer(sk, skb))
1495				break;
1496		}
1497
1498		limit = mss_now;
1499		if (tso_segs > 1)
1500			limit = tcp_mss_split_point(sk, skb, mss_now,
1501						    cwnd_quota);
1502
1503		if (skb->len > limit &&
1504		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
1505			break;
1506
1507		TCP_SKB_CB(skb)->when = tcp_time_stamp;
1508
1509		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
1510			break;
1511
1512		/* Advance the send_head.  This one is sent out.
1513		 * This call will increment packets_out.
1514		 */
1515		tcp_event_new_data_sent(sk, skb);
1516
1517		tcp_minshall_update(tp, mss_now, skb);
1518		sent_pkts++;
1519	}
1520
1521	if (likely(sent_pkts)) {
1522		tcp_cwnd_validate(sk);
1523		return 0;
1524	}
1525	return !tp->packets_out && tcp_send_head(sk);
1526}
1527
1528/* Push out any pending frames which were held back due to
1529 * TCP_CORK or attempt at coalescing tiny packets.
1530 * The socket must be locked by the caller.
1531 */
1532void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1533			       int nonagle)
1534{
1535	struct sk_buff *skb = tcp_send_head(sk);
1536
1537	if (skb) {
1538		if (tcp_write_xmit(sk, cur_mss, nonagle))
1539			tcp_check_probe_timer(sk);
1540	}
1541}
1542
1543/* Send _single_ skb sitting at the send head. This function requires
1544 * true push pending frames to setup probe timer etc.
1545 */
1546void tcp_push_one(struct sock *sk, unsigned int mss_now)
1547{
1548	struct sk_buff *skb = tcp_send_head(sk);
1549	unsigned int tso_segs, cwnd_quota;
1550
1551	BUG_ON(!skb || skb->len < mss_now);
1552
1553	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1554	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1555
1556	if (likely(cwnd_quota)) {
1557		unsigned int limit;
1558
1559		BUG_ON(!tso_segs);
1560
1561		limit = mss_now;
1562		if (tso_segs > 1)
1563			limit = tcp_mss_split_point(sk, skb, mss_now,
1564						    cwnd_quota);
1565
1566		if (skb->len > limit &&
1567		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
1568			return;
1569
1570		/* Send it out now. */
1571		TCP_SKB_CB(skb)->when = tcp_time_stamp;
1572
1573		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
1574			tcp_event_new_data_sent(sk, skb);
1575			tcp_cwnd_validate(sk);
1576			return;
1577		}
1578	}
1579}
1580
1581/* This function returns the amount that we can raise the
1582 * usable window based on the following constraints
1583 *
1584 * 1. The window can never be shrunk once it is offered (RFC 793)
1585 * 2. We limit memory per socket
1586 *
1587 * RFC 1122:
1588 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
1589 *  RECV.NEXT + RCV.WIN fixed until:
1590 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
1591 *
1592 * i.e. don't raise the right edge of the window until you can raise
1593 * it at least MSS bytes.
1594 *
1595 * Unfortunately, the recommended algorithm breaks header prediction,
1596 * since header prediction assumes th->window stays fixed.
1597 *
1598 * Strictly speaking, keeping th->window fixed violates the receiver
1599 * side SWS prevention criteria. The problem is that under this rule
1600 * a stream of single byte packets will cause the right side of the
1601 * window to always advance by a single byte.
1602 *
1603 * Of course, if the sender implements sender side SWS prevention
1604 * then this will not be a problem.
1605 *
1606 * BSD seems to make the following compromise:
1607 *
1608 *	If the free space is less than the 1/4 of the maximum
1609 *	space available and the free space is less than 1/2 mss,
1610 *	then set the window to 0.
1611 *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
1612 *	Otherwise, just prevent the window from shrinking
1613 *	and from being larger than the largest representable value.
1614 *
1615 * This prevents incremental opening of the window in the regime
1616 * where TCP is limited by the speed of the reader side taking
1617 * data out of the TCP receive queue. It does nothing about
1618 * those cases where the window is constrained on the sender side
1619 * because the pipeline is full.
1620 *
1621 * BSD also seems to "accidentally" limit itself to windows that are a
1622 * multiple of MSS, at least until the free space gets quite small.
1623 * This would appear to be a side effect of the mbuf implementation.
1624 * Combining these two algorithms results in the observed behavior
1625 * of having a fixed window size at almost all times.
1626 *
1627 * Below we obtain similar behavior by forcing the offered window to
1628 * a multiple of the mss when it is feasible to do so.
1629 *
1630 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
1631 * Regular options like TIMESTAMP are taken into account.
1632 */
1633u32 __tcp_select_window(struct sock *sk)
1634{
1635	struct inet_connection_sock *icsk = inet_csk(sk);
1636	struct tcp_sock *tp = tcp_sk(sk);
1637	/* MSS for the peer's data.  Previous versions used mss_clamp
1638	 * here.  I don't know if the value based on our guesses
1639	 * of peer's MSS is better for the performance.  It's more correct
1640	 * but may be worse for the performance because of rcv_mss
1641	 * fluctuations.  --SAW  1998/11/1
1642	 */
1643	int mss = icsk->icsk_ack.rcv_mss;
1644	int free_space = tcp_space(sk);
1645	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
1646	int window;
1647
1648	if (mss > full_space)
1649		mss = full_space;
1650
1651	if (free_space < (full_space >> 1)) {
1652		icsk->icsk_ack.quick = 0;
1653
1654		if (tcp_memory_pressure)
1655			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
1656					       4U * tp->advmss);
1657
1658		if (free_space < mss)
1659			return 0;
1660	}
1661
1662	if (free_space > tp->rcv_ssthresh)
1663		free_space = tp->rcv_ssthresh;
1664
1665	/* Don't do rounding if we are using window scaling, since the
1666	 * scaled window will not line up with the MSS boundary anyway.
1667	 */
1668	window = tp->rcv_wnd;
1669	if (tp->rx_opt.rcv_wscale) {
1670		window = free_space;
1671
1672		/* Advertise enough space so that it won't get scaled away.
1673		 * Import case: prevent zero window announcement if
1674		 * 1<<rcv_wscale > mss.
1675		 */
1676		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
1677			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
1678				  << tp->rx_opt.rcv_wscale);
1679	} else {
1680		/* Get the largest window that is a nice multiple of mss.
1681		 * Window clamp already applied above.
1682		 * If our current window offering is within 1 mss of the
1683		 * free space we just keep it. This prevents the divide
1684		 * and multiply from happening most of the time.
1685		 * We also don't do any window rounding when the free space
1686		 * is too small.
1687		 */
1688		if (window <= free_space - mss || window > free_space)
1689			window = (free_space / mss) * mss;
1690		else if (mss == full_space &&
1691			 free_space > window + (full_space >> 1))
1692			window = free_space;
1693	}
1694
1695	return window;
1696}
1697
1698/* Attempt to collapse two adjacent SKB's during retransmission. */
1699static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1700				     int mss_now)
1701{
1702	struct tcp_sock *tp = tcp_sk(sk);
1703	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1704	int skb_size, next_skb_size;
1705	u16 flags;
1706
1707	/* The first test we must make is that neither of these two
1708	 * SKB's are still referenced by someone else.
1709	 */
1710	if (skb_cloned(skb) || skb_cloned(next_skb))
1711		return;
1712
1713	skb_size = skb->len;
1714	next_skb_size = next_skb->len;
1715	flags = TCP_SKB_CB(skb)->flags;
1716
1717	/* Also punt if next skb has been SACK'd. */
1718	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1719		return;
1720
1721	/* Next skb is out of window. */
1722	if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
1723		return;
1724
1725	/* Punt if not enough space exists in the first SKB for
1726	 * the data in the second, or the total combined payload
1727	 * would exceed the MSS.
1728	 */
1729	if ((next_skb_size > skb_tailroom(skb)) ||
1730	    ((skb_size + next_skb_size) > mss_now))
1731		return;
1732
1733	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1734
1735	tcp_highest_sack_combine(sk, next_skb, skb);
1736
1737	/* Ok.	We will be able to collapse the packet. */
1738	tcp_unlink_write_queue(next_skb, sk);
1739
1740	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
1741				  next_skb_size);
1742
1743	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
1744		skb->ip_summed = CHECKSUM_PARTIAL;
1745
1746	if (skb->ip_summed != CHECKSUM_PARTIAL)
1747		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
1748
1749	/* Update sequence range on original skb. */
1750	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1751
1752	/* Merge over control information. */
1753	flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
1754	TCP_SKB_CB(skb)->flags = flags;
1755
1756	/* All done, get rid of second SKB and account for it so
1757	 * packet counting does not break.
1758	 */
1759	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
1760	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS)
1761		tp->retrans_out -= tcp_skb_pcount(next_skb);
1762	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST)
1763		tp->lost_out -= tcp_skb_pcount(next_skb);
1764	/* Reno case is special. Sigh... */
1765	if (tcp_is_reno(tp) && tp->sacked_out)
1766		tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
1767
1768	tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb));
1769	tp->packets_out -= tcp_skb_pcount(next_skb);
1770
1771	/* changed transmit queue under us so clear hints */
1772	tcp_clear_retrans_hints_partial(tp);
1773
1774	sk_wmem_free_skb(sk, next_skb);
1775}
1776
1777/* Do a simple retransmit without using the backoff mechanisms in
1778 * tcp_timer. This is used for path mtu discovery.
1779 * The socket is already locked here.
1780 */
1781void tcp_simple_retransmit(struct sock *sk)
1782{
1783	const struct inet_connection_sock *icsk = inet_csk(sk);
1784	struct tcp_sock *tp = tcp_sk(sk);
1785	struct sk_buff *skb;
1786	unsigned int mss = tcp_current_mss(sk, 0);
1787	int lost = 0;
1788
1789	tcp_for_write_queue(skb, sk) {
1790		if (skb == tcp_send_head(sk))
1791			break;
1792		if (skb->len > mss &&
1793		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1794			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1795				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1796				tp->retrans_out -= tcp_skb_pcount(skb);
1797			}
1798			if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
1799				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1800				tp->lost_out += tcp_skb_pcount(skb);
1801				lost = 1;
1802			}
1803		}
1804	}
1805
1806	tcp_clear_all_retrans_hints(tp);
1807
1808	if (!lost)
1809		return;
1810
1811	tcp_verify_left_out(tp);
1812
1813	/* Don't muck with the congestion window here.
1814	 * Reason is that we do not increase amount of _data_
1815	 * in network, but units changed and effective
1816	 * cwnd/ssthresh really reduced now.
1817	 */
1818	if (icsk->icsk_ca_state != TCP_CA_Loss) {
1819		tp->high_seq = tp->snd_nxt;
1820		tp->snd_ssthresh = tcp_current_ssthresh(sk);
1821		tp->prior_ssthresh = 0;
1822		tp->undo_marker = 0;
1823		tcp_set_ca_state(sk, TCP_CA_Loss);
1824	}
1825	tcp_xmit_retransmit_queue(sk);
1826}
1827
1828/* This retransmits one SKB.  Policy decisions and retransmit queue
1829 * state updates are done by the caller.  Returns non-zero if an
1830 * error occurred which prevented the send.
1831 */
1832int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1833{
1834	struct tcp_sock *tp = tcp_sk(sk);
1835	struct inet_connection_sock *icsk = inet_csk(sk);
1836	unsigned int cur_mss = tcp_current_mss(sk, 0);
1837	int err;
1838
1839	/* Inconslusive MTU probe */
1840	if (icsk->icsk_mtup.probe_size) {
1841		icsk->icsk_mtup.probe_size = 0;
1842	}
1843
1844	/* Do not sent more than we queued. 1/4 is reserved for possible
1845	 * copying overhead: fragmentation, tunneling, mangling etc.
1846	 */
1847	if (atomic_read(&sk->sk_wmem_alloc) >
1848	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1849		return -EAGAIN;
1850
1851	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1852		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1853			BUG();
1854		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1855			return -ENOMEM;
1856	}
1857
1858	/* If receiver has shrunk his window, and skb is out of
1859	 * new window, do not retransmit it. The exception is the
1860	 * case, when window is shrunk to zero. In this case
1861	 * our retransmit serves as a zero window probe.
1862	 */
1863	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
1864	    && TCP_SKB_CB(skb)->seq != tp->snd_una)
1865		return -EAGAIN;
1866
1867	if (skb->len > cur_mss) {
1868		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1869			return -ENOMEM; /* We'll try again later. */
1870	}
1871
1872	/* Collapse two adjacent packets if worthwhile and we can. */
1873	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1874	    (skb->len < (cur_mss >> 1)) &&
1875	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1876	    (!tcp_skb_is_last(sk, skb)) &&
1877	    (skb_shinfo(skb)->nr_frags == 0 &&
1878	     skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1879	    (tcp_skb_pcount(skb) == 1 &&
1880	     tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1881	    (sysctl_tcp_retrans_collapse != 0))
1882		tcp_retrans_try_collapse(sk, skb, cur_mss);
1883
1884	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1885		return -EHOSTUNREACH; /* Routing failure or similar. */
1886
1887	/* Some Solaris stacks overoptimize and ignore the FIN on a
1888	 * retransmit when old data is attached.  So strip it off
1889	 * since it is cheap to do so and saves bytes on the network.
1890	 */
1891	if (skb->len > 0 &&
1892	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1893	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1894		if (!pskb_trim(skb, 0)) {
1895			/* Reuse, even though it does some unnecessary work */
1896			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
1897					     TCP_SKB_CB(skb)->flags);
1898			skb->ip_summed = CHECKSUM_NONE;
1899		}
1900	}
1901
1902	/* Make a copy, if the first transmission SKB clone we made
1903	 * is still in somebody's hands, else make a clone.
1904	 */
1905	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1906
1907	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1908
1909	if (err == 0) {
1910		/* Update global TCP statistics. */
1911		TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1912
1913		tp->total_retrans++;
1914
1915#if FASTRETRANS_DEBUG > 0
1916		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1917			if (net_ratelimit())
1918				printk(KERN_DEBUG "retrans_out leaked.\n");
1919		}
1920#endif
1921		if (!tp->retrans_out)
1922			tp->lost_retrans_low = tp->snd_nxt;
1923		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1924		tp->retrans_out += tcp_skb_pcount(skb);
1925
1926		/* Save stamp of the first retransmit. */
1927		if (!tp->retrans_stamp)
1928			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1929
1930		tp->undo_retrans++;
1931
1932		/* snd_nxt is stored to detect loss of retransmitted segment,
1933		 * see tcp_input.c tcp_sacktag_write_queue().
1934		 */
1935		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1936	}
1937	return err;
1938}
1939
1940/* This gets called after a retransmit timeout, and the initially
1941 * retransmitted data is acknowledged.  It tries to continue
1942 * resending the rest of the retransmit queue, until either
1943 * we've sent it all or the congestion window limit is reached.
1944 * If doing SACK, the first ACK which comes back for a timeout
1945 * based retransmit packet might feed us FACK information again.
1946 * If so, we use it to avoid unnecessarily retransmissions.
1947 */
1948void tcp_xmit_retransmit_queue(struct sock *sk)
1949{
1950	const struct inet_connection_sock *icsk = inet_csk(sk);
1951	struct tcp_sock *tp = tcp_sk(sk);
1952	struct sk_buff *skb;
1953	int packet_cnt;
1954
1955	if (tp->retransmit_skb_hint) {
1956		skb = tp->retransmit_skb_hint;
1957		packet_cnt = tp->retransmit_cnt_hint;
1958	} else {
1959		skb = tcp_write_queue_head(sk);
1960		packet_cnt = 0;
1961	}
1962
1963	/* First pass: retransmit lost packets. */
1964	if (tp->lost_out) {
1965		tcp_for_write_queue_from(skb, sk) {
1966			__u8 sacked = TCP_SKB_CB(skb)->sacked;
1967
1968			if (skb == tcp_send_head(sk))
1969				break;
1970			/* we could do better than to assign each time */
1971			tp->retransmit_skb_hint = skb;
1972			tp->retransmit_cnt_hint = packet_cnt;
1973
1974			/* Assume this retransmit will generate
1975			 * only one packet for congestion window
1976			 * calculation purposes.  This works because
1977			 * tcp_retransmit_skb() will chop up the
1978			 * packet to be MSS sized and all the
1979			 * packet counting works out.
1980			 */
1981			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1982				return;
1983
1984			if (sacked & TCPCB_LOST) {
1985				if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1986					if (tcp_retransmit_skb(sk, skb)) {
1987						tp->retransmit_skb_hint = NULL;
1988						return;
1989					}
1990					if (icsk->icsk_ca_state != TCP_CA_Loss)
1991						NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1992					else
1993						NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1994
1995					if (skb == tcp_write_queue_head(sk))
1996						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1997									  inet_csk(sk)->icsk_rto,
1998									  TCP_RTO_MAX);
1999				}
2000
2001				packet_cnt += tcp_skb_pcount(skb);
2002				if (packet_cnt >= tp->lost_out)
2003					break;
2004			}
2005		}
2006	}
2007
2008	/* OK, demanded retransmission is finished. */
2009
2010	/* Forward retransmissions are possible only during Recovery. */
2011	if (icsk->icsk_ca_state != TCP_CA_Recovery)
2012		return;
2013
2014	/* No forward retransmissions in Reno are possible. */
2015	if (tcp_is_reno(tp))
2016		return;
2017
2018	/* Yeah, we have to make difficult choice between forward transmission
2019	 * and retransmission... Both ways have their merits...
2020	 *
2021	 * For now we do not retransmit anything, while we have some new
2022	 * segments to send. In the other cases, follow rule 3 for
2023	 * NextSeg() specified in RFC3517.
2024	 */
2025
2026	if (tcp_may_send_now(sk))
2027		return;
2028
2029	/* If nothing is SACKed, highest_sack in the loop won't be valid */
2030	if (!tp->sacked_out)
2031		return;
2032
2033	if (tp->forward_skb_hint)
2034		skb = tp->forward_skb_hint;
2035	else
2036		skb = tcp_write_queue_head(sk);
2037
2038	tcp_for_write_queue_from(skb, sk) {
2039		if (skb == tcp_send_head(sk))
2040			break;
2041		tp->forward_skb_hint = skb;
2042
2043		if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2044			break;
2045
2046		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2047			break;
2048
2049		if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
2050			continue;
2051
2052		/* Ok, retransmit it. */
2053		if (tcp_retransmit_skb(sk, skb)) {
2054			tp->forward_skb_hint = NULL;
2055			break;
2056		}
2057
2058		if (skb == tcp_write_queue_head(sk))
2059			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2060						  inet_csk(sk)->icsk_rto,
2061						  TCP_RTO_MAX);
2062
2063		NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
2064	}
2065}
2066
2067/* Send a fin.  The caller locks the socket for us.  This cannot be
2068 * allowed to fail queueing a FIN frame under any circumstances.
2069 */
2070void tcp_send_fin(struct sock *sk)
2071{
2072	struct tcp_sock *tp = tcp_sk(sk);
2073	struct sk_buff *skb = tcp_write_queue_tail(sk);
2074	int mss_now;
2075
2076	/* Optimization, tack on the FIN if we have a queue of
2077	 * unsent frames.  But be careful about outgoing SACKS
2078	 * and IP options.
2079	 */
2080	mss_now = tcp_current_mss(sk, 1);
2081
2082	if (tcp_send_head(sk) != NULL) {
2083		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
2084		TCP_SKB_CB(skb)->end_seq++;
2085		tp->write_seq++;
2086	} else {
2087		/* Socket is locked, keep trying until memory is available. */
2088		for (;;) {
2089			skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
2090			if (skb)
2091				break;
2092			yield();
2093		}
2094
2095		/* Reserve space for headers and prepare control bits. */
2096		skb_reserve(skb, MAX_TCP_HEADER);
2097		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2098		tcp_init_nondata_skb(skb, tp->write_seq,
2099				     TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
2100		tcp_queue_skb(sk, skb);
2101	}
2102	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2103}
2104
2105/* We get here when a process closes a file descriptor (either due to
2106 * an explicit close() or as a byproduct of exit()'ing) and there
2107 * was unread data in the receive queue.  This behavior is recommended
2108 * by RFC 2525, section 2.17.  -DaveM
2109 */
2110void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2111{
2112	struct sk_buff *skb;
2113
2114	/* NOTE: No TCP options attached and we never retransmit this. */
2115	skb = alloc_skb(MAX_TCP_HEADER, priority);
2116	if (!skb) {
2117		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
2118		return;
2119	}
2120
2121	/* Reserve space for headers and prepare control bits. */
2122	skb_reserve(skb, MAX_TCP_HEADER);
2123	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2124			     TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
2125	/* Send it off. */
2126	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2127	if (tcp_transmit_skb(sk, skb, 0, priority))
2128		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
2129}
2130
2131/* WARNING: This routine must only be called when we have already sent
2132 * a SYN packet that crossed the incoming SYN that caused this routine
2133 * to get called. If this assumption fails then the initial rcv_wnd
2134 * and rcv_wscale values will not be correct.
2135 */
2136int tcp_send_synack(struct sock *sk)
2137{
2138	struct sk_buff *skb;
2139
2140	skb = tcp_write_queue_head(sk);
2141	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
2142		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2143		return -EFAULT;
2144	}
2145	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
2146		if (skb_cloned(skb)) {
2147			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2148			if (nskb == NULL)
2149				return -ENOMEM;
2150			tcp_unlink_write_queue(skb, sk);
2151			skb_header_release(nskb);
2152			__tcp_add_write_queue_head(sk, nskb);
2153			sk_wmem_free_skb(sk, skb);
2154			sk->sk_wmem_queued += nskb->truesize;
2155			sk_mem_charge(sk, nskb->truesize);
2156			skb = nskb;
2157		}
2158
2159		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
2160		TCP_ECN_send_synack(tcp_sk(sk), skb);
2161	}
2162	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2163	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2164}
2165
2166/*
2167 * Prepare a SYN-ACK.
2168 */
2169struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2170				struct request_sock *req)
2171{
2172	struct inet_request_sock *ireq = inet_rsk(req);
2173	struct tcp_sock *tp = tcp_sk(sk);
2174	struct tcphdr *th;
2175	int tcp_header_size;
2176	struct sk_buff *skb;
2177#ifdef CONFIG_TCP_MD5SIG
2178	struct tcp_md5sig_key *md5;
2179	__u8 *md5_hash_location;
2180#endif
2181
2182	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2183	if (skb == NULL)
2184		return NULL;
2185
2186	/* Reserve space for headers. */
2187	skb_reserve(skb, MAX_TCP_HEADER);
2188
2189	skb->dst = dst_clone(dst);
2190
2191	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
2192			   (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
2193			   (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
2194			   /* SACK_PERM is in the place of NOP NOP of TS */
2195			   ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
2196
2197#ifdef CONFIG_TCP_MD5SIG
2198	/* Are we doing MD5 on this segment? If so - make room for it */
2199	md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
2200	if (md5)
2201		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
2202#endif
2203	skb_push(skb, tcp_header_size);
2204	skb_reset_transport_header(skb);
2205
2206	th = tcp_hdr(skb);
2207	memset(th, 0, sizeof(struct tcphdr));
2208	th->syn = 1;
2209	th->ack = 1;
2210	TCP_ECN_make_synack(req, th);
2211	th->source = inet_sk(sk)->sport;
2212	th->dest = ireq->rmt_port;
2213	/* Setting of flags are superfluous here for callers (and ECE is
2214	 * not even correctly set)
2215	 */
2216	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2217			     TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
2218	th->seq = htonl(TCP_SKB_CB(skb)->seq);
2219	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2220	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2221		__u8 rcv_wscale;
2222		/* Set this up on the first call only */
2223		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2224		/* tcp_full_space because it is guaranteed to be the first packet */
2225		tcp_select_initial_window(tcp_full_space(sk),
2226			dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2227			&req->rcv_wnd,
2228			&req->window_clamp,
2229			ireq->wscale_ok,
2230			&rcv_wscale);
2231		ireq->rcv_wscale = rcv_wscale;
2232	}
2233
2234	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2235	th->window = htons(min(req->rcv_wnd, 65535U));
2236#ifdef CONFIG_SYN_COOKIES
2237	if (unlikely(req->cookie_ts))
2238		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
2239	else
2240#endif
2241	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2242	tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
2243			      ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
2244			      TCP_SKB_CB(skb)->when,
2245			      req->ts_recent,
2246			      (
2247#ifdef CONFIG_TCP_MD5SIG
2248			       md5 ? &md5_hash_location :
2249#endif
2250			       NULL)
2251			      );
2252
2253	th->doff = (tcp_header_size >> 2);
2254	TCP_INC_STATS(TCP_MIB_OUTSEGS);
2255
2256#ifdef CONFIG_TCP_MD5SIG
2257	/* Okay, we have all we need - do the md5 hash if needed */
2258	if (md5) {
2259		tp->af_specific->calc_md5_hash(md5_hash_location,
2260					       md5,
2261					       NULL, dst, req,
2262					       tcp_hdr(skb), sk->sk_protocol,
2263					       skb->len);
2264	}
2265#endif
2266
2267	return skb;
2268}
2269
2270/*
2271 * Do all connect socket setups that can be done AF independent.
2272 */
2273static void tcp_connect_init(struct sock *sk)
2274{
2275	struct dst_entry *dst = __sk_dst_get(sk);
2276	struct tcp_sock *tp = tcp_sk(sk);
2277	__u8 rcv_wscale;
2278
2279	/* We'll fix this up when we get a response from the other end.
2280	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
2281	 */
2282	tp->tcp_header_len = sizeof(struct tcphdr) +
2283		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
2284
2285#ifdef CONFIG_TCP_MD5SIG
2286	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
2287		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2288#endif
2289
2290	/* If user gave his TCP_MAXSEG, record it to clamp */
2291	if (tp->rx_opt.user_mss)
2292		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2293	tp->max_window = 0;
2294	tcp_mtup_init(sk);
2295	tcp_sync_mss(sk, dst_mtu(dst));
2296
2297	if (!tp->window_clamp)
2298		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2299	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
2300	tcp_initialize_rcv_mss(sk);
2301
2302	tcp_select_initial_window(tcp_full_space(sk),
2303				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2304				  &tp->rcv_wnd,
2305				  &tp->window_clamp,
2306				  sysctl_tcp_window_scaling,
2307				  &rcv_wscale);
2308
2309	tp->rx_opt.rcv_wscale = rcv_wscale;
2310	tp->rcv_ssthresh = tp->rcv_wnd;
2311
2312	sk->sk_err = 0;
2313	sock_reset_flag(sk, SOCK_DONE);
2314	tp->snd_wnd = 0;
2315	tcp_init_wl(tp, tp->write_seq, 0);
2316	tp->snd_una = tp->write_seq;
2317	tp->snd_sml = tp->write_seq;
2318	tp->rcv_nxt = 0;
2319	tp->rcv_wup = 0;
2320	tp->copied_seq = 0;
2321
2322	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2323	inet_csk(sk)->icsk_retransmits = 0;
2324	tcp_clear_retrans(tp);
2325}
2326
2327/*
2328 * Build a SYN and send it off.
2329 */
2330int tcp_connect(struct sock *sk)
2331{
2332	struct tcp_sock *tp = tcp_sk(sk);
2333	struct sk_buff *buff;
2334
2335	tcp_connect_init(sk);
2336
2337	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
2338	if (unlikely(buff == NULL))
2339		return -ENOBUFS;
2340
2341	/* Reserve space for headers. */
2342	skb_reserve(buff, MAX_TCP_HEADER);
2343
2344	tp->snd_nxt = tp->write_seq;
2345	tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
2346	TCP_ECN_send_syn(sk, buff);
2347
2348	/* Send it off. */
2349	TCP_SKB_CB(buff)->when = tcp_time_stamp;
2350	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
2351	skb_header_release(buff);
2352	__tcp_add_write_queue_tail(sk, buff);
2353	sk->sk_wmem_queued += buff->truesize;
2354	sk_mem_charge(sk, buff->truesize);
2355	tp->packets_out += tcp_skb_pcount(buff);
2356	tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
2357
2358	/* We change tp->snd_nxt after the tcp_transmit_skb() call
2359	 * in order to make this packet get counted in tcpOutSegs.
2360	 */
2361	tp->snd_nxt = tp->write_seq;
2362	tp->pushed_seq = tp->write_seq;
2363	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
2364
2365	/* Timer for repeating the SYN until an answer. */
2366	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2367				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2368	return 0;
2369}
2370
2371/* Send out a delayed ack, the caller does the policy checking
2372 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
2373 * for details.
2374 */
2375void tcp_send_delayed_ack(struct sock *sk)
2376{
2377	struct inet_connection_sock *icsk = inet_csk(sk);
2378	int ato = icsk->icsk_ack.ato;
2379	unsigned long timeout;
2380
2381	if (ato > TCP_DELACK_MIN) {
2382		const struct tcp_sock *tp = tcp_sk(sk);
2383		int max_ato = HZ / 2;
2384
2385		if (icsk->icsk_ack.pingpong ||
2386		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
2387			max_ato = TCP_DELACK_MAX;
2388
2389		/* Slow path, intersegment interval is "high". */
2390
2391		/* If some rtt estimate is known, use it to bound delayed ack.
2392		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
2393		 * directly.
2394		 */
2395		if (tp->srtt) {
2396			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
2397
2398			if (rtt < max_ato)
2399				max_ato = rtt;
2400		}
2401
2402		ato = min(ato, max_ato);
2403	}
2404
2405	/* Stay within the limit we were given */
2406	timeout = jiffies + ato;
2407
2408	/* Use new timeout only if there wasn't a older one earlier. */
2409	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
2410		/* If delack timer was blocked or is about to expire,
2411		 * send ACK now.
2412		 */
2413		if (icsk->icsk_ack.blocked ||
2414		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
2415			tcp_send_ack(sk);
2416			return;
2417		}
2418
2419		if (!time_before(timeout, icsk->icsk_ack.timeout))
2420			timeout = icsk->icsk_ack.timeout;
2421	}
2422	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
2423	icsk->icsk_ack.timeout = timeout;
2424	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
2425}
2426
2427/* This routine sends an ack and also updates the window. */
2428void tcp_send_ack(struct sock *sk)
2429{
2430	struct sk_buff *buff;
2431
2432	/* If we have been reset, we may not send again. */
2433	if (sk->sk_state == TCP_CLOSE)
2434		return;
2435
2436	/* We are not putting this on the write queue, so
2437	 * tcp_transmit_skb() will set the ownership to this
2438	 * sock.
2439	 */
2440	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
2441	if (buff == NULL) {
2442		inet_csk_schedule_ack(sk);
2443		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
2444		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
2445					  TCP_DELACK_MAX, TCP_RTO_MAX);
2446		return;
2447	}
2448
2449	/* Reserve space for headers and prepare control bits. */
2450	skb_reserve(buff, MAX_TCP_HEADER);
2451	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
2452
2453	/* Send it off, this clears delayed acks for us. */
2454	TCP_SKB_CB(buff)->when = tcp_time_stamp;
2455	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
2456}
2457
2458/* This routine sends a packet with an out of date sequence
2459 * number. It assumes the other end will try to ack it.
2460 *
2461 * Question: what should we make while urgent mode?
2462 * 4.4BSD forces sending single byte of data. We cannot send
2463 * out of window data, because we have SND.NXT==SND.MAX...
2464 *
2465 * Current solution: to send TWO zero-length segments in urgent mode:
2466 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
2467 * out-of-date with SND.UNA-1 to probe window.
2468 */
2469static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2470{
2471	struct tcp_sock *tp = tcp_sk(sk);
2472	struct sk_buff *skb;
2473
2474	/* We don't queue it, tcp_transmit_skb() sets ownership. */
2475	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
2476	if (skb == NULL)
2477		return -1;
2478
2479	/* Reserve space for headers and set control bits. */
2480	skb_reserve(skb, MAX_TCP_HEADER);
2481	/* Use a previous sequence.  This should cause the other
2482	 * end to send an ack.  Don't queue or clone SKB, just
2483	 * send it.
2484	 */
2485	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
2486	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2487	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2488}
2489
2490int tcp_write_wakeup(struct sock *sk)
2491{
2492	struct tcp_sock *tp = tcp_sk(sk);
2493	struct sk_buff *skb;
2494
2495	if (sk->sk_state == TCP_CLOSE)
2496		return -1;
2497
2498	if ((skb = tcp_send_head(sk)) != NULL &&
2499	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
2500		int err;
2501		unsigned int mss = tcp_current_mss(sk, 0);
2502		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2503
2504		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
2505			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
2506
2507		/* We are probing the opening of a window
2508		 * but the window size is != 0
2509		 * must have been a result SWS avoidance ( sender )
2510		 */
2511		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2512		    skb->len > mss) {
2513			seg_size = min(seg_size, mss);
2514			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
2515			if (tcp_fragment(sk, skb, seg_size, mss))
2516				return -1;
2517		} else if (!tcp_skb_pcount(skb))
2518			tcp_set_skb_tso_segs(sk, skb, mss);
2519
2520		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
2521		TCP_SKB_CB(skb)->when = tcp_time_stamp;
2522		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2523		if (!err)
2524			tcp_event_new_data_sent(sk, skb);
2525		return err;
2526	} else {
2527		if (tp->urg_mode &&
2528		    between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
2529			tcp_xmit_probe_skb(sk, 1);
2530		return tcp_xmit_probe_skb(sk, 0);
2531	}
2532}
2533
2534/* A window probe timeout has occurred.  If window is not closed send
2535 * a partial packet else a zero probe.
2536 */
2537void tcp_send_probe0(struct sock *sk)
2538{
2539	struct inet_connection_sock *icsk = inet_csk(sk);
2540	struct tcp_sock *tp = tcp_sk(sk);
2541	int err;
2542
2543	err = tcp_write_wakeup(sk);
2544
2545	if (tp->packets_out || !tcp_send_head(sk)) {
2546		/* Cancel probe timer, if it is not required. */
2547		icsk->icsk_probes_out = 0;
2548		icsk->icsk_backoff = 0;
2549		return;
2550	}
2551
2552	if (err <= 0) {
2553		if (icsk->icsk_backoff < sysctl_tcp_retries2)
2554			icsk->icsk_backoff++;
2555		icsk->icsk_probes_out++;
2556		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2557					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2558					  TCP_RTO_MAX);
2559	} else {
2560		/* If packet was not sent due to local congestion,
2561		 * do not backoff and do not remember icsk_probes_out.
2562		 * Let local senders to fight for local resources.
2563		 *
2564		 * Use accumulated backoff yet.
2565		 */
2566		if (!icsk->icsk_probes_out)
2567			icsk->icsk_probes_out = 1;
2568		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2569					  min(icsk->icsk_rto << icsk->icsk_backoff,
2570					      TCP_RESOURCE_PROBE_INTERVAL),
2571					  TCP_RTO_MAX);
2572	}
2573}
2574
2575EXPORT_SYMBOL(tcp_select_initial_window);
2576EXPORT_SYMBOL(tcp_connect);
2577EXPORT_SYMBOL(tcp_make_synack);
2578EXPORT_SYMBOL(tcp_simple_retransmit);
2579EXPORT_SYMBOL(tcp_sync_mss);
2580EXPORT_SYMBOL(tcp_mtup_init);
2581