tcp_minisocks.c revision 463c84b97f24010a67cd871746d6a7e4c925a5f9
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 */ 22 23#include <linux/config.h> 24#include <linux/mm.h> 25#include <linux/module.h> 26#include <linux/sysctl.h> 27#include <linux/workqueue.h> 28#include <net/tcp.h> 29#include <net/inet_common.h> 30#include <net/xfrm.h> 31 32#ifdef CONFIG_SYSCTL 33#define SYNC_INIT 0 /* let the user enable it */ 34#else 35#define SYNC_INIT 1 36#endif 37 38int sysctl_tcp_tw_recycle; 39int sysctl_tcp_max_tw_buckets = NR_FILE*2; 40 41int sysctl_tcp_syncookies = SYNC_INIT; 42int sysctl_tcp_abort_on_overflow; 43 44static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); 45 46static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 47{ 48 if (seq == s_win) 49 return 1; 50 if (after(end_seq, s_win) && before(seq, e_win)) 51 return 1; 52 return (seq == e_win && seq == end_seq); 53} 54 55/* New-style handling of TIME_WAIT sockets. */ 56 57int tcp_tw_count; 58 59/* 60 * * Main purpose of TIME-WAIT state is to close connection gracefully, 61 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 62 * (and, probably, tail of data) and one or more our ACKs are lost. 63 * * What is TIME-WAIT timeout? It is associated with maximal packet 64 * lifetime in the internet, which results in wrong conclusion, that 65 * it is set to catch "old duplicate segments" wandering out of their path. 66 * It is not quite correct. This timeout is calculated so that it exceeds 67 * maximal retransmission timeout enough to allow to lose one (or more) 68 * segments sent by peer and our ACKs. This time may be calculated from RTO. 69 * * When TIME-WAIT socket receives RST, it means that another end 70 * finally closed and we are allowed to kill TIME-WAIT too. 71 * * Second purpose of TIME-WAIT is catching old duplicate segments. 72 * Well, certainly it is pure paranoia, but if we load TIME-WAIT 73 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. 74 * * If we invented some more clever way to catch duplicates 75 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. 76 * 77 * The algorithm below is based on FORMAL INTERPRETATION of RFCs. 78 * When you compare it to RFCs, please, read section SEGMENT ARRIVES 79 * from the very beginning. 80 * 81 * NOTE. With recycling (and later with fin-wait-2) TW bucket 82 * is _not_ stateless. It means, that strictly speaking we must 83 * spinlock it. I do not want! Well, probability of misbehaviour 84 * is ridiculously low and, seems, we could use some mb() tricks 85 * to avoid misread sequence numbers, states etc. --ANK 86 */ 87enum tcp_tw_status 88tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 89 const struct tcphdr *th) 90{ 91 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 92 struct tcp_options_received tmp_opt; 93 int paws_reject = 0; 94 95 tmp_opt.saw_tstamp = 0; 96 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 97 tcp_parse_options(skb, &tmp_opt, 0); 98 99 if (tmp_opt.saw_tstamp) { 100 tmp_opt.ts_recent = tcptw->tw_ts_recent; 101 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 102 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 103 } 104 } 105 106 if (tw->tw_substate == TCP_FIN_WAIT2) { 107 /* Just repeat all the checks of tcp_rcv_state_process() */ 108 109 /* Out of window, send ACK */ 110 if (paws_reject || 111 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 112 tcptw->tw_rcv_nxt, 113 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) 114 return TCP_TW_ACK; 115 116 if (th->rst) 117 goto kill; 118 119 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) 120 goto kill_with_rst; 121 122 /* Dup ACK? */ 123 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || 124 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 125 inet_twsk_put(tw); 126 return TCP_TW_SUCCESS; 127 } 128 129 /* New data or FIN. If new data arrive after half-duplex close, 130 * reset. 131 */ 132 if (!th->fin || 133 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 134kill_with_rst: 135 tcp_tw_deschedule(tw); 136 inet_twsk_put(tw); 137 return TCP_TW_RST; 138 } 139 140 /* FIN arrived, enter true time-wait state. */ 141 tw->tw_substate = TCP_TIME_WAIT; 142 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 143 if (tmp_opt.saw_tstamp) { 144 tcptw->tw_ts_recent_stamp = xtime.tv_sec; 145 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 146 } 147 148 /* I am shamed, but failed to make it more elegant. 149 * Yes, it is direct reference to IP, which is impossible 150 * to generalize to IPv6. Taking into account that IPv6 151 * do not undertsnad recycling in any case, it not 152 * a big problem in practice. --ANK */ 153 if (tw->tw_family == AF_INET && 154 sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && 155 tcp_v4_tw_remember_stamp(tw)) 156 tcp_tw_schedule(tw, tw->tw_timeout); 157 else 158 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 159 return TCP_TW_ACK; 160 } 161 162 /* 163 * Now real TIME-WAIT state. 164 * 165 * RFC 1122: 166 * "When a connection is [...] on TIME-WAIT state [...] 167 * [a TCP] MAY accept a new SYN from the remote TCP to 168 * reopen the connection directly, if it: 169 * 170 * (1) assigns its initial sequence number for the new 171 * connection to be larger than the largest sequence 172 * number it used on the previous connection incarnation, 173 * and 174 * 175 * (2) returns to TIME-WAIT state if the SYN turns out 176 * to be an old duplicate". 177 */ 178 179 if (!paws_reject && 180 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && 181 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 182 /* In window segment, it may be only reset or bare ack. */ 183 184 if (th->rst) { 185 /* This is TIME_WAIT assasination, in two flavors. 186 * Oh well... nobody has a sufficient solution to this 187 * protocol bug yet. 188 */ 189 if (sysctl_tcp_rfc1337 == 0) { 190kill: 191 tcp_tw_deschedule(tw); 192 inet_twsk_put(tw); 193 return TCP_TW_SUCCESS; 194 } 195 } 196 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 197 198 if (tmp_opt.saw_tstamp) { 199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 200 tcptw->tw_ts_recent_stamp = xtime.tv_sec; 201 } 202 203 inet_twsk_put(tw); 204 return TCP_TW_SUCCESS; 205 } 206 207 /* Out of window segment. 208 209 All the segments are ACKed immediately. 210 211 The only exception is new SYN. We accept it, if it is 212 not old duplicate and we are not in danger to be killed 213 by delayed old duplicates. RFC check is that it has 214 newer sequence number works at rates <40Mbit/sec. 215 However, if paws works, it is reliable AND even more, 216 we even may relax silly seq space cutoff. 217 218 RED-PEN: we violate main RFC requirement, if this SYN will appear 219 old duplicate (i.e. we receive RST in reply to SYN-ACK), 220 we must return socket to time-wait state. It is not good, 221 but not fatal yet. 222 */ 223 224 if (th->syn && !th->rst && !th->ack && !paws_reject && 225 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || 226 (tmp_opt.saw_tstamp && 227 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 228 u32 isn = tcptw->tw_snd_nxt + 65535 + 2; 229 if (isn == 0) 230 isn++; 231 TCP_SKB_CB(skb)->when = isn; 232 return TCP_TW_SYN; 233 } 234 235 if (paws_reject) 236 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 237 238 if(!th->rst) { 239 /* In this case we must reset the TIMEWAIT timer. 240 * 241 * If it is ACKless SYN it may be both old duplicate 242 * and new good SYN with random sequence number <rcv_nxt. 243 * Do not reschedule in the last case. 244 */ 245 if (paws_reject || th->ack) 246 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 247 248 /* Send ACK. Note, we do not put the bucket, 249 * it will be released by caller. 250 */ 251 return TCP_TW_ACK; 252 } 253 inet_twsk_put(tw); 254 return TCP_TW_SUCCESS; 255} 256 257/* 258 * Move a socket to time-wait or dead fin-wait-2 state. 259 */ 260void tcp_time_wait(struct sock *sk, int state, int timeo) 261{ 262 struct inet_timewait_sock *tw = NULL; 263 const struct tcp_sock *tp = tcp_sk(sk); 264 int recycle_ok = 0; 265 266 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) 267 recycle_ok = tp->af_specific->remember_stamp(sk); 268 269 if (tcp_tw_count < sysctl_tcp_max_tw_buckets) 270 tw = inet_twsk_alloc(sk, state); 271 272 if (tw != NULL) { 273 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 274 const struct inet_connection_sock *icsk = inet_csk(sk); 275 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 276 277 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 278 tcptw->tw_rcv_nxt = tp->rcv_nxt; 279 tcptw->tw_snd_nxt = tp->snd_nxt; 280 tcptw->tw_rcv_wnd = tcp_receive_window(tp); 281 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 282 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 283 284#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 285 if (tw->tw_family == PF_INET6) { 286 struct ipv6_pinfo *np = inet6_sk(sk); 287 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 288 289 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); 290 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); 291 tw->tw_ipv6only = np->ipv6only; 292 } 293#endif 294 /* Linkage updates. */ 295 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); 296 297 /* Get the TIME_WAIT timeout firing. */ 298 if (timeo < rto) 299 timeo = rto; 300 301 if (recycle_ok) { 302 tw->tw_timeout = rto; 303 } else { 304 tw->tw_timeout = TCP_TIMEWAIT_LEN; 305 if (state == TCP_TIME_WAIT) 306 timeo = TCP_TIMEWAIT_LEN; 307 } 308 309 tcp_tw_schedule(tw, timeo); 310 inet_twsk_put(tw); 311 } else { 312 /* Sorry, if we're out of memory, just CLOSE this 313 * socket up. We've got bigger problems than 314 * non-graceful socket closings. 315 */ 316 if (net_ratelimit()) 317 printk(KERN_INFO "TCP: time wait bucket table overflow\n"); 318 } 319 320 tcp_update_metrics(sk); 321 tcp_done(sk); 322} 323 324/* Kill off TIME_WAIT sockets once their lifetime has expired. */ 325static int tcp_tw_death_row_slot; 326 327static void tcp_twkill(unsigned long); 328 329/* TIME_WAIT reaping mechanism. */ 330#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ 331#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) 332 333#define TCP_TWKILL_QUOTA 100 334 335static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; 336static DEFINE_SPINLOCK(tw_death_lock); 337static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); 338static void twkill_work(void *); 339static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); 340static u32 twkill_thread_slots; 341 342/* Returns non-zero if quota exceeded. */ 343static int tcp_do_twkill_work(int slot, unsigned int quota) 344{ 345 struct inet_timewait_sock *tw; 346 struct hlist_node *node; 347 unsigned int killed; 348 int ret; 349 350 /* NOTE: compare this to previous version where lock 351 * was released after detaching chain. It was racy, 352 * because tw buckets are scheduled in not serialized context 353 * in 2.3 (with netfilter), and with softnet it is common, because 354 * soft irqs are not sequenced. 355 */ 356 killed = 0; 357 ret = 0; 358rescan: 359 inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { 360 __inet_twsk_del_dead_node(tw); 361 spin_unlock(&tw_death_lock); 362 __inet_twsk_kill(tw, &tcp_hashinfo); 363 inet_twsk_put(tw); 364 killed++; 365 spin_lock(&tw_death_lock); 366 if (killed > quota) { 367 ret = 1; 368 break; 369 } 370 371 /* While we dropped tw_death_lock, another cpu may have 372 * killed off the next TW bucket in the list, therefore 373 * do a fresh re-read of the hlist head node with the 374 * lock reacquired. We still use the hlist traversal 375 * macro in order to get the prefetches. 376 */ 377 goto rescan; 378 } 379 380 tcp_tw_count -= killed; 381 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); 382 383 return ret; 384} 385 386static void tcp_twkill(unsigned long dummy) 387{ 388 int need_timer, ret; 389 390 spin_lock(&tw_death_lock); 391 392 if (tcp_tw_count == 0) 393 goto out; 394 395 need_timer = 0; 396 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); 397 if (ret) { 398 twkill_thread_slots |= (1 << tcp_tw_death_row_slot); 399 mb(); 400 schedule_work(&tcp_twkill_work); 401 need_timer = 1; 402 } else { 403 /* We purged the entire slot, anything left? */ 404 if (tcp_tw_count) 405 need_timer = 1; 406 } 407 tcp_tw_death_row_slot = 408 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); 409 if (need_timer) 410 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); 411out: 412 spin_unlock(&tw_death_lock); 413} 414 415extern void twkill_slots_invalid(void); 416 417static void twkill_work(void *dummy) 418{ 419 int i; 420 421 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) 422 twkill_slots_invalid(); 423 424 while (twkill_thread_slots) { 425 spin_lock_bh(&tw_death_lock); 426 for (i = 0; i < TCP_TWKILL_SLOTS; i++) { 427 if (!(twkill_thread_slots & (1 << i))) 428 continue; 429 430 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { 431 if (need_resched()) { 432 spin_unlock_bh(&tw_death_lock); 433 schedule(); 434 spin_lock_bh(&tw_death_lock); 435 } 436 } 437 438 twkill_thread_slots &= ~(1 << i); 439 } 440 spin_unlock_bh(&tw_death_lock); 441 } 442} 443 444/* These are always called from BH context. See callers in 445 * tcp_input.c to verify this. 446 */ 447 448/* This is for handling early-kills of TIME_WAIT sockets. */ 449void tcp_tw_deschedule(struct inet_timewait_sock *tw) 450{ 451 spin_lock(&tw_death_lock); 452 if (inet_twsk_del_dead_node(tw)) { 453 inet_twsk_put(tw); 454 if (--tcp_tw_count == 0) 455 del_timer(&tcp_tw_timer); 456 } 457 spin_unlock(&tw_death_lock); 458 __inet_twsk_kill(tw, &tcp_hashinfo); 459} 460 461/* Short-time timewait calendar */ 462 463static int tcp_twcal_hand = -1; 464static int tcp_twcal_jiffie; 465static void tcp_twcal_tick(unsigned long); 466static struct timer_list tcp_twcal_timer = 467 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); 468static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; 469 470static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) 471{ 472 struct hlist_head *list; 473 int slot; 474 475 /* timeout := RTO * 3.5 476 * 477 * 3.5 = 1+2+0.5 to wait for two retransmits. 478 * 479 * RATIONALE: if FIN arrived and we entered TIME-WAIT state, 480 * our ACK acking that FIN can be lost. If N subsequent retransmitted 481 * FINs (or previous seqments) are lost (probability of such event 482 * is p^(N+1), where p is probability to lose single packet and 483 * time to detect the loss is about RTO*(2^N - 1) with exponential 484 * backoff). Normal timewait length is calculated so, that we 485 * waited at least for one retransmitted FIN (maximal RTO is 120sec). 486 * [ BTW Linux. following BSD, violates this requirement waiting 487 * only for 60sec, we should wait at least for 240 secs. 488 * Well, 240 consumes too much of resources 8) 489 * ] 490 * This interval is not reduced to catch old duplicate and 491 * responces to our wandering segments living for two MSLs. 492 * However, if we use PAWS to detect 493 * old duplicates, we can reduce the interval to bounds required 494 * by RTO, rather than MSL. So, if peer understands PAWS, we 495 * kill tw bucket after 3.5*RTO (it is important that this number 496 * is greater than TS tick!) and detect old duplicates with help 497 * of PAWS. 498 */ 499 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; 500 501 spin_lock(&tw_death_lock); 502 503 /* Unlink it, if it was scheduled */ 504 if (inet_twsk_del_dead_node(tw)) 505 tcp_tw_count--; 506 else 507 atomic_inc(&tw->tw_refcnt); 508 509 if (slot >= TCP_TW_RECYCLE_SLOTS) { 510 /* Schedule to slow timer */ 511 if (timeo >= TCP_TIMEWAIT_LEN) { 512 slot = TCP_TWKILL_SLOTS-1; 513 } else { 514 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; 515 if (slot >= TCP_TWKILL_SLOTS) 516 slot = TCP_TWKILL_SLOTS-1; 517 } 518 tw->tw_ttd = jiffies + timeo; 519 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); 520 list = &tcp_tw_death_row[slot]; 521 } else { 522 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); 523 524 if (tcp_twcal_hand < 0) { 525 tcp_twcal_hand = 0; 526 tcp_twcal_jiffie = jiffies; 527 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); 528 add_timer(&tcp_twcal_timer); 529 } else { 530 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK))) 531 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); 532 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); 533 } 534 list = &tcp_twcal_row[slot]; 535 } 536 537 hlist_add_head(&tw->tw_death_node, list); 538 539 if (tcp_tw_count++ == 0) 540 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); 541 spin_unlock(&tw_death_lock); 542} 543 544void tcp_twcal_tick(unsigned long dummy) 545{ 546 int n, slot; 547 unsigned long j; 548 unsigned long now = jiffies; 549 int killed = 0; 550 int adv = 0; 551 552 spin_lock(&tw_death_lock); 553 if (tcp_twcal_hand < 0) 554 goto out; 555 556 slot = tcp_twcal_hand; 557 j = tcp_twcal_jiffie; 558 559 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { 560 if (time_before_eq(j, now)) { 561 struct hlist_node *node, *safe; 562 struct inet_timewait_sock *tw; 563 564 inet_twsk_for_each_inmate_safe(tw, node, safe, 565 &tcp_twcal_row[slot]) { 566 __inet_twsk_del_dead_node(tw); 567 __inet_twsk_kill(tw, &tcp_hashinfo); 568 inet_twsk_put(tw); 569 killed++; 570 } 571 } else { 572 if (!adv) { 573 adv = 1; 574 tcp_twcal_jiffie = j; 575 tcp_twcal_hand = slot; 576 } 577 578 if (!hlist_empty(&tcp_twcal_row[slot])) { 579 mod_timer(&tcp_twcal_timer, j); 580 goto out; 581 } 582 } 583 j += (1<<TCP_TW_RECYCLE_TICK); 584 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); 585 } 586 tcp_twcal_hand = -1; 587 588out: 589 if ((tcp_tw_count -= killed) == 0) 590 del_timer(&tcp_tw_timer); 591 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); 592 spin_unlock(&tw_death_lock); 593} 594 595/* This is not only more efficient than what we used to do, it eliminates 596 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 597 * 598 * Actually, we could lots of memory writes here. tp of listening 599 * socket contains all necessary default parameters. 600 */ 601struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 602{ 603 struct sock *newsk = sk_clone(sk, GFP_ATOMIC); 604 605 if (newsk != NULL) { 606 struct inet_request_sock *ireq = inet_rsk(req); 607 struct tcp_request_sock *treq = tcp_rsk(req); 608 struct inet_sock *newinet = inet_sk(newsk); 609 struct inet_connection_sock *newicsk = inet_csk(newsk); 610 struct tcp_sock *newtp; 611 612 newsk->sk_state = TCP_SYN_RECV; 613 newicsk->icsk_bind_hash = NULL; 614 615 /* Clone the TCP header template */ 616 newinet->dport = ireq->rmt_port; 617 newsk->sk_write_space = sk_stream_write_space; 618 619 /* Now setup tcp_sock */ 620 newtp = tcp_sk(newsk); 621 newtp->pred_flags = 0; 622 newtp->rcv_nxt = treq->rcv_isn + 1; 623 newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; 624 625 tcp_prequeue_init(newtp); 626 627 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 628 629 newicsk->icsk_retransmits = 0; 630 newicsk->icsk_backoff = 0; 631 newtp->srtt = 0; 632 newtp->mdev = TCP_TIMEOUT_INIT; 633 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 634 635 newtp->packets_out = 0; 636 newtp->left_out = 0; 637 newtp->retrans_out = 0; 638 newtp->sacked_out = 0; 639 newtp->fackets_out = 0; 640 newtp->snd_ssthresh = 0x7fffffff; 641 642 /* So many TCP implementations out there (incorrectly) count the 643 * initial SYN frame in their delayed-ACK and congestion control 644 * algorithms that we must have the following bandaid to talk 645 * efficiently to them. -DaveM 646 */ 647 newtp->snd_cwnd = 2; 648 newtp->snd_cwnd_cnt = 0; 649 650 newtp->frto_counter = 0; 651 newtp->frto_highmark = 0; 652 653 newtp->ca_ops = &tcp_reno; 654 655 tcp_set_ca_state(newtp, TCP_CA_Open); 656 tcp_init_xmit_timers(newsk); 657 skb_queue_head_init(&newtp->out_of_order_queue); 658 newtp->rcv_wup = treq->rcv_isn + 1; 659 newtp->write_seq = treq->snt_isn + 1; 660 newtp->pushed_seq = newtp->write_seq; 661 newtp->copied_seq = treq->rcv_isn + 1; 662 663 newtp->rx_opt.saw_tstamp = 0; 664 665 newtp->rx_opt.dsack = 0; 666 newtp->rx_opt.eff_sacks = 0; 667 668 newtp->probes_out = 0; 669 newtp->rx_opt.num_sacks = 0; 670 newtp->urg_data = 0; 671 /* Deinitialize accept_queue to trap illegal accesses. */ 672 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 673 674 if (sock_flag(newsk, SOCK_KEEPOPEN)) 675 inet_csk_reset_keepalive_timer(newsk, 676 keepalive_time_when(newtp)); 677 678 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 679 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 680 if (sysctl_tcp_fack) 681 newtp->rx_opt.sack_ok |= 2; 682 } 683 newtp->window_clamp = req->window_clamp; 684 newtp->rcv_ssthresh = req->rcv_wnd; 685 newtp->rcv_wnd = req->rcv_wnd; 686 newtp->rx_opt.wscale_ok = ireq->wscale_ok; 687 if (newtp->rx_opt.wscale_ok) { 688 newtp->rx_opt.snd_wscale = ireq->snd_wscale; 689 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; 690 } else { 691 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 692 newtp->window_clamp = min(newtp->window_clamp, 65535U); 693 } 694 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; 695 newtp->max_window = newtp->snd_wnd; 696 697 if (newtp->rx_opt.tstamp_ok) { 698 newtp->rx_opt.ts_recent = req->ts_recent; 699 newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; 700 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 701 } else { 702 newtp->rx_opt.ts_recent_stamp = 0; 703 newtp->tcp_header_len = sizeof(struct tcphdr); 704 } 705 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 706 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 707 newtp->rx_opt.mss_clamp = req->mss; 708 TCP_ECN_openreq_child(newtp, req); 709 if (newtp->ecn_flags&TCP_ECN_OK) 710 sock_set_flag(newsk, SOCK_NO_LARGESEND); 711 712 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 713 } 714 return newsk; 715} 716 717/* 718 * Process an incoming packet for SYN_RECV sockets represented 719 * as a request_sock. 720 */ 721 722struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 723 struct request_sock *req, 724 struct request_sock **prev) 725{ 726 struct tcphdr *th = skb->h.th; 727 struct tcp_sock *tp = tcp_sk(sk); 728 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 729 int paws_reject = 0; 730 struct tcp_options_received tmp_opt; 731 struct sock *child; 732 733 tmp_opt.saw_tstamp = 0; 734 if (th->doff > (sizeof(struct tcphdr)>>2)) { 735 tcp_parse_options(skb, &tmp_opt, 0); 736 737 if (tmp_opt.saw_tstamp) { 738 tmp_opt.ts_recent = req->ts_recent; 739 /* We do not store true stamp, but it is not required, 740 * it can be estimated (approximately) 741 * from another data. 742 */ 743 tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 744 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 745 } 746 } 747 748 /* Check for pure retransmitted SYN. */ 749 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && 750 flg == TCP_FLAG_SYN && 751 !paws_reject) { 752 /* 753 * RFC793 draws (Incorrectly! It was fixed in RFC1122) 754 * this case on figure 6 and figure 8, but formal 755 * protocol description says NOTHING. 756 * To be more exact, it says that we should send ACK, 757 * because this segment (at least, if it has no data) 758 * is out of window. 759 * 760 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT 761 * describe SYN-RECV state. All the description 762 * is wrong, we cannot believe to it and should 763 * rely only on common sense and implementation 764 * experience. 765 * 766 * Enforce "SYN-ACK" according to figure 8, figure 6 767 * of RFC793, fixed by RFC1122. 768 */ 769 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 770 return NULL; 771 } 772 773 /* Further reproduces section "SEGMENT ARRIVES" 774 for state SYN-RECEIVED of RFC793. 775 It is broken, however, it does not work only 776 when SYNs are crossed. 777 778 You would think that SYN crossing is impossible here, since 779 we should have a SYN_SENT socket (from connect()) on our end, 780 but this is not true if the crossed SYNs were sent to both 781 ends by a malicious third party. We must defend against this, 782 and to do that we first verify the ACK (as per RFC793, page 783 36) and reset if it is invalid. Is this a true full defense? 784 To convince ourselves, let us consider a way in which the ACK 785 test can still pass in this 'malicious crossed SYNs' case. 786 Malicious sender sends identical SYNs (and thus identical sequence 787 numbers) to both A and B: 788 789 A: gets SYN, seq=7 790 B: gets SYN, seq=7 791 792 By our good fortune, both A and B select the same initial 793 send sequence number of seven :-) 794 795 A: sends SYN|ACK, seq=7, ack_seq=8 796 B: sends SYN|ACK, seq=7, ack_seq=8 797 798 So we are now A eating this SYN|ACK, ACK test passes. So 799 does sequence test, SYN is truncated, and thus we consider 800 it a bare ACK. 801 802 If tp->defer_accept, we silently drop this bare ACK. Otherwise, 803 we create an established connection. Both ends (listening sockets) 804 accept the new incoming connection and try to talk to each other. 8-) 805 806 Note: This case is both harmless, and rare. Possibility is about the 807 same as us discovering intelligent life on another plant tomorrow. 808 809 But generally, we should (RFC lies!) to accept ACK 810 from SYNACK both here and in tcp_rcv_state_process(). 811 tcp_rcv_state_process() does not, hence, we do not too. 812 813 Note that the case is absolutely generic: 814 we cannot optimize anything here without 815 violating protocol. All the checks must be made 816 before attempt to create socket. 817 */ 818 819 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 820 * and the incoming segment acknowledges something not yet 821 * sent (the segment carries an unaccaptable ACK) ... 822 * a reset is sent." 823 * 824 * Invalid ACK: reset will be sent by listening socket 825 */ 826 if ((flg & TCP_FLAG_ACK) && 827 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) 828 return sk; 829 830 /* Also, it would be not so bad idea to check rcv_tsecr, which 831 * is essentially ACK extension and too early or too late values 832 * should cause reset in unsynchronized states. 833 */ 834 835 /* RFC793: "first check sequence number". */ 836 837 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 838 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 839 /* Out of window: send ACK and drop. */ 840 if (!(flg & TCP_FLAG_RST)) 841 req->rsk_ops->send_ack(skb, req); 842 if (paws_reject) 843 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 844 return NULL; 845 } 846 847 /* In sequence, PAWS is OK. */ 848 849 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 850 req->ts_recent = tmp_opt.rcv_tsval; 851 852 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 853 /* Truncate SYN, it is out of window starting 854 at tcp_rsk(req)->rcv_isn + 1. */ 855 flg &= ~TCP_FLAG_SYN; 856 } 857 858 /* RFC793: "second check the RST bit" and 859 * "fourth, check the SYN bit" 860 */ 861 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) 862 goto embryonic_reset; 863 864 /* ACK sequence verified above, just make sure ACK is 865 * set. If ACK not set, just silently drop the packet. 866 */ 867 if (!(flg & TCP_FLAG_ACK)) 868 return NULL; 869 870 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 871 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 872 inet_rsk(req)->acked = 1; 873 return NULL; 874 } 875 876 /* OK, ACK is valid, create big socket and 877 * feed this segment to it. It will repeat all 878 * the tests. THIS SEGMENT MUST MOVE SOCKET TO 879 * ESTABLISHED STATE. If it will be dropped after 880 * socket is created, wait for troubles. 881 */ 882 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); 883 if (child == NULL) 884 goto listen_overflow; 885 886 inet_csk_reqsk_queue_unlink(sk, req, prev); 887 inet_csk_reqsk_queue_removed(sk, req); 888 889 inet_csk_reqsk_queue_add(sk, req, child); 890 return child; 891 892 listen_overflow: 893 if (!sysctl_tcp_abort_on_overflow) { 894 inet_rsk(req)->acked = 1; 895 return NULL; 896 } 897 898 embryonic_reset: 899 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); 900 if (!(flg & TCP_FLAG_RST)) 901 req->rsk_ops->send_reset(skb); 902 903 inet_csk_reqsk_queue_drop(sk, req, prev); 904 return NULL; 905} 906 907/* 908 * Queue segment on the new socket if the new socket is active, 909 * otherwise we just shortcircuit this and continue with 910 * the new socket. 911 */ 912 913int tcp_child_process(struct sock *parent, struct sock *child, 914 struct sk_buff *skb) 915{ 916 int ret = 0; 917 int state = child->sk_state; 918 919 if (!sock_owned_by_user(child)) { 920 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); 921 922 /* Wakeup parent, send SIGIO */ 923 if (state == TCP_SYN_RECV && child->sk_state != state) 924 parent->sk_data_ready(parent, 0); 925 } else { 926 /* Alas, it is possible again, because we do lookup 927 * in main socket hash table and lock on listening 928 * socket does not protect us more. 929 */ 930 sk_add_backlog(child, skb); 931 } 932 933 bh_unlock_sock(child); 934 sock_put(child); 935 return ret; 936} 937 938EXPORT_SYMBOL(tcp_check_req); 939EXPORT_SYMBOL(tcp_child_process); 940EXPORT_SYMBOL(tcp_create_openreq_child); 941EXPORT_SYMBOL(tcp_timewait_state_process); 942EXPORT_SYMBOL(tcp_tw_deschedule); 943