tcp_minisocks.c revision 81166dd6fa8eb780b2132d32fbc77eb6ac04e44e
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Mark Evans, <evansmp@uhura.aston.ac.uk> 11 * Corey Minyard <wf-rch!minyard@relay.EU.net> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 14 * Linus Torvalds, <torvalds@cs.helsinki.fi> 15 * Alan Cox, <gw4pts@gw4pts.ampr.org> 16 * Matthew Dillon, <dillon@apollo.west.oic.com> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 18 * Jorge Cwik, <jorge@laser.satlink.net> 19 */ 20 21#include <linux/mm.h> 22#include <linux/module.h> 23#include <linux/slab.h> 24#include <linux/sysctl.h> 25#include <linux/workqueue.h> 26#include <net/tcp.h> 27#include <net/inet_common.h> 28#include <net/xfrm.h> 29 30int sysctl_tcp_syncookies __read_mostly = 1; 31EXPORT_SYMBOL(sysctl_tcp_syncookies); 32 33int sysctl_tcp_abort_on_overflow __read_mostly; 34 35struct inet_timewait_death_row tcp_death_row = { 36 .sysctl_max_tw_buckets = NR_FILE * 2, 37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, 38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), 39 .hashinfo = &tcp_hashinfo, 40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, 41 (unsigned long)&tcp_death_row), 42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, 43 inet_twdr_twkill_work), 44/* Short-time timewait calendar */ 45 46 .twcal_hand = -1, 47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 48 (unsigned long)&tcp_death_row), 49}; 50EXPORT_SYMBOL_GPL(tcp_death_row); 51 52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 53{ 54 if (seq == s_win) 55 return true; 56 if (after(end_seq, s_win) && before(seq, e_win)) 57 return true; 58 return seq == e_win && seq == end_seq; 59} 60 61/* 62 * * Main purpose of TIME-WAIT state is to close connection gracefully, 63 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 64 * (and, probably, tail of data) and one or more our ACKs are lost. 65 * * What is TIME-WAIT timeout? It is associated with maximal packet 66 * lifetime in the internet, which results in wrong conclusion, that 67 * it is set to catch "old duplicate segments" wandering out of their path. 68 * It is not quite correct. This timeout is calculated so that it exceeds 69 * maximal retransmission timeout enough to allow to lose one (or more) 70 * segments sent by peer and our ACKs. This time may be calculated from RTO. 71 * * When TIME-WAIT socket receives RST, it means that another end 72 * finally closed and we are allowed to kill TIME-WAIT too. 73 * * Second purpose of TIME-WAIT is catching old duplicate segments. 74 * Well, certainly it is pure paranoia, but if we load TIME-WAIT 75 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. 76 * * If we invented some more clever way to catch duplicates 77 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. 78 * 79 * The algorithm below is based on FORMAL INTERPRETATION of RFCs. 80 * When you compare it to RFCs, please, read section SEGMENT ARRIVES 81 * from the very beginning. 82 * 83 * NOTE. With recycling (and later with fin-wait-2) TW bucket 84 * is _not_ stateless. It means, that strictly speaking we must 85 * spinlock it. I do not want! Well, probability of misbehaviour 86 * is ridiculously low and, seems, we could use some mb() tricks 87 * to avoid misread sequence numbers, states etc. --ANK 88 */ 89enum tcp_tw_status 90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 91 const struct tcphdr *th) 92{ 93 struct tcp_options_received tmp_opt; 94 const u8 *hash_location; 95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 96 bool paws_reject = false; 97 98 tmp_opt.saw_tstamp = 0; 99 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 100 tcp_parse_options(skb, &tmp_opt, &hash_location, 0); 101 102 if (tmp_opt.saw_tstamp) { 103 tmp_opt.ts_recent = tcptw->tw_ts_recent; 104 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 105 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 106 } 107 } 108 109 if (tw->tw_substate == TCP_FIN_WAIT2) { 110 /* Just repeat all the checks of tcp_rcv_state_process() */ 111 112 /* Out of window, send ACK */ 113 if (paws_reject || 114 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 115 tcptw->tw_rcv_nxt, 116 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) 117 return TCP_TW_ACK; 118 119 if (th->rst) 120 goto kill; 121 122 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) 123 goto kill_with_rst; 124 125 /* Dup ACK? */ 126 if (!th->ack || 127 !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || 128 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 129 inet_twsk_put(tw); 130 return TCP_TW_SUCCESS; 131 } 132 133 /* New data or FIN. If new data arrive after half-duplex close, 134 * reset. 135 */ 136 if (!th->fin || 137 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 138kill_with_rst: 139 inet_twsk_deschedule(tw, &tcp_death_row); 140 inet_twsk_put(tw); 141 return TCP_TW_RST; 142 } 143 144 /* FIN arrived, enter true time-wait state. */ 145 tw->tw_substate = TCP_TIME_WAIT; 146 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 147 if (tmp_opt.saw_tstamp) { 148 tcptw->tw_ts_recent_stamp = get_seconds(); 149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 150 } 151 152 if (tcp_death_row.sysctl_tw_recycle && 153 tcptw->tw_ts_recent_stamp && 154 tcp_tw_remember_stamp(tw)) 155 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 156 TCP_TIMEWAIT_LEN); 157 else 158 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 159 TCP_TIMEWAIT_LEN); 160 return TCP_TW_ACK; 161 } 162 163 /* 164 * Now real TIME-WAIT state. 165 * 166 * RFC 1122: 167 * "When a connection is [...] on TIME-WAIT state [...] 168 * [a TCP] MAY accept a new SYN from the remote TCP to 169 * reopen the connection directly, if it: 170 * 171 * (1) assigns its initial sequence number for the new 172 * connection to be larger than the largest sequence 173 * number it used on the previous connection incarnation, 174 * and 175 * 176 * (2) returns to TIME-WAIT state if the SYN turns out 177 * to be an old duplicate". 178 */ 179 180 if (!paws_reject && 181 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && 182 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 183 /* In window segment, it may be only reset or bare ack. */ 184 185 if (th->rst) { 186 /* This is TIME_WAIT assassination, in two flavors. 187 * Oh well... nobody has a sufficient solution to this 188 * protocol bug yet. 189 */ 190 if (sysctl_tcp_rfc1337 == 0) { 191kill: 192 inet_twsk_deschedule(tw, &tcp_death_row); 193 inet_twsk_put(tw); 194 return TCP_TW_SUCCESS; 195 } 196 } 197 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 198 TCP_TIMEWAIT_LEN); 199 200 if (tmp_opt.saw_tstamp) { 201 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 202 tcptw->tw_ts_recent_stamp = get_seconds(); 203 } 204 205 inet_twsk_put(tw); 206 return TCP_TW_SUCCESS; 207 } 208 209 /* Out of window segment. 210 211 All the segments are ACKed immediately. 212 213 The only exception is new SYN. We accept it, if it is 214 not old duplicate and we are not in danger to be killed 215 by delayed old duplicates. RFC check is that it has 216 newer sequence number works at rates <40Mbit/sec. 217 However, if paws works, it is reliable AND even more, 218 we even may relax silly seq space cutoff. 219 220 RED-PEN: we violate main RFC requirement, if this SYN will appear 221 old duplicate (i.e. we receive RST in reply to SYN-ACK), 222 we must return socket to time-wait state. It is not good, 223 but not fatal yet. 224 */ 225 226 if (th->syn && !th->rst && !th->ack && !paws_reject && 227 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || 228 (tmp_opt.saw_tstamp && 229 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 230 u32 isn = tcptw->tw_snd_nxt + 65535 + 2; 231 if (isn == 0) 232 isn++; 233 TCP_SKB_CB(skb)->when = isn; 234 return TCP_TW_SYN; 235 } 236 237 if (paws_reject) 238 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); 239 240 if (!th->rst) { 241 /* In this case we must reset the TIMEWAIT timer. 242 * 243 * If it is ACKless SYN it may be both old duplicate 244 * and new good SYN with random sequence number <rcv_nxt. 245 * Do not reschedule in the last case. 246 */ 247 if (paws_reject || th->ack) 248 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 249 TCP_TIMEWAIT_LEN); 250 251 /* Send ACK. Note, we do not put the bucket, 252 * it will be released by caller. 253 */ 254 return TCP_TW_ACK; 255 } 256 inet_twsk_put(tw); 257 return TCP_TW_SUCCESS; 258} 259EXPORT_SYMBOL(tcp_timewait_state_process); 260 261/* 262 * Move a socket to time-wait or dead fin-wait-2 state. 263 */ 264void tcp_time_wait(struct sock *sk, int state, int timeo) 265{ 266 struct inet_timewait_sock *tw = NULL; 267 const struct inet_connection_sock *icsk = inet_csk(sk); 268 const struct tcp_sock *tp = tcp_sk(sk); 269 bool recycle_ok = false; 270 bool recycle_on = false; 271 272 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) { 273 recycle_ok = tcp_remember_stamp(sk); 274 recycle_on = true; 275 } 276 277 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 278 tw = inet_twsk_alloc(sk, state); 279 280 if (tw != NULL) { 281 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 282 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 283 struct inet_sock *inet = inet_sk(sk); 284 struct inet_peer *peer = NULL; 285 286 tw->tw_transparent = inet->transparent; 287 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 288 tcptw->tw_rcv_nxt = tp->rcv_nxt; 289 tcptw->tw_snd_nxt = tp->snd_nxt; 290 tcptw->tw_rcv_wnd = tcp_receive_window(tp); 291 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 292 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 293 294#if IS_ENABLED(CONFIG_IPV6) 295 if (tw->tw_family == PF_INET6) { 296 struct ipv6_pinfo *np = inet6_sk(sk); 297 struct inet6_timewait_sock *tw6; 298 299 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); 300 tw6 = inet6_twsk((struct sock *)tw); 301 tw6->tw_v6_daddr = np->daddr; 302 tw6->tw_v6_rcv_saddr = np->rcv_saddr; 303 tw->tw_tclass = np->tclass; 304 tw->tw_ipv6only = np->ipv6only; 305 } 306#endif 307 308 if (recycle_on) 309 peer = icsk->icsk_af_ops->get_peer(sk); 310 tcptw->tw_peer = peer; 311 if (peer) 312 atomic_inc(&peer->refcnt); 313 314#ifdef CONFIG_TCP_MD5SIG 315 /* 316 * The timewait bucket does not have the key DB from the 317 * sock structure. We just make a quick copy of the 318 * md5 key being used (if indeed we are using one) 319 * so the timewait ack generating code has the key. 320 */ 321 do { 322 struct tcp_md5sig_key *key; 323 tcptw->tw_md5_key = NULL; 324 key = tp->af_specific->md5_lookup(sk, sk); 325 if (key != NULL) { 326 tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); 327 if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL) 328 BUG(); 329 } 330 } while (0); 331#endif 332 333 /* Linkage updates. */ 334 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); 335 336 /* Get the TIME_WAIT timeout firing. */ 337 if (timeo < rto) 338 timeo = rto; 339 340 if (recycle_ok) { 341 tw->tw_timeout = rto; 342 } else { 343 tw->tw_timeout = TCP_TIMEWAIT_LEN; 344 if (state == TCP_TIME_WAIT) 345 timeo = TCP_TIMEWAIT_LEN; 346 } 347 348 inet_twsk_schedule(tw, &tcp_death_row, timeo, 349 TCP_TIMEWAIT_LEN); 350 inet_twsk_put(tw); 351 } else { 352 /* Sorry, if we're out of memory, just CLOSE this 353 * socket up. We've got bigger problems than 354 * non-graceful socket closings. 355 */ 356 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); 357 } 358 359 tcp_update_metrics(sk); 360 tcp_done(sk); 361} 362 363void tcp_twsk_destructor(struct sock *sk) 364{ 365 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 366 367 if (twsk->tw_peer) 368 inet_putpeer(twsk->tw_peer); 369#ifdef CONFIG_TCP_MD5SIG 370 if (twsk->tw_md5_key) { 371 tcp_free_md5sig_pool(); 372 kfree_rcu(twsk->tw_md5_key, rcu); 373 } 374#endif 375} 376EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 377 378static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 379 struct request_sock *req) 380{ 381 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 382} 383 384/* This is not only more efficient than what we used to do, it eliminates 385 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 386 * 387 * Actually, we could lots of memory writes here. tp of listening 388 * socket contains all necessary default parameters. 389 */ 390struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 391{ 392 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); 393 394 if (newsk != NULL) { 395 const struct inet_request_sock *ireq = inet_rsk(req); 396 struct tcp_request_sock *treq = tcp_rsk(req); 397 struct inet_connection_sock *newicsk = inet_csk(newsk); 398 struct tcp_sock *newtp = tcp_sk(newsk); 399 struct tcp_sock *oldtp = tcp_sk(sk); 400 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 401 402 newsk->sk_rx_dst = dst_clone(skb_dst(skb)); 403 404 /* TCP Cookie Transactions require space for the cookie pair, 405 * as it differs for each connection. There is no need to 406 * copy any s_data_payload stored at the original socket. 407 * Failure will prevent resuming the connection. 408 * 409 * Presumed copied, in order of appearance: 410 * cookie_in_always, cookie_out_never 411 */ 412 if (oldcvp != NULL) { 413 struct tcp_cookie_values *newcvp = 414 kzalloc(sizeof(*newtp->cookie_values), 415 GFP_ATOMIC); 416 417 if (newcvp != NULL) { 418 kref_init(&newcvp->kref); 419 newcvp->cookie_desired = 420 oldcvp->cookie_desired; 421 newtp->cookie_values = newcvp; 422 } else { 423 /* Not Yet Implemented */ 424 newtp->cookie_values = NULL; 425 } 426 } 427 428 /* Now setup tcp_sock */ 429 newtp->pred_flags = 0; 430 431 newtp->rcv_wup = newtp->copied_seq = 432 newtp->rcv_nxt = treq->rcv_isn + 1; 433 434 newtp->snd_sml = newtp->snd_una = 435 newtp->snd_nxt = newtp->snd_up = 436 treq->snt_isn + 1 + tcp_s_data_size(oldtp); 437 438 tcp_prequeue_init(newtp); 439 440 tcp_init_wl(newtp, treq->rcv_isn); 441 442 newtp->srtt = 0; 443 newtp->mdev = TCP_TIMEOUT_INIT; 444 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 445 446 newtp->packets_out = 0; 447 newtp->retrans_out = 0; 448 newtp->sacked_out = 0; 449 newtp->fackets_out = 0; 450 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 451 tcp_enable_early_retrans(newtp); 452 453 /* So many TCP implementations out there (incorrectly) count the 454 * initial SYN frame in their delayed-ACK and congestion control 455 * algorithms that we must have the following bandaid to talk 456 * efficiently to them. -DaveM 457 */ 458 newtp->snd_cwnd = TCP_INIT_CWND; 459 newtp->snd_cwnd_cnt = 0; 460 newtp->bytes_acked = 0; 461 462 newtp->frto_counter = 0; 463 newtp->frto_highmark = 0; 464 465 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 466 !try_module_get(newicsk->icsk_ca_ops->owner)) 467 newicsk->icsk_ca_ops = &tcp_init_congestion_ops; 468 469 tcp_set_ca_state(newsk, TCP_CA_Open); 470 tcp_init_xmit_timers(newsk); 471 skb_queue_head_init(&newtp->out_of_order_queue); 472 newtp->write_seq = newtp->pushed_seq = 473 treq->snt_isn + 1 + tcp_s_data_size(oldtp); 474 475 newtp->rx_opt.saw_tstamp = 0; 476 477 newtp->rx_opt.dsack = 0; 478 newtp->rx_opt.num_sacks = 0; 479 480 newtp->urg_data = 0; 481 482 if (sock_flag(newsk, SOCK_KEEPOPEN)) 483 inet_csk_reset_keepalive_timer(newsk, 484 keepalive_time_when(newtp)); 485 486 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 487 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 488 if (sysctl_tcp_fack) 489 tcp_enable_fack(newtp); 490 } 491 newtp->window_clamp = req->window_clamp; 492 newtp->rcv_ssthresh = req->rcv_wnd; 493 newtp->rcv_wnd = req->rcv_wnd; 494 newtp->rx_opt.wscale_ok = ireq->wscale_ok; 495 if (newtp->rx_opt.wscale_ok) { 496 newtp->rx_opt.snd_wscale = ireq->snd_wscale; 497 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; 498 } else { 499 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 500 newtp->window_clamp = min(newtp->window_clamp, 65535U); 501 } 502 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << 503 newtp->rx_opt.snd_wscale); 504 newtp->max_window = newtp->snd_wnd; 505 506 if (newtp->rx_opt.tstamp_ok) { 507 newtp->rx_opt.ts_recent = req->ts_recent; 508 newtp->rx_opt.ts_recent_stamp = get_seconds(); 509 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 510 } else { 511 newtp->rx_opt.ts_recent_stamp = 0; 512 newtp->tcp_header_len = sizeof(struct tcphdr); 513 } 514#ifdef CONFIG_TCP_MD5SIG 515 newtp->md5sig_info = NULL; /*XXX*/ 516 if (newtp->af_specific->md5_lookup(sk, newsk)) 517 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 518#endif 519 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) 520 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 521 newtp->rx_opt.mss_clamp = req->mss; 522 TCP_ECN_openreq_child(newtp, req); 523 524 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 525 } 526 return newsk; 527} 528EXPORT_SYMBOL(tcp_create_openreq_child); 529 530/* 531 * Process an incoming packet for SYN_RECV sockets represented 532 * as a request_sock. 533 */ 534 535struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 536 struct request_sock *req, 537 struct request_sock **prev) 538{ 539 struct tcp_options_received tmp_opt; 540 const u8 *hash_location; 541 struct sock *child; 542 const struct tcphdr *th = tcp_hdr(skb); 543 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 544 bool paws_reject = false; 545 546 tmp_opt.saw_tstamp = 0; 547 if (th->doff > (sizeof(struct tcphdr)>>2)) { 548 tcp_parse_options(skb, &tmp_opt, &hash_location, 0); 549 550 if (tmp_opt.saw_tstamp) { 551 tmp_opt.ts_recent = req->ts_recent; 552 /* We do not store true stamp, but it is not required, 553 * it can be estimated (approximately) 554 * from another data. 555 */ 556 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 557 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 558 } 559 } 560 561 /* Check for pure retransmitted SYN. */ 562 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && 563 flg == TCP_FLAG_SYN && 564 !paws_reject) { 565 /* 566 * RFC793 draws (Incorrectly! It was fixed in RFC1122) 567 * this case on figure 6 and figure 8, but formal 568 * protocol description says NOTHING. 569 * To be more exact, it says that we should send ACK, 570 * because this segment (at least, if it has no data) 571 * is out of window. 572 * 573 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT 574 * describe SYN-RECV state. All the description 575 * is wrong, we cannot believe to it and should 576 * rely only on common sense and implementation 577 * experience. 578 * 579 * Enforce "SYN-ACK" according to figure 8, figure 6 580 * of RFC793, fixed by RFC1122. 581 */ 582 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 583 return NULL; 584 } 585 586 /* Further reproduces section "SEGMENT ARRIVES" 587 for state SYN-RECEIVED of RFC793. 588 It is broken, however, it does not work only 589 when SYNs are crossed. 590 591 You would think that SYN crossing is impossible here, since 592 we should have a SYN_SENT socket (from connect()) on our end, 593 but this is not true if the crossed SYNs were sent to both 594 ends by a malicious third party. We must defend against this, 595 and to do that we first verify the ACK (as per RFC793, page 596 36) and reset if it is invalid. Is this a true full defense? 597 To convince ourselves, let us consider a way in which the ACK 598 test can still pass in this 'malicious crossed SYNs' case. 599 Malicious sender sends identical SYNs (and thus identical sequence 600 numbers) to both A and B: 601 602 A: gets SYN, seq=7 603 B: gets SYN, seq=7 604 605 By our good fortune, both A and B select the same initial 606 send sequence number of seven :-) 607 608 A: sends SYN|ACK, seq=7, ack_seq=8 609 B: sends SYN|ACK, seq=7, ack_seq=8 610 611 So we are now A eating this SYN|ACK, ACK test passes. So 612 does sequence test, SYN is truncated, and thus we consider 613 it a bare ACK. 614 615 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this 616 bare ACK. Otherwise, we create an established connection. Both 617 ends (listening sockets) accept the new incoming connection and try 618 to talk to each other. 8-) 619 620 Note: This case is both harmless, and rare. Possibility is about the 621 same as us discovering intelligent life on another plant tomorrow. 622 623 But generally, we should (RFC lies!) to accept ACK 624 from SYNACK both here and in tcp_rcv_state_process(). 625 tcp_rcv_state_process() does not, hence, we do not too. 626 627 Note that the case is absolutely generic: 628 we cannot optimize anything here without 629 violating protocol. All the checks must be made 630 before attempt to create socket. 631 */ 632 633 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 634 * and the incoming segment acknowledges something not yet 635 * sent (the segment carries an unacceptable ACK) ... 636 * a reset is sent." 637 * 638 * Invalid ACK: reset will be sent by listening socket 639 */ 640 if ((flg & TCP_FLAG_ACK) && 641 (TCP_SKB_CB(skb)->ack_seq != 642 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 643 return sk; 644 645 /* Also, it would be not so bad idea to check rcv_tsecr, which 646 * is essentially ACK extension and too early or too late values 647 * should cause reset in unsynchronized states. 648 */ 649 650 /* RFC793: "first check sequence number". */ 651 652 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 653 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 654 /* Out of window: send ACK and drop. */ 655 if (!(flg & TCP_FLAG_RST)) 656 req->rsk_ops->send_ack(sk, skb, req); 657 if (paws_reject) 658 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 659 return NULL; 660 } 661 662 /* In sequence, PAWS is OK. */ 663 664 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 665 req->ts_recent = tmp_opt.rcv_tsval; 666 667 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 668 /* Truncate SYN, it is out of window starting 669 at tcp_rsk(req)->rcv_isn + 1. */ 670 flg &= ~TCP_FLAG_SYN; 671 } 672 673 /* RFC793: "second check the RST bit" and 674 * "fourth, check the SYN bit" 675 */ 676 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { 677 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 678 goto embryonic_reset; 679 } 680 681 /* ACK sequence verified above, just make sure ACK is 682 * set. If ACK not set, just silently drop the packet. 683 */ 684 if (!(flg & TCP_FLAG_ACK)) 685 return NULL; 686 687 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 688 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 689 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 690 inet_rsk(req)->acked = 1; 691 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 692 return NULL; 693 } 694 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) 695 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; 696 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ 697 tcp_rsk(req)->snt_synack = 0; 698 699 /* OK, ACK is valid, create big socket and 700 * feed this segment to it. It will repeat all 701 * the tests. THIS SEGMENT MUST MOVE SOCKET TO 702 * ESTABLISHED STATE. If it will be dropped after 703 * socket is created, wait for troubles. 704 */ 705 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 706 if (child == NULL) 707 goto listen_overflow; 708 709 inet_csk_reqsk_queue_unlink(sk, req, prev); 710 inet_csk_reqsk_queue_removed(sk, req); 711 712 inet_csk_reqsk_queue_add(sk, req, child); 713 return child; 714 715listen_overflow: 716 if (!sysctl_tcp_abort_on_overflow) { 717 inet_rsk(req)->acked = 1; 718 return NULL; 719 } 720 721embryonic_reset: 722 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 723 if (!(flg & TCP_FLAG_RST)) 724 req->rsk_ops->send_reset(sk, skb); 725 726 inet_csk_reqsk_queue_drop(sk, req, prev); 727 return NULL; 728} 729EXPORT_SYMBOL(tcp_check_req); 730 731/* 732 * Queue segment on the new socket if the new socket is active, 733 * otherwise we just shortcircuit this and continue with 734 * the new socket. 735 */ 736 737int tcp_child_process(struct sock *parent, struct sock *child, 738 struct sk_buff *skb) 739{ 740 int ret = 0; 741 int state = child->sk_state; 742 743 if (!sock_owned_by_user(child)) { 744 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), 745 skb->len); 746 /* Wakeup parent, send SIGIO */ 747 if (state == TCP_SYN_RECV && child->sk_state != state) 748 parent->sk_data_ready(parent, 0); 749 } else { 750 /* Alas, it is possible again, because we do lookup 751 * in main socket hash table and lock on listening 752 * socket does not protect us more. 753 */ 754 __sk_add_backlog(child, skb); 755 } 756 757 bh_unlock_sock(child); 758 sock_put(child); 759 return ret; 760} 761EXPORT_SYMBOL(tcp_child_process); 762