inet_connection_sock.c revision a019d6fe2b9da68ea4ba6cf3c4e86fc1dbf554c3
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Support for INET connection oriented protocols. 7 * 8 * Authors: See the TCP sources 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or(at your option) any later version. 14 */ 15 16#include <linux/config.h> 17#include <linux/module.h> 18#include <linux/jhash.h> 19 20#include <net/inet_connection_sock.h> 21#include <net/inet_hashtables.h> 22#include <net/inet_timewait_sock.h> 23#include <net/ip.h> 24#include <net/route.h> 25#include <net/tcp_states.h> 26#include <net/xfrm.h> 27 28#ifdef INET_CSK_DEBUG 29const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 30EXPORT_SYMBOL(inet_csk_timer_bug_msg); 31#endif 32 33/* 34 * This array holds the first and last local port number. 35 * For high-usage systems, use sysctl to change this to 36 * 32768-61000 37 */ 38int sysctl_local_port_range[2] = { 1024, 4999 }; 39 40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) 41{ 42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 43 struct sock *sk2; 44 struct hlist_node *node; 45 int reuse = sk->sk_reuse; 46 47 sk_for_each_bound(sk2, node, &tb->owners) { 48 if (sk != sk2 && 49 !inet_v6_ipv6only(sk2) && 50 (!sk->sk_bound_dev_if || 51 !sk2->sk_bound_dev_if || 52 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 53 if (!reuse || !sk2->sk_reuse || 54 sk2->sk_state == TCP_LISTEN) { 55 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 56 if (!sk2_rcv_saddr || !sk_rcv_saddr || 57 sk2_rcv_saddr == sk_rcv_saddr) 58 break; 59 } 60 } 61 } 62 return node != NULL; 63} 64 65/* Obtain a reference to a local port for the given sock, 66 * if snum is zero it means select any available local port. 67 */ 68int inet_csk_get_port(struct inet_hashinfo *hashinfo, 69 struct sock *sk, unsigned short snum) 70{ 71 struct inet_bind_hashbucket *head; 72 struct hlist_node *node; 73 struct inet_bind_bucket *tb; 74 int ret; 75 76 local_bh_disable(); 77 if (!snum) { 78 int low = sysctl_local_port_range[0]; 79 int high = sysctl_local_port_range[1]; 80 int remaining = (high - low) + 1; 81 int rover; 82 83 spin_lock(&hashinfo->portalloc_lock); 84 if (hashinfo->port_rover < low) 85 rover = low; 86 else 87 rover = hashinfo->port_rover; 88 do { 89 rover++; 90 if (rover > high) 91 rover = low; 92 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; 93 spin_lock(&head->lock); 94 inet_bind_bucket_for_each(tb, node, &head->chain) 95 if (tb->port == rover) 96 goto next; 97 break; 98 next: 99 spin_unlock(&head->lock); 100 } while (--remaining > 0); 101 hashinfo->port_rover = rover; 102 spin_unlock(&hashinfo->portalloc_lock); 103 104 /* Exhausted local port range during search? It is not 105 * possible for us to be holding one of the bind hash 106 * locks if this test triggers, because if 'remaining' 107 * drops to zero, we broke out of the do/while loop at 108 * the top level, not from the 'break;' statement. 109 */ 110 ret = 1; 111 if (remaining <= 0) 112 goto fail; 113 114 /* OK, here is the one we will use. HEAD is 115 * non-NULL and we hold it's mutex. 116 */ 117 snum = rover; 118 } else { 119 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; 120 spin_lock(&head->lock); 121 inet_bind_bucket_for_each(tb, node, &head->chain) 122 if (tb->port == snum) 123 goto tb_found; 124 } 125 tb = NULL; 126 goto tb_not_found; 127tb_found: 128 if (!hlist_empty(&tb->owners)) { 129 if (sk->sk_reuse > 1) 130 goto success; 131 if (tb->fastreuse > 0 && 132 sk->sk_reuse && sk->sk_state != TCP_LISTEN) { 133 goto success; 134 } else { 135 ret = 1; 136 if (inet_csk_bind_conflict(sk, tb)) 137 goto fail_unlock; 138 } 139 } 140tb_not_found: 141 ret = 1; 142 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) 143 goto fail_unlock; 144 if (hlist_empty(&tb->owners)) { 145 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 146 tb->fastreuse = 1; 147 else 148 tb->fastreuse = 0; 149 } else if (tb->fastreuse && 150 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 151 tb->fastreuse = 0; 152success: 153 if (!inet_csk(sk)->icsk_bind_hash) 154 inet_bind_hash(sk, tb, snum); 155 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); 156 ret = 0; 157 158fail_unlock: 159 spin_unlock(&head->lock); 160fail: 161 local_bh_enable(); 162 return ret; 163} 164 165EXPORT_SYMBOL_GPL(inet_csk_get_port); 166 167/* 168 * Wait for an incoming connection, avoid race conditions. This must be called 169 * with the socket locked. 170 */ 171static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 172{ 173 struct inet_connection_sock *icsk = inet_csk(sk); 174 DEFINE_WAIT(wait); 175 int err; 176 177 /* 178 * True wake-one mechanism for incoming connections: only 179 * one process gets woken up, not the 'whole herd'. 180 * Since we do not 'race & poll' for established sockets 181 * anymore, the common case will execute the loop only once. 182 * 183 * Subtle issue: "add_wait_queue_exclusive()" will be added 184 * after any current non-exclusive waiters, and we know that 185 * it will always _stay_ after any new non-exclusive waiters 186 * because all non-exclusive waiters are added at the 187 * beginning of the wait-queue. As such, it's ok to "drop" 188 * our exclusiveness temporarily when we get woken up without 189 * having to remove and re-insert us on the wait queue. 190 */ 191 for (;;) { 192 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 193 TASK_INTERRUPTIBLE); 194 release_sock(sk); 195 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 196 timeo = schedule_timeout(timeo); 197 lock_sock(sk); 198 err = 0; 199 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 200 break; 201 err = -EINVAL; 202 if (sk->sk_state != TCP_LISTEN) 203 break; 204 err = sock_intr_errno(timeo); 205 if (signal_pending(current)) 206 break; 207 err = -EAGAIN; 208 if (!timeo) 209 break; 210 } 211 finish_wait(sk->sk_sleep, &wait); 212 return err; 213} 214 215/* 216 * This will accept the next outstanding connection. 217 */ 218struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 219{ 220 struct inet_connection_sock *icsk = inet_csk(sk); 221 struct sock *newsk; 222 int error; 223 224 lock_sock(sk); 225 226 /* We need to make sure that this socket is listening, 227 * and that it has something pending. 228 */ 229 error = -EINVAL; 230 if (sk->sk_state != TCP_LISTEN) 231 goto out_err; 232 233 /* Find already established connection */ 234 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 235 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 236 237 /* If this is a non blocking socket don't sleep */ 238 error = -EAGAIN; 239 if (!timeo) 240 goto out_err; 241 242 error = inet_csk_wait_for_connect(sk, timeo); 243 if (error) 244 goto out_err; 245 } 246 247 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 248 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 249out: 250 release_sock(sk); 251 return newsk; 252out_err: 253 newsk = NULL; 254 *err = error; 255 goto out; 256} 257 258EXPORT_SYMBOL(inet_csk_accept); 259 260/* 261 * Using different timers for retransmit, delayed acks and probes 262 * We may wish use just one timer maintaining a list of expire jiffies 263 * to optimize. 264 */ 265void inet_csk_init_xmit_timers(struct sock *sk, 266 void (*retransmit_handler)(unsigned long), 267 void (*delack_handler)(unsigned long), 268 void (*keepalive_handler)(unsigned long)) 269{ 270 struct inet_connection_sock *icsk = inet_csk(sk); 271 272 init_timer(&icsk->icsk_retransmit_timer); 273 init_timer(&icsk->icsk_delack_timer); 274 init_timer(&sk->sk_timer); 275 276 icsk->icsk_retransmit_timer.function = retransmit_handler; 277 icsk->icsk_delack_timer.function = delack_handler; 278 sk->sk_timer.function = keepalive_handler; 279 280 icsk->icsk_retransmit_timer.data = 281 icsk->icsk_delack_timer.data = 282 sk->sk_timer.data = (unsigned long)sk; 283 284 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 285} 286 287EXPORT_SYMBOL(inet_csk_init_xmit_timers); 288 289void inet_csk_clear_xmit_timers(struct sock *sk) 290{ 291 struct inet_connection_sock *icsk = inet_csk(sk); 292 293 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; 294 295 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 296 sk_stop_timer(sk, &icsk->icsk_delack_timer); 297 sk_stop_timer(sk, &sk->sk_timer); 298} 299 300EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 301 302void inet_csk_delete_keepalive_timer(struct sock *sk) 303{ 304 sk_stop_timer(sk, &sk->sk_timer); 305} 306 307EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 308 309void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 310{ 311 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 312} 313 314EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 315 316struct dst_entry* inet_csk_route_req(struct sock *sk, 317 const struct request_sock *req) 318{ 319 struct rtable *rt; 320 const struct inet_request_sock *ireq = inet_rsk(req); 321 struct ip_options *opt = inet_rsk(req)->opt; 322 struct flowi fl = { .oif = sk->sk_bound_dev_if, 323 .nl_u = { .ip4_u = 324 { .daddr = ((opt && opt->srr) ? 325 opt->faddr : 326 ireq->rmt_addr), 327 .saddr = ireq->loc_addr, 328 .tos = RT_CONN_FLAGS(sk) } }, 329 .proto = sk->sk_protocol, 330 .uli_u = { .ports = 331 { .sport = inet_sk(sk)->sport, 332 .dport = ireq->rmt_port } } }; 333 334 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 335 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 336 return NULL; 337 } 338 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { 339 ip_rt_put(rt); 340 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 341 return NULL; 342 } 343 return &rt->u.dst; 344} 345 346EXPORT_SYMBOL_GPL(inet_csk_route_req); 347 348static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, 349 const u32 rnd, const u16 synq_hsize) 350{ 351 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); 352} 353 354#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 355#define AF_INET_FAMILY(fam) ((fam) == AF_INET) 356#else 357#define AF_INET_FAMILY(fam) 1 358#endif 359 360struct request_sock *inet_csk_search_req(const struct sock *sk, 361 struct request_sock ***prevp, 362 const __u16 rport, const __u32 raddr, 363 const __u32 laddr) 364{ 365 const struct inet_connection_sock *icsk = inet_csk(sk); 366 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 367 struct request_sock *req, **prev; 368 369 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, 370 lopt->nr_table_entries)]; 371 (req = *prev) != NULL; 372 prev = &req->dl_next) { 373 const struct inet_request_sock *ireq = inet_rsk(req); 374 375 if (ireq->rmt_port == rport && 376 ireq->rmt_addr == raddr && 377 ireq->loc_addr == laddr && 378 AF_INET_FAMILY(req->rsk_ops->family)) { 379 BUG_TRAP(!req->sk); 380 *prevp = prev; 381 break; 382 } 383 } 384 385 return req; 386} 387 388EXPORT_SYMBOL_GPL(inet_csk_search_req); 389 390void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 391 const unsigned timeout) 392{ 393 struct inet_connection_sock *icsk = inet_csk(sk); 394 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 395 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, 396 lopt->hash_rnd, lopt->nr_table_entries); 397 398 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 399 inet_csk_reqsk_queue_added(sk, timeout); 400} 401 402/* Only thing we need from tcp.h */ 403extern int sysctl_tcp_synack_retries; 404 405EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 406 407void inet_csk_reqsk_queue_prune(struct sock *parent, 408 const unsigned long interval, 409 const unsigned long timeout, 410 const unsigned long max_rto) 411{ 412 struct inet_connection_sock *icsk = inet_csk(parent); 413 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 414 struct listen_sock *lopt = queue->listen_opt; 415 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 416 int thresh = max_retries; 417 unsigned long now = jiffies; 418 struct request_sock **reqp, *req; 419 int i, budget; 420 421 if (lopt == NULL || lopt->qlen == 0) 422 return; 423 424 /* Normally all the openreqs are young and become mature 425 * (i.e. converted to established socket) for first timeout. 426 * If synack was not acknowledged for 3 seconds, it means 427 * one of the following things: synack was lost, ack was lost, 428 * rtt is high or nobody planned to ack (i.e. synflood). 429 * When server is a bit loaded, queue is populated with old 430 * open requests, reducing effective size of queue. 431 * When server is well loaded, queue size reduces to zero 432 * after several minutes of work. It is not synflood, 433 * it is normal operation. The solution is pruning 434 * too old entries overriding normal timeout, when 435 * situation becomes dangerous. 436 * 437 * Essentially, we reserve half of room for young 438 * embrions; and abort old ones without pity, if old 439 * ones are about to clog our table. 440 */ 441 if (lopt->qlen>>(lopt->max_qlen_log-1)) { 442 int young = (lopt->qlen_young<<1); 443 444 while (thresh > 2) { 445 if (lopt->qlen < young) 446 break; 447 thresh--; 448 young <<= 1; 449 } 450 } 451 452 if (queue->rskq_defer_accept) 453 max_retries = queue->rskq_defer_accept; 454 455 budget = 2 * (lopt->nr_table_entries / (timeout / interval)); 456 i = lopt->clock_hand; 457 458 do { 459 reqp=&lopt->syn_table[i]; 460 while ((req = *reqp) != NULL) { 461 if (time_after_eq(now, req->expires)) { 462 if ((req->retrans < thresh || 463 (inet_rsk(req)->acked && req->retrans < max_retries)) 464 && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { 465 unsigned long timeo; 466 467 if (req->retrans++ == 0) 468 lopt->qlen_young--; 469 timeo = min((timeout << req->retrans), max_rto); 470 req->expires = now + timeo; 471 reqp = &req->dl_next; 472 continue; 473 } 474 475 /* Drop this request */ 476 inet_csk_reqsk_queue_unlink(parent, req, reqp); 477 reqsk_queue_removed(queue, req); 478 reqsk_free(req); 479 continue; 480 } 481 reqp = &req->dl_next; 482 } 483 484 i = (i + 1) & (lopt->nr_table_entries - 1); 485 486 } while (--budget > 0); 487 488 lopt->clock_hand = i; 489 490 if (lopt->qlen) 491 inet_csk_reset_keepalive_timer(parent, interval); 492} 493 494EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 495 496struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 497 const unsigned int __nocast priority) 498{ 499 struct sock *newsk = sk_clone(sk, priority); 500 501 if (newsk != NULL) { 502 struct inet_connection_sock *newicsk = inet_csk(newsk); 503 504 newsk->sk_state = TCP_SYN_RECV; 505 newicsk->icsk_bind_hash = NULL; 506 507 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 508 newsk->sk_write_space = sk_stream_write_space; 509 510 newicsk->icsk_retransmits = 0; 511 newicsk->icsk_backoff = 0; 512 513 /* Deinitialize accept_queue to trap illegal accesses. */ 514 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 515 } 516 return newsk; 517} 518 519EXPORT_SYMBOL_GPL(inet_csk_clone); 520 521/* 522 * At this point, there should be no process reference to this 523 * socket, and thus no user references at all. Therefore we 524 * can assume the socket waitqueue is inactive and nobody will 525 * try to jump onto it. 526 */ 527void inet_csk_destroy_sock(struct sock *sk) 528{ 529 BUG_TRAP(sk->sk_state == TCP_CLOSE); 530 BUG_TRAP(sock_flag(sk, SOCK_DEAD)); 531 532 /* It cannot be in hash table! */ 533 BUG_TRAP(sk_unhashed(sk)); 534 535 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 536 BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); 537 538 sk->sk_prot->destroy(sk); 539 540 sk_stream_kill_queues(sk); 541 542 xfrm_sk_free_policy(sk); 543 544 sk_refcnt_debug_release(sk); 545 546 atomic_dec(sk->sk_prot->orphan_count); 547 sock_put(sk); 548} 549 550EXPORT_SYMBOL(inet_csk_destroy_sock); 551 552int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 553{ 554 struct inet_sock *inet = inet_sk(sk); 555 struct inet_connection_sock *icsk = inet_csk(sk); 556 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); 557 558 if (rc != 0) 559 return rc; 560 561 sk->sk_max_ack_backlog = 0; 562 sk->sk_ack_backlog = 0; 563 inet_csk_delack_init(sk); 564 565 /* There is race window here: we announce ourselves listening, 566 * but this transition is still not validated by get_port(). 567 * It is OK, because this socket enters to hash table only 568 * after validation is complete. 569 */ 570 sk->sk_state = TCP_LISTEN; 571 if (!sk->sk_prot->get_port(sk, inet->num)) { 572 inet->sport = htons(inet->num); 573 574 sk_dst_reset(sk); 575 sk->sk_prot->hash(sk); 576 577 return 0; 578 } 579 580 sk->sk_state = TCP_CLOSE; 581 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 582 return -EADDRINUSE; 583} 584 585EXPORT_SYMBOL_GPL(inet_csk_listen_start); 586 587/* 588 * This routine closes sockets which have been at least partially 589 * opened, but not yet accepted. 590 */ 591void inet_csk_listen_stop(struct sock *sk) 592{ 593 struct inet_connection_sock *icsk = inet_csk(sk); 594 struct request_sock *acc_req; 595 struct request_sock *req; 596 597 inet_csk_delete_keepalive_timer(sk); 598 599 /* make all the listen_opt local to us */ 600 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 601 602 /* Following specs, it would be better either to send FIN 603 * (and enter FIN-WAIT-1, it is normal close) 604 * or to send active reset (abort). 605 * Certainly, it is pretty dangerous while synflood, but it is 606 * bad justification for our negligence 8) 607 * To be honest, we are not able to make either 608 * of the variants now. --ANK 609 */ 610 reqsk_queue_destroy(&icsk->icsk_accept_queue); 611 612 while ((req = acc_req) != NULL) { 613 struct sock *child = req->sk; 614 615 acc_req = req->dl_next; 616 617 local_bh_disable(); 618 bh_lock_sock(child); 619 BUG_TRAP(!sock_owned_by_user(child)); 620 sock_hold(child); 621 622 sk->sk_prot->disconnect(child, O_NONBLOCK); 623 624 sock_orphan(child); 625 626 atomic_inc(sk->sk_prot->orphan_count); 627 628 inet_csk_destroy_sock(child); 629 630 bh_unlock_sock(child); 631 local_bh_enable(); 632 sock_put(child); 633 634 sk_acceptq_removed(sk); 635 __reqsk_free(req); 636 } 637 BUG_TRAP(!sk->sk_ack_backlog); 638} 639 640EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 641