tcp_metrics.c revision 8a59359cb80f448923a7bc9f555d477e74547d7a
1#include <linux/rcupdate.h> 2#include <linux/spinlock.h> 3#include <linux/jiffies.h> 4#include <linux/module.h> 5#include <linux/cache.h> 6#include <linux/slab.h> 7#include <linux/init.h> 8#include <linux/tcp.h> 9#include <linux/hash.h> 10#include <linux/tcp_metrics.h> 11#include <linux/vmalloc.h> 12 13#include <net/inet_connection_sock.h> 14#include <net/net_namespace.h> 15#include <net/request_sock.h> 16#include <net/inetpeer.h> 17#include <net/sock.h> 18#include <net/ipv6.h> 19#include <net/dst.h> 20#include <net/tcp.h> 21#include <net/genetlink.h> 22 23int sysctl_tcp_nometrics_save __read_mostly; 24 25struct tcp_fastopen_metrics { 26 u16 mss; 27 u16 syn_loss:10; /* Recurring Fast Open SYN losses */ 28 unsigned long last_syn_loss; /* Last Fast Open SYN loss */ 29 struct tcp_fastopen_cookie cookie; 30}; 31 32struct tcp_metrics_block { 33 struct tcp_metrics_block __rcu *tcpm_next; 34 struct inetpeer_addr tcpm_saddr; 35 struct inetpeer_addr tcpm_daddr; 36 unsigned long tcpm_stamp; 37 u32 tcpm_ts; 38 u32 tcpm_ts_stamp; 39 u32 tcpm_lock; 40 u32 tcpm_vals[TCP_METRIC_MAX + 1]; 41 struct tcp_fastopen_metrics tcpm_fastopen; 42 43 struct rcu_head rcu_head; 44}; 45 46static bool tcp_metric_locked(struct tcp_metrics_block *tm, 47 enum tcp_metric_index idx) 48{ 49 return tm->tcpm_lock & (1 << idx); 50} 51 52static u32 tcp_metric_get(struct tcp_metrics_block *tm, 53 enum tcp_metric_index idx) 54{ 55 return tm->tcpm_vals[idx]; 56} 57 58static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, 59 enum tcp_metric_index idx) 60{ 61 return msecs_to_jiffies(tm->tcpm_vals[idx]); 62} 63 64static void tcp_metric_set(struct tcp_metrics_block *tm, 65 enum tcp_metric_index idx, 66 u32 val) 67{ 68 tm->tcpm_vals[idx] = val; 69} 70 71static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, 72 enum tcp_metric_index idx, 73 u32 val) 74{ 75 tm->tcpm_vals[idx] = jiffies_to_msecs(val); 76} 77 78static bool addr_same(const struct inetpeer_addr *a, 79 const struct inetpeer_addr *b) 80{ 81 const struct in6_addr *a6, *b6; 82 83 if (a->family != b->family) 84 return false; 85 if (a->family == AF_INET) 86 return a->addr.a4 == b->addr.a4; 87 88 a6 = (const struct in6_addr *) &a->addr.a6[0]; 89 b6 = (const struct in6_addr *) &b->addr.a6[0]; 90 91 return ipv6_addr_equal(a6, b6); 92} 93 94struct tcpm_hash_bucket { 95 struct tcp_metrics_block __rcu *chain; 96}; 97 98static DEFINE_SPINLOCK(tcp_metrics_lock); 99 100static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, 101 bool fastopen_clear) 102{ 103 u32 val; 104 105 tm->tcpm_stamp = jiffies; 106 107 val = 0; 108 if (dst_metric_locked(dst, RTAX_RTT)) 109 val |= 1 << TCP_METRIC_RTT; 110 if (dst_metric_locked(dst, RTAX_RTTVAR)) 111 val |= 1 << TCP_METRIC_RTTVAR; 112 if (dst_metric_locked(dst, RTAX_SSTHRESH)) 113 val |= 1 << TCP_METRIC_SSTHRESH; 114 if (dst_metric_locked(dst, RTAX_CWND)) 115 val |= 1 << TCP_METRIC_CWND; 116 if (dst_metric_locked(dst, RTAX_REORDERING)) 117 val |= 1 << TCP_METRIC_REORDERING; 118 tm->tcpm_lock = val; 119 120 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); 121 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); 122 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); 123 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); 124 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); 125 tm->tcpm_ts = 0; 126 tm->tcpm_ts_stamp = 0; 127 if (fastopen_clear) { 128 tm->tcpm_fastopen.mss = 0; 129 tm->tcpm_fastopen.syn_loss = 0; 130 tm->tcpm_fastopen.cookie.len = 0; 131 } 132} 133 134static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, 135 struct inetpeer_addr *saddr, 136 struct inetpeer_addr *daddr, 137 unsigned int hash, 138 bool reclaim) 139{ 140 struct tcp_metrics_block *tm; 141 struct net *net; 142 143 spin_lock_bh(&tcp_metrics_lock); 144 net = dev_net(dst->dev); 145 if (unlikely(reclaim)) { 146 struct tcp_metrics_block *oldest; 147 148 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); 149 for (tm = rcu_dereference(oldest->tcpm_next); tm; 150 tm = rcu_dereference(tm->tcpm_next)) { 151 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) 152 oldest = tm; 153 } 154 tm = oldest; 155 } else { 156 tm = kmalloc(sizeof(*tm), GFP_ATOMIC); 157 if (!tm) 158 goto out_unlock; 159 } 160 tm->tcpm_saddr = *saddr; 161 tm->tcpm_daddr = *daddr; 162 163 tcpm_suck_dst(tm, dst, true); 164 165 if (likely(!reclaim)) { 166 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; 167 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); 168 } 169 170out_unlock: 171 spin_unlock_bh(&tcp_metrics_lock); 172 return tm; 173} 174 175#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) 176 177static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) 178{ 179 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) 180 tcpm_suck_dst(tm, dst, false); 181} 182 183#define TCP_METRICS_RECLAIM_DEPTH 5 184#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL 185 186static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) 187{ 188 if (tm) 189 return tm; 190 if (depth > TCP_METRICS_RECLAIM_DEPTH) 191 return TCP_METRICS_RECLAIM_PTR; 192 return NULL; 193} 194 195static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, 196 const struct inetpeer_addr *daddr, 197 struct net *net, unsigned int hash) 198{ 199 struct tcp_metrics_block *tm; 200 int depth = 0; 201 202 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 203 tm = rcu_dereference(tm->tcpm_next)) { 204 if (addr_same(&tm->tcpm_saddr, saddr) && 205 addr_same(&tm->tcpm_daddr, daddr)) 206 break; 207 depth++; 208 } 209 return tcp_get_encode(tm, depth); 210} 211 212static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, 213 struct dst_entry *dst) 214{ 215 struct tcp_metrics_block *tm; 216 struct inetpeer_addr saddr, daddr; 217 unsigned int hash; 218 struct net *net; 219 220 saddr.family = req->rsk_ops->family; 221 daddr.family = req->rsk_ops->family; 222 switch (daddr.family) { 223 case AF_INET: 224 saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; 225 daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; 226 hash = (__force unsigned int) daddr.addr.a4; 227 break; 228#if IS_ENABLED(CONFIG_IPV6) 229 case AF_INET6: 230 *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; 231 *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; 232 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); 233 break; 234#endif 235 default: 236 return NULL; 237 } 238 239 net = dev_net(dst->dev); 240 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 241 242 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 243 tm = rcu_dereference(tm->tcpm_next)) { 244 if (addr_same(&tm->tcpm_saddr, &saddr) && 245 addr_same(&tm->tcpm_daddr, &daddr)) 246 break; 247 } 248 tcpm_check_stamp(tm, dst); 249 return tm; 250} 251 252static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) 253{ 254 struct tcp_metrics_block *tm; 255 struct inetpeer_addr saddr, daddr; 256 unsigned int hash; 257 struct net *net; 258 259 saddr.family = tw->tw_family; 260 daddr.family = tw->tw_family; 261 switch (daddr.family) { 262 case AF_INET: 263 saddr.addr.a4 = tw->tw_rcv_saddr; 264 daddr.addr.a4 = tw->tw_daddr; 265 hash = (__force unsigned int) daddr.addr.a4; 266 break; 267#if IS_ENABLED(CONFIG_IPV6) 268 case AF_INET6: 269 *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; 270 *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; 271 hash = ipv6_addr_hash(&tw->tw_v6_daddr); 272 break; 273#endif 274 default: 275 return NULL; 276 } 277 278 net = twsk_net(tw); 279 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 280 281 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 282 tm = rcu_dereference(tm->tcpm_next)) { 283 if (addr_same(&tm->tcpm_saddr, &saddr) && 284 addr_same(&tm->tcpm_daddr, &daddr)) 285 break; 286 } 287 return tm; 288} 289 290static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, 291 struct dst_entry *dst, 292 bool create) 293{ 294 struct tcp_metrics_block *tm; 295 struct inetpeer_addr saddr, daddr; 296 unsigned int hash; 297 struct net *net; 298 bool reclaim; 299 300 saddr.family = sk->sk_family; 301 daddr.family = sk->sk_family; 302 switch (daddr.family) { 303 case AF_INET: 304 saddr.addr.a4 = inet_sk(sk)->inet_saddr; 305 daddr.addr.a4 = inet_sk(sk)->inet_daddr; 306 hash = (__force unsigned int) daddr.addr.a4; 307 break; 308#if IS_ENABLED(CONFIG_IPV6) 309 case AF_INET6: 310 *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; 311 *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; 312 hash = ipv6_addr_hash(&sk->sk_v6_daddr); 313 break; 314#endif 315 default: 316 return NULL; 317 } 318 319 net = dev_net(dst->dev); 320 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 321 322 tm = __tcp_get_metrics(&saddr, &daddr, net, hash); 323 reclaim = false; 324 if (tm == TCP_METRICS_RECLAIM_PTR) { 325 reclaim = true; 326 tm = NULL; 327 } 328 if (!tm && create) 329 tm = tcpm_new(dst, &saddr, &daddr, hash, reclaim); 330 else 331 tcpm_check_stamp(tm, dst); 332 333 return tm; 334} 335 336/* Save metrics learned by this TCP session. This function is called 337 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT 338 * or goes from LAST-ACK to CLOSE. 339 */ 340void tcp_update_metrics(struct sock *sk) 341{ 342 const struct inet_connection_sock *icsk = inet_csk(sk); 343 struct dst_entry *dst = __sk_dst_get(sk); 344 struct tcp_sock *tp = tcp_sk(sk); 345 struct tcp_metrics_block *tm; 346 unsigned long rtt; 347 u32 val; 348 int m; 349 350 if (sysctl_tcp_nometrics_save || !dst) 351 return; 352 353 if (dst->flags & DST_HOST) 354 dst_confirm(dst); 355 356 rcu_read_lock(); 357 if (icsk->icsk_backoff || !tp->srtt) { 358 /* This session failed to estimate rtt. Why? 359 * Probably, no packets returned in time. Reset our 360 * results. 361 */ 362 tm = tcp_get_metrics(sk, dst, false); 363 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) 364 tcp_metric_set(tm, TCP_METRIC_RTT, 0); 365 goto out_unlock; 366 } else 367 tm = tcp_get_metrics(sk, dst, true); 368 369 if (!tm) 370 goto out_unlock; 371 372 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); 373 m = rtt - tp->srtt; 374 375 /* If newly calculated rtt larger than stored one, store new 376 * one. Otherwise, use EWMA. Remember, rtt overestimation is 377 * always better than underestimation. 378 */ 379 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { 380 if (m <= 0) 381 rtt = tp->srtt; 382 else 383 rtt -= (m >> 3); 384 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); 385 } 386 387 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { 388 unsigned long var; 389 390 if (m < 0) 391 m = -m; 392 393 /* Scale deviation to rttvar fixed point */ 394 m >>= 1; 395 if (m < tp->mdev) 396 m = tp->mdev; 397 398 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); 399 if (m >= var) 400 var = m; 401 else 402 var -= (var - m) >> 2; 403 404 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); 405 } 406 407 if (tcp_in_initial_slowstart(tp)) { 408 /* Slow start still did not finish. */ 409 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { 410 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 411 if (val && (tp->snd_cwnd >> 1) > val) 412 tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 413 tp->snd_cwnd >> 1); 414 } 415 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 416 val = tcp_metric_get(tm, TCP_METRIC_CWND); 417 if (tp->snd_cwnd > val) 418 tcp_metric_set(tm, TCP_METRIC_CWND, 419 tp->snd_cwnd); 420 } 421 } else if (tp->snd_cwnd > tp->snd_ssthresh && 422 icsk->icsk_ca_state == TCP_CA_Open) { 423 /* Cong. avoidance phase, cwnd is reliable. */ 424 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) 425 tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 426 max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); 427 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 428 val = tcp_metric_get(tm, TCP_METRIC_CWND); 429 tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); 430 } 431 } else { 432 /* Else slow start did not finish, cwnd is non-sense, 433 * ssthresh may be also invalid. 434 */ 435 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 436 val = tcp_metric_get(tm, TCP_METRIC_CWND); 437 tcp_metric_set(tm, TCP_METRIC_CWND, 438 (val + tp->snd_ssthresh) >> 1); 439 } 440 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { 441 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 442 if (val && tp->snd_ssthresh > val) 443 tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 444 tp->snd_ssthresh); 445 } 446 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { 447 val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 448 if (val < tp->reordering && 449 tp->reordering != sysctl_tcp_reordering) 450 tcp_metric_set(tm, TCP_METRIC_REORDERING, 451 tp->reordering); 452 } 453 } 454 tm->tcpm_stamp = jiffies; 455out_unlock: 456 rcu_read_unlock(); 457} 458 459/* Initialize metrics on socket. */ 460 461void tcp_init_metrics(struct sock *sk) 462{ 463 struct dst_entry *dst = __sk_dst_get(sk); 464 struct tcp_sock *tp = tcp_sk(sk); 465 struct tcp_metrics_block *tm; 466 u32 val, crtt = 0; /* cached RTT scaled by 8 */ 467 468 if (dst == NULL) 469 goto reset; 470 471 dst_confirm(dst); 472 473 rcu_read_lock(); 474 tm = tcp_get_metrics(sk, dst, true); 475 if (!tm) { 476 rcu_read_unlock(); 477 goto reset; 478 } 479 480 if (tcp_metric_locked(tm, TCP_METRIC_CWND)) 481 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); 482 483 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 484 if (val) { 485 tp->snd_ssthresh = val; 486 if (tp->snd_ssthresh > tp->snd_cwnd_clamp) 487 tp->snd_ssthresh = tp->snd_cwnd_clamp; 488 } else { 489 /* ssthresh may have been reduced unnecessarily during. 490 * 3WHS. Restore it back to its initial default. 491 */ 492 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 493 } 494 val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 495 if (val && tp->reordering != val) { 496 tcp_disable_fack(tp); 497 tcp_disable_early_retrans(tp); 498 tp->reordering = val; 499 } 500 501 crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); 502 rcu_read_unlock(); 503reset: 504 /* The initial RTT measurement from the SYN/SYN-ACK is not ideal 505 * to seed the RTO for later data packets because SYN packets are 506 * small. Use the per-dst cached values to seed the RTO but keep 507 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). 508 * Later the RTO will be updated immediately upon obtaining the first 509 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only 510 * influences the first RTO but not later RTT estimation. 511 * 512 * But if RTT is not available from the SYN (due to retransmits or 513 * syn cookies) or the cache, force a conservative 3secs timeout. 514 * 515 * A bit of theory. RTT is time passed after "normal" sized packet 516 * is sent until it is ACKed. In normal circumstances sending small 517 * packets force peer to delay ACKs and calculation is correct too. 518 * The algorithm is adaptive and, provided we follow specs, it 519 * NEVER underestimate RTT. BUT! If peer tries to make some clever 520 * tricks sort of "quick acks" for time long enough to decrease RTT 521 * to low value, and then abruptly stops to do it and starts to delay 522 * ACKs, wait for troubles. 523 */ 524 if (crtt > tp->srtt) { 525 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ 526 crtt >>= 3; 527 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); 528 } else if (tp->srtt == 0) { 529 /* RFC6298: 5.7 We've failed to get a valid RTT sample from 530 * 3WHS. This is most likely due to retransmission, 531 * including spurious one. Reset the RTO back to 3secs 532 * from the more aggressive 1sec to avoid more spurious 533 * retransmission. 534 */ 535 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; 536 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 537 } 538 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 539 * retransmitted. In light of RFC6298 more aggressive 1sec 540 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 541 * retransmission has occurred. 542 */ 543 if (tp->total_retrans > 1) 544 tp->snd_cwnd = 1; 545 else 546 tp->snd_cwnd = tcp_init_cwnd(tp, dst); 547 tp->snd_cwnd_stamp = tcp_time_stamp; 548} 549 550bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) 551{ 552 struct tcp_metrics_block *tm; 553 bool ret; 554 555 if (!dst) 556 return false; 557 558 rcu_read_lock(); 559 tm = __tcp_get_metrics_req(req, dst); 560 if (paws_check) { 561 if (tm && 562 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && 563 (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) 564 ret = false; 565 else 566 ret = true; 567 } else { 568 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) 569 ret = true; 570 else 571 ret = false; 572 } 573 rcu_read_unlock(); 574 575 return ret; 576} 577EXPORT_SYMBOL_GPL(tcp_peer_is_proven); 578 579void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) 580{ 581 struct tcp_metrics_block *tm; 582 583 rcu_read_lock(); 584 tm = tcp_get_metrics(sk, dst, true); 585 if (tm) { 586 struct tcp_sock *tp = tcp_sk(sk); 587 588 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { 589 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; 590 tp->rx_opt.ts_recent = tm->tcpm_ts; 591 } 592 } 593 rcu_read_unlock(); 594} 595EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); 596 597/* VJ's idea. Save last timestamp seen from this destination and hold 598 * it at least for normal timewait interval to use for duplicate 599 * segment detection in subsequent connections, before they enter 600 * synchronized state. 601 */ 602bool tcp_remember_stamp(struct sock *sk) 603{ 604 struct dst_entry *dst = __sk_dst_get(sk); 605 bool ret = false; 606 607 if (dst) { 608 struct tcp_metrics_block *tm; 609 610 rcu_read_lock(); 611 tm = tcp_get_metrics(sk, dst, true); 612 if (tm) { 613 struct tcp_sock *tp = tcp_sk(sk); 614 615 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || 616 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && 617 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { 618 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; 619 tm->tcpm_ts = tp->rx_opt.ts_recent; 620 } 621 ret = true; 622 } 623 rcu_read_unlock(); 624 } 625 return ret; 626} 627 628bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) 629{ 630 struct tcp_metrics_block *tm; 631 bool ret = false; 632 633 rcu_read_lock(); 634 tm = __tcp_get_metrics_tw(tw); 635 if (tm) { 636 const struct tcp_timewait_sock *tcptw; 637 struct sock *sk = (struct sock *) tw; 638 639 tcptw = tcp_twsk(sk); 640 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || 641 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && 642 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { 643 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; 644 tm->tcpm_ts = tcptw->tw_ts_recent; 645 } 646 ret = true; 647 } 648 rcu_read_unlock(); 649 650 return ret; 651} 652 653static DEFINE_SEQLOCK(fastopen_seqlock); 654 655void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, 656 struct tcp_fastopen_cookie *cookie, 657 int *syn_loss, unsigned long *last_syn_loss) 658{ 659 struct tcp_metrics_block *tm; 660 661 rcu_read_lock(); 662 tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); 663 if (tm) { 664 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; 665 unsigned int seq; 666 667 do { 668 seq = read_seqbegin(&fastopen_seqlock); 669 if (tfom->mss) 670 *mss = tfom->mss; 671 *cookie = tfom->cookie; 672 *syn_loss = tfom->syn_loss; 673 *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; 674 } while (read_seqretry(&fastopen_seqlock, seq)); 675 } 676 rcu_read_unlock(); 677} 678 679void tcp_fastopen_cache_set(struct sock *sk, u16 mss, 680 struct tcp_fastopen_cookie *cookie, bool syn_lost) 681{ 682 struct dst_entry *dst = __sk_dst_get(sk); 683 struct tcp_metrics_block *tm; 684 685 if (!dst) 686 return; 687 rcu_read_lock(); 688 tm = tcp_get_metrics(sk, dst, true); 689 if (tm) { 690 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; 691 692 write_seqlock_bh(&fastopen_seqlock); 693 if (mss) 694 tfom->mss = mss; 695 if (cookie && cookie->len > 0) 696 tfom->cookie = *cookie; 697 if (syn_lost) { 698 ++tfom->syn_loss; 699 tfom->last_syn_loss = jiffies; 700 } else 701 tfom->syn_loss = 0; 702 write_sequnlock_bh(&fastopen_seqlock); 703 } 704 rcu_read_unlock(); 705} 706 707static struct genl_family tcp_metrics_nl_family = { 708 .id = GENL_ID_GENERATE, 709 .hdrsize = 0, 710 .name = TCP_METRICS_GENL_NAME, 711 .version = TCP_METRICS_GENL_VERSION, 712 .maxattr = TCP_METRICS_ATTR_MAX, 713 .netnsok = true, 714}; 715 716static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { 717 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, 718 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, 719 .len = sizeof(struct in6_addr), }, 720 /* Following attributes are not received for GET/DEL, 721 * we keep them for reference 722 */ 723#if 0 724 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, 725 [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, 726 [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, 727 [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, 728 [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, 729 [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, 730 [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, 731 [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, 732 .len = TCP_FASTOPEN_COOKIE_MAX, }, 733#endif 734}; 735 736/* Add attributes, caller cancels its header on failure */ 737static int tcp_metrics_fill_info(struct sk_buff *msg, 738 struct tcp_metrics_block *tm) 739{ 740 struct nlattr *nest; 741 int i; 742 743 switch (tm->tcpm_daddr.family) { 744 case AF_INET: 745 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, 746 tm->tcpm_daddr.addr.a4) < 0) 747 goto nla_put_failure; 748 if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, 749 tm->tcpm_saddr.addr.a4) < 0) 750 goto nla_put_failure; 751 break; 752 case AF_INET6: 753 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, 754 tm->tcpm_daddr.addr.a6) < 0) 755 goto nla_put_failure; 756 if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, 757 tm->tcpm_saddr.addr.a6) < 0) 758 goto nla_put_failure; 759 break; 760 default: 761 return -EAFNOSUPPORT; 762 } 763 764 if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, 765 jiffies - tm->tcpm_stamp) < 0) 766 goto nla_put_failure; 767 if (tm->tcpm_ts_stamp) { 768 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, 769 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) 770 goto nla_put_failure; 771 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, 772 tm->tcpm_ts) < 0) 773 goto nla_put_failure; 774 } 775 776 { 777 int n = 0; 778 779 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); 780 if (!nest) 781 goto nla_put_failure; 782 for (i = 0; i < TCP_METRIC_MAX + 1; i++) { 783 if (!tm->tcpm_vals[i]) 784 continue; 785 if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) 786 goto nla_put_failure; 787 n++; 788 } 789 if (n) 790 nla_nest_end(msg, nest); 791 else 792 nla_nest_cancel(msg, nest); 793 } 794 795 { 796 struct tcp_fastopen_metrics tfom_copy[1], *tfom; 797 unsigned int seq; 798 799 do { 800 seq = read_seqbegin(&fastopen_seqlock); 801 tfom_copy[0] = tm->tcpm_fastopen; 802 } while (read_seqretry(&fastopen_seqlock, seq)); 803 804 tfom = tfom_copy; 805 if (tfom->mss && 806 nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, 807 tfom->mss) < 0) 808 goto nla_put_failure; 809 if (tfom->syn_loss && 810 (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, 811 tfom->syn_loss) < 0 || 812 nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, 813 jiffies - tfom->last_syn_loss) < 0)) 814 goto nla_put_failure; 815 if (tfom->cookie.len > 0 && 816 nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, 817 tfom->cookie.len, tfom->cookie.val) < 0) 818 goto nla_put_failure; 819 } 820 821 return 0; 822 823nla_put_failure: 824 return -EMSGSIZE; 825} 826 827static int tcp_metrics_dump_info(struct sk_buff *skb, 828 struct netlink_callback *cb, 829 struct tcp_metrics_block *tm) 830{ 831 void *hdr; 832 833 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 834 &tcp_metrics_nl_family, NLM_F_MULTI, 835 TCP_METRICS_CMD_GET); 836 if (!hdr) 837 return -EMSGSIZE; 838 839 if (tcp_metrics_fill_info(skb, tm) < 0) 840 goto nla_put_failure; 841 842 return genlmsg_end(skb, hdr); 843 844nla_put_failure: 845 genlmsg_cancel(skb, hdr); 846 return -EMSGSIZE; 847} 848 849static int tcp_metrics_nl_dump(struct sk_buff *skb, 850 struct netlink_callback *cb) 851{ 852 struct net *net = sock_net(skb->sk); 853 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; 854 unsigned int row, s_row = cb->args[0]; 855 int s_col = cb->args[1], col = s_col; 856 857 for (row = s_row; row < max_rows; row++, s_col = 0) { 858 struct tcp_metrics_block *tm; 859 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; 860 861 rcu_read_lock(); 862 for (col = 0, tm = rcu_dereference(hb->chain); tm; 863 tm = rcu_dereference(tm->tcpm_next), col++) { 864 if (col < s_col) 865 continue; 866 if (tcp_metrics_dump_info(skb, cb, tm) < 0) { 867 rcu_read_unlock(); 868 goto done; 869 } 870 } 871 rcu_read_unlock(); 872 } 873 874done: 875 cb->args[0] = row; 876 cb->args[1] = col; 877 return skb->len; 878} 879 880static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, 881 unsigned int *hash, int optional) 882{ 883 struct nlattr *a; 884 885 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; 886 if (a) { 887 addr->family = AF_INET; 888 addr->addr.a4 = nla_get_be32(a); 889 *hash = (__force unsigned int) addr->addr.a4; 890 return 0; 891 } 892 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; 893 if (a) { 894 if (nla_len(a) != sizeof(struct in6_addr)) 895 return -EINVAL; 896 addr->family = AF_INET6; 897 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); 898 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); 899 return 0; 900 } 901 return optional ? 1 : -EAFNOSUPPORT; 902} 903 904static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) 905{ 906 struct tcp_metrics_block *tm; 907 struct inetpeer_addr daddr; 908 unsigned int hash; 909 struct sk_buff *msg; 910 struct net *net = genl_info_net(info); 911 void *reply; 912 int ret; 913 914 ret = parse_nl_addr(info, &daddr, &hash, 0); 915 if (ret < 0) 916 return ret; 917 918 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 919 if (!msg) 920 return -ENOMEM; 921 922 reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, 923 info->genlhdr->cmd); 924 if (!reply) 925 goto nla_put_failure; 926 927 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 928 ret = -ESRCH; 929 rcu_read_lock(); 930 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 931 tm = rcu_dereference(tm->tcpm_next)) { 932 if (addr_same(&tm->tcpm_daddr, &daddr)) { 933 ret = tcp_metrics_fill_info(msg, tm); 934 break; 935 } 936 } 937 rcu_read_unlock(); 938 if (ret < 0) 939 goto out_free; 940 941 genlmsg_end(msg, reply); 942 return genlmsg_reply(msg, info); 943 944nla_put_failure: 945 ret = -EMSGSIZE; 946 947out_free: 948 nlmsg_free(msg); 949 return ret; 950} 951 952#define deref_locked_genl(p) \ 953 rcu_dereference_protected(p, lockdep_genl_is_held() && \ 954 lockdep_is_held(&tcp_metrics_lock)) 955 956#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) 957 958static int tcp_metrics_flush_all(struct net *net) 959{ 960 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; 961 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; 962 struct tcp_metrics_block *tm; 963 unsigned int row; 964 965 for (row = 0; row < max_rows; row++, hb++) { 966 spin_lock_bh(&tcp_metrics_lock); 967 tm = deref_locked_genl(hb->chain); 968 if (tm) 969 hb->chain = NULL; 970 spin_unlock_bh(&tcp_metrics_lock); 971 while (tm) { 972 struct tcp_metrics_block *next; 973 974 next = deref_genl(tm->tcpm_next); 975 kfree_rcu(tm, rcu_head); 976 tm = next; 977 } 978 } 979 return 0; 980} 981 982static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) 983{ 984 struct tcpm_hash_bucket *hb; 985 struct tcp_metrics_block *tm; 986 struct tcp_metrics_block __rcu **pp; 987 struct inetpeer_addr daddr; 988 unsigned int hash; 989 struct net *net = genl_info_net(info); 990 int ret; 991 992 ret = parse_nl_addr(info, &daddr, &hash, 1); 993 if (ret < 0) 994 return ret; 995 if (ret > 0) 996 return tcp_metrics_flush_all(net); 997 998 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 999 hb = net->ipv4.tcp_metrics_hash + hash; 1000 pp = &hb->chain; 1001 spin_lock_bh(&tcp_metrics_lock); 1002 for (tm = deref_locked_genl(*pp); tm; 1003 pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { 1004 if (addr_same(&tm->tcpm_daddr, &daddr)) { 1005 *pp = tm->tcpm_next; 1006 break; 1007 } 1008 } 1009 spin_unlock_bh(&tcp_metrics_lock); 1010 if (!tm) 1011 return -ESRCH; 1012 kfree_rcu(tm, rcu_head); 1013 return 0; 1014} 1015 1016static const struct genl_ops tcp_metrics_nl_ops[] = { 1017 { 1018 .cmd = TCP_METRICS_CMD_GET, 1019 .doit = tcp_metrics_nl_cmd_get, 1020 .dumpit = tcp_metrics_nl_dump, 1021 .policy = tcp_metrics_nl_policy, 1022 .flags = GENL_ADMIN_PERM, 1023 }, 1024 { 1025 .cmd = TCP_METRICS_CMD_DEL, 1026 .doit = tcp_metrics_nl_cmd_del, 1027 .policy = tcp_metrics_nl_policy, 1028 .flags = GENL_ADMIN_PERM, 1029 }, 1030}; 1031 1032static unsigned int tcpmhash_entries; 1033static int __init set_tcpmhash_entries(char *str) 1034{ 1035 ssize_t ret; 1036 1037 if (!str) 1038 return 0; 1039 1040 ret = kstrtouint(str, 0, &tcpmhash_entries); 1041 if (ret) 1042 return 0; 1043 1044 return 1; 1045} 1046__setup("tcpmhash_entries=", set_tcpmhash_entries); 1047 1048static int __net_init tcp_net_metrics_init(struct net *net) 1049{ 1050 size_t size; 1051 unsigned int slots; 1052 1053 slots = tcpmhash_entries; 1054 if (!slots) { 1055 if (totalram_pages >= 128 * 1024) 1056 slots = 16 * 1024; 1057 else 1058 slots = 8 * 1024; 1059 } 1060 1061 net->ipv4.tcp_metrics_hash_log = order_base_2(slots); 1062 size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; 1063 1064 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1065 if (!net->ipv4.tcp_metrics_hash) 1066 net->ipv4.tcp_metrics_hash = vzalloc(size); 1067 1068 if (!net->ipv4.tcp_metrics_hash) 1069 return -ENOMEM; 1070 1071 return 0; 1072} 1073 1074static void __net_exit tcp_net_metrics_exit(struct net *net) 1075{ 1076 unsigned int i; 1077 1078 for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { 1079 struct tcp_metrics_block *tm, *next; 1080 1081 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); 1082 while (tm) { 1083 next = rcu_dereference_protected(tm->tcpm_next, 1); 1084 kfree(tm); 1085 tm = next; 1086 } 1087 } 1088 if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) 1089 vfree(net->ipv4.tcp_metrics_hash); 1090 else 1091 kfree(net->ipv4.tcp_metrics_hash); 1092} 1093 1094static __net_initdata struct pernet_operations tcp_net_metrics_ops = { 1095 .init = tcp_net_metrics_init, 1096 .exit = tcp_net_metrics_exit, 1097}; 1098 1099void __init tcp_metrics_init(void) 1100{ 1101 int ret; 1102 1103 ret = register_pernet_subsys(&tcp_net_metrics_ops); 1104 if (ret < 0) 1105 goto cleanup; 1106 ret = genl_register_family_with_ops(&tcp_metrics_nl_family, 1107 tcp_metrics_nl_ops); 1108 if (ret < 0) 1109 goto cleanup_subsys; 1110 return; 1111 1112cleanup_subsys: 1113 unregister_pernet_subsys(&tcp_net_metrics_ops); 1114 1115cleanup: 1116 return; 1117} 1118