tcp_ipv4.c revision 0b040829952d84bf2a62526f0e24b624e0699447
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24/* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 54#include <linux/types.h> 55#include <linux/fcntl.h> 56#include <linux/module.h> 57#include <linux/random.h> 58#include <linux/cache.h> 59#include <linux/jhash.h> 60#include <linux/init.h> 61#include <linux/times.h> 62 63#include <net/net_namespace.h> 64#include <net/icmp.h> 65#include <net/inet_hashtables.h> 66#include <net/tcp.h> 67#include <net/transp_v6.h> 68#include <net/ipv6.h> 69#include <net/inet_common.h> 70#include <net/timewait_sock.h> 71#include <net/xfrm.h> 72#include <net/netdma.h> 73 74#include <linux/inet.h> 75#include <linux/ipv6.h> 76#include <linux/stddef.h> 77#include <linux/proc_fs.h> 78#include <linux/seq_file.h> 79 80#include <linux/crypto.h> 81#include <linux/scatterlist.h> 82 83int sysctl_tcp_tw_reuse __read_mostly; 84int sysctl_tcp_low_latency __read_mostly; 85 86/* Check TCP sequence numbers in ICMP packets. */ 87#define ICMP_MIN_LENGTH 8 88 89void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); 90 91#ifdef CONFIG_TCP_MD5SIG 92static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, 93 __be32 addr); 94static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, 95 __be32 saddr, __be32 daddr, 96 struct tcphdr *th, int protocol, 97 unsigned int tcplen); 98#endif 99 100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 101 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock), 102 .lhash_users = ATOMIC_INIT(0), 103 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), 104}; 105 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 107{ 108 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 109 ip_hdr(skb)->saddr, 110 tcp_hdr(skb)->dest, 111 tcp_hdr(skb)->source); 112} 113 114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 115{ 116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 117 struct tcp_sock *tp = tcp_sk(sk); 118 119 /* With PAWS, it is safe from the viewpoint 120 of data integrity. Even without PAWS it is safe provided sequence 121 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 122 123 Actually, the idea is close to VJ's one, only timestamp cache is 124 held not per host, but per port pair and TW bucket is used as state 125 holder. 126 127 If TW bucket has been already destroyed we fall back to VJ's scheme 128 and use initial timestamp retrieved from peer table. 129 */ 130 if (tcptw->tw_ts_recent_stamp && 131 (twp == NULL || (sysctl_tcp_tw_reuse && 132 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 133 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 134 if (tp->write_seq == 0) 135 tp->write_seq = 1; 136 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 137 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 138 sock_hold(sktw); 139 return 1; 140 } 141 142 return 0; 143} 144 145EXPORT_SYMBOL_GPL(tcp_twsk_unique); 146 147/* This will initiate an outgoing connection. */ 148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 149{ 150 struct inet_sock *inet = inet_sk(sk); 151 struct tcp_sock *tp = tcp_sk(sk); 152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 153 struct rtable *rt; 154 __be32 daddr, nexthop; 155 int tmp; 156 int err; 157 158 if (addr_len < sizeof(struct sockaddr_in)) 159 return -EINVAL; 160 161 if (usin->sin_family != AF_INET) 162 return -EAFNOSUPPORT; 163 164 nexthop = daddr = usin->sin_addr.s_addr; 165 if (inet->opt && inet->opt->srr) { 166 if (!daddr) 167 return -EINVAL; 168 nexthop = inet->opt->faddr; 169 } 170 171 tmp = ip_route_connect(&rt, nexthop, inet->saddr, 172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 173 IPPROTO_TCP, 174 inet->sport, usin->sin_port, sk, 1); 175 if (tmp < 0) { 176 if (tmp == -ENETUNREACH) 177 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 178 return tmp; 179 } 180 181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 182 ip_rt_put(rt); 183 return -ENETUNREACH; 184 } 185 186 if (!inet->opt || !inet->opt->srr) 187 daddr = rt->rt_dst; 188 189 if (!inet->saddr) 190 inet->saddr = rt->rt_src; 191 inet->rcv_saddr = inet->saddr; 192 193 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { 194 /* Reset inherited state */ 195 tp->rx_opt.ts_recent = 0; 196 tp->rx_opt.ts_recent_stamp = 0; 197 tp->write_seq = 0; 198 } 199 200 if (tcp_death_row.sysctl_tw_recycle && 201 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 202 struct inet_peer *peer = rt_get_peer(rt); 203 /* 204 * VJ's idea. We save last timestamp seen from 205 * the destination in peer table, when entering state 206 * TIME-WAIT * and initialize rx_opt.ts_recent from it, 207 * when trying new connection. 208 */ 209 if (peer != NULL && 210 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { 211 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 212 tp->rx_opt.ts_recent = peer->tcp_ts; 213 } 214 } 215 216 inet->dport = usin->sin_port; 217 inet->daddr = daddr; 218 219 inet_csk(sk)->icsk_ext_hdr_len = 0; 220 if (inet->opt) 221 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 222 223 tp->rx_opt.mss_clamp = 536; 224 225 /* Socket identity is still unknown (sport may be zero). 226 * However we set state to SYN-SENT and not releasing socket 227 * lock select source port, enter ourselves into the hash tables and 228 * complete initialization after this. 229 */ 230 tcp_set_state(sk, TCP_SYN_SENT); 231 err = inet_hash_connect(&tcp_death_row, sk); 232 if (err) 233 goto failure; 234 235 err = ip_route_newports(&rt, IPPROTO_TCP, 236 inet->sport, inet->dport, sk); 237 if (err) 238 goto failure; 239 240 /* OK, now commit destination to socket. */ 241 sk->sk_gso_type = SKB_GSO_TCPV4; 242 sk_setup_caps(sk, &rt->u.dst); 243 244 if (!tp->write_seq) 245 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 246 inet->daddr, 247 inet->sport, 248 usin->sin_port); 249 250 inet->id = tp->write_seq ^ jiffies; 251 252 err = tcp_connect(sk); 253 rt = NULL; 254 if (err) 255 goto failure; 256 257 return 0; 258 259failure: 260 /* 261 * This unhashes the socket and releases the local port, 262 * if necessary. 263 */ 264 tcp_set_state(sk, TCP_CLOSE); 265 ip_rt_put(rt); 266 sk->sk_route_caps = 0; 267 inet->dport = 0; 268 return err; 269} 270 271/* 272 * This routine does path mtu discovery as defined in RFC1191. 273 */ 274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) 275{ 276 struct dst_entry *dst; 277 struct inet_sock *inet = inet_sk(sk); 278 279 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 280 * send out by Linux are always <576bytes so they should go through 281 * unfragmented). 282 */ 283 if (sk->sk_state == TCP_LISTEN) 284 return; 285 286 /* We don't check in the destentry if pmtu discovery is forbidden 287 * on this route. We just assume that no packet_to_big packets 288 * are send back when pmtu discovery is not active. 289 * There is a small race when the user changes this flag in the 290 * route, but I think that's acceptable. 291 */ 292 if ((dst = __sk_dst_check(sk, 0)) == NULL) 293 return; 294 295 dst->ops->update_pmtu(dst, mtu); 296 297 /* Something is about to be wrong... Remember soft error 298 * for the case, if this connection will not able to recover. 299 */ 300 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 301 sk->sk_err_soft = EMSGSIZE; 302 303 mtu = dst_mtu(dst); 304 305 if (inet->pmtudisc != IP_PMTUDISC_DONT && 306 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 307 tcp_sync_mss(sk, mtu); 308 309 /* Resend the TCP packet because it's 310 * clear that the old packet has been 311 * dropped. This is the new "fast" path mtu 312 * discovery. 313 */ 314 tcp_simple_retransmit(sk); 315 } /* else let the usual retransmit timer handle it */ 316} 317 318/* 319 * This routine is called by the ICMP module when it gets some 320 * sort of error condition. If err < 0 then the socket should 321 * be closed and the error returned to the user. If err > 0 322 * it's just the icmp type << 8 | icmp code. After adjustment 323 * header points to the first 8 bytes of the tcp header. We need 324 * to find the appropriate port. 325 * 326 * The locking strategy used here is very "optimistic". When 327 * someone else accesses the socket the ICMP is just dropped 328 * and for some paths there is no check at all. 329 * A more general error queue to queue errors for later handling 330 * is probably better. 331 * 332 */ 333 334void tcp_v4_err(struct sk_buff *skb, u32 info) 335{ 336 struct iphdr *iph = (struct iphdr *)skb->data; 337 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 338 struct tcp_sock *tp; 339 struct inet_sock *inet; 340 const int type = icmp_hdr(skb)->type; 341 const int code = icmp_hdr(skb)->code; 342 struct sock *sk; 343 __u32 seq; 344 int err; 345 346 if (skb->len < (iph->ihl << 2) + 8) { 347 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 348 return; 349 } 350 351 sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, 352 iph->saddr, th->source, inet_iif(skb)); 353 if (!sk) { 354 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 355 return; 356 } 357 if (sk->sk_state == TCP_TIME_WAIT) { 358 inet_twsk_put(inet_twsk(sk)); 359 return; 360 } 361 362 bh_lock_sock(sk); 363 /* If too many ICMPs get dropped on busy 364 * servers this needs to be solved differently. 365 */ 366 if (sock_owned_by_user(sk)) 367 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); 368 369 if (sk->sk_state == TCP_CLOSE) 370 goto out; 371 372 tp = tcp_sk(sk); 373 seq = ntohl(th->seq); 374 if (sk->sk_state != TCP_LISTEN && 375 !between(seq, tp->snd_una, tp->snd_nxt)) { 376 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 377 goto out; 378 } 379 380 switch (type) { 381 case ICMP_SOURCE_QUENCH: 382 /* Just silently ignore these. */ 383 goto out; 384 case ICMP_PARAMETERPROB: 385 err = EPROTO; 386 break; 387 case ICMP_DEST_UNREACH: 388 if (code > NR_ICMP_UNREACH) 389 goto out; 390 391 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 392 if (!sock_owned_by_user(sk)) 393 do_pmtu_discovery(sk, iph, info); 394 goto out; 395 } 396 397 err = icmp_err_convert[code].errno; 398 break; 399 case ICMP_TIME_EXCEEDED: 400 err = EHOSTUNREACH; 401 break; 402 default: 403 goto out; 404 } 405 406 switch (sk->sk_state) { 407 struct request_sock *req, **prev; 408 case TCP_LISTEN: 409 if (sock_owned_by_user(sk)) 410 goto out; 411 412 req = inet_csk_search_req(sk, &prev, th->dest, 413 iph->daddr, iph->saddr); 414 if (!req) 415 goto out; 416 417 /* ICMPs are not backlogged, hence we cannot get 418 an established socket here. 419 */ 420 BUG_TRAP(!req->sk); 421 422 if (seq != tcp_rsk(req)->snt_isn) { 423 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 424 goto out; 425 } 426 427 /* 428 * Still in SYN_RECV, just remove it silently. 429 * There is no good way to pass the error to the newly 430 * created socket, and POSIX does not want network 431 * errors returned from accept(). 432 */ 433 inet_csk_reqsk_queue_drop(sk, req, prev); 434 goto out; 435 436 case TCP_SYN_SENT: 437 case TCP_SYN_RECV: /* Cannot happen. 438 It can f.e. if SYNs crossed. 439 */ 440 if (!sock_owned_by_user(sk)) { 441 sk->sk_err = err; 442 443 sk->sk_error_report(sk); 444 445 tcp_done(sk); 446 } else { 447 sk->sk_err_soft = err; 448 } 449 goto out; 450 } 451 452 /* If we've already connected we will keep trying 453 * until we time out, or the user gives up. 454 * 455 * rfc1122 4.2.3.9 allows to consider as hard errors 456 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 457 * but it is obsoleted by pmtu discovery). 458 * 459 * Note, that in modern internet, where routing is unreliable 460 * and in each dark corner broken firewalls sit, sending random 461 * errors ordered by their masters even this two messages finally lose 462 * their original sense (even Linux sends invalid PORT_UNREACHs) 463 * 464 * Now we are in compliance with RFCs. 465 * --ANK (980905) 466 */ 467 468 inet = inet_sk(sk); 469 if (!sock_owned_by_user(sk) && inet->recverr) { 470 sk->sk_err = err; 471 sk->sk_error_report(sk); 472 } else { /* Only an error on timeout */ 473 sk->sk_err_soft = err; 474 } 475 476out: 477 bh_unlock_sock(sk); 478 sock_put(sk); 479} 480 481/* This routine computes an IPv4 TCP checksum. */ 482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) 483{ 484 struct inet_sock *inet = inet_sk(sk); 485 struct tcphdr *th = tcp_hdr(skb); 486 487 if (skb->ip_summed == CHECKSUM_PARTIAL) { 488 th->check = ~tcp_v4_check(len, inet->saddr, 489 inet->daddr, 0); 490 skb->csum_start = skb_transport_header(skb) - skb->head; 491 skb->csum_offset = offsetof(struct tcphdr, check); 492 } else { 493 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 494 csum_partial((char *)th, 495 th->doff << 2, 496 skb->csum)); 497 } 498} 499 500int tcp_v4_gso_send_check(struct sk_buff *skb) 501{ 502 const struct iphdr *iph; 503 struct tcphdr *th; 504 505 if (!pskb_may_pull(skb, sizeof(*th))) 506 return -EINVAL; 507 508 iph = ip_hdr(skb); 509 th = tcp_hdr(skb); 510 511 th->check = 0; 512 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); 513 skb->csum_start = skb_transport_header(skb) - skb->head; 514 skb->csum_offset = offsetof(struct tcphdr, check); 515 skb->ip_summed = CHECKSUM_PARTIAL; 516 return 0; 517} 518 519/* 520 * This routine will send an RST to the other tcp. 521 * 522 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 523 * for reset. 524 * Answer: if a packet caused RST, it is not for a socket 525 * existing in our system, if it is matched to a socket, 526 * it is just duplicate segment or bug in other side's TCP. 527 * So that we build reply only basing on parameters 528 * arrived with segment. 529 * Exception: precedence violation. We do not implement it in any case. 530 */ 531 532static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) 533{ 534 struct tcphdr *th = tcp_hdr(skb); 535 struct { 536 struct tcphdr th; 537#ifdef CONFIG_TCP_MD5SIG 538 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 539#endif 540 } rep; 541 struct ip_reply_arg arg; 542#ifdef CONFIG_TCP_MD5SIG 543 struct tcp_md5sig_key *key; 544#endif 545 546 /* Never send a reset in response to a reset. */ 547 if (th->rst) 548 return; 549 550 if (skb->rtable->rt_type != RTN_LOCAL) 551 return; 552 553 /* Swap the send and the receive. */ 554 memset(&rep, 0, sizeof(rep)); 555 rep.th.dest = th->source; 556 rep.th.source = th->dest; 557 rep.th.doff = sizeof(struct tcphdr) / 4; 558 rep.th.rst = 1; 559 560 if (th->ack) { 561 rep.th.seq = th->ack_seq; 562 } else { 563 rep.th.ack = 1; 564 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 565 skb->len - (th->doff << 2)); 566 } 567 568 memset(&arg, 0, sizeof(arg)); 569 arg.iov[0].iov_base = (unsigned char *)&rep; 570 arg.iov[0].iov_len = sizeof(rep.th); 571 572#ifdef CONFIG_TCP_MD5SIG 573 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; 574 if (key) { 575 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 576 (TCPOPT_NOP << 16) | 577 (TCPOPT_MD5SIG << 8) | 578 TCPOLEN_MD5SIG); 579 /* Update length and the length the header thinks exists */ 580 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 581 rep.th.doff = arg.iov[0].iov_len / 4; 582 583 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], 584 key, 585 ip_hdr(skb)->daddr, 586 ip_hdr(skb)->saddr, 587 &rep.th, IPPROTO_TCP, 588 arg.iov[0].iov_len); 589 } 590#endif 591 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 592 ip_hdr(skb)->saddr, /* XXX */ 593 sizeof(struct tcphdr), IPPROTO_TCP, 0); 594 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 595 596 ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, 597 &arg, arg.iov[0].iov_len); 598 599 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); 600 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); 601} 602 603/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 604 outside socket context is ugly, certainly. What can I do? 605 */ 606 607static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, 608 struct sk_buff *skb, u32 seq, u32 ack, 609 u32 win, u32 ts) 610{ 611 struct tcphdr *th = tcp_hdr(skb); 612 struct { 613 struct tcphdr th; 614 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 615#ifdef CONFIG_TCP_MD5SIG 616 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 617#endif 618 ]; 619 } rep; 620 struct ip_reply_arg arg; 621#ifdef CONFIG_TCP_MD5SIG 622 struct tcp_md5sig_key *key; 623 struct tcp_md5sig_key tw_key; 624#endif 625 626 memset(&rep.th, 0, sizeof(struct tcphdr)); 627 memset(&arg, 0, sizeof(arg)); 628 629 arg.iov[0].iov_base = (unsigned char *)&rep; 630 arg.iov[0].iov_len = sizeof(rep.th); 631 if (ts) { 632 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 633 (TCPOPT_TIMESTAMP << 8) | 634 TCPOLEN_TIMESTAMP); 635 rep.opt[1] = htonl(tcp_time_stamp); 636 rep.opt[2] = htonl(ts); 637 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 638 } 639 640 /* Swap the send and the receive. */ 641 rep.th.dest = th->source; 642 rep.th.source = th->dest; 643 rep.th.doff = arg.iov[0].iov_len / 4; 644 rep.th.seq = htonl(seq); 645 rep.th.ack_seq = htonl(ack); 646 rep.th.ack = 1; 647 rep.th.window = htons(win); 648 649#ifdef CONFIG_TCP_MD5SIG 650 /* 651 * The SKB holds an imcoming packet, but may not have a valid ->sk 652 * pointer. This is especially the case when we're dealing with a 653 * TIME_WAIT ack, because the sk structure is long gone, and only 654 * the tcp_timewait_sock remains. So the md5 key is stashed in that 655 * structure, and we use it in preference. I believe that (twsk || 656 * skb->sk) holds true, but we program defensively. 657 */ 658 if (!twsk && skb->sk) { 659 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); 660 } else if (twsk && twsk->tw_md5_keylen) { 661 tw_key.key = twsk->tw_md5_key; 662 tw_key.keylen = twsk->tw_md5_keylen; 663 key = &tw_key; 664 } else 665 key = NULL; 666 667 if (key) { 668 int offset = (ts) ? 3 : 0; 669 670 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 671 (TCPOPT_NOP << 16) | 672 (TCPOPT_MD5SIG << 8) | 673 TCPOLEN_MD5SIG); 674 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 675 rep.th.doff = arg.iov[0].iov_len/4; 676 677 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], 678 key, 679 ip_hdr(skb)->daddr, 680 ip_hdr(skb)->saddr, 681 &rep.th, IPPROTO_TCP, 682 arg.iov[0].iov_len); 683 } 684#endif 685 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 686 ip_hdr(skb)->saddr, /* XXX */ 687 arg.iov[0].iov_len, IPPROTO_TCP, 0); 688 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 689 if (twsk) 690 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; 691 692 ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, 693 &arg, arg.iov[0].iov_len); 694 695 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); 696} 697 698static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 699{ 700 struct inet_timewait_sock *tw = inet_twsk(sk); 701 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 702 703 tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 704 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 705 tcptw->tw_ts_recent); 706 707 inet_twsk_put(tw); 708} 709 710static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, 711 struct request_sock *req) 712{ 713 tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, 714 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 715 req->ts_recent); 716} 717 718/* 719 * Send a SYN-ACK after having received a SYN. 720 * This still operates on a request_sock only, not on a big 721 * socket. 722 */ 723static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 724 struct dst_entry *dst) 725{ 726 const struct inet_request_sock *ireq = inet_rsk(req); 727 int err = -1; 728 struct sk_buff * skb; 729 730 /* First, grab a route. */ 731 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 732 return -1; 733 734 skb = tcp_make_synack(sk, dst, req); 735 736 if (skb) { 737 struct tcphdr *th = tcp_hdr(skb); 738 739 th->check = tcp_v4_check(skb->len, 740 ireq->loc_addr, 741 ireq->rmt_addr, 742 csum_partial((char *)th, skb->len, 743 skb->csum)); 744 745 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 746 ireq->rmt_addr, 747 ireq->opt); 748 err = net_xmit_eval(err); 749 } 750 751 dst_release(dst); 752 return err; 753} 754 755static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) 756{ 757 return __tcp_v4_send_synack(sk, req, NULL); 758} 759 760/* 761 * IPv4 request_sock destructor. 762 */ 763static void tcp_v4_reqsk_destructor(struct request_sock *req) 764{ 765 kfree(inet_rsk(req)->opt); 766} 767 768#ifdef CONFIG_SYN_COOKIES 769static void syn_flood_warning(struct sk_buff *skb) 770{ 771 static unsigned long warntime; 772 773 if (time_after(jiffies, (warntime + HZ * 60))) { 774 warntime = jiffies; 775 printk(KERN_INFO 776 "possible SYN flooding on port %d. Sending cookies.\n", 777 ntohs(tcp_hdr(skb)->dest)); 778 } 779} 780#endif 781 782/* 783 * Save and compile IPv4 options into the request_sock if needed. 784 */ 785static struct ip_options *tcp_v4_save_options(struct sock *sk, 786 struct sk_buff *skb) 787{ 788 struct ip_options *opt = &(IPCB(skb)->opt); 789 struct ip_options *dopt = NULL; 790 791 if (opt && opt->optlen) { 792 int opt_size = optlength(opt); 793 dopt = kmalloc(opt_size, GFP_ATOMIC); 794 if (dopt) { 795 if (ip_options_echo(dopt, skb)) { 796 kfree(dopt); 797 dopt = NULL; 798 } 799 } 800 } 801 return dopt; 802} 803 804#ifdef CONFIG_TCP_MD5SIG 805/* 806 * RFC2385 MD5 checksumming requires a mapping of 807 * IP address->MD5 Key. 808 * We need to maintain these in the sk structure. 809 */ 810 811/* Find the Key structure for an address. */ 812static struct tcp_md5sig_key * 813 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) 814{ 815 struct tcp_sock *tp = tcp_sk(sk); 816 int i; 817 818 if (!tp->md5sig_info || !tp->md5sig_info->entries4) 819 return NULL; 820 for (i = 0; i < tp->md5sig_info->entries4; i++) { 821 if (tp->md5sig_info->keys4[i].addr == addr) 822 return &tp->md5sig_info->keys4[i].base; 823 } 824 return NULL; 825} 826 827struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 828 struct sock *addr_sk) 829{ 830 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); 831} 832 833EXPORT_SYMBOL(tcp_v4_md5_lookup); 834 835static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 836 struct request_sock *req) 837{ 838 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); 839} 840 841/* This can be called on a newly created socket, from other files */ 842int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, 843 u8 *newkey, u8 newkeylen) 844{ 845 /* Add Key to the list */ 846 struct tcp_md5sig_key *key; 847 struct tcp_sock *tp = tcp_sk(sk); 848 struct tcp4_md5sig_key *keys; 849 850 key = tcp_v4_md5_do_lookup(sk, addr); 851 if (key) { 852 /* Pre-existing entry - just update that one. */ 853 kfree(key->key); 854 key->key = newkey; 855 key->keylen = newkeylen; 856 } else { 857 struct tcp_md5sig_info *md5sig; 858 859 if (!tp->md5sig_info) { 860 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), 861 GFP_ATOMIC); 862 if (!tp->md5sig_info) { 863 kfree(newkey); 864 return -ENOMEM; 865 } 866 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 867 } 868 if (tcp_alloc_md5sig_pool() == NULL) { 869 kfree(newkey); 870 return -ENOMEM; 871 } 872 md5sig = tp->md5sig_info; 873 874 if (md5sig->alloced4 == md5sig->entries4) { 875 keys = kmalloc((sizeof(*keys) * 876 (md5sig->entries4 + 1)), GFP_ATOMIC); 877 if (!keys) { 878 kfree(newkey); 879 tcp_free_md5sig_pool(); 880 return -ENOMEM; 881 } 882 883 if (md5sig->entries4) 884 memcpy(keys, md5sig->keys4, 885 sizeof(*keys) * md5sig->entries4); 886 887 /* Free old key list, and reference new one */ 888 kfree(md5sig->keys4); 889 md5sig->keys4 = keys; 890 md5sig->alloced4++; 891 } 892 md5sig->entries4++; 893 md5sig->keys4[md5sig->entries4 - 1].addr = addr; 894 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; 895 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; 896 } 897 return 0; 898} 899 900EXPORT_SYMBOL(tcp_v4_md5_do_add); 901 902static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 903 u8 *newkey, u8 newkeylen) 904{ 905 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, 906 newkey, newkeylen); 907} 908 909int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) 910{ 911 struct tcp_sock *tp = tcp_sk(sk); 912 int i; 913 914 for (i = 0; i < tp->md5sig_info->entries4; i++) { 915 if (tp->md5sig_info->keys4[i].addr == addr) { 916 /* Free the key */ 917 kfree(tp->md5sig_info->keys4[i].base.key); 918 tp->md5sig_info->entries4--; 919 920 if (tp->md5sig_info->entries4 == 0) { 921 kfree(tp->md5sig_info->keys4); 922 tp->md5sig_info->keys4 = NULL; 923 tp->md5sig_info->alloced4 = 0; 924 } else if (tp->md5sig_info->entries4 != i) { 925 /* Need to do some manipulation */ 926 memmove(&tp->md5sig_info->keys4[i], 927 &tp->md5sig_info->keys4[i+1], 928 (tp->md5sig_info->entries4 - i) * 929 sizeof(struct tcp4_md5sig_key)); 930 } 931 tcp_free_md5sig_pool(); 932 return 0; 933 } 934 } 935 return -ENOENT; 936} 937 938EXPORT_SYMBOL(tcp_v4_md5_do_del); 939 940static void tcp_v4_clear_md5_list(struct sock *sk) 941{ 942 struct tcp_sock *tp = tcp_sk(sk); 943 944 /* Free each key, then the set of key keys, 945 * the crypto element, and then decrement our 946 * hold on the last resort crypto. 947 */ 948 if (tp->md5sig_info->entries4) { 949 int i; 950 for (i = 0; i < tp->md5sig_info->entries4; i++) 951 kfree(tp->md5sig_info->keys4[i].base.key); 952 tp->md5sig_info->entries4 = 0; 953 tcp_free_md5sig_pool(); 954 } 955 if (tp->md5sig_info->keys4) { 956 kfree(tp->md5sig_info->keys4); 957 tp->md5sig_info->keys4 = NULL; 958 tp->md5sig_info->alloced4 = 0; 959 } 960} 961 962static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 963 int optlen) 964{ 965 struct tcp_md5sig cmd; 966 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 967 u8 *newkey; 968 969 if (optlen < sizeof(cmd)) 970 return -EINVAL; 971 972 if (copy_from_user(&cmd, optval, sizeof(cmd))) 973 return -EFAULT; 974 975 if (sin->sin_family != AF_INET) 976 return -EINVAL; 977 978 if (!cmd.tcpm_key || !cmd.tcpm_keylen) { 979 if (!tcp_sk(sk)->md5sig_info) 980 return -ENOENT; 981 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); 982 } 983 984 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 985 return -EINVAL; 986 987 if (!tcp_sk(sk)->md5sig_info) { 988 struct tcp_sock *tp = tcp_sk(sk); 989 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL); 990 991 if (!p) 992 return -EINVAL; 993 994 tp->md5sig_info = p; 995 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 996 } 997 998 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 999 if (!newkey) 1000 return -ENOMEM; 1001 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, 1002 newkey, cmd.tcpm_keylen); 1003} 1004 1005static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, 1006 __be32 saddr, __be32 daddr, 1007 struct tcphdr *th, int protocol, 1008 unsigned int tcplen) 1009{ 1010 struct scatterlist sg[4]; 1011 __u16 data_len; 1012 int block = 0; 1013 __sum16 old_checksum; 1014 struct tcp_md5sig_pool *hp; 1015 struct tcp4_pseudohdr *bp; 1016 struct hash_desc *desc; 1017 int err; 1018 unsigned int nbytes = 0; 1019 1020 /* 1021 * Okay, so RFC2385 is turned on for this connection, 1022 * so we need to generate the MD5 hash for the packet now. 1023 */ 1024 1025 hp = tcp_get_md5sig_pool(); 1026 if (!hp) 1027 goto clear_hash_noput; 1028 1029 bp = &hp->md5_blk.ip4; 1030 desc = &hp->md5_desc; 1031 1032 /* 1033 * 1. the TCP pseudo-header (in the order: source IP address, 1034 * destination IP address, zero-padded protocol number, and 1035 * segment length) 1036 */ 1037 bp->saddr = saddr; 1038 bp->daddr = daddr; 1039 bp->pad = 0; 1040 bp->protocol = protocol; 1041 bp->len = htons(tcplen); 1042 1043 sg_init_table(sg, 4); 1044 1045 sg_set_buf(&sg[block++], bp, sizeof(*bp)); 1046 nbytes += sizeof(*bp); 1047 1048 /* 2. the TCP header, excluding options, and assuming a 1049 * checksum of zero/ 1050 */ 1051 old_checksum = th->check; 1052 th->check = 0; 1053 sg_set_buf(&sg[block++], th, sizeof(struct tcphdr)); 1054 nbytes += sizeof(struct tcphdr); 1055 1056 /* 3. the TCP segment data (if any) */ 1057 data_len = tcplen - (th->doff << 2); 1058 if (data_len > 0) { 1059 unsigned char *data = (unsigned char *)th + (th->doff << 2); 1060 sg_set_buf(&sg[block++], data, data_len); 1061 nbytes += data_len; 1062 } 1063 1064 /* 4. an independently-specified key or password, known to both 1065 * TCPs and presumably connection-specific 1066 */ 1067 sg_set_buf(&sg[block++], key->key, key->keylen); 1068 nbytes += key->keylen; 1069 1070 sg_mark_end(&sg[block - 1]); 1071 1072 /* Now store the Hash into the packet */ 1073 err = crypto_hash_init(desc); 1074 if (err) 1075 goto clear_hash; 1076 err = crypto_hash_update(desc, sg, nbytes); 1077 if (err) 1078 goto clear_hash; 1079 err = crypto_hash_final(desc, md5_hash); 1080 if (err) 1081 goto clear_hash; 1082 1083 /* Reset header, and free up the crypto */ 1084 tcp_put_md5sig_pool(); 1085 th->check = old_checksum; 1086 1087out: 1088 return 0; 1089clear_hash: 1090 tcp_put_md5sig_pool(); 1091clear_hash_noput: 1092 memset(md5_hash, 0, 16); 1093 goto out; 1094} 1095 1096int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, 1097 struct sock *sk, 1098 struct dst_entry *dst, 1099 struct request_sock *req, 1100 struct tcphdr *th, int protocol, 1101 unsigned int tcplen) 1102{ 1103 __be32 saddr, daddr; 1104 1105 if (sk) { 1106 saddr = inet_sk(sk)->saddr; 1107 daddr = inet_sk(sk)->daddr; 1108 } else { 1109 struct rtable *rt = (struct rtable *)dst; 1110 BUG_ON(!rt); 1111 saddr = rt->rt_src; 1112 daddr = rt->rt_dst; 1113 } 1114 return tcp_v4_do_calc_md5_hash(md5_hash, key, 1115 saddr, daddr, 1116 th, protocol, tcplen); 1117} 1118 1119EXPORT_SYMBOL(tcp_v4_calc_md5_hash); 1120 1121static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) 1122{ 1123 /* 1124 * This gets called for each TCP segment that arrives 1125 * so we want to be efficient. 1126 * We have 3 drop cases: 1127 * o No MD5 hash and one expected. 1128 * o MD5 hash and we're not expecting one. 1129 * o MD5 hash and its wrong. 1130 */ 1131 __u8 *hash_location = NULL; 1132 struct tcp_md5sig_key *hash_expected; 1133 const struct iphdr *iph = ip_hdr(skb); 1134 struct tcphdr *th = tcp_hdr(skb); 1135 int length = (th->doff << 2) - sizeof(struct tcphdr); 1136 int genhash; 1137 unsigned char *ptr; 1138 unsigned char newhash[16]; 1139 1140 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); 1141 1142 /* 1143 * If the TCP option length is less than the TCP_MD5SIG 1144 * option length, then we can shortcut 1145 */ 1146 if (length < TCPOLEN_MD5SIG) { 1147 if (hash_expected) 1148 return 1; 1149 else 1150 return 0; 1151 } 1152 1153 /* Okay, we can't shortcut - we have to grub through the options */ 1154 ptr = (unsigned char *)(th + 1); 1155 while (length > 0) { 1156 int opcode = *ptr++; 1157 int opsize; 1158 1159 switch (opcode) { 1160 case TCPOPT_EOL: 1161 goto done_opts; 1162 case TCPOPT_NOP: 1163 length--; 1164 continue; 1165 default: 1166 opsize = *ptr++; 1167 if (opsize < 2) 1168 goto done_opts; 1169 if (opsize > length) 1170 goto done_opts; 1171 1172 if (opcode == TCPOPT_MD5SIG) { 1173 hash_location = ptr; 1174 goto done_opts; 1175 } 1176 } 1177 ptr += opsize-2; 1178 length -= opsize; 1179 } 1180done_opts: 1181 /* We've parsed the options - do we have a hash? */ 1182 if (!hash_expected && !hash_location) 1183 return 0; 1184 1185 if (hash_expected && !hash_location) { 1186 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found " 1187 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n", 1188 NIPQUAD(iph->saddr), ntohs(th->source), 1189 NIPQUAD(iph->daddr), ntohs(th->dest)); 1190 return 1; 1191 } 1192 1193 if (!hash_expected && hash_location) { 1194 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found " 1195 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n", 1196 NIPQUAD(iph->saddr), ntohs(th->source), 1197 NIPQUAD(iph->daddr), ntohs(th->dest)); 1198 return 1; 1199 } 1200 1201 /* Okay, so this is hash_expected and hash_location - 1202 * so we need to calculate the checksum. 1203 */ 1204 genhash = tcp_v4_do_calc_md5_hash(newhash, 1205 hash_expected, 1206 iph->saddr, iph->daddr, 1207 th, sk->sk_protocol, 1208 skb->len); 1209 1210 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1211 if (net_ratelimit()) { 1212 printk(KERN_INFO "MD5 Hash failed for " 1213 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n", 1214 NIPQUAD(iph->saddr), ntohs(th->source), 1215 NIPQUAD(iph->daddr), ntohs(th->dest), 1216 genhash ? " tcp_v4_calc_md5_hash failed" : ""); 1217 } 1218 return 1; 1219 } 1220 return 0; 1221} 1222 1223#endif 1224 1225struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1226 .family = PF_INET, 1227 .obj_size = sizeof(struct tcp_request_sock), 1228 .rtx_syn_ack = tcp_v4_send_synack, 1229 .send_ack = tcp_v4_reqsk_send_ack, 1230 .destructor = tcp_v4_reqsk_destructor, 1231 .send_reset = tcp_v4_send_reset, 1232}; 1233 1234#ifdef CONFIG_TCP_MD5SIG 1235static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1236 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1237}; 1238#endif 1239 1240static struct timewait_sock_ops tcp_timewait_sock_ops = { 1241 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1242 .twsk_unique = tcp_twsk_unique, 1243 .twsk_destructor= tcp_twsk_destructor, 1244}; 1245 1246int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1247{ 1248 struct inet_request_sock *ireq; 1249 struct tcp_options_received tmp_opt; 1250 struct request_sock *req; 1251 __be32 saddr = ip_hdr(skb)->saddr; 1252 __be32 daddr = ip_hdr(skb)->daddr; 1253 __u32 isn = TCP_SKB_CB(skb)->when; 1254 struct dst_entry *dst = NULL; 1255#ifdef CONFIG_SYN_COOKIES 1256 int want_cookie = 0; 1257#else 1258#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ 1259#endif 1260 1261 /* Never answer to SYNs send to broadcast or multicast */ 1262 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1263 goto drop; 1264 1265 /* TW buckets are converted to open requests without 1266 * limitations, they conserve resources and peer is 1267 * evidently real one. 1268 */ 1269 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1270#ifdef CONFIG_SYN_COOKIES 1271 if (sysctl_tcp_syncookies) { 1272 want_cookie = 1; 1273 } else 1274#endif 1275 goto drop; 1276 } 1277 1278 /* Accept backlog is full. If we have already queued enough 1279 * of warm entries in syn queue, drop request. It is better than 1280 * clogging syn queue with openreqs with exponentially increasing 1281 * timeout. 1282 */ 1283 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 1284 goto drop; 1285 1286 req = reqsk_alloc(&tcp_request_sock_ops); 1287 if (!req) 1288 goto drop; 1289 1290#ifdef CONFIG_TCP_MD5SIG 1291 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; 1292#endif 1293 1294 tcp_clear_options(&tmp_opt); 1295 tmp_opt.mss_clamp = 536; 1296 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; 1297 1298 tcp_parse_options(skb, &tmp_opt, 0); 1299 1300 if (want_cookie && !tmp_opt.saw_tstamp) 1301 tcp_clear_options(&tmp_opt); 1302 1303 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { 1304 /* Some OSes (unknown ones, but I see them on web server, which 1305 * contains information interesting only for windows' 1306 * users) do not send their stamp in SYN. It is easy case. 1307 * We simply do not advertise TS support. 1308 */ 1309 tmp_opt.saw_tstamp = 0; 1310 tmp_opt.tstamp_ok = 0; 1311 } 1312 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1313 1314 tcp_openreq_init(req, &tmp_opt, skb); 1315 1316 if (security_inet_conn_request(sk, skb, req)) 1317 goto drop_and_free; 1318 1319 ireq = inet_rsk(req); 1320 ireq->loc_addr = daddr; 1321 ireq->rmt_addr = saddr; 1322 ireq->opt = tcp_v4_save_options(sk, skb); 1323 if (!want_cookie) 1324 TCP_ECN_create_request(req, tcp_hdr(skb)); 1325 1326 if (want_cookie) { 1327#ifdef CONFIG_SYN_COOKIES 1328 syn_flood_warning(skb); 1329 req->cookie_ts = tmp_opt.tstamp_ok; 1330#endif 1331 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1332 } else if (!isn) { 1333 struct inet_peer *peer = NULL; 1334 1335 /* VJ's idea. We save last timestamp seen 1336 * from the destination in peer table, when entering 1337 * state TIME-WAIT, and check against it before 1338 * accepting new connection request. 1339 * 1340 * If "isn" is not zero, this request hit alive 1341 * timewait bucket, so that all the necessary checks 1342 * are made in the function processing timewait state. 1343 */ 1344 if (tmp_opt.saw_tstamp && 1345 tcp_death_row.sysctl_tw_recycle && 1346 (dst = inet_csk_route_req(sk, req)) != NULL && 1347 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1348 peer->v4daddr == saddr) { 1349 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1350 (s32)(peer->tcp_ts - req->ts_recent) > 1351 TCP_PAWS_WINDOW) { 1352 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); 1353 goto drop_and_release; 1354 } 1355 } 1356 /* Kill the following clause, if you dislike this way. */ 1357 else if (!sysctl_tcp_syncookies && 1358 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1359 (sysctl_max_syn_backlog >> 2)) && 1360 (!peer || !peer->tcp_ts_stamp) && 1361 (!dst || !dst_metric(dst, RTAX_RTT))) { 1362 /* Without syncookies last quarter of 1363 * backlog is filled with destinations, 1364 * proven to be alive. 1365 * It means that we continue to communicate 1366 * to destinations, already remembered 1367 * to the moment of synflood. 1368 */ 1369 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " 1370 "request from " NIPQUAD_FMT "/%u\n", 1371 NIPQUAD(saddr), 1372 ntohs(tcp_hdr(skb)->source)); 1373 goto drop_and_release; 1374 } 1375 1376 isn = tcp_v4_init_sequence(skb); 1377 } 1378 tcp_rsk(req)->snt_isn = isn; 1379 1380 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) 1381 goto drop_and_free; 1382 1383 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1384 return 0; 1385 1386drop_and_release: 1387 dst_release(dst); 1388drop_and_free: 1389 reqsk_free(req); 1390drop: 1391 return 0; 1392} 1393 1394 1395/* 1396 * The three way handshake has completed - we got a valid synack - 1397 * now create the new socket. 1398 */ 1399struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1400 struct request_sock *req, 1401 struct dst_entry *dst) 1402{ 1403 struct inet_request_sock *ireq; 1404 struct inet_sock *newinet; 1405 struct tcp_sock *newtp; 1406 struct sock *newsk; 1407#ifdef CONFIG_TCP_MD5SIG 1408 struct tcp_md5sig_key *key; 1409#endif 1410 1411 if (sk_acceptq_is_full(sk)) 1412 goto exit_overflow; 1413 1414 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 1415 goto exit; 1416 1417 newsk = tcp_create_openreq_child(sk, req, skb); 1418 if (!newsk) 1419 goto exit; 1420 1421 newsk->sk_gso_type = SKB_GSO_TCPV4; 1422 sk_setup_caps(newsk, dst); 1423 1424 newtp = tcp_sk(newsk); 1425 newinet = inet_sk(newsk); 1426 ireq = inet_rsk(req); 1427 newinet->daddr = ireq->rmt_addr; 1428 newinet->rcv_saddr = ireq->loc_addr; 1429 newinet->saddr = ireq->loc_addr; 1430 newinet->opt = ireq->opt; 1431 ireq->opt = NULL; 1432 newinet->mc_index = inet_iif(skb); 1433 newinet->mc_ttl = ip_hdr(skb)->ttl; 1434 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1435 if (newinet->opt) 1436 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1437 newinet->id = newtp->write_seq ^ jiffies; 1438 1439 tcp_mtup_init(newsk); 1440 tcp_sync_mss(newsk, dst_mtu(dst)); 1441 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1442 tcp_initialize_rcv_mss(newsk); 1443 1444#ifdef CONFIG_TCP_MD5SIG 1445 /* Copy over the MD5 key from the original socket */ 1446 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { 1447 /* 1448 * We're using one, so create a matching key 1449 * on the newsk structure. If we fail to get 1450 * memory, then we end up not copying the key 1451 * across. Shucks. 1452 */ 1453 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); 1454 if (newkey != NULL) 1455 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr, 1456 newkey, key->keylen); 1457 } 1458#endif 1459 1460 __inet_hash_nolisten(newsk); 1461 __inet_inherit_port(sk, newsk); 1462 1463 return newsk; 1464 1465exit_overflow: 1466 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); 1467exit: 1468 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); 1469 dst_release(dst); 1470 return NULL; 1471} 1472 1473static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1474{ 1475 struct tcphdr *th = tcp_hdr(skb); 1476 const struct iphdr *iph = ip_hdr(skb); 1477 struct sock *nsk; 1478 struct request_sock **prev; 1479 /* Find possible connection requests. */ 1480 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1481 iph->saddr, iph->daddr); 1482 if (req) 1483 return tcp_check_req(sk, skb, req, prev); 1484 1485 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1486 th->source, iph->daddr, th->dest, inet_iif(skb)); 1487 1488 if (nsk) { 1489 if (nsk->sk_state != TCP_TIME_WAIT) { 1490 bh_lock_sock(nsk); 1491 return nsk; 1492 } 1493 inet_twsk_put(inet_twsk(nsk)); 1494 return NULL; 1495 } 1496 1497#ifdef CONFIG_SYN_COOKIES 1498 if (!th->rst && !th->syn && th->ack) 1499 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1500#endif 1501 return sk; 1502} 1503 1504static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) 1505{ 1506 const struct iphdr *iph = ip_hdr(skb); 1507 1508 if (skb->ip_summed == CHECKSUM_COMPLETE) { 1509 if (!tcp_v4_check(skb->len, iph->saddr, 1510 iph->daddr, skb->csum)) { 1511 skb->ip_summed = CHECKSUM_UNNECESSARY; 1512 return 0; 1513 } 1514 } 1515 1516 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 1517 skb->len, IPPROTO_TCP, 0); 1518 1519 if (skb->len <= 76) { 1520 return __skb_checksum_complete(skb); 1521 } 1522 return 0; 1523} 1524 1525 1526/* The socket must have it's spinlock held when we get 1527 * here. 1528 * 1529 * We have a potential double-lock case here, so even when 1530 * doing backlog processing we use the BH locking scheme. 1531 * This is because we cannot sleep with the original spinlock 1532 * held. 1533 */ 1534int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1535{ 1536 struct sock *rsk; 1537#ifdef CONFIG_TCP_MD5SIG 1538 /* 1539 * We really want to reject the packet as early as possible 1540 * if: 1541 * o We're expecting an MD5'd packet and this is no MD5 tcp option 1542 * o There is an MD5 option and we're not expecting one 1543 */ 1544 if (tcp_v4_inbound_md5_hash(sk, skb)) 1545 goto discard; 1546#endif 1547 1548 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1549 TCP_CHECK_TIMER(sk); 1550 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1551 rsk = sk; 1552 goto reset; 1553 } 1554 TCP_CHECK_TIMER(sk); 1555 return 0; 1556 } 1557 1558 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) 1559 goto csum_err; 1560 1561 if (sk->sk_state == TCP_LISTEN) { 1562 struct sock *nsk = tcp_v4_hnd_req(sk, skb); 1563 if (!nsk) 1564 goto discard; 1565 1566 if (nsk != sk) { 1567 if (tcp_child_process(sk, nsk, skb)) { 1568 rsk = nsk; 1569 goto reset; 1570 } 1571 return 0; 1572 } 1573 } 1574 1575 TCP_CHECK_TIMER(sk); 1576 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1577 rsk = sk; 1578 goto reset; 1579 } 1580 TCP_CHECK_TIMER(sk); 1581 return 0; 1582 1583reset: 1584 tcp_v4_send_reset(rsk, skb); 1585discard: 1586 kfree_skb(skb); 1587 /* Be careful here. If this function gets more complicated and 1588 * gcc suffers from register pressure on the x86, sk (in %ebx) 1589 * might be destroyed here. This current version compiles correctly, 1590 * but you have been warned. 1591 */ 1592 return 0; 1593 1594csum_err: 1595 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1596 goto discard; 1597} 1598 1599/* 1600 * From tcp_input.c 1601 */ 1602 1603int tcp_v4_rcv(struct sk_buff *skb) 1604{ 1605 const struct iphdr *iph; 1606 struct tcphdr *th; 1607 struct sock *sk; 1608 int ret; 1609 1610 if (skb->pkt_type != PACKET_HOST) 1611 goto discard_it; 1612 1613 /* Count it even if it's bad */ 1614 TCP_INC_STATS_BH(TCP_MIB_INSEGS); 1615 1616 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1617 goto discard_it; 1618 1619 th = tcp_hdr(skb); 1620 1621 if (th->doff < sizeof(struct tcphdr) / 4) 1622 goto bad_packet; 1623 if (!pskb_may_pull(skb, th->doff * 4)) 1624 goto discard_it; 1625 1626 /* An explanation is required here, I think. 1627 * Packet length and doff are validated by header prediction, 1628 * provided case of th->doff==0 is eliminated. 1629 * So, we defer the checks. */ 1630 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) 1631 goto bad_packet; 1632 1633 th = tcp_hdr(skb); 1634 iph = ip_hdr(skb); 1635 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1636 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1637 skb->len - th->doff * 4); 1638 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1639 TCP_SKB_CB(skb)->when = 0; 1640 TCP_SKB_CB(skb)->flags = iph->tos; 1641 TCP_SKB_CB(skb)->sacked = 0; 1642 1643 sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, 1644 th->source, iph->daddr, th->dest, inet_iif(skb)); 1645 if (!sk) 1646 goto no_tcp_socket; 1647 1648process: 1649 if (sk->sk_state == TCP_TIME_WAIT) 1650 goto do_time_wait; 1651 1652 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1653 goto discard_and_relse; 1654 nf_reset(skb); 1655 1656 if (sk_filter(sk, skb)) 1657 goto discard_and_relse; 1658 1659 skb->dev = NULL; 1660 1661 bh_lock_sock_nested(sk); 1662 ret = 0; 1663 if (!sock_owned_by_user(sk)) { 1664#ifdef CONFIG_NET_DMA 1665 struct tcp_sock *tp = tcp_sk(sk); 1666 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1667 tp->ucopy.dma_chan = get_softnet_dma(); 1668 if (tp->ucopy.dma_chan) 1669 ret = tcp_v4_do_rcv(sk, skb); 1670 else 1671#endif 1672 { 1673 if (!tcp_prequeue(sk, skb)) 1674 ret = tcp_v4_do_rcv(sk, skb); 1675 } 1676 } else 1677 sk_add_backlog(sk, skb); 1678 bh_unlock_sock(sk); 1679 1680 sock_put(sk); 1681 1682 return ret; 1683 1684no_tcp_socket: 1685 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1686 goto discard_it; 1687 1688 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1689bad_packet: 1690 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1691 } else { 1692 tcp_v4_send_reset(NULL, skb); 1693 } 1694 1695discard_it: 1696 /* Discard frame. */ 1697 kfree_skb(skb); 1698 return 0; 1699 1700discard_and_relse: 1701 sock_put(sk); 1702 goto discard_it; 1703 1704do_time_wait: 1705 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1706 inet_twsk_put(inet_twsk(sk)); 1707 goto discard_it; 1708 } 1709 1710 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1711 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1712 inet_twsk_put(inet_twsk(sk)); 1713 goto discard_it; 1714 } 1715 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1716 case TCP_TW_SYN: { 1717 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1718 &tcp_hashinfo, 1719 iph->daddr, th->dest, 1720 inet_iif(skb)); 1721 if (sk2) { 1722 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); 1723 inet_twsk_put(inet_twsk(sk)); 1724 sk = sk2; 1725 goto process; 1726 } 1727 /* Fall through to ACK */ 1728 } 1729 case TCP_TW_ACK: 1730 tcp_v4_timewait_ack(sk, skb); 1731 break; 1732 case TCP_TW_RST: 1733 goto no_tcp_socket; 1734 case TCP_TW_SUCCESS:; 1735 } 1736 goto discard_it; 1737} 1738 1739/* VJ's idea. Save last timestamp seen from this destination 1740 * and hold it at least for normal timewait interval to use for duplicate 1741 * segment detection in subsequent connections, before they enter synchronized 1742 * state. 1743 */ 1744 1745int tcp_v4_remember_stamp(struct sock *sk) 1746{ 1747 struct inet_sock *inet = inet_sk(sk); 1748 struct tcp_sock *tp = tcp_sk(sk); 1749 struct rtable *rt = (struct rtable *)__sk_dst_get(sk); 1750 struct inet_peer *peer = NULL; 1751 int release_it = 0; 1752 1753 if (!rt || rt->rt_dst != inet->daddr) { 1754 peer = inet_getpeer(inet->daddr, 1); 1755 release_it = 1; 1756 } else { 1757 if (!rt->peer) 1758 rt_bind_peer(rt, 1); 1759 peer = rt->peer; 1760 } 1761 1762 if (peer) { 1763 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 1764 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1765 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { 1766 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; 1767 peer->tcp_ts = tp->rx_opt.ts_recent; 1768 } 1769 if (release_it) 1770 inet_putpeer(peer); 1771 return 1; 1772 } 1773 1774 return 0; 1775} 1776 1777int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1778{ 1779 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1780 1781 if (peer) { 1782 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 1783 1784 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 1785 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1786 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { 1787 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; 1788 peer->tcp_ts = tcptw->tw_ts_recent; 1789 } 1790 inet_putpeer(peer); 1791 return 1; 1792 } 1793 1794 return 0; 1795} 1796 1797struct inet_connection_sock_af_ops ipv4_specific = { 1798 .queue_xmit = ip_queue_xmit, 1799 .send_check = tcp_v4_send_check, 1800 .rebuild_header = inet_sk_rebuild_header, 1801 .conn_request = tcp_v4_conn_request, 1802 .syn_recv_sock = tcp_v4_syn_recv_sock, 1803 .remember_stamp = tcp_v4_remember_stamp, 1804 .net_header_len = sizeof(struct iphdr), 1805 .setsockopt = ip_setsockopt, 1806 .getsockopt = ip_getsockopt, 1807 .addr2sockaddr = inet_csk_addr2sockaddr, 1808 .sockaddr_len = sizeof(struct sockaddr_in), 1809 .bind_conflict = inet_csk_bind_conflict, 1810#ifdef CONFIG_COMPAT 1811 .compat_setsockopt = compat_ip_setsockopt, 1812 .compat_getsockopt = compat_ip_getsockopt, 1813#endif 1814}; 1815 1816#ifdef CONFIG_TCP_MD5SIG 1817static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1818 .md5_lookup = tcp_v4_md5_lookup, 1819 .calc_md5_hash = tcp_v4_calc_md5_hash, 1820 .md5_add = tcp_v4_md5_add_func, 1821 .md5_parse = tcp_v4_parse_md5_keys, 1822}; 1823#endif 1824 1825/* NOTE: A lot of things set to zero explicitly by call to 1826 * sk_alloc() so need not be done here. 1827 */ 1828static int tcp_v4_init_sock(struct sock *sk) 1829{ 1830 struct inet_connection_sock *icsk = inet_csk(sk); 1831 struct tcp_sock *tp = tcp_sk(sk); 1832 1833 skb_queue_head_init(&tp->out_of_order_queue); 1834 tcp_init_xmit_timers(sk); 1835 tcp_prequeue_init(tp); 1836 1837 icsk->icsk_rto = TCP_TIMEOUT_INIT; 1838 tp->mdev = TCP_TIMEOUT_INIT; 1839 1840 /* So many TCP implementations out there (incorrectly) count the 1841 * initial SYN frame in their delayed-ACK and congestion control 1842 * algorithms that we must have the following bandaid to talk 1843 * efficiently to them. -DaveM 1844 */ 1845 tp->snd_cwnd = 2; 1846 1847 /* See draft-stevens-tcpca-spec-01 for discussion of the 1848 * initialization of these values. 1849 */ 1850 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 1851 tp->snd_cwnd_clamp = ~0; 1852 tp->mss_cache = 536; 1853 1854 tp->reordering = sysctl_tcp_reordering; 1855 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1856 1857 sk->sk_state = TCP_CLOSE; 1858 1859 sk->sk_write_space = sk_stream_write_space; 1860 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1861 1862 icsk->icsk_af_ops = &ipv4_specific; 1863 icsk->icsk_sync_mss = tcp_sync_mss; 1864#ifdef CONFIG_TCP_MD5SIG 1865 tp->af_specific = &tcp_sock_ipv4_specific; 1866#endif 1867 1868 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1869 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1870 1871 atomic_inc(&tcp_sockets_allocated); 1872 1873 return 0; 1874} 1875 1876int tcp_v4_destroy_sock(struct sock *sk) 1877{ 1878 struct tcp_sock *tp = tcp_sk(sk); 1879 1880 tcp_clear_xmit_timers(sk); 1881 1882 tcp_cleanup_congestion_control(sk); 1883 1884 /* Cleanup up the write buffer. */ 1885 tcp_write_queue_purge(sk); 1886 1887 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1888 __skb_queue_purge(&tp->out_of_order_queue); 1889 1890#ifdef CONFIG_TCP_MD5SIG 1891 /* Clean up the MD5 key list, if any */ 1892 if (tp->md5sig_info) { 1893 tcp_v4_clear_md5_list(sk); 1894 kfree(tp->md5sig_info); 1895 tp->md5sig_info = NULL; 1896 } 1897#endif 1898 1899#ifdef CONFIG_NET_DMA 1900 /* Cleans up our sk_async_wait_queue */ 1901 __skb_queue_purge(&sk->sk_async_wait_queue); 1902#endif 1903 1904 /* Clean prequeue, it must be empty really */ 1905 __skb_queue_purge(&tp->ucopy.prequeue); 1906 1907 /* Clean up a referenced TCP bind bucket. */ 1908 if (inet_csk(sk)->icsk_bind_hash) 1909 inet_put_port(sk); 1910 1911 /* 1912 * If sendmsg cached page exists, toss it. 1913 */ 1914 if (sk->sk_sndmsg_page) { 1915 __free_page(sk->sk_sndmsg_page); 1916 sk->sk_sndmsg_page = NULL; 1917 } 1918 1919 if (tp->defer_tcp_accept.request) { 1920 reqsk_free(tp->defer_tcp_accept.request); 1921 sock_put(tp->defer_tcp_accept.listen_sk); 1922 sock_put(sk); 1923 tp->defer_tcp_accept.listen_sk = NULL; 1924 tp->defer_tcp_accept.request = NULL; 1925 } 1926 1927 atomic_dec(&tcp_sockets_allocated); 1928 1929 return 0; 1930} 1931 1932EXPORT_SYMBOL(tcp_v4_destroy_sock); 1933 1934#ifdef CONFIG_PROC_FS 1935/* Proc filesystem TCP sock list dumping. */ 1936 1937static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) 1938{ 1939 return hlist_empty(head) ? NULL : 1940 list_entry(head->first, struct inet_timewait_sock, tw_node); 1941} 1942 1943static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 1944{ 1945 return tw->tw_node.next ? 1946 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1947} 1948 1949static void *listening_get_next(struct seq_file *seq, void *cur) 1950{ 1951 struct inet_connection_sock *icsk; 1952 struct hlist_node *node; 1953 struct sock *sk = cur; 1954 struct tcp_iter_state* st = seq->private; 1955 struct net *net = seq_file_net(seq); 1956 1957 if (!sk) { 1958 st->bucket = 0; 1959 sk = sk_head(&tcp_hashinfo.listening_hash[0]); 1960 goto get_sk; 1961 } 1962 1963 ++st->num; 1964 1965 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1966 struct request_sock *req = cur; 1967 1968 icsk = inet_csk(st->syn_wait_sk); 1969 req = req->dl_next; 1970 while (1) { 1971 while (req) { 1972 if (req->rsk_ops->family == st->family && 1973 net_eq(sock_net(req->sk), net)) { 1974 cur = req; 1975 goto out; 1976 } 1977 req = req->dl_next; 1978 } 1979 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 1980 break; 1981get_req: 1982 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 1983 } 1984 sk = sk_next(st->syn_wait_sk); 1985 st->state = TCP_SEQ_STATE_LISTENING; 1986 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1987 } else { 1988 icsk = inet_csk(sk); 1989 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1990 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 1991 goto start_req; 1992 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1993 sk = sk_next(sk); 1994 } 1995get_sk: 1996 sk_for_each_from(sk, node) { 1997 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 1998 cur = sk; 1999 goto out; 2000 } 2001 icsk = inet_csk(sk); 2002 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2003 if (reqsk_queue_len(&icsk->icsk_accept_queue)) { 2004start_req: 2005 st->uid = sock_i_uid(sk); 2006 st->syn_wait_sk = sk; 2007 st->state = TCP_SEQ_STATE_OPENREQ; 2008 st->sbucket = 0; 2009 goto get_req; 2010 } 2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2012 } 2013 if (++st->bucket < INET_LHTABLE_SIZE) { 2014 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); 2015 goto get_sk; 2016 } 2017 cur = NULL; 2018out: 2019 return cur; 2020} 2021 2022static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2023{ 2024 void *rc = listening_get_next(seq, NULL); 2025 2026 while (rc && *pos) { 2027 rc = listening_get_next(seq, rc); 2028 --*pos; 2029 } 2030 return rc; 2031} 2032 2033static void *established_get_first(struct seq_file *seq) 2034{ 2035 struct tcp_iter_state* st = seq->private; 2036 struct net *net = seq_file_net(seq); 2037 void *rc = NULL; 2038 2039 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 2040 struct sock *sk; 2041 struct hlist_node *node; 2042 struct inet_timewait_sock *tw; 2043 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2044 2045 read_lock_bh(lock); 2046 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2047 if (sk->sk_family != st->family || 2048 !net_eq(sock_net(sk), net)) { 2049 continue; 2050 } 2051 rc = sk; 2052 goto out; 2053 } 2054 st->state = TCP_SEQ_STATE_TIME_WAIT; 2055 inet_twsk_for_each(tw, node, 2056 &tcp_hashinfo.ehash[st->bucket].twchain) { 2057 if (tw->tw_family != st->family || 2058 !net_eq(twsk_net(tw), net)) { 2059 continue; 2060 } 2061 rc = tw; 2062 goto out; 2063 } 2064 read_unlock_bh(lock); 2065 st->state = TCP_SEQ_STATE_ESTABLISHED; 2066 } 2067out: 2068 return rc; 2069} 2070 2071static void *established_get_next(struct seq_file *seq, void *cur) 2072{ 2073 struct sock *sk = cur; 2074 struct inet_timewait_sock *tw; 2075 struct hlist_node *node; 2076 struct tcp_iter_state* st = seq->private; 2077 struct net *net = seq_file_net(seq); 2078 2079 ++st->num; 2080 2081 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2082 tw = cur; 2083 tw = tw_next(tw); 2084get_tw: 2085 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { 2086 tw = tw_next(tw); 2087 } 2088 if (tw) { 2089 cur = tw; 2090 goto out; 2091 } 2092 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2093 st->state = TCP_SEQ_STATE_ESTABLISHED; 2094 2095 if (++st->bucket < tcp_hashinfo.ehash_size) { 2096 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2097 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2098 } else { 2099 cur = NULL; 2100 goto out; 2101 } 2102 } else 2103 sk = sk_next(sk); 2104 2105 sk_for_each_from(sk, node) { 2106 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2107 goto found; 2108 } 2109 2110 st->state = TCP_SEQ_STATE_TIME_WAIT; 2111 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); 2112 goto get_tw; 2113found: 2114 cur = sk; 2115out: 2116 return cur; 2117} 2118 2119static void *established_get_idx(struct seq_file *seq, loff_t pos) 2120{ 2121 void *rc = established_get_first(seq); 2122 2123 while (rc && pos) { 2124 rc = established_get_next(seq, rc); 2125 --pos; 2126 } 2127 return rc; 2128} 2129 2130static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2131{ 2132 void *rc; 2133 struct tcp_iter_state* st = seq->private; 2134 2135 inet_listen_lock(&tcp_hashinfo); 2136 st->state = TCP_SEQ_STATE_LISTENING; 2137 rc = listening_get_idx(seq, &pos); 2138 2139 if (!rc) { 2140 inet_listen_unlock(&tcp_hashinfo); 2141 st->state = TCP_SEQ_STATE_ESTABLISHED; 2142 rc = established_get_idx(seq, pos); 2143 } 2144 2145 return rc; 2146} 2147 2148static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2149{ 2150 struct tcp_iter_state* st = seq->private; 2151 st->state = TCP_SEQ_STATE_LISTENING; 2152 st->num = 0; 2153 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2154} 2155 2156static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2157{ 2158 void *rc = NULL; 2159 struct tcp_iter_state* st; 2160 2161 if (v == SEQ_START_TOKEN) { 2162 rc = tcp_get_idx(seq, 0); 2163 goto out; 2164 } 2165 st = seq->private; 2166 2167 switch (st->state) { 2168 case TCP_SEQ_STATE_OPENREQ: 2169 case TCP_SEQ_STATE_LISTENING: 2170 rc = listening_get_next(seq, v); 2171 if (!rc) { 2172 inet_listen_unlock(&tcp_hashinfo); 2173 st->state = TCP_SEQ_STATE_ESTABLISHED; 2174 rc = established_get_first(seq); 2175 } 2176 break; 2177 case TCP_SEQ_STATE_ESTABLISHED: 2178 case TCP_SEQ_STATE_TIME_WAIT: 2179 rc = established_get_next(seq, v); 2180 break; 2181 } 2182out: 2183 ++*pos; 2184 return rc; 2185} 2186 2187static void tcp_seq_stop(struct seq_file *seq, void *v) 2188{ 2189 struct tcp_iter_state* st = seq->private; 2190 2191 switch (st->state) { 2192 case TCP_SEQ_STATE_OPENREQ: 2193 if (v) { 2194 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); 2195 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2196 } 2197 case TCP_SEQ_STATE_LISTENING: 2198 if (v != SEQ_START_TOKEN) 2199 inet_listen_unlock(&tcp_hashinfo); 2200 break; 2201 case TCP_SEQ_STATE_TIME_WAIT: 2202 case TCP_SEQ_STATE_ESTABLISHED: 2203 if (v) 2204 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2205 break; 2206 } 2207} 2208 2209static int tcp_seq_open(struct inode *inode, struct file *file) 2210{ 2211 struct tcp_seq_afinfo *afinfo = PDE(inode)->data; 2212 struct tcp_iter_state *s; 2213 int err; 2214 2215 err = seq_open_net(inode, file, &afinfo->seq_ops, 2216 sizeof(struct tcp_iter_state)); 2217 if (err < 0) 2218 return err; 2219 2220 s = ((struct seq_file *)file->private_data)->private; 2221 s->family = afinfo->family; 2222 return 0; 2223} 2224 2225int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2226{ 2227 int rc = 0; 2228 struct proc_dir_entry *p; 2229 2230 afinfo->seq_fops.open = tcp_seq_open; 2231 afinfo->seq_fops.read = seq_read; 2232 afinfo->seq_fops.llseek = seq_lseek; 2233 afinfo->seq_fops.release = seq_release_net; 2234 2235 afinfo->seq_ops.start = tcp_seq_start; 2236 afinfo->seq_ops.next = tcp_seq_next; 2237 afinfo->seq_ops.stop = tcp_seq_stop; 2238 2239 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2240 &afinfo->seq_fops, afinfo); 2241 if (!p) 2242 rc = -ENOMEM; 2243 return rc; 2244} 2245 2246void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2247{ 2248 proc_net_remove(net, afinfo->name); 2249} 2250 2251static void get_openreq4(struct sock *sk, struct request_sock *req, 2252 struct seq_file *f, int i, int uid, int *len) 2253{ 2254 const struct inet_request_sock *ireq = inet_rsk(req); 2255 int ttd = req->expires - jiffies; 2256 2257 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2258 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2259 i, 2260 ireq->loc_addr, 2261 ntohs(inet_sk(sk)->sport), 2262 ireq->rmt_addr, 2263 ntohs(ireq->rmt_port), 2264 TCP_SYN_RECV, 2265 0, 0, /* could print option size, but that is af dependent. */ 2266 1, /* timers active (only the expire timer) */ 2267 jiffies_to_clock_t(ttd), 2268 req->retrans, 2269 uid, 2270 0, /* non standard timer */ 2271 0, /* open_requests have no inode */ 2272 atomic_read(&sk->sk_refcnt), 2273 req, 2274 len); 2275} 2276 2277static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) 2278{ 2279 int timer_active; 2280 unsigned long timer_expires; 2281 struct tcp_sock *tp = tcp_sk(sk); 2282 const struct inet_connection_sock *icsk = inet_csk(sk); 2283 struct inet_sock *inet = inet_sk(sk); 2284 __be32 dest = inet->daddr; 2285 __be32 src = inet->rcv_saddr; 2286 __u16 destp = ntohs(inet->dport); 2287 __u16 srcp = ntohs(inet->sport); 2288 2289 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2290 timer_active = 1; 2291 timer_expires = icsk->icsk_timeout; 2292 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2293 timer_active = 4; 2294 timer_expires = icsk->icsk_timeout; 2295 } else if (timer_pending(&sk->sk_timer)) { 2296 timer_active = 2; 2297 timer_expires = sk->sk_timer.expires; 2298 } else { 2299 timer_active = 0; 2300 timer_expires = jiffies; 2301 } 2302 2303 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2304 "%08X %5d %8d %lu %d %p %u %u %u %u %d%n", 2305 i, src, srcp, dest, destp, sk->sk_state, 2306 tp->write_seq - tp->snd_una, 2307 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : 2308 (tp->rcv_nxt - tp->copied_seq), 2309 timer_active, 2310 jiffies_to_clock_t(timer_expires - jiffies), 2311 icsk->icsk_retransmits, 2312 sock_i_uid(sk), 2313 icsk->icsk_probes_out, 2314 sock_i_ino(sk), 2315 atomic_read(&sk->sk_refcnt), sk, 2316 icsk->icsk_rto, 2317 icsk->icsk_ack.ato, 2318 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2319 tp->snd_cwnd, 2320 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, 2321 len); 2322} 2323 2324static void get_timewait4_sock(struct inet_timewait_sock *tw, 2325 struct seq_file *f, int i, int *len) 2326{ 2327 __be32 dest, src; 2328 __u16 destp, srcp; 2329 int ttd = tw->tw_ttd - jiffies; 2330 2331 if (ttd < 0) 2332 ttd = 0; 2333 2334 dest = tw->tw_daddr; 2335 src = tw->tw_rcv_saddr; 2336 destp = ntohs(tw->tw_dport); 2337 srcp = ntohs(tw->tw_sport); 2338 2339 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2340 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", 2341 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2342 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, 2343 atomic_read(&tw->tw_refcnt), tw, len); 2344} 2345 2346#define TMPSZ 150 2347 2348static int tcp4_seq_show(struct seq_file *seq, void *v) 2349{ 2350 struct tcp_iter_state* st; 2351 int len; 2352 2353 if (v == SEQ_START_TOKEN) { 2354 seq_printf(seq, "%-*s\n", TMPSZ - 1, 2355 " sl local_address rem_address st tx_queue " 2356 "rx_queue tr tm->when retrnsmt uid timeout " 2357 "inode"); 2358 goto out; 2359 } 2360 st = seq->private; 2361 2362 switch (st->state) { 2363 case TCP_SEQ_STATE_LISTENING: 2364 case TCP_SEQ_STATE_ESTABLISHED: 2365 get_tcp4_sock(v, seq, st->num, &len); 2366 break; 2367 case TCP_SEQ_STATE_OPENREQ: 2368 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); 2369 break; 2370 case TCP_SEQ_STATE_TIME_WAIT: 2371 get_timewait4_sock(v, seq, st->num, &len); 2372 break; 2373 } 2374 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); 2375out: 2376 return 0; 2377} 2378 2379static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2380 .name = "tcp", 2381 .family = AF_INET, 2382 .seq_fops = { 2383 .owner = THIS_MODULE, 2384 }, 2385 .seq_ops = { 2386 .show = tcp4_seq_show, 2387 }, 2388}; 2389 2390static int tcp4_proc_init_net(struct net *net) 2391{ 2392 return tcp_proc_register(net, &tcp4_seq_afinfo); 2393} 2394 2395static void tcp4_proc_exit_net(struct net *net) 2396{ 2397 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2398} 2399 2400static struct pernet_operations tcp4_net_ops = { 2401 .init = tcp4_proc_init_net, 2402 .exit = tcp4_proc_exit_net, 2403}; 2404 2405int __init tcp4_proc_init(void) 2406{ 2407 return register_pernet_subsys(&tcp4_net_ops); 2408} 2409 2410void tcp4_proc_exit(void) 2411{ 2412 unregister_pernet_subsys(&tcp4_net_ops); 2413} 2414#endif /* CONFIG_PROC_FS */ 2415 2416struct proto tcp_prot = { 2417 .name = "TCP", 2418 .owner = THIS_MODULE, 2419 .close = tcp_close, 2420 .connect = tcp_v4_connect, 2421 .disconnect = tcp_disconnect, 2422 .accept = inet_csk_accept, 2423 .ioctl = tcp_ioctl, 2424 .init = tcp_v4_init_sock, 2425 .destroy = tcp_v4_destroy_sock, 2426 .shutdown = tcp_shutdown, 2427 .setsockopt = tcp_setsockopt, 2428 .getsockopt = tcp_getsockopt, 2429 .recvmsg = tcp_recvmsg, 2430 .backlog_rcv = tcp_v4_do_rcv, 2431 .hash = inet_hash, 2432 .unhash = inet_unhash, 2433 .get_port = inet_csk_get_port, 2434 .enter_memory_pressure = tcp_enter_memory_pressure, 2435 .sockets_allocated = &tcp_sockets_allocated, 2436 .orphan_count = &tcp_orphan_count, 2437 .memory_allocated = &tcp_memory_allocated, 2438 .memory_pressure = &tcp_memory_pressure, 2439 .sysctl_mem = sysctl_tcp_mem, 2440 .sysctl_wmem = sysctl_tcp_wmem, 2441 .sysctl_rmem = sysctl_tcp_rmem, 2442 .max_header = MAX_TCP_HEADER, 2443 .obj_size = sizeof(struct tcp_sock), 2444 .twsk_prot = &tcp_timewait_sock_ops, 2445 .rsk_prot = &tcp_request_sock_ops, 2446 .h.hashinfo = &tcp_hashinfo, 2447#ifdef CONFIG_COMPAT 2448 .compat_setsockopt = compat_tcp_setsockopt, 2449 .compat_getsockopt = compat_tcp_getsockopt, 2450#endif 2451}; 2452 2453 2454static int __net_init tcp_sk_init(struct net *net) 2455{ 2456 return inet_ctl_sock_create(&net->ipv4.tcp_sock, 2457 PF_INET, SOCK_RAW, IPPROTO_TCP, net); 2458} 2459 2460static void __net_exit tcp_sk_exit(struct net *net) 2461{ 2462 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2463} 2464 2465static struct pernet_operations __net_initdata tcp_sk_ops = { 2466 .init = tcp_sk_init, 2467 .exit = tcp_sk_exit, 2468}; 2469 2470void __init tcp_v4_init(void) 2471{ 2472 if (register_pernet_device(&tcp_sk_ops)) 2473 panic("Failed to create the TCP control socket.\n"); 2474} 2475 2476EXPORT_SYMBOL(ipv4_specific); 2477EXPORT_SYMBOL(tcp_hashinfo); 2478EXPORT_SYMBOL(tcp_prot); 2479EXPORT_SYMBOL(tcp_v4_conn_request); 2480EXPORT_SYMBOL(tcp_v4_connect); 2481EXPORT_SYMBOL(tcp_v4_do_rcv); 2482EXPORT_SYMBOL(tcp_v4_remember_stamp); 2483EXPORT_SYMBOL(tcp_v4_send_check); 2484EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 2485 2486#ifdef CONFIG_PROC_FS 2487EXPORT_SYMBOL(tcp_proc_register); 2488EXPORT_SYMBOL(tcp_proc_unregister); 2489#endif 2490EXPORT_SYMBOL(sysctl_tcp_low_latency); 2491 2492