tcp_ipv4.c revision 3b401a81c0d50ea9c718cf837f62cc2e6e79cc30
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24/* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 54#include <linux/bottom_half.h> 55#include <linux/types.h> 56#include <linux/fcntl.h> 57#include <linux/module.h> 58#include <linux/random.h> 59#include <linux/cache.h> 60#include <linux/jhash.h> 61#include <linux/init.h> 62#include <linux/times.h> 63 64#include <net/net_namespace.h> 65#include <net/icmp.h> 66#include <net/inet_hashtables.h> 67#include <net/tcp.h> 68#include <net/transp_v6.h> 69#include <net/ipv6.h> 70#include <net/inet_common.h> 71#include <net/timewait_sock.h> 72#include <net/xfrm.h> 73#include <net/netdma.h> 74 75#include <linux/inet.h> 76#include <linux/ipv6.h> 77#include <linux/stddef.h> 78#include <linux/proc_fs.h> 79#include <linux/seq_file.h> 80 81#include <linux/crypto.h> 82#include <linux/scatterlist.h> 83 84int sysctl_tcp_tw_reuse __read_mostly; 85int sysctl_tcp_low_latency __read_mostly; 86 87 88#ifdef CONFIG_TCP_MD5SIG 89static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, 90 __be32 addr); 91static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, 92 __be32 daddr, __be32 saddr, struct tcphdr *th); 93#else 94static inline 95struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) 96{ 97 return NULL; 98} 99#endif 100 101struct inet_hashinfo tcp_hashinfo; 102 103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 104{ 105 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 106 ip_hdr(skb)->saddr, 107 tcp_hdr(skb)->dest, 108 tcp_hdr(skb)->source); 109} 110 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112{ 113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 struct tcp_sock *tp = tcp_sk(sk); 115 116 /* With PAWS, it is safe from the viewpoint 117 of data integrity. Even without PAWS it is safe provided sequence 118 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 119 120 Actually, the idea is close to VJ's one, only timestamp cache is 121 held not per host, but per port pair and TW bucket is used as state 122 holder. 123 124 If TW bucket has been already destroyed we fall back to VJ's scheme 125 and use initial timestamp retrieved from peer table. 126 */ 127 if (tcptw->tw_ts_recent_stamp && 128 (twp == NULL || (sysctl_tcp_tw_reuse && 129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 131 if (tp->write_seq == 0) 132 tp->write_seq = 1; 133 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 134 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 135 sock_hold(sktw); 136 return 1; 137 } 138 139 return 0; 140} 141 142EXPORT_SYMBOL_GPL(tcp_twsk_unique); 143 144/* This will initiate an outgoing connection. */ 145int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 146{ 147 struct inet_sock *inet = inet_sk(sk); 148 struct tcp_sock *tp = tcp_sk(sk); 149 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 150 struct rtable *rt; 151 __be32 daddr, nexthop; 152 int tmp; 153 int err; 154 155 if (addr_len < sizeof(struct sockaddr_in)) 156 return -EINVAL; 157 158 if (usin->sin_family != AF_INET) 159 return -EAFNOSUPPORT; 160 161 nexthop = daddr = usin->sin_addr.s_addr; 162 if (inet->opt && inet->opt->srr) { 163 if (!daddr) 164 return -EINVAL; 165 nexthop = inet->opt->faddr; 166 } 167 168 tmp = ip_route_connect(&rt, nexthop, inet->saddr, 169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 170 IPPROTO_TCP, 171 inet->sport, usin->sin_port, sk, 1); 172 if (tmp < 0) { 173 if (tmp == -ENETUNREACH) 174 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 175 return tmp; 176 } 177 178 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 179 ip_rt_put(rt); 180 return -ENETUNREACH; 181 } 182 183 if (!inet->opt || !inet->opt->srr) 184 daddr = rt->rt_dst; 185 186 if (!inet->saddr) 187 inet->saddr = rt->rt_src; 188 inet->rcv_saddr = inet->saddr; 189 190 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { 191 /* Reset inherited state */ 192 tp->rx_opt.ts_recent = 0; 193 tp->rx_opt.ts_recent_stamp = 0; 194 tp->write_seq = 0; 195 } 196 197 if (tcp_death_row.sysctl_tw_recycle && 198 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 199 struct inet_peer *peer = rt_get_peer(rt); 200 /* 201 * VJ's idea. We save last timestamp seen from 202 * the destination in peer table, when entering state 203 * TIME-WAIT * and initialize rx_opt.ts_recent from it, 204 * when trying new connection. 205 */ 206 if (peer != NULL && 207 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { 208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 209 tp->rx_opt.ts_recent = peer->tcp_ts; 210 } 211 } 212 213 inet->dport = usin->sin_port; 214 inet->daddr = daddr; 215 216 inet_csk(sk)->icsk_ext_hdr_len = 0; 217 if (inet->opt) 218 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 219 220 tp->rx_opt.mss_clamp = 536; 221 222 /* Socket identity is still unknown (sport may be zero). 223 * However we set state to SYN-SENT and not releasing socket 224 * lock select source port, enter ourselves into the hash tables and 225 * complete initialization after this. 226 */ 227 tcp_set_state(sk, TCP_SYN_SENT); 228 err = inet_hash_connect(&tcp_death_row, sk); 229 if (err) 230 goto failure; 231 232 err = ip_route_newports(&rt, IPPROTO_TCP, 233 inet->sport, inet->dport, sk); 234 if (err) 235 goto failure; 236 237 /* OK, now commit destination to socket. */ 238 sk->sk_gso_type = SKB_GSO_TCPV4; 239 sk_setup_caps(sk, &rt->u.dst); 240 241 if (!tp->write_seq) 242 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 243 inet->daddr, 244 inet->sport, 245 usin->sin_port); 246 247 inet->id = tp->write_seq ^ jiffies; 248 249 err = tcp_connect(sk); 250 rt = NULL; 251 if (err) 252 goto failure; 253 254 return 0; 255 256failure: 257 /* 258 * This unhashes the socket and releases the local port, 259 * if necessary. 260 */ 261 tcp_set_state(sk, TCP_CLOSE); 262 ip_rt_put(rt); 263 sk->sk_route_caps = 0; 264 inet->dport = 0; 265 return err; 266} 267 268/* 269 * This routine does path mtu discovery as defined in RFC1191. 270 */ 271static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) 272{ 273 struct dst_entry *dst; 274 struct inet_sock *inet = inet_sk(sk); 275 276 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 277 * send out by Linux are always <576bytes so they should go through 278 * unfragmented). 279 */ 280 if (sk->sk_state == TCP_LISTEN) 281 return; 282 283 /* We don't check in the destentry if pmtu discovery is forbidden 284 * on this route. We just assume that no packet_to_big packets 285 * are send back when pmtu discovery is not active. 286 * There is a small race when the user changes this flag in the 287 * route, but I think that's acceptable. 288 */ 289 if ((dst = __sk_dst_check(sk, 0)) == NULL) 290 return; 291 292 dst->ops->update_pmtu(dst, mtu); 293 294 /* Something is about to be wrong... Remember soft error 295 * for the case, if this connection will not able to recover. 296 */ 297 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 298 sk->sk_err_soft = EMSGSIZE; 299 300 mtu = dst_mtu(dst); 301 302 if (inet->pmtudisc != IP_PMTUDISC_DONT && 303 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 304 tcp_sync_mss(sk, mtu); 305 306 /* Resend the TCP packet because it's 307 * clear that the old packet has been 308 * dropped. This is the new "fast" path mtu 309 * discovery. 310 */ 311 tcp_simple_retransmit(sk); 312 } /* else let the usual retransmit timer handle it */ 313} 314 315/* 316 * This routine is called by the ICMP module when it gets some 317 * sort of error condition. If err < 0 then the socket should 318 * be closed and the error returned to the user. If err > 0 319 * it's just the icmp type << 8 | icmp code. After adjustment 320 * header points to the first 8 bytes of the tcp header. We need 321 * to find the appropriate port. 322 * 323 * The locking strategy used here is very "optimistic". When 324 * someone else accesses the socket the ICMP is just dropped 325 * and for some paths there is no check at all. 326 * A more general error queue to queue errors for later handling 327 * is probably better. 328 * 329 */ 330 331void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 332{ 333 struct iphdr *iph = (struct iphdr *)icmp_skb->data; 334 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 335 struct inet_connection_sock *icsk; 336 struct tcp_sock *tp; 337 struct inet_sock *inet; 338 const int type = icmp_hdr(icmp_skb)->type; 339 const int code = icmp_hdr(icmp_skb)->code; 340 struct sock *sk; 341 struct sk_buff *skb; 342 __u32 seq; 343 __u32 remaining; 344 int err; 345 struct net *net = dev_net(icmp_skb->dev); 346 347 if (icmp_skb->len < (iph->ihl << 2) + 8) { 348 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 349 return; 350 } 351 352 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, 353 iph->saddr, th->source, inet_iif(icmp_skb)); 354 if (!sk) { 355 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 356 return; 357 } 358 if (sk->sk_state == TCP_TIME_WAIT) { 359 inet_twsk_put(inet_twsk(sk)); 360 return; 361 } 362 363 bh_lock_sock(sk); 364 /* If too many ICMPs get dropped on busy 365 * servers this needs to be solved differently. 366 */ 367 if (sock_owned_by_user(sk)) 368 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 369 370 if (sk->sk_state == TCP_CLOSE) 371 goto out; 372 373 icsk = inet_csk(sk); 374 tp = tcp_sk(sk); 375 seq = ntohl(th->seq); 376 if (sk->sk_state != TCP_LISTEN && 377 !between(seq, tp->snd_una, tp->snd_nxt)) { 378 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 379 goto out; 380 } 381 382 switch (type) { 383 case ICMP_SOURCE_QUENCH: 384 /* Just silently ignore these. */ 385 goto out; 386 case ICMP_PARAMETERPROB: 387 err = EPROTO; 388 break; 389 case ICMP_DEST_UNREACH: 390 if (code > NR_ICMP_UNREACH) 391 goto out; 392 393 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 394 if (!sock_owned_by_user(sk)) 395 do_pmtu_discovery(sk, iph, info); 396 goto out; 397 } 398 399 err = icmp_err_convert[code].errno; 400 /* check if icmp_skb allows revert of backoff 401 * (see draft-zimmermann-tcp-lcd) */ 402 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 403 break; 404 if (seq != tp->snd_una || !icsk->icsk_retransmits || 405 !icsk->icsk_backoff) 406 break; 407 408 icsk->icsk_backoff--; 409 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << 410 icsk->icsk_backoff; 411 tcp_bound_rto(sk); 412 413 skb = tcp_write_queue_head(sk); 414 BUG_ON(!skb); 415 416 remaining = icsk->icsk_rto - min(icsk->icsk_rto, 417 tcp_time_stamp - TCP_SKB_CB(skb)->when); 418 419 if (remaining) { 420 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 421 remaining, TCP_RTO_MAX); 422 } else if (sock_owned_by_user(sk)) { 423 /* RTO revert clocked out retransmission, 424 * but socket is locked. Will defer. */ 425 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 426 HZ/20, TCP_RTO_MAX); 427 } else { 428 /* RTO revert clocked out retransmission. 429 * Will retransmit now */ 430 tcp_retransmit_timer(sk); 431 } 432 433 break; 434 case ICMP_TIME_EXCEEDED: 435 err = EHOSTUNREACH; 436 break; 437 default: 438 goto out; 439 } 440 441 switch (sk->sk_state) { 442 struct request_sock *req, **prev; 443 case TCP_LISTEN: 444 if (sock_owned_by_user(sk)) 445 goto out; 446 447 req = inet_csk_search_req(sk, &prev, th->dest, 448 iph->daddr, iph->saddr); 449 if (!req) 450 goto out; 451 452 /* ICMPs are not backlogged, hence we cannot get 453 an established socket here. 454 */ 455 WARN_ON(req->sk); 456 457 if (seq != tcp_rsk(req)->snt_isn) { 458 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 459 goto out; 460 } 461 462 /* 463 * Still in SYN_RECV, just remove it silently. 464 * There is no good way to pass the error to the newly 465 * created socket, and POSIX does not want network 466 * errors returned from accept(). 467 */ 468 inet_csk_reqsk_queue_drop(sk, req, prev); 469 goto out; 470 471 case TCP_SYN_SENT: 472 case TCP_SYN_RECV: /* Cannot happen. 473 It can f.e. if SYNs crossed. 474 */ 475 if (!sock_owned_by_user(sk)) { 476 sk->sk_err = err; 477 478 sk->sk_error_report(sk); 479 480 tcp_done(sk); 481 } else { 482 sk->sk_err_soft = err; 483 } 484 goto out; 485 } 486 487 /* If we've already connected we will keep trying 488 * until we time out, or the user gives up. 489 * 490 * rfc1122 4.2.3.9 allows to consider as hard errors 491 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 492 * but it is obsoleted by pmtu discovery). 493 * 494 * Note, that in modern internet, where routing is unreliable 495 * and in each dark corner broken firewalls sit, sending random 496 * errors ordered by their masters even this two messages finally lose 497 * their original sense (even Linux sends invalid PORT_UNREACHs) 498 * 499 * Now we are in compliance with RFCs. 500 * --ANK (980905) 501 */ 502 503 inet = inet_sk(sk); 504 if (!sock_owned_by_user(sk) && inet->recverr) { 505 sk->sk_err = err; 506 sk->sk_error_report(sk); 507 } else { /* Only an error on timeout */ 508 sk->sk_err_soft = err; 509 } 510 511out: 512 bh_unlock_sock(sk); 513 sock_put(sk); 514} 515 516/* This routine computes an IPv4 TCP checksum. */ 517void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) 518{ 519 struct inet_sock *inet = inet_sk(sk); 520 struct tcphdr *th = tcp_hdr(skb); 521 522 if (skb->ip_summed == CHECKSUM_PARTIAL) { 523 th->check = ~tcp_v4_check(len, inet->saddr, 524 inet->daddr, 0); 525 skb->csum_start = skb_transport_header(skb) - skb->head; 526 skb->csum_offset = offsetof(struct tcphdr, check); 527 } else { 528 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 529 csum_partial(th, 530 th->doff << 2, 531 skb->csum)); 532 } 533} 534 535int tcp_v4_gso_send_check(struct sk_buff *skb) 536{ 537 const struct iphdr *iph; 538 struct tcphdr *th; 539 540 if (!pskb_may_pull(skb, sizeof(*th))) 541 return -EINVAL; 542 543 iph = ip_hdr(skb); 544 th = tcp_hdr(skb); 545 546 th->check = 0; 547 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); 548 skb->csum_start = skb_transport_header(skb) - skb->head; 549 skb->csum_offset = offsetof(struct tcphdr, check); 550 skb->ip_summed = CHECKSUM_PARTIAL; 551 return 0; 552} 553 554/* 555 * This routine will send an RST to the other tcp. 556 * 557 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 558 * for reset. 559 * Answer: if a packet caused RST, it is not for a socket 560 * existing in our system, if it is matched to a socket, 561 * it is just duplicate segment or bug in other side's TCP. 562 * So that we build reply only basing on parameters 563 * arrived with segment. 564 * Exception: precedence violation. We do not implement it in any case. 565 */ 566 567static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) 568{ 569 struct tcphdr *th = tcp_hdr(skb); 570 struct { 571 struct tcphdr th; 572#ifdef CONFIG_TCP_MD5SIG 573 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 574#endif 575 } rep; 576 struct ip_reply_arg arg; 577#ifdef CONFIG_TCP_MD5SIG 578 struct tcp_md5sig_key *key; 579#endif 580 struct net *net; 581 582 /* Never send a reset in response to a reset. */ 583 if (th->rst) 584 return; 585 586 if (skb_rtable(skb)->rt_type != RTN_LOCAL) 587 return; 588 589 /* Swap the send and the receive. */ 590 memset(&rep, 0, sizeof(rep)); 591 rep.th.dest = th->source; 592 rep.th.source = th->dest; 593 rep.th.doff = sizeof(struct tcphdr) / 4; 594 rep.th.rst = 1; 595 596 if (th->ack) { 597 rep.th.seq = th->ack_seq; 598 } else { 599 rep.th.ack = 1; 600 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 601 skb->len - (th->doff << 2)); 602 } 603 604 memset(&arg, 0, sizeof(arg)); 605 arg.iov[0].iov_base = (unsigned char *)&rep; 606 arg.iov[0].iov_len = sizeof(rep.th); 607 608#ifdef CONFIG_TCP_MD5SIG 609 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; 610 if (key) { 611 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 612 (TCPOPT_NOP << 16) | 613 (TCPOPT_MD5SIG << 8) | 614 TCPOLEN_MD5SIG); 615 /* Update length and the length the header thinks exists */ 616 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 617 rep.th.doff = arg.iov[0].iov_len / 4; 618 619 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 620 key, ip_hdr(skb)->saddr, 621 ip_hdr(skb)->daddr, &rep.th); 622 } 623#endif 624 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 625 ip_hdr(skb)->saddr, /* XXX */ 626 arg.iov[0].iov_len, IPPROTO_TCP, 0); 627 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 628 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 629 630 net = dev_net(skb_dst(skb)->dev); 631 ip_send_reply(net->ipv4.tcp_sock, skb, 632 &arg, arg.iov[0].iov_len); 633 634 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 635 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 636} 637 638/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 639 outside socket context is ugly, certainly. What can I do? 640 */ 641 642static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, 643 u32 win, u32 ts, int oif, 644 struct tcp_md5sig_key *key, 645 int reply_flags) 646{ 647 struct tcphdr *th = tcp_hdr(skb); 648 struct { 649 struct tcphdr th; 650 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 651#ifdef CONFIG_TCP_MD5SIG 652 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 653#endif 654 ]; 655 } rep; 656 struct ip_reply_arg arg; 657 struct net *net = dev_net(skb_dst(skb)->dev); 658 659 memset(&rep.th, 0, sizeof(struct tcphdr)); 660 memset(&arg, 0, sizeof(arg)); 661 662 arg.iov[0].iov_base = (unsigned char *)&rep; 663 arg.iov[0].iov_len = sizeof(rep.th); 664 if (ts) { 665 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 666 (TCPOPT_TIMESTAMP << 8) | 667 TCPOLEN_TIMESTAMP); 668 rep.opt[1] = htonl(tcp_time_stamp); 669 rep.opt[2] = htonl(ts); 670 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 671 } 672 673 /* Swap the send and the receive. */ 674 rep.th.dest = th->source; 675 rep.th.source = th->dest; 676 rep.th.doff = arg.iov[0].iov_len / 4; 677 rep.th.seq = htonl(seq); 678 rep.th.ack_seq = htonl(ack); 679 rep.th.ack = 1; 680 rep.th.window = htons(win); 681 682#ifdef CONFIG_TCP_MD5SIG 683 if (key) { 684 int offset = (ts) ? 3 : 0; 685 686 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 687 (TCPOPT_NOP << 16) | 688 (TCPOPT_MD5SIG << 8) | 689 TCPOLEN_MD5SIG); 690 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 691 rep.th.doff = arg.iov[0].iov_len/4; 692 693 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 694 key, ip_hdr(skb)->saddr, 695 ip_hdr(skb)->daddr, &rep.th); 696 } 697#endif 698 arg.flags = reply_flags; 699 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 700 ip_hdr(skb)->saddr, /* XXX */ 701 arg.iov[0].iov_len, IPPROTO_TCP, 0); 702 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 703 if (oif) 704 arg.bound_dev_if = oif; 705 706 ip_send_reply(net->ipv4.tcp_sock, skb, 707 &arg, arg.iov[0].iov_len); 708 709 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 710} 711 712static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 713{ 714 struct inet_timewait_sock *tw = inet_twsk(sk); 715 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 716 717 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 718 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 719 tcptw->tw_ts_recent, 720 tw->tw_bound_dev_if, 721 tcp_twsk_md5_key(tcptw), 722 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 723 ); 724 725 inet_twsk_put(tw); 726} 727 728static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 729 struct request_sock *req) 730{ 731 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, 732 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 733 req->ts_recent, 734 0, 735 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), 736 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); 737} 738 739/* 740 * Send a SYN-ACK after having received a SYN. 741 * This still operates on a request_sock only, not on a big 742 * socket. 743 */ 744static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 745 struct dst_entry *dst) 746{ 747 const struct inet_request_sock *ireq = inet_rsk(req); 748 int err = -1; 749 struct sk_buff * skb; 750 751 /* First, grab a route. */ 752 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 753 return -1; 754 755 skb = tcp_make_synack(sk, dst, req); 756 757 if (skb) { 758 struct tcphdr *th = tcp_hdr(skb); 759 760 th->check = tcp_v4_check(skb->len, 761 ireq->loc_addr, 762 ireq->rmt_addr, 763 csum_partial(th, skb->len, 764 skb->csum)); 765 766 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 767 ireq->rmt_addr, 768 ireq->opt); 769 err = net_xmit_eval(err); 770 } 771 772 dst_release(dst); 773 return err; 774} 775 776static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) 777{ 778 return __tcp_v4_send_synack(sk, req, NULL); 779} 780 781/* 782 * IPv4 request_sock destructor. 783 */ 784static void tcp_v4_reqsk_destructor(struct request_sock *req) 785{ 786 kfree(inet_rsk(req)->opt); 787} 788 789#ifdef CONFIG_SYN_COOKIES 790static void syn_flood_warning(struct sk_buff *skb) 791{ 792 static unsigned long warntime; 793 794 if (time_after(jiffies, (warntime + HZ * 60))) { 795 warntime = jiffies; 796 printk(KERN_INFO 797 "possible SYN flooding on port %d. Sending cookies.\n", 798 ntohs(tcp_hdr(skb)->dest)); 799 } 800} 801#endif 802 803/* 804 * Save and compile IPv4 options into the request_sock if needed. 805 */ 806static struct ip_options *tcp_v4_save_options(struct sock *sk, 807 struct sk_buff *skb) 808{ 809 struct ip_options *opt = &(IPCB(skb)->opt); 810 struct ip_options *dopt = NULL; 811 812 if (opt && opt->optlen) { 813 int opt_size = optlength(opt); 814 dopt = kmalloc(opt_size, GFP_ATOMIC); 815 if (dopt) { 816 if (ip_options_echo(dopt, skb)) { 817 kfree(dopt); 818 dopt = NULL; 819 } 820 } 821 } 822 return dopt; 823} 824 825#ifdef CONFIG_TCP_MD5SIG 826/* 827 * RFC2385 MD5 checksumming requires a mapping of 828 * IP address->MD5 Key. 829 * We need to maintain these in the sk structure. 830 */ 831 832/* Find the Key structure for an address. */ 833static struct tcp_md5sig_key * 834 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) 835{ 836 struct tcp_sock *tp = tcp_sk(sk); 837 int i; 838 839 if (!tp->md5sig_info || !tp->md5sig_info->entries4) 840 return NULL; 841 for (i = 0; i < tp->md5sig_info->entries4; i++) { 842 if (tp->md5sig_info->keys4[i].addr == addr) 843 return &tp->md5sig_info->keys4[i].base; 844 } 845 return NULL; 846} 847 848struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 849 struct sock *addr_sk) 850{ 851 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); 852} 853 854EXPORT_SYMBOL(tcp_v4_md5_lookup); 855 856static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 857 struct request_sock *req) 858{ 859 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); 860} 861 862/* This can be called on a newly created socket, from other files */ 863int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, 864 u8 *newkey, u8 newkeylen) 865{ 866 /* Add Key to the list */ 867 struct tcp_md5sig_key *key; 868 struct tcp_sock *tp = tcp_sk(sk); 869 struct tcp4_md5sig_key *keys; 870 871 key = tcp_v4_md5_do_lookup(sk, addr); 872 if (key) { 873 /* Pre-existing entry - just update that one. */ 874 kfree(key->key); 875 key->key = newkey; 876 key->keylen = newkeylen; 877 } else { 878 struct tcp_md5sig_info *md5sig; 879 880 if (!tp->md5sig_info) { 881 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), 882 GFP_ATOMIC); 883 if (!tp->md5sig_info) { 884 kfree(newkey); 885 return -ENOMEM; 886 } 887 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 888 } 889 if (tcp_alloc_md5sig_pool() == NULL) { 890 kfree(newkey); 891 return -ENOMEM; 892 } 893 md5sig = tp->md5sig_info; 894 895 if (md5sig->alloced4 == md5sig->entries4) { 896 keys = kmalloc((sizeof(*keys) * 897 (md5sig->entries4 + 1)), GFP_ATOMIC); 898 if (!keys) { 899 kfree(newkey); 900 tcp_free_md5sig_pool(); 901 return -ENOMEM; 902 } 903 904 if (md5sig->entries4) 905 memcpy(keys, md5sig->keys4, 906 sizeof(*keys) * md5sig->entries4); 907 908 /* Free old key list, and reference new one */ 909 kfree(md5sig->keys4); 910 md5sig->keys4 = keys; 911 md5sig->alloced4++; 912 } 913 md5sig->entries4++; 914 md5sig->keys4[md5sig->entries4 - 1].addr = addr; 915 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; 916 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; 917 } 918 return 0; 919} 920 921EXPORT_SYMBOL(tcp_v4_md5_do_add); 922 923static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 924 u8 *newkey, u8 newkeylen) 925{ 926 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, 927 newkey, newkeylen); 928} 929 930int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) 931{ 932 struct tcp_sock *tp = tcp_sk(sk); 933 int i; 934 935 for (i = 0; i < tp->md5sig_info->entries4; i++) { 936 if (tp->md5sig_info->keys4[i].addr == addr) { 937 /* Free the key */ 938 kfree(tp->md5sig_info->keys4[i].base.key); 939 tp->md5sig_info->entries4--; 940 941 if (tp->md5sig_info->entries4 == 0) { 942 kfree(tp->md5sig_info->keys4); 943 tp->md5sig_info->keys4 = NULL; 944 tp->md5sig_info->alloced4 = 0; 945 } else if (tp->md5sig_info->entries4 != i) { 946 /* Need to do some manipulation */ 947 memmove(&tp->md5sig_info->keys4[i], 948 &tp->md5sig_info->keys4[i+1], 949 (tp->md5sig_info->entries4 - i) * 950 sizeof(struct tcp4_md5sig_key)); 951 } 952 tcp_free_md5sig_pool(); 953 return 0; 954 } 955 } 956 return -ENOENT; 957} 958 959EXPORT_SYMBOL(tcp_v4_md5_do_del); 960 961static void tcp_v4_clear_md5_list(struct sock *sk) 962{ 963 struct tcp_sock *tp = tcp_sk(sk); 964 965 /* Free each key, then the set of key keys, 966 * the crypto element, and then decrement our 967 * hold on the last resort crypto. 968 */ 969 if (tp->md5sig_info->entries4) { 970 int i; 971 for (i = 0; i < tp->md5sig_info->entries4; i++) 972 kfree(tp->md5sig_info->keys4[i].base.key); 973 tp->md5sig_info->entries4 = 0; 974 tcp_free_md5sig_pool(); 975 } 976 if (tp->md5sig_info->keys4) { 977 kfree(tp->md5sig_info->keys4); 978 tp->md5sig_info->keys4 = NULL; 979 tp->md5sig_info->alloced4 = 0; 980 } 981} 982 983static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 984 int optlen) 985{ 986 struct tcp_md5sig cmd; 987 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 988 u8 *newkey; 989 990 if (optlen < sizeof(cmd)) 991 return -EINVAL; 992 993 if (copy_from_user(&cmd, optval, sizeof(cmd))) 994 return -EFAULT; 995 996 if (sin->sin_family != AF_INET) 997 return -EINVAL; 998 999 if (!cmd.tcpm_key || !cmd.tcpm_keylen) { 1000 if (!tcp_sk(sk)->md5sig_info) 1001 return -ENOENT; 1002 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); 1003 } 1004 1005 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1006 return -EINVAL; 1007 1008 if (!tcp_sk(sk)->md5sig_info) { 1009 struct tcp_sock *tp = tcp_sk(sk); 1010 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL); 1011 1012 if (!p) 1013 return -EINVAL; 1014 1015 tp->md5sig_info = p; 1016 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1017 } 1018 1019 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1020 if (!newkey) 1021 return -ENOMEM; 1022 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, 1023 newkey, cmd.tcpm_keylen); 1024} 1025 1026static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, 1027 __be32 daddr, __be32 saddr, int nbytes) 1028{ 1029 struct tcp4_pseudohdr *bp; 1030 struct scatterlist sg; 1031 1032 bp = &hp->md5_blk.ip4; 1033 1034 /* 1035 * 1. the TCP pseudo-header (in the order: source IP address, 1036 * destination IP address, zero-padded protocol number, and 1037 * segment length) 1038 */ 1039 bp->saddr = saddr; 1040 bp->daddr = daddr; 1041 bp->pad = 0; 1042 bp->protocol = IPPROTO_TCP; 1043 bp->len = cpu_to_be16(nbytes); 1044 1045 sg_init_one(&sg, bp, sizeof(*bp)); 1046 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); 1047} 1048 1049static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, 1050 __be32 daddr, __be32 saddr, struct tcphdr *th) 1051{ 1052 struct tcp_md5sig_pool *hp; 1053 struct hash_desc *desc; 1054 1055 hp = tcp_get_md5sig_pool(); 1056 if (!hp) 1057 goto clear_hash_noput; 1058 desc = &hp->md5_desc; 1059 1060 if (crypto_hash_init(desc)) 1061 goto clear_hash; 1062 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) 1063 goto clear_hash; 1064 if (tcp_md5_hash_header(hp, th)) 1065 goto clear_hash; 1066 if (tcp_md5_hash_key(hp, key)) 1067 goto clear_hash; 1068 if (crypto_hash_final(desc, md5_hash)) 1069 goto clear_hash; 1070 1071 tcp_put_md5sig_pool(); 1072 return 0; 1073 1074clear_hash: 1075 tcp_put_md5sig_pool(); 1076clear_hash_noput: 1077 memset(md5_hash, 0, 16); 1078 return 1; 1079} 1080 1081int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, 1082 struct sock *sk, struct request_sock *req, 1083 struct sk_buff *skb) 1084{ 1085 struct tcp_md5sig_pool *hp; 1086 struct hash_desc *desc; 1087 struct tcphdr *th = tcp_hdr(skb); 1088 __be32 saddr, daddr; 1089 1090 if (sk) { 1091 saddr = inet_sk(sk)->saddr; 1092 daddr = inet_sk(sk)->daddr; 1093 } else if (req) { 1094 saddr = inet_rsk(req)->loc_addr; 1095 daddr = inet_rsk(req)->rmt_addr; 1096 } else { 1097 const struct iphdr *iph = ip_hdr(skb); 1098 saddr = iph->saddr; 1099 daddr = iph->daddr; 1100 } 1101 1102 hp = tcp_get_md5sig_pool(); 1103 if (!hp) 1104 goto clear_hash_noput; 1105 desc = &hp->md5_desc; 1106 1107 if (crypto_hash_init(desc)) 1108 goto clear_hash; 1109 1110 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) 1111 goto clear_hash; 1112 if (tcp_md5_hash_header(hp, th)) 1113 goto clear_hash; 1114 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1115 goto clear_hash; 1116 if (tcp_md5_hash_key(hp, key)) 1117 goto clear_hash; 1118 if (crypto_hash_final(desc, md5_hash)) 1119 goto clear_hash; 1120 1121 tcp_put_md5sig_pool(); 1122 return 0; 1123 1124clear_hash: 1125 tcp_put_md5sig_pool(); 1126clear_hash_noput: 1127 memset(md5_hash, 0, 16); 1128 return 1; 1129} 1130 1131EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1132 1133static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) 1134{ 1135 /* 1136 * This gets called for each TCP segment that arrives 1137 * so we want to be efficient. 1138 * We have 3 drop cases: 1139 * o No MD5 hash and one expected. 1140 * o MD5 hash and we're not expecting one. 1141 * o MD5 hash and its wrong. 1142 */ 1143 __u8 *hash_location = NULL; 1144 struct tcp_md5sig_key *hash_expected; 1145 const struct iphdr *iph = ip_hdr(skb); 1146 struct tcphdr *th = tcp_hdr(skb); 1147 int genhash; 1148 unsigned char newhash[16]; 1149 1150 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); 1151 hash_location = tcp_parse_md5sig_option(th); 1152 1153 /* We've parsed the options - do we have a hash? */ 1154 if (!hash_expected && !hash_location) 1155 return 0; 1156 1157 if (hash_expected && !hash_location) { 1158 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1159 return 1; 1160 } 1161 1162 if (!hash_expected && hash_location) { 1163 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1164 return 1; 1165 } 1166 1167 /* Okay, so this is hash_expected and hash_location - 1168 * so we need to calculate the checksum. 1169 */ 1170 genhash = tcp_v4_md5_hash_skb(newhash, 1171 hash_expected, 1172 NULL, NULL, skb); 1173 1174 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1175 if (net_ratelimit()) { 1176 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1177 &iph->saddr, ntohs(th->source), 1178 &iph->daddr, ntohs(th->dest), 1179 genhash ? " tcp_v4_calc_md5_hash failed" : ""); 1180 } 1181 return 1; 1182 } 1183 return 0; 1184} 1185 1186#endif 1187 1188struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1189 .family = PF_INET, 1190 .obj_size = sizeof(struct tcp_request_sock), 1191 .rtx_syn_ack = tcp_v4_send_synack, 1192 .send_ack = tcp_v4_reqsk_send_ack, 1193 .destructor = tcp_v4_reqsk_destructor, 1194 .send_reset = tcp_v4_send_reset, 1195}; 1196 1197#ifdef CONFIG_TCP_MD5SIG 1198static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1199 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1200 .calc_md5_hash = tcp_v4_md5_hash_skb, 1201}; 1202#endif 1203 1204static struct timewait_sock_ops tcp_timewait_sock_ops = { 1205 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1206 .twsk_unique = tcp_twsk_unique, 1207 .twsk_destructor= tcp_twsk_destructor, 1208}; 1209 1210int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1211{ 1212 struct inet_request_sock *ireq; 1213 struct tcp_options_received tmp_opt; 1214 struct request_sock *req; 1215 __be32 saddr = ip_hdr(skb)->saddr; 1216 __be32 daddr = ip_hdr(skb)->daddr; 1217 __u32 isn = TCP_SKB_CB(skb)->when; 1218 struct dst_entry *dst = NULL; 1219#ifdef CONFIG_SYN_COOKIES 1220 int want_cookie = 0; 1221#else 1222#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ 1223#endif 1224 1225 /* Never answer to SYNs send to broadcast or multicast */ 1226 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1227 goto drop; 1228 1229 /* TW buckets are converted to open requests without 1230 * limitations, they conserve resources and peer is 1231 * evidently real one. 1232 */ 1233 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1234#ifdef CONFIG_SYN_COOKIES 1235 if (sysctl_tcp_syncookies) { 1236 want_cookie = 1; 1237 } else 1238#endif 1239 goto drop; 1240 } 1241 1242 /* Accept backlog is full. If we have already queued enough 1243 * of warm entries in syn queue, drop request. It is better than 1244 * clogging syn queue with openreqs with exponentially increasing 1245 * timeout. 1246 */ 1247 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 1248 goto drop; 1249 1250 req = inet_reqsk_alloc(&tcp_request_sock_ops); 1251 if (!req) 1252 goto drop; 1253 1254#ifdef CONFIG_TCP_MD5SIG 1255 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; 1256#endif 1257 1258 tcp_clear_options(&tmp_opt); 1259 tmp_opt.mss_clamp = 536; 1260 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; 1261 1262 tcp_parse_options(skb, &tmp_opt, 0); 1263 1264 if (want_cookie && !tmp_opt.saw_tstamp) 1265 tcp_clear_options(&tmp_opt); 1266 1267 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1268 1269 tcp_openreq_init(req, &tmp_opt, skb); 1270 1271 ireq = inet_rsk(req); 1272 ireq->loc_addr = daddr; 1273 ireq->rmt_addr = saddr; 1274 ireq->no_srccheck = inet_sk(sk)->transparent; 1275 ireq->opt = tcp_v4_save_options(sk, skb); 1276 1277 if (security_inet_conn_request(sk, skb, req)) 1278 goto drop_and_free; 1279 1280 if (!want_cookie) 1281 TCP_ECN_create_request(req, tcp_hdr(skb)); 1282 1283 if (want_cookie) { 1284#ifdef CONFIG_SYN_COOKIES 1285 syn_flood_warning(skb); 1286 req->cookie_ts = tmp_opt.tstamp_ok; 1287#endif 1288 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1289 } else if (!isn) { 1290 struct inet_peer *peer = NULL; 1291 1292 /* VJ's idea. We save last timestamp seen 1293 * from the destination in peer table, when entering 1294 * state TIME-WAIT, and check against it before 1295 * accepting new connection request. 1296 * 1297 * If "isn" is not zero, this request hit alive 1298 * timewait bucket, so that all the necessary checks 1299 * are made in the function processing timewait state. 1300 */ 1301 if (tmp_opt.saw_tstamp && 1302 tcp_death_row.sysctl_tw_recycle && 1303 (dst = inet_csk_route_req(sk, req)) != NULL && 1304 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1305 peer->v4daddr == saddr) { 1306 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1307 (s32)(peer->tcp_ts - req->ts_recent) > 1308 TCP_PAWS_WINDOW) { 1309 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1310 goto drop_and_release; 1311 } 1312 } 1313 /* Kill the following clause, if you dislike this way. */ 1314 else if (!sysctl_tcp_syncookies && 1315 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1316 (sysctl_max_syn_backlog >> 2)) && 1317 (!peer || !peer->tcp_ts_stamp) && 1318 (!dst || !dst_metric(dst, RTAX_RTT))) { 1319 /* Without syncookies last quarter of 1320 * backlog is filled with destinations, 1321 * proven to be alive. 1322 * It means that we continue to communicate 1323 * to destinations, already remembered 1324 * to the moment of synflood. 1325 */ 1326 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", 1327 &saddr, ntohs(tcp_hdr(skb)->source)); 1328 goto drop_and_release; 1329 } 1330 1331 isn = tcp_v4_init_sequence(skb); 1332 } 1333 tcp_rsk(req)->snt_isn = isn; 1334 1335 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) 1336 goto drop_and_free; 1337 1338 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1339 return 0; 1340 1341drop_and_release: 1342 dst_release(dst); 1343drop_and_free: 1344 reqsk_free(req); 1345drop: 1346 return 0; 1347} 1348 1349 1350/* 1351 * The three way handshake has completed - we got a valid synack - 1352 * now create the new socket. 1353 */ 1354struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1355 struct request_sock *req, 1356 struct dst_entry *dst) 1357{ 1358 struct inet_request_sock *ireq; 1359 struct inet_sock *newinet; 1360 struct tcp_sock *newtp; 1361 struct sock *newsk; 1362#ifdef CONFIG_TCP_MD5SIG 1363 struct tcp_md5sig_key *key; 1364#endif 1365 1366 if (sk_acceptq_is_full(sk)) 1367 goto exit_overflow; 1368 1369 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 1370 goto exit; 1371 1372 newsk = tcp_create_openreq_child(sk, req, skb); 1373 if (!newsk) 1374 goto exit; 1375 1376 newsk->sk_gso_type = SKB_GSO_TCPV4; 1377 sk_setup_caps(newsk, dst); 1378 1379 newtp = tcp_sk(newsk); 1380 newinet = inet_sk(newsk); 1381 ireq = inet_rsk(req); 1382 newinet->daddr = ireq->rmt_addr; 1383 newinet->rcv_saddr = ireq->loc_addr; 1384 newinet->saddr = ireq->loc_addr; 1385 newinet->opt = ireq->opt; 1386 ireq->opt = NULL; 1387 newinet->mc_index = inet_iif(skb); 1388 newinet->mc_ttl = ip_hdr(skb)->ttl; 1389 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1390 if (newinet->opt) 1391 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1392 newinet->id = newtp->write_seq ^ jiffies; 1393 1394 tcp_mtup_init(newsk); 1395 tcp_sync_mss(newsk, dst_mtu(dst)); 1396 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1397 if (tcp_sk(sk)->rx_opt.user_mss && 1398 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1399 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1400 1401 tcp_initialize_rcv_mss(newsk); 1402 1403#ifdef CONFIG_TCP_MD5SIG 1404 /* Copy over the MD5 key from the original socket */ 1405 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { 1406 /* 1407 * We're using one, so create a matching key 1408 * on the newsk structure. If we fail to get 1409 * memory, then we end up not copying the key 1410 * across. Shucks. 1411 */ 1412 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); 1413 if (newkey != NULL) 1414 tcp_v4_md5_do_add(newsk, newinet->daddr, 1415 newkey, key->keylen); 1416 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1417 } 1418#endif 1419 1420 __inet_hash_nolisten(newsk); 1421 __inet_inherit_port(sk, newsk); 1422 1423 return newsk; 1424 1425exit_overflow: 1426 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1427exit: 1428 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1429 dst_release(dst); 1430 return NULL; 1431} 1432 1433static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1434{ 1435 struct tcphdr *th = tcp_hdr(skb); 1436 const struct iphdr *iph = ip_hdr(skb); 1437 struct sock *nsk; 1438 struct request_sock **prev; 1439 /* Find possible connection requests. */ 1440 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1441 iph->saddr, iph->daddr); 1442 if (req) 1443 return tcp_check_req(sk, skb, req, prev); 1444 1445 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1446 th->source, iph->daddr, th->dest, inet_iif(skb)); 1447 1448 if (nsk) { 1449 if (nsk->sk_state != TCP_TIME_WAIT) { 1450 bh_lock_sock(nsk); 1451 return nsk; 1452 } 1453 inet_twsk_put(inet_twsk(nsk)); 1454 return NULL; 1455 } 1456 1457#ifdef CONFIG_SYN_COOKIES 1458 if (!th->rst && !th->syn && th->ack) 1459 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1460#endif 1461 return sk; 1462} 1463 1464static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) 1465{ 1466 const struct iphdr *iph = ip_hdr(skb); 1467 1468 if (skb->ip_summed == CHECKSUM_COMPLETE) { 1469 if (!tcp_v4_check(skb->len, iph->saddr, 1470 iph->daddr, skb->csum)) { 1471 skb->ip_summed = CHECKSUM_UNNECESSARY; 1472 return 0; 1473 } 1474 } 1475 1476 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 1477 skb->len, IPPROTO_TCP, 0); 1478 1479 if (skb->len <= 76) { 1480 return __skb_checksum_complete(skb); 1481 } 1482 return 0; 1483} 1484 1485 1486/* The socket must have it's spinlock held when we get 1487 * here. 1488 * 1489 * We have a potential double-lock case here, so even when 1490 * doing backlog processing we use the BH locking scheme. 1491 * This is because we cannot sleep with the original spinlock 1492 * held. 1493 */ 1494int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1495{ 1496 struct sock *rsk; 1497#ifdef CONFIG_TCP_MD5SIG 1498 /* 1499 * We really want to reject the packet as early as possible 1500 * if: 1501 * o We're expecting an MD5'd packet and this is no MD5 tcp option 1502 * o There is an MD5 option and we're not expecting one 1503 */ 1504 if (tcp_v4_inbound_md5_hash(sk, skb)) 1505 goto discard; 1506#endif 1507 1508 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1509 TCP_CHECK_TIMER(sk); 1510 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1511 rsk = sk; 1512 goto reset; 1513 } 1514 TCP_CHECK_TIMER(sk); 1515 return 0; 1516 } 1517 1518 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) 1519 goto csum_err; 1520 1521 if (sk->sk_state == TCP_LISTEN) { 1522 struct sock *nsk = tcp_v4_hnd_req(sk, skb); 1523 if (!nsk) 1524 goto discard; 1525 1526 if (nsk != sk) { 1527 if (tcp_child_process(sk, nsk, skb)) { 1528 rsk = nsk; 1529 goto reset; 1530 } 1531 return 0; 1532 } 1533 } 1534 1535 TCP_CHECK_TIMER(sk); 1536 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1537 rsk = sk; 1538 goto reset; 1539 } 1540 TCP_CHECK_TIMER(sk); 1541 return 0; 1542 1543reset: 1544 tcp_v4_send_reset(rsk, skb); 1545discard: 1546 kfree_skb(skb); 1547 /* Be careful here. If this function gets more complicated and 1548 * gcc suffers from register pressure on the x86, sk (in %ebx) 1549 * might be destroyed here. This current version compiles correctly, 1550 * but you have been warned. 1551 */ 1552 return 0; 1553 1554csum_err: 1555 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1556 goto discard; 1557} 1558 1559/* 1560 * From tcp_input.c 1561 */ 1562 1563int tcp_v4_rcv(struct sk_buff *skb) 1564{ 1565 const struct iphdr *iph; 1566 struct tcphdr *th; 1567 struct sock *sk; 1568 int ret; 1569 struct net *net = dev_net(skb->dev); 1570 1571 if (skb->pkt_type != PACKET_HOST) 1572 goto discard_it; 1573 1574 /* Count it even if it's bad */ 1575 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); 1576 1577 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1578 goto discard_it; 1579 1580 th = tcp_hdr(skb); 1581 1582 if (th->doff < sizeof(struct tcphdr) / 4) 1583 goto bad_packet; 1584 if (!pskb_may_pull(skb, th->doff * 4)) 1585 goto discard_it; 1586 1587 /* An explanation is required here, I think. 1588 * Packet length and doff are validated by header prediction, 1589 * provided case of th->doff==0 is eliminated. 1590 * So, we defer the checks. */ 1591 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) 1592 goto bad_packet; 1593 1594 th = tcp_hdr(skb); 1595 iph = ip_hdr(skb); 1596 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1597 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1598 skb->len - th->doff * 4); 1599 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1600 TCP_SKB_CB(skb)->when = 0; 1601 TCP_SKB_CB(skb)->flags = iph->tos; 1602 TCP_SKB_CB(skb)->sacked = 0; 1603 1604 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); 1605 if (!sk) 1606 goto no_tcp_socket; 1607 1608process: 1609 if (sk->sk_state == TCP_TIME_WAIT) 1610 goto do_time_wait; 1611 1612 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1613 goto discard_and_relse; 1614 nf_reset(skb); 1615 1616 if (sk_filter(sk, skb)) 1617 goto discard_and_relse; 1618 1619 skb->dev = NULL; 1620 1621 bh_lock_sock_nested(sk); 1622 ret = 0; 1623 if (!sock_owned_by_user(sk)) { 1624#ifdef CONFIG_NET_DMA 1625 struct tcp_sock *tp = tcp_sk(sk); 1626 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1627 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); 1628 if (tp->ucopy.dma_chan) 1629 ret = tcp_v4_do_rcv(sk, skb); 1630 else 1631#endif 1632 { 1633 if (!tcp_prequeue(sk, skb)) 1634 ret = tcp_v4_do_rcv(sk, skb); 1635 } 1636 } else 1637 sk_add_backlog(sk, skb); 1638 bh_unlock_sock(sk); 1639 1640 sock_put(sk); 1641 1642 return ret; 1643 1644no_tcp_socket: 1645 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1646 goto discard_it; 1647 1648 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1649bad_packet: 1650 TCP_INC_STATS_BH(net, TCP_MIB_INERRS); 1651 } else { 1652 tcp_v4_send_reset(NULL, skb); 1653 } 1654 1655discard_it: 1656 /* Discard frame. */ 1657 kfree_skb(skb); 1658 return 0; 1659 1660discard_and_relse: 1661 sock_put(sk); 1662 goto discard_it; 1663 1664do_time_wait: 1665 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1666 inet_twsk_put(inet_twsk(sk)); 1667 goto discard_it; 1668 } 1669 1670 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1671 TCP_INC_STATS_BH(net, TCP_MIB_INERRS); 1672 inet_twsk_put(inet_twsk(sk)); 1673 goto discard_it; 1674 } 1675 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1676 case TCP_TW_SYN: { 1677 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1678 &tcp_hashinfo, 1679 iph->daddr, th->dest, 1680 inet_iif(skb)); 1681 if (sk2) { 1682 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); 1683 inet_twsk_put(inet_twsk(sk)); 1684 sk = sk2; 1685 goto process; 1686 } 1687 /* Fall through to ACK */ 1688 } 1689 case TCP_TW_ACK: 1690 tcp_v4_timewait_ack(sk, skb); 1691 break; 1692 case TCP_TW_RST: 1693 goto no_tcp_socket; 1694 case TCP_TW_SUCCESS:; 1695 } 1696 goto discard_it; 1697} 1698 1699/* VJ's idea. Save last timestamp seen from this destination 1700 * and hold it at least for normal timewait interval to use for duplicate 1701 * segment detection in subsequent connections, before they enter synchronized 1702 * state. 1703 */ 1704 1705int tcp_v4_remember_stamp(struct sock *sk) 1706{ 1707 struct inet_sock *inet = inet_sk(sk); 1708 struct tcp_sock *tp = tcp_sk(sk); 1709 struct rtable *rt = (struct rtable *)__sk_dst_get(sk); 1710 struct inet_peer *peer = NULL; 1711 int release_it = 0; 1712 1713 if (!rt || rt->rt_dst != inet->daddr) { 1714 peer = inet_getpeer(inet->daddr, 1); 1715 release_it = 1; 1716 } else { 1717 if (!rt->peer) 1718 rt_bind_peer(rt, 1); 1719 peer = rt->peer; 1720 } 1721 1722 if (peer) { 1723 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 1724 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1725 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { 1726 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; 1727 peer->tcp_ts = tp->rx_opt.ts_recent; 1728 } 1729 if (release_it) 1730 inet_putpeer(peer); 1731 return 1; 1732 } 1733 1734 return 0; 1735} 1736 1737int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1738{ 1739 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1740 1741 if (peer) { 1742 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 1743 1744 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 1745 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1746 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { 1747 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; 1748 peer->tcp_ts = tcptw->tw_ts_recent; 1749 } 1750 inet_putpeer(peer); 1751 return 1; 1752 } 1753 1754 return 0; 1755} 1756 1757const struct inet_connection_sock_af_ops ipv4_specific = { 1758 .queue_xmit = ip_queue_xmit, 1759 .send_check = tcp_v4_send_check, 1760 .rebuild_header = inet_sk_rebuild_header, 1761 .conn_request = tcp_v4_conn_request, 1762 .syn_recv_sock = tcp_v4_syn_recv_sock, 1763 .remember_stamp = tcp_v4_remember_stamp, 1764 .net_header_len = sizeof(struct iphdr), 1765 .setsockopt = ip_setsockopt, 1766 .getsockopt = ip_getsockopt, 1767 .addr2sockaddr = inet_csk_addr2sockaddr, 1768 .sockaddr_len = sizeof(struct sockaddr_in), 1769 .bind_conflict = inet_csk_bind_conflict, 1770#ifdef CONFIG_COMPAT 1771 .compat_setsockopt = compat_ip_setsockopt, 1772 .compat_getsockopt = compat_ip_getsockopt, 1773#endif 1774}; 1775 1776#ifdef CONFIG_TCP_MD5SIG 1777static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1778 .md5_lookup = tcp_v4_md5_lookup, 1779 .calc_md5_hash = tcp_v4_md5_hash_skb, 1780 .md5_add = tcp_v4_md5_add_func, 1781 .md5_parse = tcp_v4_parse_md5_keys, 1782}; 1783#endif 1784 1785/* NOTE: A lot of things set to zero explicitly by call to 1786 * sk_alloc() so need not be done here. 1787 */ 1788static int tcp_v4_init_sock(struct sock *sk) 1789{ 1790 struct inet_connection_sock *icsk = inet_csk(sk); 1791 struct tcp_sock *tp = tcp_sk(sk); 1792 1793 skb_queue_head_init(&tp->out_of_order_queue); 1794 tcp_init_xmit_timers(sk); 1795 tcp_prequeue_init(tp); 1796 1797 icsk->icsk_rto = TCP_TIMEOUT_INIT; 1798 tp->mdev = TCP_TIMEOUT_INIT; 1799 1800 /* So many TCP implementations out there (incorrectly) count the 1801 * initial SYN frame in their delayed-ACK and congestion control 1802 * algorithms that we must have the following bandaid to talk 1803 * efficiently to them. -DaveM 1804 */ 1805 tp->snd_cwnd = 2; 1806 1807 /* See draft-stevens-tcpca-spec-01 for discussion of the 1808 * initialization of these values. 1809 */ 1810 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 1811 tp->snd_cwnd_clamp = ~0; 1812 tp->mss_cache = 536; 1813 1814 tp->reordering = sysctl_tcp_reordering; 1815 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1816 1817 sk->sk_state = TCP_CLOSE; 1818 1819 sk->sk_write_space = sk_stream_write_space; 1820 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1821 1822 icsk->icsk_af_ops = &ipv4_specific; 1823 icsk->icsk_sync_mss = tcp_sync_mss; 1824#ifdef CONFIG_TCP_MD5SIG 1825 tp->af_specific = &tcp_sock_ipv4_specific; 1826#endif 1827 1828 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1829 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1830 1831 local_bh_disable(); 1832 percpu_counter_inc(&tcp_sockets_allocated); 1833 local_bh_enable(); 1834 1835 return 0; 1836} 1837 1838void tcp_v4_destroy_sock(struct sock *sk) 1839{ 1840 struct tcp_sock *tp = tcp_sk(sk); 1841 1842 tcp_clear_xmit_timers(sk); 1843 1844 tcp_cleanup_congestion_control(sk); 1845 1846 /* Cleanup up the write buffer. */ 1847 tcp_write_queue_purge(sk); 1848 1849 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1850 __skb_queue_purge(&tp->out_of_order_queue); 1851 1852#ifdef CONFIG_TCP_MD5SIG 1853 /* Clean up the MD5 key list, if any */ 1854 if (tp->md5sig_info) { 1855 tcp_v4_clear_md5_list(sk); 1856 kfree(tp->md5sig_info); 1857 tp->md5sig_info = NULL; 1858 } 1859#endif 1860 1861#ifdef CONFIG_NET_DMA 1862 /* Cleans up our sk_async_wait_queue */ 1863 __skb_queue_purge(&sk->sk_async_wait_queue); 1864#endif 1865 1866 /* Clean prequeue, it must be empty really */ 1867 __skb_queue_purge(&tp->ucopy.prequeue); 1868 1869 /* Clean up a referenced TCP bind bucket. */ 1870 if (inet_csk(sk)->icsk_bind_hash) 1871 inet_put_port(sk); 1872 1873 /* 1874 * If sendmsg cached page exists, toss it. 1875 */ 1876 if (sk->sk_sndmsg_page) { 1877 __free_page(sk->sk_sndmsg_page); 1878 sk->sk_sndmsg_page = NULL; 1879 } 1880 1881 percpu_counter_dec(&tcp_sockets_allocated); 1882} 1883 1884EXPORT_SYMBOL(tcp_v4_destroy_sock); 1885 1886#ifdef CONFIG_PROC_FS 1887/* Proc filesystem TCP sock list dumping. */ 1888 1889static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) 1890{ 1891 return hlist_nulls_empty(head) ? NULL : 1892 list_entry(head->first, struct inet_timewait_sock, tw_node); 1893} 1894 1895static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 1896{ 1897 return !is_a_nulls(tw->tw_node.next) ? 1898 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1899} 1900 1901static void *listening_get_next(struct seq_file *seq, void *cur) 1902{ 1903 struct inet_connection_sock *icsk; 1904 struct hlist_nulls_node *node; 1905 struct sock *sk = cur; 1906 struct inet_listen_hashbucket *ilb; 1907 struct tcp_iter_state *st = seq->private; 1908 struct net *net = seq_file_net(seq); 1909 1910 if (!sk) { 1911 st->bucket = 0; 1912 ilb = &tcp_hashinfo.listening_hash[0]; 1913 spin_lock_bh(&ilb->lock); 1914 sk = sk_nulls_head(&ilb->head); 1915 goto get_sk; 1916 } 1917 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1918 ++st->num; 1919 1920 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1921 struct request_sock *req = cur; 1922 1923 icsk = inet_csk(st->syn_wait_sk); 1924 req = req->dl_next; 1925 while (1) { 1926 while (req) { 1927 if (req->rsk_ops->family == st->family) { 1928 cur = req; 1929 goto out; 1930 } 1931 req = req->dl_next; 1932 } 1933 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 1934 break; 1935get_req: 1936 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 1937 } 1938 sk = sk_next(st->syn_wait_sk); 1939 st->state = TCP_SEQ_STATE_LISTENING; 1940 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1941 } else { 1942 icsk = inet_csk(sk); 1943 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1944 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 1945 goto start_req; 1946 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1947 sk = sk_next(sk); 1948 } 1949get_sk: 1950 sk_nulls_for_each_from(sk, node) { 1951 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 1952 cur = sk; 1953 goto out; 1954 } 1955 icsk = inet_csk(sk); 1956 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1957 if (reqsk_queue_len(&icsk->icsk_accept_queue)) { 1958start_req: 1959 st->uid = sock_i_uid(sk); 1960 st->syn_wait_sk = sk; 1961 st->state = TCP_SEQ_STATE_OPENREQ; 1962 st->sbucket = 0; 1963 goto get_req; 1964 } 1965 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1966 } 1967 spin_unlock_bh(&ilb->lock); 1968 if (++st->bucket < INET_LHTABLE_SIZE) { 1969 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1970 spin_lock_bh(&ilb->lock); 1971 sk = sk_nulls_head(&ilb->head); 1972 goto get_sk; 1973 } 1974 cur = NULL; 1975out: 1976 return cur; 1977} 1978 1979static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 1980{ 1981 void *rc = listening_get_next(seq, NULL); 1982 1983 while (rc && *pos) { 1984 rc = listening_get_next(seq, rc); 1985 --*pos; 1986 } 1987 return rc; 1988} 1989 1990static inline int empty_bucket(struct tcp_iter_state *st) 1991{ 1992 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 1993 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 1994} 1995 1996static void *established_get_first(struct seq_file *seq) 1997{ 1998 struct tcp_iter_state *st = seq->private; 1999 struct net *net = seq_file_net(seq); 2000 void *rc = NULL; 2001 2002 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 2003 struct sock *sk; 2004 struct hlist_nulls_node *node; 2005 struct inet_timewait_sock *tw; 2006 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2007 2008 /* Lockless fast path for the common case of empty buckets */ 2009 if (empty_bucket(st)) 2010 continue; 2011 2012 spin_lock_bh(lock); 2013 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2014 if (sk->sk_family != st->family || 2015 !net_eq(sock_net(sk), net)) { 2016 continue; 2017 } 2018 rc = sk; 2019 goto out; 2020 } 2021 st->state = TCP_SEQ_STATE_TIME_WAIT; 2022 inet_twsk_for_each(tw, node, 2023 &tcp_hashinfo.ehash[st->bucket].twchain) { 2024 if (tw->tw_family != st->family || 2025 !net_eq(twsk_net(tw), net)) { 2026 continue; 2027 } 2028 rc = tw; 2029 goto out; 2030 } 2031 spin_unlock_bh(lock); 2032 st->state = TCP_SEQ_STATE_ESTABLISHED; 2033 } 2034out: 2035 return rc; 2036} 2037 2038static void *established_get_next(struct seq_file *seq, void *cur) 2039{ 2040 struct sock *sk = cur; 2041 struct inet_timewait_sock *tw; 2042 struct hlist_nulls_node *node; 2043 struct tcp_iter_state *st = seq->private; 2044 struct net *net = seq_file_net(seq); 2045 2046 ++st->num; 2047 2048 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2049 tw = cur; 2050 tw = tw_next(tw); 2051get_tw: 2052 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { 2053 tw = tw_next(tw); 2054 } 2055 if (tw) { 2056 cur = tw; 2057 goto out; 2058 } 2059 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2060 st->state = TCP_SEQ_STATE_ESTABLISHED; 2061 2062 /* Look for next non empty bucket */ 2063 while (++st->bucket < tcp_hashinfo.ehash_size && 2064 empty_bucket(st)) 2065 ; 2066 if (st->bucket >= tcp_hashinfo.ehash_size) 2067 return NULL; 2068 2069 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2070 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); 2071 } else 2072 sk = sk_nulls_next(sk); 2073 2074 sk_nulls_for_each_from(sk, node) { 2075 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2076 goto found; 2077 } 2078 2079 st->state = TCP_SEQ_STATE_TIME_WAIT; 2080 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); 2081 goto get_tw; 2082found: 2083 cur = sk; 2084out: 2085 return cur; 2086} 2087 2088static void *established_get_idx(struct seq_file *seq, loff_t pos) 2089{ 2090 void *rc = established_get_first(seq); 2091 2092 while (rc && pos) { 2093 rc = established_get_next(seq, rc); 2094 --pos; 2095 } 2096 return rc; 2097} 2098 2099static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2100{ 2101 void *rc; 2102 struct tcp_iter_state *st = seq->private; 2103 2104 st->state = TCP_SEQ_STATE_LISTENING; 2105 rc = listening_get_idx(seq, &pos); 2106 2107 if (!rc) { 2108 st->state = TCP_SEQ_STATE_ESTABLISHED; 2109 rc = established_get_idx(seq, pos); 2110 } 2111 2112 return rc; 2113} 2114 2115static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2116{ 2117 struct tcp_iter_state *st = seq->private; 2118 st->state = TCP_SEQ_STATE_LISTENING; 2119 st->num = 0; 2120 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2121} 2122 2123static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2124{ 2125 void *rc = NULL; 2126 struct tcp_iter_state *st; 2127 2128 if (v == SEQ_START_TOKEN) { 2129 rc = tcp_get_idx(seq, 0); 2130 goto out; 2131 } 2132 st = seq->private; 2133 2134 switch (st->state) { 2135 case TCP_SEQ_STATE_OPENREQ: 2136 case TCP_SEQ_STATE_LISTENING: 2137 rc = listening_get_next(seq, v); 2138 if (!rc) { 2139 st->state = TCP_SEQ_STATE_ESTABLISHED; 2140 rc = established_get_first(seq); 2141 } 2142 break; 2143 case TCP_SEQ_STATE_ESTABLISHED: 2144 case TCP_SEQ_STATE_TIME_WAIT: 2145 rc = established_get_next(seq, v); 2146 break; 2147 } 2148out: 2149 ++*pos; 2150 return rc; 2151} 2152 2153static void tcp_seq_stop(struct seq_file *seq, void *v) 2154{ 2155 struct tcp_iter_state *st = seq->private; 2156 2157 switch (st->state) { 2158 case TCP_SEQ_STATE_OPENREQ: 2159 if (v) { 2160 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); 2161 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2162 } 2163 case TCP_SEQ_STATE_LISTENING: 2164 if (v != SEQ_START_TOKEN) 2165 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); 2166 break; 2167 case TCP_SEQ_STATE_TIME_WAIT: 2168 case TCP_SEQ_STATE_ESTABLISHED: 2169 if (v) 2170 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2171 break; 2172 } 2173} 2174 2175static int tcp_seq_open(struct inode *inode, struct file *file) 2176{ 2177 struct tcp_seq_afinfo *afinfo = PDE(inode)->data; 2178 struct tcp_iter_state *s; 2179 int err; 2180 2181 err = seq_open_net(inode, file, &afinfo->seq_ops, 2182 sizeof(struct tcp_iter_state)); 2183 if (err < 0) 2184 return err; 2185 2186 s = ((struct seq_file *)file->private_data)->private; 2187 s->family = afinfo->family; 2188 return 0; 2189} 2190 2191int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2192{ 2193 int rc = 0; 2194 struct proc_dir_entry *p; 2195 2196 afinfo->seq_fops.open = tcp_seq_open; 2197 afinfo->seq_fops.read = seq_read; 2198 afinfo->seq_fops.llseek = seq_lseek; 2199 afinfo->seq_fops.release = seq_release_net; 2200 2201 afinfo->seq_ops.start = tcp_seq_start; 2202 afinfo->seq_ops.next = tcp_seq_next; 2203 afinfo->seq_ops.stop = tcp_seq_stop; 2204 2205 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2206 &afinfo->seq_fops, afinfo); 2207 if (!p) 2208 rc = -ENOMEM; 2209 return rc; 2210} 2211 2212void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2213{ 2214 proc_net_remove(net, afinfo->name); 2215} 2216 2217static void get_openreq4(struct sock *sk, struct request_sock *req, 2218 struct seq_file *f, int i, int uid, int *len) 2219{ 2220 const struct inet_request_sock *ireq = inet_rsk(req); 2221 int ttd = req->expires - jiffies; 2222 2223 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2224 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2225 i, 2226 ireq->loc_addr, 2227 ntohs(inet_sk(sk)->sport), 2228 ireq->rmt_addr, 2229 ntohs(ireq->rmt_port), 2230 TCP_SYN_RECV, 2231 0, 0, /* could print option size, but that is af dependent. */ 2232 1, /* timers active (only the expire timer) */ 2233 jiffies_to_clock_t(ttd), 2234 req->retrans, 2235 uid, 2236 0, /* non standard timer */ 2237 0, /* open_requests have no inode */ 2238 atomic_read(&sk->sk_refcnt), 2239 req, 2240 len); 2241} 2242 2243static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) 2244{ 2245 int timer_active; 2246 unsigned long timer_expires; 2247 struct tcp_sock *tp = tcp_sk(sk); 2248 const struct inet_connection_sock *icsk = inet_csk(sk); 2249 struct inet_sock *inet = inet_sk(sk); 2250 __be32 dest = inet->daddr; 2251 __be32 src = inet->rcv_saddr; 2252 __u16 destp = ntohs(inet->dport); 2253 __u16 srcp = ntohs(inet->sport); 2254 2255 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2256 timer_active = 1; 2257 timer_expires = icsk->icsk_timeout; 2258 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2259 timer_active = 4; 2260 timer_expires = icsk->icsk_timeout; 2261 } else if (timer_pending(&sk->sk_timer)) { 2262 timer_active = 2; 2263 timer_expires = sk->sk_timer.expires; 2264 } else { 2265 timer_active = 0; 2266 timer_expires = jiffies; 2267 } 2268 2269 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2270 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", 2271 i, src, srcp, dest, destp, sk->sk_state, 2272 tp->write_seq - tp->snd_una, 2273 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : 2274 (tp->rcv_nxt - tp->copied_seq), 2275 timer_active, 2276 jiffies_to_clock_t(timer_expires - jiffies), 2277 icsk->icsk_retransmits, 2278 sock_i_uid(sk), 2279 icsk->icsk_probes_out, 2280 sock_i_ino(sk), 2281 atomic_read(&sk->sk_refcnt), sk, 2282 jiffies_to_clock_t(icsk->icsk_rto), 2283 jiffies_to_clock_t(icsk->icsk_ack.ato), 2284 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2285 tp->snd_cwnd, 2286 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, 2287 len); 2288} 2289 2290static void get_timewait4_sock(struct inet_timewait_sock *tw, 2291 struct seq_file *f, int i, int *len) 2292{ 2293 __be32 dest, src; 2294 __u16 destp, srcp; 2295 int ttd = tw->tw_ttd - jiffies; 2296 2297 if (ttd < 0) 2298 ttd = 0; 2299 2300 dest = tw->tw_daddr; 2301 src = tw->tw_rcv_saddr; 2302 destp = ntohs(tw->tw_dport); 2303 srcp = ntohs(tw->tw_sport); 2304 2305 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2306 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", 2307 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2308 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, 2309 atomic_read(&tw->tw_refcnt), tw, len); 2310} 2311 2312#define TMPSZ 150 2313 2314static int tcp4_seq_show(struct seq_file *seq, void *v) 2315{ 2316 struct tcp_iter_state *st; 2317 int len; 2318 2319 if (v == SEQ_START_TOKEN) { 2320 seq_printf(seq, "%-*s\n", TMPSZ - 1, 2321 " sl local_address rem_address st tx_queue " 2322 "rx_queue tr tm->when retrnsmt uid timeout " 2323 "inode"); 2324 goto out; 2325 } 2326 st = seq->private; 2327 2328 switch (st->state) { 2329 case TCP_SEQ_STATE_LISTENING: 2330 case TCP_SEQ_STATE_ESTABLISHED: 2331 get_tcp4_sock(v, seq, st->num, &len); 2332 break; 2333 case TCP_SEQ_STATE_OPENREQ: 2334 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); 2335 break; 2336 case TCP_SEQ_STATE_TIME_WAIT: 2337 get_timewait4_sock(v, seq, st->num, &len); 2338 break; 2339 } 2340 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); 2341out: 2342 return 0; 2343} 2344 2345static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2346 .name = "tcp", 2347 .family = AF_INET, 2348 .seq_fops = { 2349 .owner = THIS_MODULE, 2350 }, 2351 .seq_ops = { 2352 .show = tcp4_seq_show, 2353 }, 2354}; 2355 2356static int tcp4_proc_init_net(struct net *net) 2357{ 2358 return tcp_proc_register(net, &tcp4_seq_afinfo); 2359} 2360 2361static void tcp4_proc_exit_net(struct net *net) 2362{ 2363 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2364} 2365 2366static struct pernet_operations tcp4_net_ops = { 2367 .init = tcp4_proc_init_net, 2368 .exit = tcp4_proc_exit_net, 2369}; 2370 2371int __init tcp4_proc_init(void) 2372{ 2373 return register_pernet_subsys(&tcp4_net_ops); 2374} 2375 2376void tcp4_proc_exit(void) 2377{ 2378 unregister_pernet_subsys(&tcp4_net_ops); 2379} 2380#endif /* CONFIG_PROC_FS */ 2381 2382struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2383{ 2384 struct iphdr *iph = skb_gro_network_header(skb); 2385 2386 switch (skb->ip_summed) { 2387 case CHECKSUM_COMPLETE: 2388 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, 2389 skb->csum)) { 2390 skb->ip_summed = CHECKSUM_UNNECESSARY; 2391 break; 2392 } 2393 2394 /* fall through */ 2395 case CHECKSUM_NONE: 2396 NAPI_GRO_CB(skb)->flush = 1; 2397 return NULL; 2398 } 2399 2400 return tcp_gro_receive(head, skb); 2401} 2402EXPORT_SYMBOL(tcp4_gro_receive); 2403 2404int tcp4_gro_complete(struct sk_buff *skb) 2405{ 2406 struct iphdr *iph = ip_hdr(skb); 2407 struct tcphdr *th = tcp_hdr(skb); 2408 2409 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), 2410 iph->saddr, iph->daddr, 0); 2411 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 2412 2413 return tcp_gro_complete(skb); 2414} 2415EXPORT_SYMBOL(tcp4_gro_complete); 2416 2417struct proto tcp_prot = { 2418 .name = "TCP", 2419 .owner = THIS_MODULE, 2420 .close = tcp_close, 2421 .connect = tcp_v4_connect, 2422 .disconnect = tcp_disconnect, 2423 .accept = inet_csk_accept, 2424 .ioctl = tcp_ioctl, 2425 .init = tcp_v4_init_sock, 2426 .destroy = tcp_v4_destroy_sock, 2427 .shutdown = tcp_shutdown, 2428 .setsockopt = tcp_setsockopt, 2429 .getsockopt = tcp_getsockopt, 2430 .recvmsg = tcp_recvmsg, 2431 .backlog_rcv = tcp_v4_do_rcv, 2432 .hash = inet_hash, 2433 .unhash = inet_unhash, 2434 .get_port = inet_csk_get_port, 2435 .enter_memory_pressure = tcp_enter_memory_pressure, 2436 .sockets_allocated = &tcp_sockets_allocated, 2437 .orphan_count = &tcp_orphan_count, 2438 .memory_allocated = &tcp_memory_allocated, 2439 .memory_pressure = &tcp_memory_pressure, 2440 .sysctl_mem = sysctl_tcp_mem, 2441 .sysctl_wmem = sysctl_tcp_wmem, 2442 .sysctl_rmem = sysctl_tcp_rmem, 2443 .max_header = MAX_TCP_HEADER, 2444 .obj_size = sizeof(struct tcp_sock), 2445 .slab_flags = SLAB_DESTROY_BY_RCU, 2446 .twsk_prot = &tcp_timewait_sock_ops, 2447 .rsk_prot = &tcp_request_sock_ops, 2448 .h.hashinfo = &tcp_hashinfo, 2449#ifdef CONFIG_COMPAT 2450 .compat_setsockopt = compat_tcp_setsockopt, 2451 .compat_getsockopt = compat_tcp_getsockopt, 2452#endif 2453}; 2454 2455 2456static int __net_init tcp_sk_init(struct net *net) 2457{ 2458 return inet_ctl_sock_create(&net->ipv4.tcp_sock, 2459 PF_INET, SOCK_RAW, IPPROTO_TCP, net); 2460} 2461 2462static void __net_exit tcp_sk_exit(struct net *net) 2463{ 2464 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2465 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); 2466} 2467 2468static struct pernet_operations __net_initdata tcp_sk_ops = { 2469 .init = tcp_sk_init, 2470 .exit = tcp_sk_exit, 2471}; 2472 2473void __init tcp_v4_init(void) 2474{ 2475 inet_hashinfo_init(&tcp_hashinfo); 2476 if (register_pernet_subsys(&tcp_sk_ops)) 2477 panic("Failed to create the TCP control socket.\n"); 2478} 2479 2480EXPORT_SYMBOL(ipv4_specific); 2481EXPORT_SYMBOL(tcp_hashinfo); 2482EXPORT_SYMBOL(tcp_prot); 2483EXPORT_SYMBOL(tcp_v4_conn_request); 2484EXPORT_SYMBOL(tcp_v4_connect); 2485EXPORT_SYMBOL(tcp_v4_do_rcv); 2486EXPORT_SYMBOL(tcp_v4_remember_stamp); 2487EXPORT_SYMBOL(tcp_v4_send_check); 2488EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 2489 2490#ifdef CONFIG_PROC_FS 2491EXPORT_SYMBOL(tcp_proc_register); 2492EXPORT_SYMBOL(tcp_proc_unregister); 2493#endif 2494EXPORT_SYMBOL(sysctl_tcp_low_latency); 2495 2496