1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * PF_INET protocol family socket handler. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Florian La Roche, <flla@stud.uni-sb.de> 11 * Alan Cox, <A.Cox@swansea.ac.uk> 12 * 13 * Changes (see also sock.c) 14 * 15 * piggy, 16 * Karl Knutson : Socket protocol table 17 * A.N.Kuznetsov : Socket death error in accept(). 18 * John Richardson : Fix non blocking error in connect() 19 * so sockets that fail to connect 20 * don't return -EINPROGRESS. 21 * Alan Cox : Asynchronous I/O support 22 * Alan Cox : Keep correct socket pointer on sock 23 * structures 24 * when accept() ed 25 * Alan Cox : Semantics of SO_LINGER aren't state 26 * moved to close when you look carefully. 27 * With this fixed and the accept bug fixed 28 * some RPC stuff seems happier. 29 * Niibe Yutaka : 4.4BSD style write async I/O 30 * Alan Cox, 31 * Tony Gale : Fixed reuse semantics. 32 * Alan Cox : bind() shouldn't abort existing but dead 33 * sockets. Stops FTP netin:.. I hope. 34 * Alan Cox : bind() works correctly for RAW sockets. 35 * Note that FreeBSD at least was broken 36 * in this respect so be careful with 37 * compatibility tests... 38 * Alan Cox : routing cache support 39 * Alan Cox : memzero the socket structure for 40 * compactness. 41 * Matt Day : nonblock connect error handler 42 * Alan Cox : Allow large numbers of pending sockets 43 * (eg for big web sites), but only if 44 * specifically application requested. 45 * Alan Cox : New buffering throughout IP. Used 46 * dumbly. 47 * Alan Cox : New buffering now used smartly. 48 * Alan Cox : BSD rather than common sense 49 * interpretation of listen. 50 * Germano Caronni : Assorted small races. 51 * Alan Cox : sendmsg/recvmsg basic support. 52 * Alan Cox : Only sendmsg/recvmsg now supported. 53 * Alan Cox : Locked down bind (see security list). 54 * Alan Cox : Loosened bind a little. 55 * Mike McLagan : ADD/DEL DLCI Ioctls 56 * Willy Konynenberg : Transparent proxying support. 57 * David S. Miller : New socket lookup architecture. 58 * Some other random speedups. 59 * Cyrus Durgin : Cleaned up file for kmod hacks. 60 * Andi Kleen : Fix inet_stream_connect TCP race. 61 * 62 * This program is free software; you can redistribute it and/or 63 * modify it under the terms of the GNU General Public License 64 * as published by the Free Software Foundation; either version 65 * 2 of the License, or (at your option) any later version. 66 */ 67 68#define pr_fmt(fmt) "IPv4: " fmt 69 70#include <linux/err.h> 71#include <linux/errno.h> 72#include <linux/types.h> 73#include <linux/socket.h> 74#include <linux/in.h> 75#include <linux/kernel.h> 76#include <linux/module.h> 77#include <linux/sched.h> 78#include <linux/timer.h> 79#include <linux/string.h> 80#include <linux/sockios.h> 81#include <linux/net.h> 82#include <linux/capability.h> 83#include <linux/fcntl.h> 84#include <linux/mm.h> 85#include <linux/interrupt.h> 86#include <linux/stat.h> 87#include <linux/init.h> 88#include <linux/poll.h> 89#include <linux/netfilter_ipv4.h> 90#include <linux/random.h> 91#include <linux/slab.h> 92 93#include <asm/uaccess.h> 94 95#include <linux/inet.h> 96#include <linux/igmp.h> 97#include <linux/inetdevice.h> 98#include <linux/netdevice.h> 99#include <net/checksum.h> 100#include <net/ip.h> 101#include <net/protocol.h> 102#include <net/arp.h> 103#include <net/route.h> 104#include <net/ip_fib.h> 105#include <net/inet_connection_sock.h> 106#include <net/tcp.h> 107#include <net/udp.h> 108#include <net/udplite.h> 109#include <net/ping.h> 110#include <linux/skbuff.h> 111#include <net/sock.h> 112#include <net/raw.h> 113#include <net/icmp.h> 114#include <net/inet_common.h> 115#include <net/xfrm.h> 116#include <net/net_namespace.h> 117#include <net/secure_seq.h> 118#ifdef CONFIG_IP_MROUTE 119#include <linux/mroute.h> 120#endif 121 122#ifdef CONFIG_ANDROID_PARANOID_NETWORK 123#include <linux/android_aid.h> 124 125static inline int current_has_network(void) 126{ 127 return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); 128} 129#else 130static inline int current_has_network(void) 131{ 132 return 1; 133} 134#endif 135 136/* The inetsw table contains everything that inet_create needs to 137 * build a new socket. 138 */ 139static struct list_head inetsw[SOCK_MAX]; 140static DEFINE_SPINLOCK(inetsw_lock); 141 142/* New destruction routine */ 143 144void inet_sock_destruct(struct sock *sk) 145{ 146 struct inet_sock *inet = inet_sk(sk); 147 148 __skb_queue_purge(&sk->sk_receive_queue); 149 __skb_queue_purge(&sk->sk_error_queue); 150 151 sk_mem_reclaim(sk); 152 153 if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { 154 pr_err("Attempt to release TCP socket in state %d %p\n", 155 sk->sk_state, sk); 156 return; 157 } 158 if (!sock_flag(sk, SOCK_DEAD)) { 159 pr_err("Attempt to release alive inet socket %p\n", sk); 160 return; 161 } 162 163 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 164 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 165 WARN_ON(sk->sk_wmem_queued); 166 WARN_ON(sk->sk_forward_alloc); 167 168 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 169 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 170 dst_release(sk->sk_rx_dst); 171 sk_refcnt_debug_dec(sk); 172} 173EXPORT_SYMBOL(inet_sock_destruct); 174 175/* 176 * The routines beyond this point handle the behaviour of an AF_INET 177 * socket object. Mostly it punts to the subprotocols of IP to do 178 * the work. 179 */ 180 181/* 182 * Automatically bind an unbound socket. 183 */ 184 185static int inet_autobind(struct sock *sk) 186{ 187 struct inet_sock *inet; 188 /* We may need to bind the socket. */ 189 lock_sock(sk); 190 inet = inet_sk(sk); 191 if (!inet->inet_num) { 192 if (sk->sk_prot->get_port(sk, 0)) { 193 release_sock(sk); 194 return -EAGAIN; 195 } 196 inet->inet_sport = htons(inet->inet_num); 197 } 198 release_sock(sk); 199 return 0; 200} 201 202/* 203 * Move a socket into listening state. 204 */ 205int inet_listen(struct socket *sock, int backlog) 206{ 207 struct sock *sk = sock->sk; 208 unsigned char old_state; 209 int err; 210 211 lock_sock(sk); 212 213 err = -EINVAL; 214 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) 215 goto out; 216 217 old_state = sk->sk_state; 218 if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) 219 goto out; 220 221 /* Really, if the socket is already in listen state 222 * we can only allow the backlog to be adjusted. 223 */ 224 if (old_state != TCP_LISTEN) { 225 /* Check special setups for testing purpose to enable TFO w/o 226 * requiring TCP_FASTOPEN sockopt. 227 * Note that only TCP sockets (SOCK_STREAM) will reach here. 228 * Also fastopenq may already been allocated because this 229 * socket was in TCP_LISTEN state previously but was 230 * shutdown() (rather than close()). 231 */ 232 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && 233 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { 234 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) 235 err = fastopen_init_queue(sk, backlog); 236 else if ((sysctl_tcp_fastopen & 237 TFO_SERVER_WO_SOCKOPT2) != 0) 238 err = fastopen_init_queue(sk, 239 ((uint)sysctl_tcp_fastopen) >> 16); 240 else 241 err = 0; 242 if (err) 243 goto out; 244 } 245 err = inet_csk_listen_start(sk, backlog); 246 if (err) 247 goto out; 248 } 249 sk->sk_max_ack_backlog = backlog; 250 err = 0; 251 252out: 253 release_sock(sk); 254 return err; 255} 256EXPORT_SYMBOL(inet_listen); 257 258/* 259 * Create an inet socket. 260 */ 261 262static int inet_create(struct net *net, struct socket *sock, int protocol, 263 int kern) 264{ 265 struct sock *sk; 266 struct inet_protosw *answer; 267 struct inet_sock *inet; 268 struct proto *answer_prot; 269 unsigned char answer_flags; 270 int try_loading_module = 0; 271 int err; 272 273 if (!current_has_network()) 274 return -EACCES; 275 276 sock->state = SS_UNCONNECTED; 277 278 /* Look for the requested type/protocol pair. */ 279lookup_protocol: 280 err = -ESOCKTNOSUPPORT; 281 rcu_read_lock(); 282 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { 283 284 err = 0; 285 /* Check the non-wild match. */ 286 if (protocol == answer->protocol) { 287 if (protocol != IPPROTO_IP) 288 break; 289 } else { 290 /* Check for the two wild cases. */ 291 if (IPPROTO_IP == protocol) { 292 protocol = answer->protocol; 293 break; 294 } 295 if (IPPROTO_IP == answer->protocol) 296 break; 297 } 298 err = -EPROTONOSUPPORT; 299 } 300 301 if (unlikely(err)) { 302 if (try_loading_module < 2) { 303 rcu_read_unlock(); 304 /* 305 * Be more specific, e.g. net-pf-2-proto-132-type-1 306 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) 307 */ 308 if (++try_loading_module == 1) 309 request_module("net-pf-%d-proto-%d-type-%d", 310 PF_INET, protocol, sock->type); 311 /* 312 * Fall back to generic, e.g. net-pf-2-proto-132 313 * (net-pf-PF_INET-proto-IPPROTO_SCTP) 314 */ 315 else 316 request_module("net-pf-%d-proto-%d", 317 PF_INET, protocol); 318 goto lookup_protocol; 319 } else 320 goto out_rcu_unlock; 321 } 322 323 err = -EPERM; 324 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) 325 goto out_rcu_unlock; 326 327 sock->ops = answer->ops; 328 answer_prot = answer->prot; 329 answer_flags = answer->flags; 330 rcu_read_unlock(); 331 332 WARN_ON(answer_prot->slab == NULL); 333 334 err = -ENOBUFS; 335 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); 336 if (sk == NULL) 337 goto out; 338 339 err = 0; 340 if (INET_PROTOSW_REUSE & answer_flags) 341 sk->sk_reuse = SK_CAN_REUSE; 342 343 inet = inet_sk(sk); 344 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 345 346 inet->nodefrag = 0; 347 348 if (SOCK_RAW == sock->type) { 349 inet->inet_num = protocol; 350 if (IPPROTO_RAW == protocol) 351 inet->hdrincl = 1; 352 } 353 354 if (net->ipv4.sysctl_ip_no_pmtu_disc) 355 inet->pmtudisc = IP_PMTUDISC_DONT; 356 else 357 inet->pmtudisc = IP_PMTUDISC_WANT; 358 359 inet->inet_id = 0; 360 361 sock_init_data(sock, sk); 362 363 sk->sk_destruct = inet_sock_destruct; 364 sk->sk_protocol = protocol; 365 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; 366 367 inet->uc_ttl = -1; 368 inet->mc_loop = 1; 369 inet->mc_ttl = 1; 370 inet->mc_all = 1; 371 inet->mc_index = 0; 372 inet->mc_list = NULL; 373 inet->rcv_tos = 0; 374 375 sk_refcnt_debug_inc(sk); 376 377 if (inet->inet_num) { 378 /* It assumes that any protocol which allows 379 * the user to assign a number at socket 380 * creation time automatically 381 * shares. 382 */ 383 inet->inet_sport = htons(inet->inet_num); 384 /* Add to protocol hash chains. */ 385 sk->sk_prot->hash(sk); 386 } 387 388 if (sk->sk_prot->init) { 389 err = sk->sk_prot->init(sk); 390 if (err) 391 sk_common_release(sk); 392 } 393out: 394 return err; 395out_rcu_unlock: 396 rcu_read_unlock(); 397 goto out; 398} 399 400 401/* 402 * The peer socket should always be NULL (or else). When we call this 403 * function we are destroying the object and from then on nobody 404 * should refer to it. 405 */ 406int inet_release(struct socket *sock) 407{ 408 struct sock *sk = sock->sk; 409 410 if (sk) { 411 long timeout; 412 413 sock_rps_reset_flow(sk); 414 415 /* Applications forget to leave groups before exiting */ 416 ip_mc_drop_socket(sk); 417 418 /* If linger is set, we don't return until the close 419 * is complete. Otherwise we return immediately. The 420 * actually closing is done the same either way. 421 * 422 * If the close is due to the process exiting, we never 423 * linger.. 424 */ 425 timeout = 0; 426 if (sock_flag(sk, SOCK_LINGER) && 427 !(current->flags & PF_EXITING)) 428 timeout = sk->sk_lingertime; 429 sock->sk = NULL; 430 sk->sk_prot->close(sk, timeout); 431 } 432 return 0; 433} 434EXPORT_SYMBOL(inet_release); 435 436int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 437{ 438 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 439 struct sock *sk = sock->sk; 440 struct inet_sock *inet = inet_sk(sk); 441 struct net *net = sock_net(sk); 442 unsigned short snum; 443 int chk_addr_ret; 444 int err; 445 446 /* If the socket has its own bind function then use it. (RAW) */ 447 if (sk->sk_prot->bind) { 448 err = sk->sk_prot->bind(sk, uaddr, addr_len); 449 goto out; 450 } 451 err = -EINVAL; 452 if (addr_len < sizeof(struct sockaddr_in)) 453 goto out; 454 455 if (addr->sin_family != AF_INET) { 456 /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) 457 * only if s_addr is INADDR_ANY. 458 */ 459 err = -EAFNOSUPPORT; 460 if (addr->sin_family != AF_UNSPEC || 461 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 462 goto out; 463 } 464 465 chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); 466 467 /* Not specified by any standard per-se, however it breaks too 468 * many applications when removed. It is unfortunate since 469 * allowing applications to make a non-local bind solves 470 * several problems with systems using dynamic addressing. 471 * (ie. your servers still start up even if your ISDN link 472 * is temporarily down) 473 */ 474 err = -EADDRNOTAVAIL; 475 if (!net->ipv4.sysctl_ip_nonlocal_bind && 476 !(inet->freebind || inet->transparent) && 477 addr->sin_addr.s_addr != htonl(INADDR_ANY) && 478 chk_addr_ret != RTN_LOCAL && 479 chk_addr_ret != RTN_MULTICAST && 480 chk_addr_ret != RTN_BROADCAST) 481 goto out; 482 483 snum = ntohs(addr->sin_port); 484 err = -EACCES; 485 if (snum && snum < PROT_SOCK && 486 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 487 goto out; 488 489 /* We keep a pair of addresses. rcv_saddr is the one 490 * used by hash lookups, and saddr is used for transmit. 491 * 492 * In the BSD API these are the same except where it 493 * would be illegal to use them (multicast/broadcast) in 494 * which case the sending device address is used. 495 */ 496 lock_sock(sk); 497 498 /* Check these errors (active socket, double bind). */ 499 err = -EINVAL; 500 if (sk->sk_state != TCP_CLOSE || inet->inet_num) 501 goto out_release_sock; 502 503 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; 504 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 505 inet->inet_saddr = 0; /* Use device */ 506 507 /* Make sure we are allowed to bind here. */ 508 if (sk->sk_prot->get_port(sk, snum)) { 509 inet->inet_saddr = inet->inet_rcv_saddr = 0; 510 err = -EADDRINUSE; 511 goto out_release_sock; 512 } 513 514 if (inet->inet_rcv_saddr) 515 sk->sk_userlocks |= SOCK_BINDADDR_LOCK; 516 if (snum) 517 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 518 inet->inet_sport = htons(inet->inet_num); 519 inet->inet_daddr = 0; 520 inet->inet_dport = 0; 521 sk_dst_reset(sk); 522 err = 0; 523out_release_sock: 524 release_sock(sk); 525out: 526 return err; 527} 528EXPORT_SYMBOL(inet_bind); 529 530int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, 531 int addr_len, int flags) 532{ 533 struct sock *sk = sock->sk; 534 535 if (addr_len < sizeof(uaddr->sa_family)) 536 return -EINVAL; 537 if (uaddr->sa_family == AF_UNSPEC) 538 return sk->sk_prot->disconnect(sk, flags); 539 540 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 541 return -EAGAIN; 542 return sk->sk_prot->connect(sk, uaddr, addr_len); 543} 544EXPORT_SYMBOL(inet_dgram_connect); 545 546static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) 547{ 548 DEFINE_WAIT(wait); 549 550 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 551 sk->sk_write_pending += writebias; 552 553 /* Basic assumption: if someone sets sk->sk_err, he _must_ 554 * change state of the socket from TCP_SYN_*. 555 * Connect() does not allow to get error notifications 556 * without closing the socket. 557 */ 558 while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 559 release_sock(sk); 560 timeo = schedule_timeout(timeo); 561 lock_sock(sk); 562 if (signal_pending(current) || !timeo) 563 break; 564 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 565 } 566 finish_wait(sk_sleep(sk), &wait); 567 sk->sk_write_pending -= writebias; 568 return timeo; 569} 570 571/* 572 * Connect to a remote host. There is regrettably still a little 573 * TCP 'magic' in here. 574 */ 575int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 576 int addr_len, int flags) 577{ 578 struct sock *sk = sock->sk; 579 int err; 580 long timeo; 581 582 if (addr_len < sizeof(uaddr->sa_family)) 583 return -EINVAL; 584 585 if (uaddr->sa_family == AF_UNSPEC) { 586 err = sk->sk_prot->disconnect(sk, flags); 587 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 588 goto out; 589 } 590 591 switch (sock->state) { 592 default: 593 err = -EINVAL; 594 goto out; 595 case SS_CONNECTED: 596 err = -EISCONN; 597 goto out; 598 case SS_CONNECTING: 599 err = -EALREADY; 600 /* Fall out of switch with err, set for this state */ 601 break; 602 case SS_UNCONNECTED: 603 err = -EISCONN; 604 if (sk->sk_state != TCP_CLOSE) 605 goto out; 606 607 err = sk->sk_prot->connect(sk, uaddr, addr_len); 608 if (err < 0) 609 goto out; 610 611 sock->state = SS_CONNECTING; 612 613 /* Just entered SS_CONNECTING state; the only 614 * difference is that return value in non-blocking 615 * case is EINPROGRESS, rather than EALREADY. 616 */ 617 err = -EINPROGRESS; 618 break; 619 } 620 621 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 622 623 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 624 int writebias = (sk->sk_protocol == IPPROTO_TCP) && 625 tcp_sk(sk)->fastopen_req && 626 tcp_sk(sk)->fastopen_req->data ? 1 : 0; 627 628 /* Error code is set above */ 629 if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) 630 goto out; 631 632 err = sock_intr_errno(timeo); 633 if (signal_pending(current)) 634 goto out; 635 } 636 637 /* Connection was closed by RST, timeout, ICMP error 638 * or another process disconnected us. 639 */ 640 if (sk->sk_state == TCP_CLOSE) 641 goto sock_error; 642 643 /* sk->sk_err may be not zero now, if RECVERR was ordered by user 644 * and error was received after socket entered established state. 645 * Hence, it is handled normally after connect() return successfully. 646 */ 647 648 sock->state = SS_CONNECTED; 649 err = 0; 650out: 651 return err; 652 653sock_error: 654 err = sock_error(sk) ? : -ECONNABORTED; 655 sock->state = SS_UNCONNECTED; 656 if (sk->sk_prot->disconnect(sk, flags)) 657 sock->state = SS_DISCONNECTING; 658 goto out; 659} 660EXPORT_SYMBOL(__inet_stream_connect); 661 662int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 663 int addr_len, int flags) 664{ 665 int err; 666 667 lock_sock(sock->sk); 668 err = __inet_stream_connect(sock, uaddr, addr_len, flags); 669 release_sock(sock->sk); 670 return err; 671} 672EXPORT_SYMBOL(inet_stream_connect); 673 674/* 675 * Accept a pending connection. The TCP layer now gives BSD semantics. 676 */ 677 678int inet_accept(struct socket *sock, struct socket *newsock, int flags) 679{ 680 struct sock *sk1 = sock->sk; 681 int err = -EINVAL; 682 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); 683 684 if (!sk2) 685 goto do_err; 686 687 lock_sock(sk2); 688 689 sock_rps_record_flow(sk2); 690 WARN_ON(!((1 << sk2->sk_state) & 691 (TCPF_ESTABLISHED | TCPF_SYN_RECV | 692 TCPF_CLOSE_WAIT | TCPF_CLOSE))); 693 694 sock_graft(sk2, newsock); 695 696 newsock->state = SS_CONNECTED; 697 err = 0; 698 release_sock(sk2); 699do_err: 700 return err; 701} 702EXPORT_SYMBOL(inet_accept); 703 704 705/* 706 * This does both peername and sockname. 707 */ 708int inet_getname(struct socket *sock, struct sockaddr *uaddr, 709 int *uaddr_len, int peer) 710{ 711 struct sock *sk = sock->sk; 712 struct inet_sock *inet = inet_sk(sk); 713 DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); 714 715 sin->sin_family = AF_INET; 716 if (peer) { 717 if (!inet->inet_dport || 718 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && 719 peer == 1)) 720 return -ENOTCONN; 721 sin->sin_port = inet->inet_dport; 722 sin->sin_addr.s_addr = inet->inet_daddr; 723 } else { 724 __be32 addr = inet->inet_rcv_saddr; 725 if (!addr) 726 addr = inet->inet_saddr; 727 sin->sin_port = inet->inet_sport; 728 sin->sin_addr.s_addr = addr; 729 } 730 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 731 *uaddr_len = sizeof(*sin); 732 return 0; 733} 734EXPORT_SYMBOL(inet_getname); 735 736int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 737 size_t size) 738{ 739 struct sock *sk = sock->sk; 740 741 sock_rps_record_flow(sk); 742 743 /* We may need to bind the socket. */ 744 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && 745 inet_autobind(sk)) 746 return -EAGAIN; 747 748 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 749} 750EXPORT_SYMBOL(inet_sendmsg); 751 752ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, 753 size_t size, int flags) 754{ 755 struct sock *sk = sock->sk; 756 757 sock_rps_record_flow(sk); 758 759 /* We may need to bind the socket. */ 760 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && 761 inet_autobind(sk)) 762 return -EAGAIN; 763 764 if (sk->sk_prot->sendpage) 765 return sk->sk_prot->sendpage(sk, page, offset, size, flags); 766 return sock_no_sendpage(sock, page, offset, size, flags); 767} 768EXPORT_SYMBOL(inet_sendpage); 769 770int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 771 size_t size, int flags) 772{ 773 struct sock *sk = sock->sk; 774 int addr_len = 0; 775 int err; 776 777 sock_rps_record_flow(sk); 778 779 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 780 flags & ~MSG_DONTWAIT, &addr_len); 781 if (err >= 0) 782 msg->msg_namelen = addr_len; 783 return err; 784} 785EXPORT_SYMBOL(inet_recvmsg); 786 787int inet_shutdown(struct socket *sock, int how) 788{ 789 struct sock *sk = sock->sk; 790 int err = 0; 791 792 /* This should really check to make sure 793 * the socket is a TCP socket. (WHY AC...) 794 */ 795 how++; /* maps 0->1 has the advantage of making bit 1 rcvs and 796 1->2 bit 2 snds. 797 2->3 */ 798 if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ 799 return -EINVAL; 800 801 lock_sock(sk); 802 if (sock->state == SS_CONNECTING) { 803 if ((1 << sk->sk_state) & 804 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 805 sock->state = SS_DISCONNECTING; 806 else 807 sock->state = SS_CONNECTED; 808 } 809 810 switch (sk->sk_state) { 811 case TCP_CLOSE: 812 err = -ENOTCONN; 813 /* Hack to wake up other listeners, who can poll for 814 POLLHUP, even on eg. unconnected UDP sockets -- RR */ 815 default: 816 sk->sk_shutdown |= how; 817 if (sk->sk_prot->shutdown) 818 sk->sk_prot->shutdown(sk, how); 819 break; 820 821 /* Remaining two branches are temporary solution for missing 822 * close() in multithreaded environment. It is _not_ a good idea, 823 * but we have no choice until close() is repaired at VFS level. 824 */ 825 case TCP_LISTEN: 826 if (!(how & RCV_SHUTDOWN)) 827 break; 828 /* Fall through */ 829 case TCP_SYN_SENT: 830 err = sk->sk_prot->disconnect(sk, O_NONBLOCK); 831 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 832 break; 833 } 834 835 /* Wake up anyone sleeping in poll. */ 836 sk->sk_state_change(sk); 837 release_sock(sk); 838 return err; 839} 840EXPORT_SYMBOL(inet_shutdown); 841 842/* 843 * ioctl() calls you can issue on an INET socket. Most of these are 844 * device configuration and stuff and very rarely used. Some ioctls 845 * pass on to the socket itself. 846 * 847 * NOTE: I like the idea of a module for the config stuff. ie ifconfig 848 * loads the devconfigure module does its configuring and unloads it. 849 * There's a good 20K of config code hanging around the kernel. 850 */ 851 852int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 853{ 854 struct sock *sk = sock->sk; 855 int err = 0; 856 struct net *net = sock_net(sk); 857 858 switch (cmd) { 859 case SIOCGSTAMP: 860 err = sock_get_timestamp(sk, (struct timeval __user *)arg); 861 break; 862 case SIOCGSTAMPNS: 863 err = sock_get_timestampns(sk, (struct timespec __user *)arg); 864 break; 865 case SIOCADDRT: 866 case SIOCDELRT: 867 case SIOCRTMSG: 868 err = ip_rt_ioctl(net, cmd, (void __user *)arg); 869 break; 870 case SIOCDARP: 871 case SIOCGARP: 872 case SIOCSARP: 873 err = arp_ioctl(net, cmd, (void __user *)arg); 874 break; 875 case SIOCGIFADDR: 876 case SIOCSIFADDR: 877 case SIOCGIFBRDADDR: 878 case SIOCSIFBRDADDR: 879 case SIOCGIFNETMASK: 880 case SIOCSIFNETMASK: 881 case SIOCGIFDSTADDR: 882 case SIOCSIFDSTADDR: 883 case SIOCSIFPFLAGS: 884 case SIOCGIFPFLAGS: 885 case SIOCSIFFLAGS: 886 case SIOCKILLADDR: 887 err = devinet_ioctl(net, cmd, (void __user *)arg); 888 break; 889 default: 890 if (sk->sk_prot->ioctl) 891 err = sk->sk_prot->ioctl(sk, cmd, arg); 892 else 893 err = -ENOIOCTLCMD; 894 break; 895 } 896 return err; 897} 898EXPORT_SYMBOL(inet_ioctl); 899 900#ifdef CONFIG_COMPAT 901static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 902{ 903 struct sock *sk = sock->sk; 904 int err = -ENOIOCTLCMD; 905 906 if (sk->sk_prot->compat_ioctl) 907 err = sk->sk_prot->compat_ioctl(sk, cmd, arg); 908 909 return err; 910} 911#endif 912 913const struct proto_ops inet_stream_ops = { 914 .family = PF_INET, 915 .owner = THIS_MODULE, 916 .release = inet_release, 917 .bind = inet_bind, 918 .connect = inet_stream_connect, 919 .socketpair = sock_no_socketpair, 920 .accept = inet_accept, 921 .getname = inet_getname, 922 .poll = tcp_poll, 923 .ioctl = inet_ioctl, 924 .listen = inet_listen, 925 .shutdown = inet_shutdown, 926 .setsockopt = sock_common_setsockopt, 927 .getsockopt = sock_common_getsockopt, 928 .sendmsg = inet_sendmsg, 929 .recvmsg = inet_recvmsg, 930 .mmap = sock_no_mmap, 931 .sendpage = inet_sendpage, 932 .splice_read = tcp_splice_read, 933#ifdef CONFIG_COMPAT 934 .compat_setsockopt = compat_sock_common_setsockopt, 935 .compat_getsockopt = compat_sock_common_getsockopt, 936 .compat_ioctl = inet_compat_ioctl, 937#endif 938}; 939EXPORT_SYMBOL(inet_stream_ops); 940 941const struct proto_ops inet_dgram_ops = { 942 .family = PF_INET, 943 .owner = THIS_MODULE, 944 .release = inet_release, 945 .bind = inet_bind, 946 .connect = inet_dgram_connect, 947 .socketpair = sock_no_socketpair, 948 .accept = sock_no_accept, 949 .getname = inet_getname, 950 .poll = udp_poll, 951 .ioctl = inet_ioctl, 952 .listen = sock_no_listen, 953 .shutdown = inet_shutdown, 954 .setsockopt = sock_common_setsockopt, 955 .getsockopt = sock_common_getsockopt, 956 .sendmsg = inet_sendmsg, 957 .recvmsg = inet_recvmsg, 958 .mmap = sock_no_mmap, 959 .sendpage = inet_sendpage, 960#ifdef CONFIG_COMPAT 961 .compat_setsockopt = compat_sock_common_setsockopt, 962 .compat_getsockopt = compat_sock_common_getsockopt, 963 .compat_ioctl = inet_compat_ioctl, 964#endif 965}; 966EXPORT_SYMBOL(inet_dgram_ops); 967 968/* 969 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 970 * udp_poll 971 */ 972static const struct proto_ops inet_sockraw_ops = { 973 .family = PF_INET, 974 .owner = THIS_MODULE, 975 .release = inet_release, 976 .bind = inet_bind, 977 .connect = inet_dgram_connect, 978 .socketpair = sock_no_socketpair, 979 .accept = sock_no_accept, 980 .getname = inet_getname, 981 .poll = datagram_poll, 982 .ioctl = inet_ioctl, 983 .listen = sock_no_listen, 984 .shutdown = inet_shutdown, 985 .setsockopt = sock_common_setsockopt, 986 .getsockopt = sock_common_getsockopt, 987 .sendmsg = inet_sendmsg, 988 .recvmsg = inet_recvmsg, 989 .mmap = sock_no_mmap, 990 .sendpage = inet_sendpage, 991#ifdef CONFIG_COMPAT 992 .compat_setsockopt = compat_sock_common_setsockopt, 993 .compat_getsockopt = compat_sock_common_getsockopt, 994 .compat_ioctl = inet_compat_ioctl, 995#endif 996}; 997 998static const struct net_proto_family inet_family_ops = { 999 .family = PF_INET, 1000 .create = inet_create, 1001 .owner = THIS_MODULE, 1002}; 1003 1004/* Upon startup we insert all the elements in inetsw_array[] into 1005 * the linked list inetsw. 1006 */ 1007static struct inet_protosw inetsw_array[] = 1008{ 1009 { 1010 .type = SOCK_STREAM, 1011 .protocol = IPPROTO_TCP, 1012 .prot = &tcp_prot, 1013 .ops = &inet_stream_ops, 1014 .flags = INET_PROTOSW_PERMANENT | 1015 INET_PROTOSW_ICSK, 1016 }, 1017 1018 { 1019 .type = SOCK_DGRAM, 1020 .protocol = IPPROTO_UDP, 1021 .prot = &udp_prot, 1022 .ops = &inet_dgram_ops, 1023 .flags = INET_PROTOSW_PERMANENT, 1024 }, 1025 1026 { 1027 .type = SOCK_DGRAM, 1028 .protocol = IPPROTO_ICMP, 1029 .prot = &ping_prot, 1030 .ops = &inet_dgram_ops, 1031 .flags = INET_PROTOSW_REUSE, 1032 }, 1033 1034 { 1035 .type = SOCK_RAW, 1036 .protocol = IPPROTO_IP, /* wild card */ 1037 .prot = &raw_prot, 1038 .ops = &inet_sockraw_ops, 1039 .flags = INET_PROTOSW_REUSE, 1040 } 1041}; 1042 1043#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) 1044 1045void inet_register_protosw(struct inet_protosw *p) 1046{ 1047 struct list_head *lh; 1048 struct inet_protosw *answer; 1049 int protocol = p->protocol; 1050 struct list_head *last_perm; 1051 1052 spin_lock_bh(&inetsw_lock); 1053 1054 if (p->type >= SOCK_MAX) 1055 goto out_illegal; 1056 1057 /* If we are trying to override a permanent protocol, bail. */ 1058 answer = NULL; 1059 last_perm = &inetsw[p->type]; 1060 list_for_each(lh, &inetsw[p->type]) { 1061 answer = list_entry(lh, struct inet_protosw, list); 1062 1063 /* Check only the non-wild match. */ 1064 if (INET_PROTOSW_PERMANENT & answer->flags) { 1065 if (protocol == answer->protocol) 1066 break; 1067 last_perm = lh; 1068 } 1069 1070 answer = NULL; 1071 } 1072 if (answer) 1073 goto out_permanent; 1074 1075 /* Add the new entry after the last permanent entry if any, so that 1076 * the new entry does not override a permanent entry when matched with 1077 * a wild-card protocol. But it is allowed to override any existing 1078 * non-permanent entry. This means that when we remove this entry, the 1079 * system automatically returns to the old behavior. 1080 */ 1081 list_add_rcu(&p->list, last_perm); 1082out: 1083 spin_unlock_bh(&inetsw_lock); 1084 1085 return; 1086 1087out_permanent: 1088 pr_err("Attempt to override permanent protocol %d\n", protocol); 1089 goto out; 1090 1091out_illegal: 1092 pr_err("Ignoring attempt to register invalid socket type %d\n", 1093 p->type); 1094 goto out; 1095} 1096EXPORT_SYMBOL(inet_register_protosw); 1097 1098void inet_unregister_protosw(struct inet_protosw *p) 1099{ 1100 if (INET_PROTOSW_PERMANENT & p->flags) { 1101 pr_err("Attempt to unregister permanent protocol %d\n", 1102 p->protocol); 1103 } else { 1104 spin_lock_bh(&inetsw_lock); 1105 list_del_rcu(&p->list); 1106 spin_unlock_bh(&inetsw_lock); 1107 1108 synchronize_net(); 1109 } 1110} 1111EXPORT_SYMBOL(inet_unregister_protosw); 1112 1113/* 1114 * Shall we try to damage output packets if routing dev changes? 1115 */ 1116 1117int sysctl_ip_dynaddr __read_mostly; 1118 1119static int inet_sk_reselect_saddr(struct sock *sk) 1120{ 1121 struct inet_sock *inet = inet_sk(sk); 1122 __be32 old_saddr = inet->inet_saddr; 1123 __be32 daddr = inet->inet_daddr; 1124 struct flowi4 *fl4; 1125 struct rtable *rt; 1126 __be32 new_saddr; 1127 struct ip_options_rcu *inet_opt; 1128 1129 inet_opt = rcu_dereference_protected(inet->inet_opt, 1130 sock_owned_by_user(sk)); 1131 if (inet_opt && inet_opt->opt.srr) 1132 daddr = inet_opt->opt.faddr; 1133 1134 /* Query new route. */ 1135 fl4 = &inet->cork.fl.u.ip4; 1136 rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), 1137 sk->sk_bound_dev_if, sk->sk_protocol, 1138 inet->inet_sport, inet->inet_dport, sk); 1139 if (IS_ERR(rt)) 1140 return PTR_ERR(rt); 1141 1142 sk_setup_caps(sk, &rt->dst); 1143 1144 new_saddr = fl4->saddr; 1145 1146 if (new_saddr == old_saddr) 1147 return 0; 1148 1149 if (sysctl_ip_dynaddr > 1) { 1150 pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", 1151 __func__, &old_saddr, &new_saddr); 1152 } 1153 1154 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; 1155 1156 /* 1157 * XXX The only one ugly spot where we need to 1158 * XXX really change the sockets identity after 1159 * XXX it has entered the hashes. -DaveM 1160 * 1161 * Besides that, it does not check for connection 1162 * uniqueness. Wait for troubles. 1163 */ 1164 __sk_prot_rehash(sk); 1165 return 0; 1166} 1167 1168int inet_sk_rebuild_header(struct sock *sk) 1169{ 1170 struct inet_sock *inet = inet_sk(sk); 1171 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); 1172 __be32 daddr; 1173 struct ip_options_rcu *inet_opt; 1174 struct flowi4 *fl4; 1175 int err; 1176 1177 /* Route is OK, nothing to do. */ 1178 if (rt) 1179 return 0; 1180 1181 /* Reroute. */ 1182 rcu_read_lock(); 1183 inet_opt = rcu_dereference(inet->inet_opt); 1184 daddr = inet->inet_daddr; 1185 if (inet_opt && inet_opt->opt.srr) 1186 daddr = inet_opt->opt.faddr; 1187 rcu_read_unlock(); 1188 fl4 = &inet->cork.fl.u.ip4; 1189 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, 1190 inet->inet_dport, inet->inet_sport, 1191 sk->sk_protocol, RT_CONN_FLAGS(sk), 1192 sk->sk_bound_dev_if); 1193 if (!IS_ERR(rt)) { 1194 err = 0; 1195 sk_setup_caps(sk, &rt->dst); 1196 } else { 1197 err = PTR_ERR(rt); 1198 1199 /* Routing failed... */ 1200 sk->sk_route_caps = 0; 1201 /* 1202 * Other protocols have to map its equivalent state to TCP_SYN_SENT. 1203 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme 1204 */ 1205 if (!sysctl_ip_dynaddr || 1206 sk->sk_state != TCP_SYN_SENT || 1207 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || 1208 (err = inet_sk_reselect_saddr(sk)) != 0) 1209 sk->sk_err_soft = -err; 1210 } 1211 1212 return err; 1213} 1214EXPORT_SYMBOL(inet_sk_rebuild_header); 1215 1216static struct sk_buff *inet_gso_segment(struct sk_buff *skb, 1217 netdev_features_t features) 1218{ 1219 struct sk_buff *segs = ERR_PTR(-EINVAL); 1220 const struct net_offload *ops; 1221 unsigned int offset = 0; 1222 bool udpfrag, encap; 1223 struct iphdr *iph; 1224 int proto; 1225 int nhoff; 1226 int ihl; 1227 int id; 1228 1229 if (unlikely(skb_shinfo(skb)->gso_type & 1230 ~(SKB_GSO_TCPV4 | 1231 SKB_GSO_UDP | 1232 SKB_GSO_DODGY | 1233 SKB_GSO_TCP_ECN | 1234 SKB_GSO_GRE | 1235 SKB_GSO_GRE_CSUM | 1236 SKB_GSO_IPIP | 1237 SKB_GSO_SIT | 1238 SKB_GSO_TCPV6 | 1239 SKB_GSO_UDP_TUNNEL | 1240 SKB_GSO_UDP_TUNNEL_CSUM | 1241 SKB_GSO_MPLS | 1242 0))) 1243 goto out; 1244 1245 skb_reset_network_header(skb); 1246 nhoff = skb_network_header(skb) - skb_mac_header(skb); 1247 if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) 1248 goto out; 1249 1250 iph = ip_hdr(skb); 1251 ihl = iph->ihl * 4; 1252 if (ihl < sizeof(*iph)) 1253 goto out; 1254 1255 id = ntohs(iph->id); 1256 proto = iph->protocol; 1257 1258 /* Warning: after this point, iph might be no longer valid */ 1259 if (unlikely(!pskb_may_pull(skb, ihl))) 1260 goto out; 1261 __skb_pull(skb, ihl); 1262 1263 encap = SKB_GSO_CB(skb)->encap_level > 0; 1264 if (encap) 1265 features &= skb->dev->hw_enc_features; 1266 SKB_GSO_CB(skb)->encap_level += ihl; 1267 1268 skb_reset_transport_header(skb); 1269 1270 segs = ERR_PTR(-EPROTONOSUPPORT); 1271 1272 if (skb->encapsulation && 1273 skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) 1274 udpfrag = proto == IPPROTO_UDP && encap; 1275 else 1276 udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; 1277 1278 ops = rcu_dereference(inet_offloads[proto]); 1279 if (likely(ops && ops->callbacks.gso_segment)) 1280 segs = ops->callbacks.gso_segment(skb, features); 1281 1282 if (IS_ERR_OR_NULL(segs)) 1283 goto out; 1284 1285 skb = segs; 1286 do { 1287 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); 1288 if (udpfrag) { 1289 iph->id = htons(id); 1290 iph->frag_off = htons(offset >> 3); 1291 if (skb->next != NULL) 1292 iph->frag_off |= htons(IP_MF); 1293 offset += skb->len - nhoff - ihl; 1294 } else { 1295 iph->id = htons(id++); 1296 } 1297 iph->tot_len = htons(skb->len - nhoff); 1298 ip_send_check(iph); 1299 if (encap) 1300 skb_reset_inner_headers(skb); 1301 skb->network_header = (u8 *)iph - skb->head; 1302 } while ((skb = skb->next)); 1303 1304out: 1305 return segs; 1306} 1307 1308static struct sk_buff **inet_gro_receive(struct sk_buff **head, 1309 struct sk_buff *skb) 1310{ 1311 const struct net_offload *ops; 1312 struct sk_buff **pp = NULL; 1313 struct sk_buff *p; 1314 const struct iphdr *iph; 1315 unsigned int hlen; 1316 unsigned int off; 1317 unsigned int id; 1318 int flush = 1; 1319 int proto; 1320 1321 off = skb_gro_offset(skb); 1322 hlen = off + sizeof(*iph); 1323 iph = skb_gro_header_fast(skb, off); 1324 if (skb_gro_header_hard(skb, hlen)) { 1325 iph = skb_gro_header_slow(skb, hlen, off); 1326 if (unlikely(!iph)) 1327 goto out; 1328 } 1329 1330 proto = iph->protocol; 1331 1332 rcu_read_lock(); 1333 ops = rcu_dereference(inet_offloads[proto]); 1334 if (!ops || !ops->callbacks.gro_receive) 1335 goto out_unlock; 1336 1337 if (*(u8 *)iph != 0x45) 1338 goto out_unlock; 1339 1340 if (unlikely(ip_fast_csum((u8 *)iph, 5))) 1341 goto out_unlock; 1342 1343 id = ntohl(*(__be32 *)&iph->id); 1344 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); 1345 id >>= 16; 1346 1347 for (p = *head; p; p = p->next) { 1348 struct iphdr *iph2; 1349 1350 if (!NAPI_GRO_CB(p)->same_flow) 1351 continue; 1352 1353 iph2 = (struct iphdr *)(p->data + off); 1354 /* The above works because, with the exception of the top 1355 * (inner most) layer, we only aggregate pkts with the same 1356 * hdr length so all the hdrs we'll need to verify will start 1357 * at the same offset. 1358 */ 1359 if ((iph->protocol ^ iph2->protocol) | 1360 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | 1361 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { 1362 NAPI_GRO_CB(p)->same_flow = 0; 1363 continue; 1364 } 1365 1366 /* All fields must match except length and checksum. */ 1367 NAPI_GRO_CB(p)->flush |= 1368 (iph->ttl ^ iph2->ttl) | 1369 (iph->tos ^ iph2->tos) | 1370 ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); 1371 1372 /* Save the IP ID check to be included later when we get to 1373 * the transport layer so only the inner most IP ID is checked. 1374 * This is because some GSO/TSO implementations do not 1375 * correctly increment the IP ID for the outer hdrs. 1376 */ 1377 NAPI_GRO_CB(p)->flush_id = 1378 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); 1379 NAPI_GRO_CB(p)->flush |= flush; 1380 } 1381 1382 NAPI_GRO_CB(skb)->flush |= flush; 1383 skb_set_network_header(skb, off); 1384 /* The above will be needed by the transport layer if there is one 1385 * immediately following this IP hdr. 1386 */ 1387 1388 /* Note : No need to call skb_gro_postpull_rcsum() here, 1389 * as we already checked checksum over ipv4 header was 0 1390 */ 1391 skb_gro_pull(skb, sizeof(*iph)); 1392 skb_set_transport_header(skb, skb_gro_offset(skb)); 1393 1394 pp = ops->callbacks.gro_receive(head, skb); 1395 1396out_unlock: 1397 rcu_read_unlock(); 1398 1399out: 1400 NAPI_GRO_CB(skb)->flush |= flush; 1401 1402 return pp; 1403} 1404 1405int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) 1406{ 1407 if (sk->sk_family == AF_INET) 1408 return ip_recv_error(sk, msg, len, addr_len); 1409#if IS_ENABLED(CONFIG_IPV6) 1410 if (sk->sk_family == AF_INET6) 1411 return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); 1412#endif 1413 return -EINVAL; 1414} 1415 1416static int inet_gro_complete(struct sk_buff *skb, int nhoff) 1417{ 1418 __be16 newlen = htons(skb->len - nhoff); 1419 struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); 1420 const struct net_offload *ops; 1421 int proto = iph->protocol; 1422 int err = -ENOSYS; 1423 1424 if (skb->encapsulation) 1425 skb_set_inner_network_header(skb, nhoff); 1426 1427 csum_replace2(&iph->check, iph->tot_len, newlen); 1428 iph->tot_len = newlen; 1429 1430 rcu_read_lock(); 1431 ops = rcu_dereference(inet_offloads[proto]); 1432 if (WARN_ON(!ops || !ops->callbacks.gro_complete)) 1433 goto out_unlock; 1434 1435 /* Only need to add sizeof(*iph) to get to the next hdr below 1436 * because any hdr with option will have been flushed in 1437 * inet_gro_receive(). 1438 */ 1439 err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); 1440 1441out_unlock: 1442 rcu_read_unlock(); 1443 1444 return err; 1445} 1446 1447int inet_ctl_sock_create(struct sock **sk, unsigned short family, 1448 unsigned short type, unsigned char protocol, 1449 struct net *net) 1450{ 1451 struct socket *sock; 1452 int rc = sock_create_kern(family, type, protocol, &sock); 1453 1454 if (rc == 0) { 1455 *sk = sock->sk; 1456 (*sk)->sk_allocation = GFP_ATOMIC; 1457 /* 1458 * Unhash it so that IP input processing does not even see it, 1459 * we do not wish this socket to see incoming packets. 1460 */ 1461 (*sk)->sk_prot->unhash(*sk); 1462 1463 sk_change_net(*sk, net); 1464 } 1465 return rc; 1466} 1467EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1468 1469unsigned long snmp_fold_field(void __percpu *mib, int offt) 1470{ 1471 unsigned long res = 0; 1472 int i; 1473 1474 for_each_possible_cpu(i) 1475 res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); 1476 return res; 1477} 1478EXPORT_SYMBOL_GPL(snmp_fold_field); 1479 1480#if BITS_PER_LONG==32 1481 1482u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) 1483{ 1484 u64 res = 0; 1485 int cpu; 1486 1487 for_each_possible_cpu(cpu) { 1488 void *bhptr; 1489 struct u64_stats_sync *syncp; 1490 u64 v; 1491 unsigned int start; 1492 1493 bhptr = per_cpu_ptr(mib, cpu); 1494 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); 1495 do { 1496 start = u64_stats_fetch_begin_irq(syncp); 1497 v = *(((u64 *) bhptr) + offt); 1498 } while (u64_stats_fetch_retry_irq(syncp, start)); 1499 1500 res += v; 1501 } 1502 return res; 1503} 1504EXPORT_SYMBOL_GPL(snmp_fold_field64); 1505#endif 1506 1507#ifdef CONFIG_IP_MULTICAST 1508static const struct net_protocol igmp_protocol = { 1509 .handler = igmp_rcv, 1510 .netns_ok = 1, 1511}; 1512#endif 1513 1514static const struct net_protocol tcp_protocol = { 1515 .early_demux = tcp_v4_early_demux, 1516 .handler = tcp_v4_rcv, 1517 .err_handler = tcp_v4_err, 1518 .no_policy = 1, 1519 .netns_ok = 1, 1520 .icmp_strict_tag_validation = 1, 1521}; 1522 1523static const struct net_protocol udp_protocol = { 1524 .early_demux = udp_v4_early_demux, 1525 .handler = udp_rcv, 1526 .err_handler = udp_err, 1527 .no_policy = 1, 1528 .netns_ok = 1, 1529}; 1530 1531static const struct net_protocol icmp_protocol = { 1532 .handler = icmp_rcv, 1533 .err_handler = icmp_err, 1534 .no_policy = 1, 1535 .netns_ok = 1, 1536}; 1537 1538static __net_init int ipv4_mib_init_net(struct net *net) 1539{ 1540 int i; 1541 1542 net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); 1543 if (!net->mib.tcp_statistics) 1544 goto err_tcp_mib; 1545 net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); 1546 if (!net->mib.ip_statistics) 1547 goto err_ip_mib; 1548 1549 for_each_possible_cpu(i) { 1550 struct ipstats_mib *af_inet_stats; 1551 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); 1552 u64_stats_init(&af_inet_stats->syncp); 1553 } 1554 1555 net->mib.net_statistics = alloc_percpu(struct linux_mib); 1556 if (!net->mib.net_statistics) 1557 goto err_net_mib; 1558 net->mib.udp_statistics = alloc_percpu(struct udp_mib); 1559 if (!net->mib.udp_statistics) 1560 goto err_udp_mib; 1561 net->mib.udplite_statistics = alloc_percpu(struct udp_mib); 1562 if (!net->mib.udplite_statistics) 1563 goto err_udplite_mib; 1564 net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); 1565 if (!net->mib.icmp_statistics) 1566 goto err_icmp_mib; 1567 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), 1568 GFP_KERNEL); 1569 if (!net->mib.icmpmsg_statistics) 1570 goto err_icmpmsg_mib; 1571 1572 tcp_mib_init(net); 1573 return 0; 1574 1575err_icmpmsg_mib: 1576 free_percpu(net->mib.icmp_statistics); 1577err_icmp_mib: 1578 free_percpu(net->mib.udplite_statistics); 1579err_udplite_mib: 1580 free_percpu(net->mib.udp_statistics); 1581err_udp_mib: 1582 free_percpu(net->mib.net_statistics); 1583err_net_mib: 1584 free_percpu(net->mib.ip_statistics); 1585err_ip_mib: 1586 free_percpu(net->mib.tcp_statistics); 1587err_tcp_mib: 1588 return -ENOMEM; 1589} 1590 1591static __net_exit void ipv4_mib_exit_net(struct net *net) 1592{ 1593 kfree(net->mib.icmpmsg_statistics); 1594 free_percpu(net->mib.icmp_statistics); 1595 free_percpu(net->mib.udplite_statistics); 1596 free_percpu(net->mib.udp_statistics); 1597 free_percpu(net->mib.net_statistics); 1598 free_percpu(net->mib.ip_statistics); 1599 free_percpu(net->mib.tcp_statistics); 1600} 1601 1602static __net_initdata struct pernet_operations ipv4_mib_ops = { 1603 .init = ipv4_mib_init_net, 1604 .exit = ipv4_mib_exit_net, 1605}; 1606 1607static int __init init_ipv4_mibs(void) 1608{ 1609 return register_pernet_subsys(&ipv4_mib_ops); 1610} 1611 1612static __net_init int inet_init_net(struct net *net) 1613{ 1614 /* 1615 * Set defaults for local port range 1616 */ 1617 seqlock_init(&net->ipv4.ip_local_ports.lock); 1618 net->ipv4.ip_local_ports.range[0] = 32768; 1619 net->ipv4.ip_local_ports.range[1] = 61000; 1620 1621 seqlock_init(&net->ipv4.ping_group_range.lock); 1622 /* 1623 * Sane defaults - nobody may create ping sockets. 1624 * Boot scripts should set this to distro-specific group. 1625 */ 1626 net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); 1627 net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); 1628 return 0; 1629} 1630 1631static __net_exit void inet_exit_net(struct net *net) 1632{ 1633} 1634 1635static __net_initdata struct pernet_operations af_inet_ops = { 1636 .init = inet_init_net, 1637 .exit = inet_exit_net, 1638}; 1639 1640static int __init init_inet_pernet_ops(void) 1641{ 1642 return register_pernet_subsys(&af_inet_ops); 1643} 1644 1645static int ipv4_proc_init(void); 1646 1647/* 1648 * IP protocol layer initialiser 1649 */ 1650 1651static struct packet_offload ip_packet_offload __read_mostly = { 1652 .type = cpu_to_be16(ETH_P_IP), 1653 .callbacks = { 1654 .gso_segment = inet_gso_segment, 1655 .gro_receive = inet_gro_receive, 1656 .gro_complete = inet_gro_complete, 1657 }, 1658}; 1659 1660static const struct net_offload ipip_offload = { 1661 .callbacks = { 1662 .gso_segment = inet_gso_segment, 1663 .gro_receive = inet_gro_receive, 1664 .gro_complete = inet_gro_complete, 1665 }, 1666}; 1667 1668static int __init ipv4_offload_init(void) 1669{ 1670 /* 1671 * Add offloads 1672 */ 1673 if (udpv4_offload_init() < 0) 1674 pr_crit("%s: Cannot add UDP protocol offload\n", __func__); 1675 if (tcpv4_offload_init() < 0) 1676 pr_crit("%s: Cannot add TCP protocol offload\n", __func__); 1677 1678 dev_add_offload(&ip_packet_offload); 1679 inet_add_offload(&ipip_offload, IPPROTO_IPIP); 1680 return 0; 1681} 1682 1683fs_initcall(ipv4_offload_init); 1684 1685static struct packet_type ip_packet_type __read_mostly = { 1686 .type = cpu_to_be16(ETH_P_IP), 1687 .func = ip_rcv, 1688}; 1689 1690static int __init inet_init(void) 1691{ 1692 struct inet_protosw *q; 1693 struct list_head *r; 1694 int rc = -EINVAL; 1695 1696 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); 1697 1698 rc = proto_register(&tcp_prot, 1); 1699 if (rc) 1700 goto out; 1701 1702 rc = proto_register(&udp_prot, 1); 1703 if (rc) 1704 goto out_unregister_tcp_proto; 1705 1706 rc = proto_register(&raw_prot, 1); 1707 if (rc) 1708 goto out_unregister_udp_proto; 1709 1710 rc = proto_register(&ping_prot, 1); 1711 if (rc) 1712 goto out_unregister_raw_proto; 1713 1714 /* 1715 * Tell SOCKET that we are alive... 1716 */ 1717 1718 (void)sock_register(&inet_family_ops); 1719 1720#ifdef CONFIG_SYSCTL 1721 ip_static_sysctl_init(); 1722#endif 1723 1724 /* 1725 * Add all the base protocols. 1726 */ 1727 1728 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) 1729 pr_crit("%s: Cannot add ICMP protocol\n", __func__); 1730 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) 1731 pr_crit("%s: Cannot add UDP protocol\n", __func__); 1732 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) 1733 pr_crit("%s: Cannot add TCP protocol\n", __func__); 1734#ifdef CONFIG_IP_MULTICAST 1735 if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) 1736 pr_crit("%s: Cannot add IGMP protocol\n", __func__); 1737#endif 1738 1739 /* Register the socket-side information for inet_create. */ 1740 for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) 1741 INIT_LIST_HEAD(r); 1742 1743 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) 1744 inet_register_protosw(q); 1745 1746 /* 1747 * Set the ARP module up 1748 */ 1749 1750 arp_init(); 1751 1752 /* 1753 * Set the IP module up 1754 */ 1755 1756 ip_init(); 1757 1758 tcp_v4_init(); 1759 1760 /* Setup TCP slab cache for open requests. */ 1761 tcp_init(); 1762 1763 /* Setup UDP memory threshold */ 1764 udp_init(); 1765 1766 /* Add UDP-Lite (RFC 3828) */ 1767 udplite4_register(); 1768 1769 ping_init(); 1770 1771 /* 1772 * Set the ICMP layer up 1773 */ 1774 1775 if (icmp_init() < 0) 1776 panic("Failed to create the ICMP control socket.\n"); 1777 1778 /* 1779 * Initialise the multicast router 1780 */ 1781#if defined(CONFIG_IP_MROUTE) 1782 if (ip_mr_init()) 1783 pr_crit("%s: Cannot init ipv4 mroute\n", __func__); 1784#endif 1785 1786 if (init_inet_pernet_ops()) 1787 pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); 1788 /* 1789 * Initialise per-cpu ipv4 mibs 1790 */ 1791 1792 if (init_ipv4_mibs()) 1793 pr_crit("%s: Cannot init ipv4 mibs\n", __func__); 1794 1795 ipv4_proc_init(); 1796 1797 ipfrag_init(); 1798 1799 dev_add_pack(&ip_packet_type); 1800 1801 rc = 0; 1802out: 1803 return rc; 1804out_unregister_raw_proto: 1805 proto_unregister(&raw_prot); 1806out_unregister_udp_proto: 1807 proto_unregister(&udp_prot); 1808out_unregister_tcp_proto: 1809 proto_unregister(&tcp_prot); 1810 goto out; 1811} 1812 1813fs_initcall(inet_init); 1814 1815/* ------------------------------------------------------------------------ */ 1816 1817#ifdef CONFIG_PROC_FS 1818static int __init ipv4_proc_init(void) 1819{ 1820 int rc = 0; 1821 1822 if (raw_proc_init()) 1823 goto out_raw; 1824 if (tcp4_proc_init()) 1825 goto out_tcp; 1826 if (udp4_proc_init()) 1827 goto out_udp; 1828 if (ping_proc_init()) 1829 goto out_ping; 1830 if (ip_misc_proc_init()) 1831 goto out_misc; 1832out: 1833 return rc; 1834out_misc: 1835 ping_proc_exit(); 1836out_ping: 1837 udp4_proc_exit(); 1838out_udp: 1839 tcp4_proc_exit(); 1840out_tcp: 1841 raw_proc_exit(); 1842out_raw: 1843 rc = -ENOMEM; 1844 goto out; 1845} 1846 1847#else /* CONFIG_PROC_FS */ 1848static int __init ipv4_proc_init(void) 1849{ 1850 return 0; 1851} 1852#endif /* CONFIG_PROC_FS */ 1853 1854MODULE_ALIAS_NETPROTO(PF_INET); 1855 1856