sock.c revision 5dbe7c178d3f0a4634f088d9e729f1909b9ddcd1
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94#include <linux/capability.h> 95#include <linux/errno.h> 96#include <linux/types.h> 97#include <linux/socket.h> 98#include <linux/in.h> 99#include <linux/kernel.h> 100#include <linux/module.h> 101#include <linux/proc_fs.h> 102#include <linux/seq_file.h> 103#include <linux/sched.h> 104#include <linux/timer.h> 105#include <linux/string.h> 106#include <linux/sockios.h> 107#include <linux/net.h> 108#include <linux/mm.h> 109#include <linux/slab.h> 110#include <linux/interrupt.h> 111#include <linux/poll.h> 112#include <linux/tcp.h> 113#include <linux/init.h> 114#include <linux/highmem.h> 115#include <linux/user_namespace.h> 116#include <linux/static_key.h> 117#include <linux/memcontrol.h> 118#include <linux/prefetch.h> 119 120#include <asm/uaccess.h> 121 122#include <linux/netdevice.h> 123#include <net/protocol.h> 124#include <linux/skbuff.h> 125#include <net/net_namespace.h> 126#include <net/request_sock.h> 127#include <net/sock.h> 128#include <linux/net_tstamp.h> 129#include <net/xfrm.h> 130#include <linux/ipsec.h> 131#include <net/cls_cgroup.h> 132#include <net/netprio_cgroup.h> 133 134#include <linux/filter.h> 135 136#include <trace/events/sock.h> 137 138#ifdef CONFIG_INET 139#include <net/tcp.h> 140#endif 141 142static DEFINE_MUTEX(proto_list_mutex); 143static LIST_HEAD(proto_list); 144 145#ifdef CONFIG_MEMCG_KMEM 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 147{ 148 struct proto *proto; 149 int ret = 0; 150 151 mutex_lock(&proto_list_mutex); 152 list_for_each_entry(proto, &proto_list, node) { 153 if (proto->init_cgroup) { 154 ret = proto->init_cgroup(memcg, ss); 155 if (ret) 156 goto out; 157 } 158 } 159 160 mutex_unlock(&proto_list_mutex); 161 return ret; 162out: 163 list_for_each_entry_continue_reverse(proto, &proto_list, node) 164 if (proto->destroy_cgroup) 165 proto->destroy_cgroup(memcg); 166 mutex_unlock(&proto_list_mutex); 167 return ret; 168} 169 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) 171{ 172 struct proto *proto; 173 174 mutex_lock(&proto_list_mutex); 175 list_for_each_entry_reverse(proto, &proto_list, node) 176 if (proto->destroy_cgroup) 177 proto->destroy_cgroup(memcg); 178 mutex_unlock(&proto_list_mutex); 179} 180#endif 181 182/* 183 * Each address family might have different locking rules, so we have 184 * one slock key per address family: 185 */ 186static struct lock_class_key af_family_keys[AF_MAX]; 187static struct lock_class_key af_family_slock_keys[AF_MAX]; 188 189#if defined(CONFIG_MEMCG_KMEM) 190struct static_key memcg_socket_limit_enabled; 191EXPORT_SYMBOL(memcg_socket_limit_enabled); 192#endif 193 194/* 195 * Make lock validator output more readable. (we pre-construct these 196 * strings build-time, so that runtime initialization of socket 197 * locks is fast): 198 */ 199static const char *const af_family_key_strings[AF_MAX+1] = { 200 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 201 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", 202 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , 203 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , 204 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 205 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 206 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 207 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 208 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 209 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 210 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 211 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 212 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , 213 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" 214}; 215static const char *const af_family_slock_key_strings[AF_MAX+1] = { 216 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 217 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", 218 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , 219 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , 220 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 221 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 222 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 223 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , 224 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 225 "slock-27" , "slock-28" , "slock-AF_CAN" , 226 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 227 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 228 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , 229 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" 230}; 231static const char *const af_family_clock_key_strings[AF_MAX+1] = { 232 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 233 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 234 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 235 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 236 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 237 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 238 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 239 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 240 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 241 "clock-27" , "clock-28" , "clock-AF_CAN" , 242 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 243 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 244 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , 245 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" 246}; 247 248/* 249 * sk_callback_lock locking rules are per-address-family, 250 * so split the lock classes by using a per-AF key: 251 */ 252static struct lock_class_key af_callback_keys[AF_MAX]; 253 254/* Take into consideration the size of the struct sk_buff overhead in the 255 * determination of these values, since that is non-constant across 256 * platforms. This makes socket queueing behavior and performance 257 * not depend upon such differences. 258 */ 259#define _SK_MEM_PACKETS 256 260#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 261#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 262#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 263 264/* Run time adjustable parameters. */ 265__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 266EXPORT_SYMBOL(sysctl_wmem_max); 267__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 268EXPORT_SYMBOL(sysctl_rmem_max); 269__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 270__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 271 272/* Maximal space eaten by iovec or ancillary data plus some space */ 273int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 274EXPORT_SYMBOL(sysctl_optmem_max); 275 276struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; 277EXPORT_SYMBOL_GPL(memalloc_socks); 278 279/** 280 * sk_set_memalloc - sets %SOCK_MEMALLOC 281 * @sk: socket to set it on 282 * 283 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 284 * It's the responsibility of the admin to adjust min_free_kbytes 285 * to meet the requirements 286 */ 287void sk_set_memalloc(struct sock *sk) 288{ 289 sock_set_flag(sk, SOCK_MEMALLOC); 290 sk->sk_allocation |= __GFP_MEMALLOC; 291 static_key_slow_inc(&memalloc_socks); 292} 293EXPORT_SYMBOL_GPL(sk_set_memalloc); 294 295void sk_clear_memalloc(struct sock *sk) 296{ 297 sock_reset_flag(sk, SOCK_MEMALLOC); 298 sk->sk_allocation &= ~__GFP_MEMALLOC; 299 static_key_slow_dec(&memalloc_socks); 300 301 /* 302 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 303 * progress of swapping. However, if SOCK_MEMALLOC is cleared while 304 * it has rmem allocations there is a risk that the user of the 305 * socket cannot make forward progress due to exceeding the rmem 306 * limits. By rights, sk_clear_memalloc() should only be called 307 * on sockets being torn down but warn and reset the accounting if 308 * that assumption breaks. 309 */ 310 if (WARN_ON(sk->sk_forward_alloc)) 311 sk_mem_reclaim(sk); 312} 313EXPORT_SYMBOL_GPL(sk_clear_memalloc); 314 315int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 316{ 317 int ret; 318 unsigned long pflags = current->flags; 319 320 /* these should have been dropped before queueing */ 321 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 322 323 current->flags |= PF_MEMALLOC; 324 ret = sk->sk_backlog_rcv(sk, skb); 325 tsk_restore_flags(current, pflags, PF_MEMALLOC); 326 327 return ret; 328} 329EXPORT_SYMBOL(__sk_backlog_rcv); 330 331static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 332{ 333 struct timeval tv; 334 335 if (optlen < sizeof(tv)) 336 return -EINVAL; 337 if (copy_from_user(&tv, optval, sizeof(tv))) 338 return -EFAULT; 339 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 340 return -EDOM; 341 342 if (tv.tv_sec < 0) { 343 static int warned __read_mostly; 344 345 *timeo_p = 0; 346 if (warned < 10 && net_ratelimit()) { 347 warned++; 348 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 349 __func__, current->comm, task_pid_nr(current)); 350 } 351 return 0; 352 } 353 *timeo_p = MAX_SCHEDULE_TIMEOUT; 354 if (tv.tv_sec == 0 && tv.tv_usec == 0) 355 return 0; 356 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 357 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 358 return 0; 359} 360 361static void sock_warn_obsolete_bsdism(const char *name) 362{ 363 static int warned; 364 static char warncomm[TASK_COMM_LEN]; 365 if (strcmp(warncomm, current->comm) && warned < 5) { 366 strcpy(warncomm, current->comm); 367 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 368 warncomm, name); 369 warned++; 370 } 371} 372 373#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) 374 375static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 376{ 377 if (sk->sk_flags & flags) { 378 sk->sk_flags &= ~flags; 379 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 380 net_disable_timestamp(); 381 } 382} 383 384 385int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 386{ 387 int err; 388 int skb_len; 389 unsigned long flags; 390 struct sk_buff_head *list = &sk->sk_receive_queue; 391 392 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 393 atomic_inc(&sk->sk_drops); 394 trace_sock_rcvqueue_full(sk, skb); 395 return -ENOMEM; 396 } 397 398 err = sk_filter(sk, skb); 399 if (err) 400 return err; 401 402 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 403 atomic_inc(&sk->sk_drops); 404 return -ENOBUFS; 405 } 406 407 skb->dev = NULL; 408 skb_set_owner_r(skb, sk); 409 410 /* Cache the SKB length before we tack it onto the receive 411 * queue. Once it is added it no longer belongs to us and 412 * may be freed by other threads of control pulling packets 413 * from the queue. 414 */ 415 skb_len = skb->len; 416 417 /* we escape from rcu protected region, make sure we dont leak 418 * a norefcounted dst 419 */ 420 skb_dst_force(skb); 421 422 spin_lock_irqsave(&list->lock, flags); 423 skb->dropcount = atomic_read(&sk->sk_drops); 424 __skb_queue_tail(list, skb); 425 spin_unlock_irqrestore(&list->lock, flags); 426 427 if (!sock_flag(sk, SOCK_DEAD)) 428 sk->sk_data_ready(sk, skb_len); 429 return 0; 430} 431EXPORT_SYMBOL(sock_queue_rcv_skb); 432 433int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) 434{ 435 int rc = NET_RX_SUCCESS; 436 437 if (sk_filter(sk, skb)) 438 goto discard_and_relse; 439 440 skb->dev = NULL; 441 442 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { 443 atomic_inc(&sk->sk_drops); 444 goto discard_and_relse; 445 } 446 if (nested) 447 bh_lock_sock_nested(sk); 448 else 449 bh_lock_sock(sk); 450 if (!sock_owned_by_user(sk)) { 451 /* 452 * trylock + unlock semantics: 453 */ 454 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 455 456 rc = sk_backlog_rcv(sk, skb); 457 458 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 459 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 460 bh_unlock_sock(sk); 461 atomic_inc(&sk->sk_drops); 462 goto discard_and_relse; 463 } 464 465 bh_unlock_sock(sk); 466out: 467 sock_put(sk); 468 return rc; 469discard_and_relse: 470 kfree_skb(skb); 471 goto out; 472} 473EXPORT_SYMBOL(sk_receive_skb); 474 475void sk_reset_txq(struct sock *sk) 476{ 477 sk_tx_queue_clear(sk); 478} 479EXPORT_SYMBOL(sk_reset_txq); 480 481struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 482{ 483 struct dst_entry *dst = __sk_dst_get(sk); 484 485 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 486 sk_tx_queue_clear(sk); 487 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 488 dst_release(dst); 489 return NULL; 490 } 491 492 return dst; 493} 494EXPORT_SYMBOL(__sk_dst_check); 495 496struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 497{ 498 struct dst_entry *dst = sk_dst_get(sk); 499 500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 501 sk_dst_reset(sk); 502 dst_release(dst); 503 return NULL; 504 } 505 506 return dst; 507} 508EXPORT_SYMBOL(sk_dst_check); 509 510static int sock_setbindtodevice(struct sock *sk, char __user *optval, 511 int optlen) 512{ 513 int ret = -ENOPROTOOPT; 514#ifdef CONFIG_NETDEVICES 515 struct net *net = sock_net(sk); 516 char devname[IFNAMSIZ]; 517 int index; 518 519 /* Sorry... */ 520 ret = -EPERM; 521 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 522 goto out; 523 524 ret = -EINVAL; 525 if (optlen < 0) 526 goto out; 527 528 /* Bind this socket to a particular device like "eth0", 529 * as specified in the passed interface name. If the 530 * name is "" or the option length is zero the socket 531 * is not bound. 532 */ 533 if (optlen > IFNAMSIZ - 1) 534 optlen = IFNAMSIZ - 1; 535 memset(devname, 0, sizeof(devname)); 536 537 ret = -EFAULT; 538 if (copy_from_user(devname, optval, optlen)) 539 goto out; 540 541 index = 0; 542 if (devname[0] != '\0') { 543 struct net_device *dev; 544 545 rcu_read_lock(); 546 dev = dev_get_by_name_rcu(net, devname); 547 if (dev) 548 index = dev->ifindex; 549 rcu_read_unlock(); 550 ret = -ENODEV; 551 if (!dev) 552 goto out; 553 } 554 555 lock_sock(sk); 556 sk->sk_bound_dev_if = index; 557 sk_dst_reset(sk); 558 release_sock(sk); 559 560 ret = 0; 561 562out: 563#endif 564 565 return ret; 566} 567 568static int sock_getbindtodevice(struct sock *sk, char __user *optval, 569 int __user *optlen, int len) 570{ 571 int ret = -ENOPROTOOPT; 572#ifdef CONFIG_NETDEVICES 573 struct net *net = sock_net(sk); 574 char devname[IFNAMSIZ]; 575 576 if (sk->sk_bound_dev_if == 0) { 577 len = 0; 578 goto zero; 579 } 580 581 ret = -EINVAL; 582 if (len < IFNAMSIZ) 583 goto out; 584 585 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 586 if (ret) 587 goto out; 588 589 len = strlen(devname) + 1; 590 591 ret = -EFAULT; 592 if (copy_to_user(optval, devname, len)) 593 goto out; 594 595zero: 596 ret = -EFAULT; 597 if (put_user(len, optlen)) 598 goto out; 599 600 ret = 0; 601 602out: 603#endif 604 605 return ret; 606} 607 608static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 609{ 610 if (valbool) 611 sock_set_flag(sk, bit); 612 else 613 sock_reset_flag(sk, bit); 614} 615 616/* 617 * This is meant for all protocols to use and covers goings on 618 * at the socket level. Everything here is generic. 619 */ 620 621int sock_setsockopt(struct socket *sock, int level, int optname, 622 char __user *optval, unsigned int optlen) 623{ 624 struct sock *sk = sock->sk; 625 int val; 626 int valbool; 627 struct linger ling; 628 int ret = 0; 629 630 /* 631 * Options without arguments 632 */ 633 634 if (optname == SO_BINDTODEVICE) 635 return sock_setbindtodevice(sk, optval, optlen); 636 637 if (optlen < sizeof(int)) 638 return -EINVAL; 639 640 if (get_user(val, (int __user *)optval)) 641 return -EFAULT; 642 643 valbool = val ? 1 : 0; 644 645 lock_sock(sk); 646 647 switch (optname) { 648 case SO_DEBUG: 649 if (val && !capable(CAP_NET_ADMIN)) 650 ret = -EACCES; 651 else 652 sock_valbool_flag(sk, SOCK_DBG, valbool); 653 break; 654 case SO_REUSEADDR: 655 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 656 break; 657 case SO_REUSEPORT: 658 sk->sk_reuseport = valbool; 659 break; 660 case SO_TYPE: 661 case SO_PROTOCOL: 662 case SO_DOMAIN: 663 case SO_ERROR: 664 ret = -ENOPROTOOPT; 665 break; 666 case SO_DONTROUTE: 667 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 668 break; 669 case SO_BROADCAST: 670 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 671 break; 672 case SO_SNDBUF: 673 /* Don't error on this BSD doesn't and if you think 674 * about it this is right. Otherwise apps have to 675 * play 'guess the biggest size' games. RCVBUF/SNDBUF 676 * are treated in BSD as hints 677 */ 678 val = min_t(u32, val, sysctl_wmem_max); 679set_sndbuf: 680 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 681 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); 682 /* Wake up sending tasks if we upped the value. */ 683 sk->sk_write_space(sk); 684 break; 685 686 case SO_SNDBUFFORCE: 687 if (!capable(CAP_NET_ADMIN)) { 688 ret = -EPERM; 689 break; 690 } 691 goto set_sndbuf; 692 693 case SO_RCVBUF: 694 /* Don't error on this BSD doesn't and if you think 695 * about it this is right. Otherwise apps have to 696 * play 'guess the biggest size' games. RCVBUF/SNDBUF 697 * are treated in BSD as hints 698 */ 699 val = min_t(u32, val, sysctl_rmem_max); 700set_rcvbuf: 701 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 702 /* 703 * We double it on the way in to account for 704 * "struct sk_buff" etc. overhead. Applications 705 * assume that the SO_RCVBUF setting they make will 706 * allow that much actual data to be received on that 707 * socket. 708 * 709 * Applications are unaware that "struct sk_buff" and 710 * other overheads allocate from the receive buffer 711 * during socket buffer allocation. 712 * 713 * And after considering the possible alternatives, 714 * returning the value we actually used in getsockopt 715 * is the most desirable behavior. 716 */ 717 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); 718 break; 719 720 case SO_RCVBUFFORCE: 721 if (!capable(CAP_NET_ADMIN)) { 722 ret = -EPERM; 723 break; 724 } 725 goto set_rcvbuf; 726 727 case SO_KEEPALIVE: 728#ifdef CONFIG_INET 729 if (sk->sk_protocol == IPPROTO_TCP && 730 sk->sk_type == SOCK_STREAM) 731 tcp_set_keepalive(sk, valbool); 732#endif 733 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 734 break; 735 736 case SO_OOBINLINE: 737 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 738 break; 739 740 case SO_NO_CHECK: 741 sk->sk_no_check = valbool; 742 break; 743 744 case SO_PRIORITY: 745 if ((val >= 0 && val <= 6) || 746 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 747 sk->sk_priority = val; 748 else 749 ret = -EPERM; 750 break; 751 752 case SO_LINGER: 753 if (optlen < sizeof(ling)) { 754 ret = -EINVAL; /* 1003.1g */ 755 break; 756 } 757 if (copy_from_user(&ling, optval, sizeof(ling))) { 758 ret = -EFAULT; 759 break; 760 } 761 if (!ling.l_onoff) 762 sock_reset_flag(sk, SOCK_LINGER); 763 else { 764#if (BITS_PER_LONG == 32) 765 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 766 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 767 else 768#endif 769 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 770 sock_set_flag(sk, SOCK_LINGER); 771 } 772 break; 773 774 case SO_BSDCOMPAT: 775 sock_warn_obsolete_bsdism("setsockopt"); 776 break; 777 778 case SO_PASSCRED: 779 if (valbool) 780 set_bit(SOCK_PASSCRED, &sock->flags); 781 else 782 clear_bit(SOCK_PASSCRED, &sock->flags); 783 break; 784 785 case SO_TIMESTAMP: 786 case SO_TIMESTAMPNS: 787 if (valbool) { 788 if (optname == SO_TIMESTAMP) 789 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 790 else 791 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 792 sock_set_flag(sk, SOCK_RCVTSTAMP); 793 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 794 } else { 795 sock_reset_flag(sk, SOCK_RCVTSTAMP); 796 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 797 } 798 break; 799 800 case SO_TIMESTAMPING: 801 if (val & ~SOF_TIMESTAMPING_MASK) { 802 ret = -EINVAL; 803 break; 804 } 805 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, 806 val & SOF_TIMESTAMPING_TX_HARDWARE); 807 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, 808 val & SOF_TIMESTAMPING_TX_SOFTWARE); 809 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, 810 val & SOF_TIMESTAMPING_RX_HARDWARE); 811 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 812 sock_enable_timestamp(sk, 813 SOCK_TIMESTAMPING_RX_SOFTWARE); 814 else 815 sock_disable_timestamp(sk, 816 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 817 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, 818 val & SOF_TIMESTAMPING_SOFTWARE); 819 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, 820 val & SOF_TIMESTAMPING_SYS_HARDWARE); 821 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, 822 val & SOF_TIMESTAMPING_RAW_HARDWARE); 823 break; 824 825 case SO_RCVLOWAT: 826 if (val < 0) 827 val = INT_MAX; 828 sk->sk_rcvlowat = val ? : 1; 829 break; 830 831 case SO_RCVTIMEO: 832 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 833 break; 834 835 case SO_SNDTIMEO: 836 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 837 break; 838 839 case SO_ATTACH_FILTER: 840 ret = -EINVAL; 841 if (optlen == sizeof(struct sock_fprog)) { 842 struct sock_fprog fprog; 843 844 ret = -EFAULT; 845 if (copy_from_user(&fprog, optval, sizeof(fprog))) 846 break; 847 848 ret = sk_attach_filter(&fprog, sk); 849 } 850 break; 851 852 case SO_DETACH_FILTER: 853 ret = sk_detach_filter(sk); 854 break; 855 856 case SO_LOCK_FILTER: 857 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 858 ret = -EPERM; 859 else 860 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 861 break; 862 863 case SO_PASSSEC: 864 if (valbool) 865 set_bit(SOCK_PASSSEC, &sock->flags); 866 else 867 clear_bit(SOCK_PASSSEC, &sock->flags); 868 break; 869 case SO_MARK: 870 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 871 ret = -EPERM; 872 else 873 sk->sk_mark = val; 874 break; 875 876 /* We implement the SO_SNDLOWAT etc to 877 not be settable (1003.1g 5.3) */ 878 case SO_RXQ_OVFL: 879 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 880 break; 881 882 case SO_WIFI_STATUS: 883 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 884 break; 885 886 case SO_PEEK_OFF: 887 if (sock->ops->set_peek_off) 888 sock->ops->set_peek_off(sk, val); 889 else 890 ret = -EOPNOTSUPP; 891 break; 892 893 case SO_NOFCS: 894 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 895 break; 896 897 case SO_SELECT_ERR_QUEUE: 898 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 899 break; 900 901 default: 902 ret = -ENOPROTOOPT; 903 break; 904 } 905 release_sock(sk); 906 return ret; 907} 908EXPORT_SYMBOL(sock_setsockopt); 909 910 911void cred_to_ucred(struct pid *pid, const struct cred *cred, 912 struct ucred *ucred) 913{ 914 ucred->pid = pid_vnr(pid); 915 ucred->uid = ucred->gid = -1; 916 if (cred) { 917 struct user_namespace *current_ns = current_user_ns(); 918 919 ucred->uid = from_kuid_munged(current_ns, cred->euid); 920 ucred->gid = from_kgid_munged(current_ns, cred->egid); 921 } 922} 923EXPORT_SYMBOL_GPL(cred_to_ucred); 924 925int sock_getsockopt(struct socket *sock, int level, int optname, 926 char __user *optval, int __user *optlen) 927{ 928 struct sock *sk = sock->sk; 929 930 union { 931 int val; 932 struct linger ling; 933 struct timeval tm; 934 } v; 935 936 int lv = sizeof(int); 937 int len; 938 939 if (get_user(len, optlen)) 940 return -EFAULT; 941 if (len < 0) 942 return -EINVAL; 943 944 memset(&v, 0, sizeof(v)); 945 946 switch (optname) { 947 case SO_DEBUG: 948 v.val = sock_flag(sk, SOCK_DBG); 949 break; 950 951 case SO_DONTROUTE: 952 v.val = sock_flag(sk, SOCK_LOCALROUTE); 953 break; 954 955 case SO_BROADCAST: 956 v.val = sock_flag(sk, SOCK_BROADCAST); 957 break; 958 959 case SO_SNDBUF: 960 v.val = sk->sk_sndbuf; 961 break; 962 963 case SO_RCVBUF: 964 v.val = sk->sk_rcvbuf; 965 break; 966 967 case SO_REUSEADDR: 968 v.val = sk->sk_reuse; 969 break; 970 971 case SO_REUSEPORT: 972 v.val = sk->sk_reuseport; 973 break; 974 975 case SO_KEEPALIVE: 976 v.val = sock_flag(sk, SOCK_KEEPOPEN); 977 break; 978 979 case SO_TYPE: 980 v.val = sk->sk_type; 981 break; 982 983 case SO_PROTOCOL: 984 v.val = sk->sk_protocol; 985 break; 986 987 case SO_DOMAIN: 988 v.val = sk->sk_family; 989 break; 990 991 case SO_ERROR: 992 v.val = -sock_error(sk); 993 if (v.val == 0) 994 v.val = xchg(&sk->sk_err_soft, 0); 995 break; 996 997 case SO_OOBINLINE: 998 v.val = sock_flag(sk, SOCK_URGINLINE); 999 break; 1000 1001 case SO_NO_CHECK: 1002 v.val = sk->sk_no_check; 1003 break; 1004 1005 case SO_PRIORITY: 1006 v.val = sk->sk_priority; 1007 break; 1008 1009 case SO_LINGER: 1010 lv = sizeof(v.ling); 1011 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1012 v.ling.l_linger = sk->sk_lingertime / HZ; 1013 break; 1014 1015 case SO_BSDCOMPAT: 1016 sock_warn_obsolete_bsdism("getsockopt"); 1017 break; 1018 1019 case SO_TIMESTAMP: 1020 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1021 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1022 break; 1023 1024 case SO_TIMESTAMPNS: 1025 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1026 break; 1027 1028 case SO_TIMESTAMPING: 1029 v.val = 0; 1030 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) 1031 v.val |= SOF_TIMESTAMPING_TX_HARDWARE; 1032 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) 1033 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; 1034 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) 1035 v.val |= SOF_TIMESTAMPING_RX_HARDWARE; 1036 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) 1037 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; 1038 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) 1039 v.val |= SOF_TIMESTAMPING_SOFTWARE; 1040 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) 1041 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; 1042 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) 1043 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; 1044 break; 1045 1046 case SO_RCVTIMEO: 1047 lv = sizeof(struct timeval); 1048 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1049 v.tm.tv_sec = 0; 1050 v.tm.tv_usec = 0; 1051 } else { 1052 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1053 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 1054 } 1055 break; 1056 1057 case SO_SNDTIMEO: 1058 lv = sizeof(struct timeval); 1059 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1060 v.tm.tv_sec = 0; 1061 v.tm.tv_usec = 0; 1062 } else { 1063 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1064 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 1065 } 1066 break; 1067 1068 case SO_RCVLOWAT: 1069 v.val = sk->sk_rcvlowat; 1070 break; 1071 1072 case SO_SNDLOWAT: 1073 v.val = 1; 1074 break; 1075 1076 case SO_PASSCRED: 1077 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1078 break; 1079 1080 case SO_PEERCRED: 1081 { 1082 struct ucred peercred; 1083 if (len > sizeof(peercred)) 1084 len = sizeof(peercred); 1085 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1086 if (copy_to_user(optval, &peercred, len)) 1087 return -EFAULT; 1088 goto lenout; 1089 } 1090 1091 case SO_PEERNAME: 1092 { 1093 char address[128]; 1094 1095 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) 1096 return -ENOTCONN; 1097 if (lv < len) 1098 return -EINVAL; 1099 if (copy_to_user(optval, address, len)) 1100 return -EFAULT; 1101 goto lenout; 1102 } 1103 1104 /* Dubious BSD thing... Probably nobody even uses it, but 1105 * the UNIX standard wants it for whatever reason... -DaveM 1106 */ 1107 case SO_ACCEPTCONN: 1108 v.val = sk->sk_state == TCP_LISTEN; 1109 break; 1110 1111 case SO_PASSSEC: 1112 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1113 break; 1114 1115 case SO_PEERSEC: 1116 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1117 1118 case SO_MARK: 1119 v.val = sk->sk_mark; 1120 break; 1121 1122 case SO_RXQ_OVFL: 1123 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1124 break; 1125 1126 case SO_WIFI_STATUS: 1127 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1128 break; 1129 1130 case SO_PEEK_OFF: 1131 if (!sock->ops->set_peek_off) 1132 return -EOPNOTSUPP; 1133 1134 v.val = sk->sk_peek_off; 1135 break; 1136 case SO_NOFCS: 1137 v.val = sock_flag(sk, SOCK_NOFCS); 1138 break; 1139 1140 case SO_BINDTODEVICE: 1141 return sock_getbindtodevice(sk, optval, optlen, len); 1142 1143 case SO_GET_FILTER: 1144 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1145 if (len < 0) 1146 return len; 1147 1148 goto lenout; 1149 1150 case SO_LOCK_FILTER: 1151 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1152 break; 1153 1154 case SO_SELECT_ERR_QUEUE: 1155 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1156 break; 1157 1158 default: 1159 return -ENOPROTOOPT; 1160 } 1161 1162 if (len > lv) 1163 len = lv; 1164 if (copy_to_user(optval, &v, len)) 1165 return -EFAULT; 1166lenout: 1167 if (put_user(len, optlen)) 1168 return -EFAULT; 1169 return 0; 1170} 1171 1172/* 1173 * Initialize an sk_lock. 1174 * 1175 * (We also register the sk_lock with the lock validator.) 1176 */ 1177static inline void sock_lock_init(struct sock *sk) 1178{ 1179 sock_lock_init_class_and_name(sk, 1180 af_family_slock_key_strings[sk->sk_family], 1181 af_family_slock_keys + sk->sk_family, 1182 af_family_key_strings[sk->sk_family], 1183 af_family_keys + sk->sk_family); 1184} 1185 1186/* 1187 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1188 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1189 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1190 */ 1191static void sock_copy(struct sock *nsk, const struct sock *osk) 1192{ 1193#ifdef CONFIG_SECURITY_NETWORK 1194 void *sptr = nsk->sk_security; 1195#endif 1196 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1197 1198 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1199 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1200 1201#ifdef CONFIG_SECURITY_NETWORK 1202 nsk->sk_security = sptr; 1203 security_sk_clone(osk, nsk); 1204#endif 1205} 1206 1207void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) 1208{ 1209 unsigned long nulls1, nulls2; 1210 1211 nulls1 = offsetof(struct sock, __sk_common.skc_node.next); 1212 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); 1213 if (nulls1 > nulls2) 1214 swap(nulls1, nulls2); 1215 1216 if (nulls1 != 0) 1217 memset((char *)sk, 0, nulls1); 1218 memset((char *)sk + nulls1 + sizeof(void *), 0, 1219 nulls2 - nulls1 - sizeof(void *)); 1220 memset((char *)sk + nulls2 + sizeof(void *), 0, 1221 size - nulls2 - sizeof(void *)); 1222} 1223EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); 1224 1225static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1226 int family) 1227{ 1228 struct sock *sk; 1229 struct kmem_cache *slab; 1230 1231 slab = prot->slab; 1232 if (slab != NULL) { 1233 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1234 if (!sk) 1235 return sk; 1236 if (priority & __GFP_ZERO) { 1237 if (prot->clear_sk) 1238 prot->clear_sk(sk, prot->obj_size); 1239 else 1240 sk_prot_clear_nulls(sk, prot->obj_size); 1241 } 1242 } else 1243 sk = kmalloc(prot->obj_size, priority); 1244 1245 if (sk != NULL) { 1246 kmemcheck_annotate_bitfield(sk, flags); 1247 1248 if (security_sk_alloc(sk, family, priority)) 1249 goto out_free; 1250 1251 if (!try_module_get(prot->owner)) 1252 goto out_free_sec; 1253 sk_tx_queue_clear(sk); 1254 } 1255 1256 return sk; 1257 1258out_free_sec: 1259 security_sk_free(sk); 1260out_free: 1261 if (slab != NULL) 1262 kmem_cache_free(slab, sk); 1263 else 1264 kfree(sk); 1265 return NULL; 1266} 1267 1268static void sk_prot_free(struct proto *prot, struct sock *sk) 1269{ 1270 struct kmem_cache *slab; 1271 struct module *owner; 1272 1273 owner = prot->owner; 1274 slab = prot->slab; 1275 1276 security_sk_free(sk); 1277 if (slab != NULL) 1278 kmem_cache_free(slab, sk); 1279 else 1280 kfree(sk); 1281 module_put(owner); 1282} 1283 1284#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) 1285void sock_update_classid(struct sock *sk) 1286{ 1287 u32 classid; 1288 1289 classid = task_cls_classid(current); 1290 if (classid != sk->sk_classid) 1291 sk->sk_classid = classid; 1292} 1293EXPORT_SYMBOL(sock_update_classid); 1294#endif 1295 1296#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) 1297void sock_update_netprioidx(struct sock *sk) 1298{ 1299 if (in_interrupt()) 1300 return; 1301 1302 sk->sk_cgrp_prioidx = task_netprioidx(current); 1303} 1304EXPORT_SYMBOL_GPL(sock_update_netprioidx); 1305#endif 1306 1307/** 1308 * sk_alloc - All socket objects are allocated here 1309 * @net: the applicable net namespace 1310 * @family: protocol family 1311 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1312 * @prot: struct proto associated with this new sock instance 1313 */ 1314struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1315 struct proto *prot) 1316{ 1317 struct sock *sk; 1318 1319 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1320 if (sk) { 1321 sk->sk_family = family; 1322 /* 1323 * See comment in struct sock definition to understand 1324 * why we need sk_prot_creator -acme 1325 */ 1326 sk->sk_prot = sk->sk_prot_creator = prot; 1327 sock_lock_init(sk); 1328 sock_net_set(sk, get_net(net)); 1329 atomic_set(&sk->sk_wmem_alloc, 1); 1330 1331 sock_update_classid(sk); 1332 sock_update_netprioidx(sk); 1333 } 1334 1335 return sk; 1336} 1337EXPORT_SYMBOL(sk_alloc); 1338 1339static void __sk_free(struct sock *sk) 1340{ 1341 struct sk_filter *filter; 1342 1343 if (sk->sk_destruct) 1344 sk->sk_destruct(sk); 1345 1346 filter = rcu_dereference_check(sk->sk_filter, 1347 atomic_read(&sk->sk_wmem_alloc) == 0); 1348 if (filter) { 1349 sk_filter_uncharge(sk, filter); 1350 RCU_INIT_POINTER(sk->sk_filter, NULL); 1351 } 1352 1353 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1354 1355 if (atomic_read(&sk->sk_omem_alloc)) 1356 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1357 __func__, atomic_read(&sk->sk_omem_alloc)); 1358 1359 if (sk->sk_peer_cred) 1360 put_cred(sk->sk_peer_cred); 1361 put_pid(sk->sk_peer_pid); 1362 put_net(sock_net(sk)); 1363 sk_prot_free(sk->sk_prot_creator, sk); 1364} 1365 1366void sk_free(struct sock *sk) 1367{ 1368 /* 1369 * We subtract one from sk_wmem_alloc and can know if 1370 * some packets are still in some tx queue. 1371 * If not null, sock_wfree() will call __sk_free(sk) later 1372 */ 1373 if (atomic_dec_and_test(&sk->sk_wmem_alloc)) 1374 __sk_free(sk); 1375} 1376EXPORT_SYMBOL(sk_free); 1377 1378/* 1379 * Last sock_put should drop reference to sk->sk_net. It has already 1380 * been dropped in sk_change_net. Taking reference to stopping namespace 1381 * is not an option. 1382 * Take reference to a socket to remove it from hash _alive_ and after that 1383 * destroy it in the context of init_net. 1384 */ 1385void sk_release_kernel(struct sock *sk) 1386{ 1387 if (sk == NULL || sk->sk_socket == NULL) 1388 return; 1389 1390 sock_hold(sk); 1391 sock_release(sk->sk_socket); 1392 release_net(sock_net(sk)); 1393 sock_net_set(sk, get_net(&init_net)); 1394 sock_put(sk); 1395} 1396EXPORT_SYMBOL(sk_release_kernel); 1397 1398static void sk_update_clone(const struct sock *sk, struct sock *newsk) 1399{ 1400 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1401 sock_update_memcg(newsk); 1402} 1403 1404/** 1405 * sk_clone_lock - clone a socket, and lock its clone 1406 * @sk: the socket to clone 1407 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1408 * 1409 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1410 */ 1411struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1412{ 1413 struct sock *newsk; 1414 1415 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1416 if (newsk != NULL) { 1417 struct sk_filter *filter; 1418 1419 sock_copy(newsk, sk); 1420 1421 /* SANITY */ 1422 get_net(sock_net(newsk)); 1423 sk_node_init(&newsk->sk_node); 1424 sock_lock_init(newsk); 1425 bh_lock_sock(newsk); 1426 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1427 newsk->sk_backlog.len = 0; 1428 1429 atomic_set(&newsk->sk_rmem_alloc, 0); 1430 /* 1431 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1432 */ 1433 atomic_set(&newsk->sk_wmem_alloc, 1); 1434 atomic_set(&newsk->sk_omem_alloc, 0); 1435 skb_queue_head_init(&newsk->sk_receive_queue); 1436 skb_queue_head_init(&newsk->sk_write_queue); 1437#ifdef CONFIG_NET_DMA 1438 skb_queue_head_init(&newsk->sk_async_wait_queue); 1439#endif 1440 1441 spin_lock_init(&newsk->sk_dst_lock); 1442 rwlock_init(&newsk->sk_callback_lock); 1443 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1444 af_callback_keys + newsk->sk_family, 1445 af_family_clock_key_strings[newsk->sk_family]); 1446 1447 newsk->sk_dst_cache = NULL; 1448 newsk->sk_wmem_queued = 0; 1449 newsk->sk_forward_alloc = 0; 1450 newsk->sk_send_head = NULL; 1451 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1452 1453 sock_reset_flag(newsk, SOCK_DONE); 1454 skb_queue_head_init(&newsk->sk_error_queue); 1455 1456 filter = rcu_dereference_protected(newsk->sk_filter, 1); 1457 if (filter != NULL) 1458 sk_filter_charge(newsk, filter); 1459 1460 if (unlikely(xfrm_sk_clone_policy(newsk))) { 1461 /* It is still raw copy of parent, so invalidate 1462 * destructor and make plain sk_free() */ 1463 newsk->sk_destruct = NULL; 1464 bh_unlock_sock(newsk); 1465 sk_free(newsk); 1466 newsk = NULL; 1467 goto out; 1468 } 1469 1470 newsk->sk_err = 0; 1471 newsk->sk_priority = 0; 1472 /* 1473 * Before updating sk_refcnt, we must commit prior changes to memory 1474 * (Documentation/RCU/rculist_nulls.txt for details) 1475 */ 1476 smp_wmb(); 1477 atomic_set(&newsk->sk_refcnt, 2); 1478 1479 /* 1480 * Increment the counter in the same struct proto as the master 1481 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1482 * is the same as sk->sk_prot->socks, as this field was copied 1483 * with memcpy). 1484 * 1485 * This _changes_ the previous behaviour, where 1486 * tcp_create_openreq_child always was incrementing the 1487 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1488 * to be taken into account in all callers. -acme 1489 */ 1490 sk_refcnt_debug_inc(newsk); 1491 sk_set_socket(newsk, NULL); 1492 newsk->sk_wq = NULL; 1493 1494 sk_update_clone(sk, newsk); 1495 1496 if (newsk->sk_prot->sockets_allocated) 1497 sk_sockets_allocated_inc(newsk); 1498 1499 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1500 net_enable_timestamp(); 1501 } 1502out: 1503 return newsk; 1504} 1505EXPORT_SYMBOL_GPL(sk_clone_lock); 1506 1507void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1508{ 1509 __sk_dst_set(sk, dst); 1510 sk->sk_route_caps = dst->dev->features; 1511 if (sk->sk_route_caps & NETIF_F_GSO) 1512 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1513 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1514 if (sk_can_gso(sk)) { 1515 if (dst->header_len) { 1516 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1517 } else { 1518 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1519 sk->sk_gso_max_size = dst->dev->gso_max_size; 1520 sk->sk_gso_max_segs = dst->dev->gso_max_segs; 1521 } 1522 } 1523} 1524EXPORT_SYMBOL_GPL(sk_setup_caps); 1525 1526/* 1527 * Simple resource managers for sockets. 1528 */ 1529 1530 1531/* 1532 * Write buffer destructor automatically called from kfree_skb. 1533 */ 1534void sock_wfree(struct sk_buff *skb) 1535{ 1536 struct sock *sk = skb->sk; 1537 unsigned int len = skb->truesize; 1538 1539 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1540 /* 1541 * Keep a reference on sk_wmem_alloc, this will be released 1542 * after sk_write_space() call 1543 */ 1544 atomic_sub(len - 1, &sk->sk_wmem_alloc); 1545 sk->sk_write_space(sk); 1546 len = 1; 1547 } 1548 /* 1549 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1550 * could not do because of in-flight packets 1551 */ 1552 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) 1553 __sk_free(sk); 1554} 1555EXPORT_SYMBOL(sock_wfree); 1556 1557/* 1558 * Read buffer destructor automatically called from kfree_skb. 1559 */ 1560void sock_rfree(struct sk_buff *skb) 1561{ 1562 struct sock *sk = skb->sk; 1563 unsigned int len = skb->truesize; 1564 1565 atomic_sub(len, &sk->sk_rmem_alloc); 1566 sk_mem_uncharge(sk, len); 1567} 1568EXPORT_SYMBOL(sock_rfree); 1569 1570void sock_edemux(struct sk_buff *skb) 1571{ 1572 struct sock *sk = skb->sk; 1573 1574#ifdef CONFIG_INET 1575 if (sk->sk_state == TCP_TIME_WAIT) 1576 inet_twsk_put(inet_twsk(sk)); 1577 else 1578#endif 1579 sock_put(sk); 1580} 1581EXPORT_SYMBOL(sock_edemux); 1582 1583kuid_t sock_i_uid(struct sock *sk) 1584{ 1585 kuid_t uid; 1586 1587 read_lock_bh(&sk->sk_callback_lock); 1588 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1589 read_unlock_bh(&sk->sk_callback_lock); 1590 return uid; 1591} 1592EXPORT_SYMBOL(sock_i_uid); 1593 1594unsigned long sock_i_ino(struct sock *sk) 1595{ 1596 unsigned long ino; 1597 1598 read_lock_bh(&sk->sk_callback_lock); 1599 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1600 read_unlock_bh(&sk->sk_callback_lock); 1601 return ino; 1602} 1603EXPORT_SYMBOL(sock_i_ino); 1604 1605/* 1606 * Allocate a skb from the socket's send buffer. 1607 */ 1608struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1609 gfp_t priority) 1610{ 1611 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1612 struct sk_buff *skb = alloc_skb(size, priority); 1613 if (skb) { 1614 skb_set_owner_w(skb, sk); 1615 return skb; 1616 } 1617 } 1618 return NULL; 1619} 1620EXPORT_SYMBOL(sock_wmalloc); 1621 1622/* 1623 * Allocate a skb from the socket's receive buffer. 1624 */ 1625struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, 1626 gfp_t priority) 1627{ 1628 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1629 struct sk_buff *skb = alloc_skb(size, priority); 1630 if (skb) { 1631 skb_set_owner_r(skb, sk); 1632 return skb; 1633 } 1634 } 1635 return NULL; 1636} 1637 1638/* 1639 * Allocate a memory block from the socket's option memory buffer. 1640 */ 1641void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1642{ 1643 if ((unsigned int)size <= sysctl_optmem_max && 1644 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1645 void *mem; 1646 /* First do the add, to avoid the race if kmalloc 1647 * might sleep. 1648 */ 1649 atomic_add(size, &sk->sk_omem_alloc); 1650 mem = kmalloc(size, priority); 1651 if (mem) 1652 return mem; 1653 atomic_sub(size, &sk->sk_omem_alloc); 1654 } 1655 return NULL; 1656} 1657EXPORT_SYMBOL(sock_kmalloc); 1658 1659/* 1660 * Free an option memory block. 1661 */ 1662void sock_kfree_s(struct sock *sk, void *mem, int size) 1663{ 1664 kfree(mem); 1665 atomic_sub(size, &sk->sk_omem_alloc); 1666} 1667EXPORT_SYMBOL(sock_kfree_s); 1668 1669/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 1670 I think, these locks should be removed for datagram sockets. 1671 */ 1672static long sock_wait_for_wmem(struct sock *sk, long timeo) 1673{ 1674 DEFINE_WAIT(wait); 1675 1676 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1677 for (;;) { 1678 if (!timeo) 1679 break; 1680 if (signal_pending(current)) 1681 break; 1682 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1683 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1684 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1685 break; 1686 if (sk->sk_shutdown & SEND_SHUTDOWN) 1687 break; 1688 if (sk->sk_err) 1689 break; 1690 timeo = schedule_timeout(timeo); 1691 } 1692 finish_wait(sk_sleep(sk), &wait); 1693 return timeo; 1694} 1695 1696 1697/* 1698 * Generic send/receive buffer handlers 1699 */ 1700 1701struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1702 unsigned long data_len, int noblock, 1703 int *errcode) 1704{ 1705 struct sk_buff *skb; 1706 gfp_t gfp_mask; 1707 long timeo; 1708 int err; 1709 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 1710 1711 err = -EMSGSIZE; 1712 if (npages > MAX_SKB_FRAGS) 1713 goto failure; 1714 1715 gfp_mask = sk->sk_allocation; 1716 if (gfp_mask & __GFP_WAIT) 1717 gfp_mask |= __GFP_REPEAT; 1718 1719 timeo = sock_sndtimeo(sk, noblock); 1720 while (1) { 1721 err = sock_error(sk); 1722 if (err != 0) 1723 goto failure; 1724 1725 err = -EPIPE; 1726 if (sk->sk_shutdown & SEND_SHUTDOWN) 1727 goto failure; 1728 1729 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1730 skb = alloc_skb(header_len, gfp_mask); 1731 if (skb) { 1732 int i; 1733 1734 /* No pages, we're done... */ 1735 if (!data_len) 1736 break; 1737 1738 skb->truesize += data_len; 1739 skb_shinfo(skb)->nr_frags = npages; 1740 for (i = 0; i < npages; i++) { 1741 struct page *page; 1742 1743 page = alloc_pages(sk->sk_allocation, 0); 1744 if (!page) { 1745 err = -ENOBUFS; 1746 skb_shinfo(skb)->nr_frags = i; 1747 kfree_skb(skb); 1748 goto failure; 1749 } 1750 1751 __skb_fill_page_desc(skb, i, 1752 page, 0, 1753 (data_len >= PAGE_SIZE ? 1754 PAGE_SIZE : 1755 data_len)); 1756 data_len -= PAGE_SIZE; 1757 } 1758 1759 /* Full success... */ 1760 break; 1761 } 1762 err = -ENOBUFS; 1763 goto failure; 1764 } 1765 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1766 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1767 err = -EAGAIN; 1768 if (!timeo) 1769 goto failure; 1770 if (signal_pending(current)) 1771 goto interrupted; 1772 timeo = sock_wait_for_wmem(sk, timeo); 1773 } 1774 1775 skb_set_owner_w(skb, sk); 1776 return skb; 1777 1778interrupted: 1779 err = sock_intr_errno(timeo); 1780failure: 1781 *errcode = err; 1782 return NULL; 1783} 1784EXPORT_SYMBOL(sock_alloc_send_pskb); 1785 1786struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1787 int noblock, int *errcode) 1788{ 1789 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); 1790} 1791EXPORT_SYMBOL(sock_alloc_send_skb); 1792 1793/* On 32bit arches, an skb frag is limited to 2^15 */ 1794#define SKB_FRAG_PAGE_ORDER get_order(32768) 1795 1796bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 1797{ 1798 int order; 1799 1800 if (pfrag->page) { 1801 if (atomic_read(&pfrag->page->_count) == 1) { 1802 pfrag->offset = 0; 1803 return true; 1804 } 1805 if (pfrag->offset < pfrag->size) 1806 return true; 1807 put_page(pfrag->page); 1808 } 1809 1810 /* We restrict high order allocations to users that can afford to wait */ 1811 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; 1812 1813 do { 1814 gfp_t gfp = sk->sk_allocation; 1815 1816 if (order) 1817 gfp |= __GFP_COMP | __GFP_NOWARN; 1818 pfrag->page = alloc_pages(gfp, order); 1819 if (likely(pfrag->page)) { 1820 pfrag->offset = 0; 1821 pfrag->size = PAGE_SIZE << order; 1822 return true; 1823 } 1824 } while (--order >= 0); 1825 1826 sk_enter_memory_pressure(sk); 1827 sk_stream_moderate_sndbuf(sk); 1828 return false; 1829} 1830EXPORT_SYMBOL(sk_page_frag_refill); 1831 1832static void __lock_sock(struct sock *sk) 1833 __releases(&sk->sk_lock.slock) 1834 __acquires(&sk->sk_lock.slock) 1835{ 1836 DEFINE_WAIT(wait); 1837 1838 for (;;) { 1839 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 1840 TASK_UNINTERRUPTIBLE); 1841 spin_unlock_bh(&sk->sk_lock.slock); 1842 schedule(); 1843 spin_lock_bh(&sk->sk_lock.slock); 1844 if (!sock_owned_by_user(sk)) 1845 break; 1846 } 1847 finish_wait(&sk->sk_lock.wq, &wait); 1848} 1849 1850static void __release_sock(struct sock *sk) 1851 __releases(&sk->sk_lock.slock) 1852 __acquires(&sk->sk_lock.slock) 1853{ 1854 struct sk_buff *skb = sk->sk_backlog.head; 1855 1856 do { 1857 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 1858 bh_unlock_sock(sk); 1859 1860 do { 1861 struct sk_buff *next = skb->next; 1862 1863 prefetch(next); 1864 WARN_ON_ONCE(skb_dst_is_noref(skb)); 1865 skb->next = NULL; 1866 sk_backlog_rcv(sk, skb); 1867 1868 /* 1869 * We are in process context here with softirqs 1870 * disabled, use cond_resched_softirq() to preempt. 1871 * This is safe to do because we've taken the backlog 1872 * queue private: 1873 */ 1874 cond_resched_softirq(); 1875 1876 skb = next; 1877 } while (skb != NULL); 1878 1879 bh_lock_sock(sk); 1880 } while ((skb = sk->sk_backlog.head) != NULL); 1881 1882 /* 1883 * Doing the zeroing here guarantee we can not loop forever 1884 * while a wild producer attempts to flood us. 1885 */ 1886 sk->sk_backlog.len = 0; 1887} 1888 1889/** 1890 * sk_wait_data - wait for data to arrive at sk_receive_queue 1891 * @sk: sock to wait on 1892 * @timeo: for how long 1893 * 1894 * Now socket state including sk->sk_err is changed only under lock, 1895 * hence we may omit checks after joining wait queue. 1896 * We check receive queue before schedule() only as optimization; 1897 * it is very likely that release_sock() added new data. 1898 */ 1899int sk_wait_data(struct sock *sk, long *timeo) 1900{ 1901 int rc; 1902 DEFINE_WAIT(wait); 1903 1904 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1905 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1906 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); 1907 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1908 finish_wait(sk_sleep(sk), &wait); 1909 return rc; 1910} 1911EXPORT_SYMBOL(sk_wait_data); 1912 1913/** 1914 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 1915 * @sk: socket 1916 * @size: memory size to allocate 1917 * @kind: allocation type 1918 * 1919 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 1920 * rmem allocation. This function assumes that protocols which have 1921 * memory_pressure use sk_wmem_queued as write buffer accounting. 1922 */ 1923int __sk_mem_schedule(struct sock *sk, int size, int kind) 1924{ 1925 struct proto *prot = sk->sk_prot; 1926 int amt = sk_mem_pages(size); 1927 long allocated; 1928 int parent_status = UNDER_LIMIT; 1929 1930 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1931 1932 allocated = sk_memory_allocated_add(sk, amt, &parent_status); 1933 1934 /* Under limit. */ 1935 if (parent_status == UNDER_LIMIT && 1936 allocated <= sk_prot_mem_limits(sk, 0)) { 1937 sk_leave_memory_pressure(sk); 1938 return 1; 1939 } 1940 1941 /* Under pressure. (we or our parents) */ 1942 if ((parent_status > SOFT_LIMIT) || 1943 allocated > sk_prot_mem_limits(sk, 1)) 1944 sk_enter_memory_pressure(sk); 1945 1946 /* Over hard limit (we or our parents) */ 1947 if ((parent_status == OVER_LIMIT) || 1948 (allocated > sk_prot_mem_limits(sk, 2))) 1949 goto suppress_allocation; 1950 1951 /* guarantee minimum buffer size under pressure */ 1952 if (kind == SK_MEM_RECV) { 1953 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 1954 return 1; 1955 1956 } else { /* SK_MEM_SEND */ 1957 if (sk->sk_type == SOCK_STREAM) { 1958 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 1959 return 1; 1960 } else if (atomic_read(&sk->sk_wmem_alloc) < 1961 prot->sysctl_wmem[0]) 1962 return 1; 1963 } 1964 1965 if (sk_has_memory_pressure(sk)) { 1966 int alloc; 1967 1968 if (!sk_under_memory_pressure(sk)) 1969 return 1; 1970 alloc = sk_sockets_allocated_read_positive(sk); 1971 if (sk_prot_mem_limits(sk, 2) > alloc * 1972 sk_mem_pages(sk->sk_wmem_queued + 1973 atomic_read(&sk->sk_rmem_alloc) + 1974 sk->sk_forward_alloc)) 1975 return 1; 1976 } 1977 1978suppress_allocation: 1979 1980 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 1981 sk_stream_moderate_sndbuf(sk); 1982 1983 /* Fail only if socket is _under_ its sndbuf. 1984 * In this case we cannot block, so that we have to fail. 1985 */ 1986 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 1987 return 1; 1988 } 1989 1990 trace_sock_exceed_buf_limit(sk, prot, allocated); 1991 1992 /* Alas. Undo changes. */ 1993 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 1994 1995 sk_memory_allocated_sub(sk, amt); 1996 1997 return 0; 1998} 1999EXPORT_SYMBOL(__sk_mem_schedule); 2000 2001/** 2002 * __sk_reclaim - reclaim memory_allocated 2003 * @sk: socket 2004 */ 2005void __sk_mem_reclaim(struct sock *sk) 2006{ 2007 sk_memory_allocated_sub(sk, 2008 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); 2009 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 2010 2011 if (sk_under_memory_pressure(sk) && 2012 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2013 sk_leave_memory_pressure(sk); 2014} 2015EXPORT_SYMBOL(__sk_mem_reclaim); 2016 2017 2018/* 2019 * Set of default routines for initialising struct proto_ops when 2020 * the protocol does not support a particular function. In certain 2021 * cases where it makes no sense for a protocol to have a "do nothing" 2022 * function, some default processing is provided. 2023 */ 2024 2025int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2026{ 2027 return -EOPNOTSUPP; 2028} 2029EXPORT_SYMBOL(sock_no_bind); 2030 2031int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2032 int len, int flags) 2033{ 2034 return -EOPNOTSUPP; 2035} 2036EXPORT_SYMBOL(sock_no_connect); 2037 2038int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2039{ 2040 return -EOPNOTSUPP; 2041} 2042EXPORT_SYMBOL(sock_no_socketpair); 2043 2044int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 2045{ 2046 return -EOPNOTSUPP; 2047} 2048EXPORT_SYMBOL(sock_no_accept); 2049 2050int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2051 int *len, int peer) 2052{ 2053 return -EOPNOTSUPP; 2054} 2055EXPORT_SYMBOL(sock_no_getname); 2056 2057unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 2058{ 2059 return 0; 2060} 2061EXPORT_SYMBOL(sock_no_poll); 2062 2063int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2064{ 2065 return -EOPNOTSUPP; 2066} 2067EXPORT_SYMBOL(sock_no_ioctl); 2068 2069int sock_no_listen(struct socket *sock, int backlog) 2070{ 2071 return -EOPNOTSUPP; 2072} 2073EXPORT_SYMBOL(sock_no_listen); 2074 2075int sock_no_shutdown(struct socket *sock, int how) 2076{ 2077 return -EOPNOTSUPP; 2078} 2079EXPORT_SYMBOL(sock_no_shutdown); 2080 2081int sock_no_setsockopt(struct socket *sock, int level, int optname, 2082 char __user *optval, unsigned int optlen) 2083{ 2084 return -EOPNOTSUPP; 2085} 2086EXPORT_SYMBOL(sock_no_setsockopt); 2087 2088int sock_no_getsockopt(struct socket *sock, int level, int optname, 2089 char __user *optval, int __user *optlen) 2090{ 2091 return -EOPNOTSUPP; 2092} 2093EXPORT_SYMBOL(sock_no_getsockopt); 2094 2095int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 2096 size_t len) 2097{ 2098 return -EOPNOTSUPP; 2099} 2100EXPORT_SYMBOL(sock_no_sendmsg); 2101 2102int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 2103 size_t len, int flags) 2104{ 2105 return -EOPNOTSUPP; 2106} 2107EXPORT_SYMBOL(sock_no_recvmsg); 2108 2109int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2110{ 2111 /* Mirror missing mmap method error code */ 2112 return -ENODEV; 2113} 2114EXPORT_SYMBOL(sock_no_mmap); 2115 2116ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2117{ 2118 ssize_t res; 2119 struct msghdr msg = {.msg_flags = flags}; 2120 struct kvec iov; 2121 char *kaddr = kmap(page); 2122 iov.iov_base = kaddr + offset; 2123 iov.iov_len = size; 2124 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2125 kunmap(page); 2126 return res; 2127} 2128EXPORT_SYMBOL(sock_no_sendpage); 2129 2130/* 2131 * Default Socket Callbacks 2132 */ 2133 2134static void sock_def_wakeup(struct sock *sk) 2135{ 2136 struct socket_wq *wq; 2137 2138 rcu_read_lock(); 2139 wq = rcu_dereference(sk->sk_wq); 2140 if (wq_has_sleeper(wq)) 2141 wake_up_interruptible_all(&wq->wait); 2142 rcu_read_unlock(); 2143} 2144 2145static void sock_def_error_report(struct sock *sk) 2146{ 2147 struct socket_wq *wq; 2148 2149 rcu_read_lock(); 2150 wq = rcu_dereference(sk->sk_wq); 2151 if (wq_has_sleeper(wq)) 2152 wake_up_interruptible_poll(&wq->wait, POLLERR); 2153 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2154 rcu_read_unlock(); 2155} 2156 2157static void sock_def_readable(struct sock *sk, int len) 2158{ 2159 struct socket_wq *wq; 2160 2161 rcu_read_lock(); 2162 wq = rcu_dereference(sk->sk_wq); 2163 if (wq_has_sleeper(wq)) 2164 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | 2165 POLLRDNORM | POLLRDBAND); 2166 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2167 rcu_read_unlock(); 2168} 2169 2170static void sock_def_write_space(struct sock *sk) 2171{ 2172 struct socket_wq *wq; 2173 2174 rcu_read_lock(); 2175 2176 /* Do not wake up a writer until he can make "significant" 2177 * progress. --DaveM 2178 */ 2179 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2180 wq = rcu_dereference(sk->sk_wq); 2181 if (wq_has_sleeper(wq)) 2182 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | 2183 POLLWRNORM | POLLWRBAND); 2184 2185 /* Should agree with poll, otherwise some programs break */ 2186 if (sock_writeable(sk)) 2187 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2188 } 2189 2190 rcu_read_unlock(); 2191} 2192 2193static void sock_def_destruct(struct sock *sk) 2194{ 2195 kfree(sk->sk_protinfo); 2196} 2197 2198void sk_send_sigurg(struct sock *sk) 2199{ 2200 if (sk->sk_socket && sk->sk_socket->file) 2201 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2202 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2203} 2204EXPORT_SYMBOL(sk_send_sigurg); 2205 2206void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2207 unsigned long expires) 2208{ 2209 if (!mod_timer(timer, expires)) 2210 sock_hold(sk); 2211} 2212EXPORT_SYMBOL(sk_reset_timer); 2213 2214void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2215{ 2216 if (del_timer(timer)) 2217 __sock_put(sk); 2218} 2219EXPORT_SYMBOL(sk_stop_timer); 2220 2221void sock_init_data(struct socket *sock, struct sock *sk) 2222{ 2223 skb_queue_head_init(&sk->sk_receive_queue); 2224 skb_queue_head_init(&sk->sk_write_queue); 2225 skb_queue_head_init(&sk->sk_error_queue); 2226#ifdef CONFIG_NET_DMA 2227 skb_queue_head_init(&sk->sk_async_wait_queue); 2228#endif 2229 2230 sk->sk_send_head = NULL; 2231 2232 init_timer(&sk->sk_timer); 2233 2234 sk->sk_allocation = GFP_KERNEL; 2235 sk->sk_rcvbuf = sysctl_rmem_default; 2236 sk->sk_sndbuf = sysctl_wmem_default; 2237 sk->sk_state = TCP_CLOSE; 2238 sk_set_socket(sk, sock); 2239 2240 sock_set_flag(sk, SOCK_ZAPPED); 2241 2242 if (sock) { 2243 sk->sk_type = sock->type; 2244 sk->sk_wq = sock->wq; 2245 sock->sk = sk; 2246 } else 2247 sk->sk_wq = NULL; 2248 2249 spin_lock_init(&sk->sk_dst_lock); 2250 rwlock_init(&sk->sk_callback_lock); 2251 lockdep_set_class_and_name(&sk->sk_callback_lock, 2252 af_callback_keys + sk->sk_family, 2253 af_family_clock_key_strings[sk->sk_family]); 2254 2255 sk->sk_state_change = sock_def_wakeup; 2256 sk->sk_data_ready = sock_def_readable; 2257 sk->sk_write_space = sock_def_write_space; 2258 sk->sk_error_report = sock_def_error_report; 2259 sk->sk_destruct = sock_def_destruct; 2260 2261 sk->sk_frag.page = NULL; 2262 sk->sk_frag.offset = 0; 2263 sk->sk_peek_off = -1; 2264 2265 sk->sk_peer_pid = NULL; 2266 sk->sk_peer_cred = NULL; 2267 sk->sk_write_pending = 0; 2268 sk->sk_rcvlowat = 1; 2269 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2270 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2271 2272 sk->sk_stamp = ktime_set(-1L, 0); 2273 2274 /* 2275 * Before updating sk_refcnt, we must commit prior changes to memory 2276 * (Documentation/RCU/rculist_nulls.txt for details) 2277 */ 2278 smp_wmb(); 2279 atomic_set(&sk->sk_refcnt, 1); 2280 atomic_set(&sk->sk_drops, 0); 2281} 2282EXPORT_SYMBOL(sock_init_data); 2283 2284void lock_sock_nested(struct sock *sk, int subclass) 2285{ 2286 might_sleep(); 2287 spin_lock_bh(&sk->sk_lock.slock); 2288 if (sk->sk_lock.owned) 2289 __lock_sock(sk); 2290 sk->sk_lock.owned = 1; 2291 spin_unlock(&sk->sk_lock.slock); 2292 /* 2293 * The sk_lock has mutex_lock() semantics here: 2294 */ 2295 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2296 local_bh_enable(); 2297} 2298EXPORT_SYMBOL(lock_sock_nested); 2299 2300void release_sock(struct sock *sk) 2301{ 2302 /* 2303 * The sk_lock has mutex_unlock() semantics: 2304 */ 2305 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 2306 2307 spin_lock_bh(&sk->sk_lock.slock); 2308 if (sk->sk_backlog.tail) 2309 __release_sock(sk); 2310 2311 if (sk->sk_prot->release_cb) 2312 sk->sk_prot->release_cb(sk); 2313 2314 sk->sk_lock.owned = 0; 2315 if (waitqueue_active(&sk->sk_lock.wq)) 2316 wake_up(&sk->sk_lock.wq); 2317 spin_unlock_bh(&sk->sk_lock.slock); 2318} 2319EXPORT_SYMBOL(release_sock); 2320 2321/** 2322 * lock_sock_fast - fast version of lock_sock 2323 * @sk: socket 2324 * 2325 * This version should be used for very small section, where process wont block 2326 * return false if fast path is taken 2327 * sk_lock.slock locked, owned = 0, BH disabled 2328 * return true if slow path is taken 2329 * sk_lock.slock unlocked, owned = 1, BH enabled 2330 */ 2331bool lock_sock_fast(struct sock *sk) 2332{ 2333 might_sleep(); 2334 spin_lock_bh(&sk->sk_lock.slock); 2335 2336 if (!sk->sk_lock.owned) 2337 /* 2338 * Note : We must disable BH 2339 */ 2340 return false; 2341 2342 __lock_sock(sk); 2343 sk->sk_lock.owned = 1; 2344 spin_unlock(&sk->sk_lock.slock); 2345 /* 2346 * The sk_lock has mutex_lock() semantics here: 2347 */ 2348 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2349 local_bh_enable(); 2350 return true; 2351} 2352EXPORT_SYMBOL(lock_sock_fast); 2353 2354int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2355{ 2356 struct timeval tv; 2357 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2358 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2359 tv = ktime_to_timeval(sk->sk_stamp); 2360 if (tv.tv_sec == -1) 2361 return -ENOENT; 2362 if (tv.tv_sec == 0) { 2363 sk->sk_stamp = ktime_get_real(); 2364 tv = ktime_to_timeval(sk->sk_stamp); 2365 } 2366 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2367} 2368EXPORT_SYMBOL(sock_get_timestamp); 2369 2370int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2371{ 2372 struct timespec ts; 2373 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2374 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2375 ts = ktime_to_timespec(sk->sk_stamp); 2376 if (ts.tv_sec == -1) 2377 return -ENOENT; 2378 if (ts.tv_sec == 0) { 2379 sk->sk_stamp = ktime_get_real(); 2380 ts = ktime_to_timespec(sk->sk_stamp); 2381 } 2382 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2383} 2384EXPORT_SYMBOL(sock_get_timestampns); 2385 2386void sock_enable_timestamp(struct sock *sk, int flag) 2387{ 2388 if (!sock_flag(sk, flag)) { 2389 unsigned long previous_flags = sk->sk_flags; 2390 2391 sock_set_flag(sk, flag); 2392 /* 2393 * we just set one of the two flags which require net 2394 * time stamping, but time stamping might have been on 2395 * already because of the other one 2396 */ 2397 if (!(previous_flags & SK_FLAGS_TIMESTAMP)) 2398 net_enable_timestamp(); 2399 } 2400} 2401 2402/* 2403 * Get a socket option on an socket. 2404 * 2405 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2406 * asynchronous errors should be reported by getsockopt. We assume 2407 * this means if you specify SO_ERROR (otherwise whats the point of it). 2408 */ 2409int sock_common_getsockopt(struct socket *sock, int level, int optname, 2410 char __user *optval, int __user *optlen) 2411{ 2412 struct sock *sk = sock->sk; 2413 2414 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2415} 2416EXPORT_SYMBOL(sock_common_getsockopt); 2417 2418#ifdef CONFIG_COMPAT 2419int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2420 char __user *optval, int __user *optlen) 2421{ 2422 struct sock *sk = sock->sk; 2423 2424 if (sk->sk_prot->compat_getsockopt != NULL) 2425 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2426 optval, optlen); 2427 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2428} 2429EXPORT_SYMBOL(compat_sock_common_getsockopt); 2430#endif 2431 2432int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, 2433 struct msghdr *msg, size_t size, int flags) 2434{ 2435 struct sock *sk = sock->sk; 2436 int addr_len = 0; 2437 int err; 2438 2439 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 2440 flags & ~MSG_DONTWAIT, &addr_len); 2441 if (err >= 0) 2442 msg->msg_namelen = addr_len; 2443 return err; 2444} 2445EXPORT_SYMBOL(sock_common_recvmsg); 2446 2447/* 2448 * Set socket options on an inet socket. 2449 */ 2450int sock_common_setsockopt(struct socket *sock, int level, int optname, 2451 char __user *optval, unsigned int optlen) 2452{ 2453 struct sock *sk = sock->sk; 2454 2455 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2456} 2457EXPORT_SYMBOL(sock_common_setsockopt); 2458 2459#ifdef CONFIG_COMPAT 2460int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2461 char __user *optval, unsigned int optlen) 2462{ 2463 struct sock *sk = sock->sk; 2464 2465 if (sk->sk_prot->compat_setsockopt != NULL) 2466 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2467 optval, optlen); 2468 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2469} 2470EXPORT_SYMBOL(compat_sock_common_setsockopt); 2471#endif 2472 2473void sk_common_release(struct sock *sk) 2474{ 2475 if (sk->sk_prot->destroy) 2476 sk->sk_prot->destroy(sk); 2477 2478 /* 2479 * Observation: when sock_common_release is called, processes have 2480 * no access to socket. But net still has. 2481 * Step one, detach it from networking: 2482 * 2483 * A. Remove from hash tables. 2484 */ 2485 2486 sk->sk_prot->unhash(sk); 2487 2488 /* 2489 * In this point socket cannot receive new packets, but it is possible 2490 * that some packets are in flight because some CPU runs receiver and 2491 * did hash table lookup before we unhashed socket. They will achieve 2492 * receive queue and will be purged by socket destructor. 2493 * 2494 * Also we still have packets pending on receive queue and probably, 2495 * our own packets waiting in device queues. sock_destroy will drain 2496 * receive queue, but transmitted packets will delay socket destruction 2497 * until the last reference will be released. 2498 */ 2499 2500 sock_orphan(sk); 2501 2502 xfrm_sk_free_policy(sk); 2503 2504 sk_refcnt_debug_release(sk); 2505 2506 if (sk->sk_frag.page) { 2507 put_page(sk->sk_frag.page); 2508 sk->sk_frag.page = NULL; 2509 } 2510 2511 sock_put(sk); 2512} 2513EXPORT_SYMBOL(sk_common_release); 2514 2515#ifdef CONFIG_PROC_FS 2516#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2517struct prot_inuse { 2518 int val[PROTO_INUSE_NR]; 2519}; 2520 2521static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 2522 2523#ifdef CONFIG_NET_NS 2524void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2525{ 2526 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); 2527} 2528EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2529 2530int sock_prot_inuse_get(struct net *net, struct proto *prot) 2531{ 2532 int cpu, idx = prot->inuse_idx; 2533 int res = 0; 2534 2535 for_each_possible_cpu(cpu) 2536 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; 2537 2538 return res >= 0 ? res : 0; 2539} 2540EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2541 2542static int __net_init sock_inuse_init_net(struct net *net) 2543{ 2544 net->core.inuse = alloc_percpu(struct prot_inuse); 2545 return net->core.inuse ? 0 : -ENOMEM; 2546} 2547 2548static void __net_exit sock_inuse_exit_net(struct net *net) 2549{ 2550 free_percpu(net->core.inuse); 2551} 2552 2553static struct pernet_operations net_inuse_ops = { 2554 .init = sock_inuse_init_net, 2555 .exit = sock_inuse_exit_net, 2556}; 2557 2558static __init int net_inuse_init(void) 2559{ 2560 if (register_pernet_subsys(&net_inuse_ops)) 2561 panic("Cannot initialize net inuse counters"); 2562 2563 return 0; 2564} 2565 2566core_initcall(net_inuse_init); 2567#else 2568static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); 2569 2570void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2571{ 2572 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); 2573} 2574EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2575 2576int sock_prot_inuse_get(struct net *net, struct proto *prot) 2577{ 2578 int cpu, idx = prot->inuse_idx; 2579 int res = 0; 2580 2581 for_each_possible_cpu(cpu) 2582 res += per_cpu(prot_inuse, cpu).val[idx]; 2583 2584 return res >= 0 ? res : 0; 2585} 2586EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2587#endif 2588 2589static void assign_proto_idx(struct proto *prot) 2590{ 2591 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 2592 2593 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 2594 pr_err("PROTO_INUSE_NR exhausted\n"); 2595 return; 2596 } 2597 2598 set_bit(prot->inuse_idx, proto_inuse_idx); 2599} 2600 2601static void release_proto_idx(struct proto *prot) 2602{ 2603 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 2604 clear_bit(prot->inuse_idx, proto_inuse_idx); 2605} 2606#else 2607static inline void assign_proto_idx(struct proto *prot) 2608{ 2609} 2610 2611static inline void release_proto_idx(struct proto *prot) 2612{ 2613} 2614#endif 2615 2616int proto_register(struct proto *prot, int alloc_slab) 2617{ 2618 if (alloc_slab) { 2619 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2620 SLAB_HWCACHE_ALIGN | prot->slab_flags, 2621 NULL); 2622 2623 if (prot->slab == NULL) { 2624 pr_crit("%s: Can't create sock SLAB cache!\n", 2625 prot->name); 2626 goto out; 2627 } 2628 2629 if (prot->rsk_prot != NULL) { 2630 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); 2631 if (prot->rsk_prot->slab_name == NULL) 2632 goto out_free_sock_slab; 2633 2634 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, 2635 prot->rsk_prot->obj_size, 0, 2636 SLAB_HWCACHE_ALIGN, NULL); 2637 2638 if (prot->rsk_prot->slab == NULL) { 2639 pr_crit("%s: Can't create request sock SLAB cache!\n", 2640 prot->name); 2641 goto out_free_request_sock_slab_name; 2642 } 2643 } 2644 2645 if (prot->twsk_prot != NULL) { 2646 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 2647 2648 if (prot->twsk_prot->twsk_slab_name == NULL) 2649 goto out_free_request_sock_slab; 2650 2651 prot->twsk_prot->twsk_slab = 2652 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 2653 prot->twsk_prot->twsk_obj_size, 2654 0, 2655 SLAB_HWCACHE_ALIGN | 2656 prot->slab_flags, 2657 NULL); 2658 if (prot->twsk_prot->twsk_slab == NULL) 2659 goto out_free_timewait_sock_slab_name; 2660 } 2661 } 2662 2663 mutex_lock(&proto_list_mutex); 2664 list_add(&prot->node, &proto_list); 2665 assign_proto_idx(prot); 2666 mutex_unlock(&proto_list_mutex); 2667 return 0; 2668 2669out_free_timewait_sock_slab_name: 2670 kfree(prot->twsk_prot->twsk_slab_name); 2671out_free_request_sock_slab: 2672 if (prot->rsk_prot && prot->rsk_prot->slab) { 2673 kmem_cache_destroy(prot->rsk_prot->slab); 2674 prot->rsk_prot->slab = NULL; 2675 } 2676out_free_request_sock_slab_name: 2677 if (prot->rsk_prot) 2678 kfree(prot->rsk_prot->slab_name); 2679out_free_sock_slab: 2680 kmem_cache_destroy(prot->slab); 2681 prot->slab = NULL; 2682out: 2683 return -ENOBUFS; 2684} 2685EXPORT_SYMBOL(proto_register); 2686 2687void proto_unregister(struct proto *prot) 2688{ 2689 mutex_lock(&proto_list_mutex); 2690 release_proto_idx(prot); 2691 list_del(&prot->node); 2692 mutex_unlock(&proto_list_mutex); 2693 2694 if (prot->slab != NULL) { 2695 kmem_cache_destroy(prot->slab); 2696 prot->slab = NULL; 2697 } 2698 2699 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { 2700 kmem_cache_destroy(prot->rsk_prot->slab); 2701 kfree(prot->rsk_prot->slab_name); 2702 prot->rsk_prot->slab = NULL; 2703 } 2704 2705 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 2706 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 2707 kfree(prot->twsk_prot->twsk_slab_name); 2708 prot->twsk_prot->twsk_slab = NULL; 2709 } 2710} 2711EXPORT_SYMBOL(proto_unregister); 2712 2713#ifdef CONFIG_PROC_FS 2714static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2715 __acquires(proto_list_mutex) 2716{ 2717 mutex_lock(&proto_list_mutex); 2718 return seq_list_start_head(&proto_list, *pos); 2719} 2720 2721static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2722{ 2723 return seq_list_next(v, &proto_list, pos); 2724} 2725 2726static void proto_seq_stop(struct seq_file *seq, void *v) 2727 __releases(proto_list_mutex) 2728{ 2729 mutex_unlock(&proto_list_mutex); 2730} 2731 2732static char proto_method_implemented(const void *method) 2733{ 2734 return method == NULL ? 'n' : 'y'; 2735} 2736static long sock_prot_memory_allocated(struct proto *proto) 2737{ 2738 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 2739} 2740 2741static char *sock_prot_memory_pressure(struct proto *proto) 2742{ 2743 return proto->memory_pressure != NULL ? 2744 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 2745} 2746 2747static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2748{ 2749 2750 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 2751 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2752 proto->name, 2753 proto->obj_size, 2754 sock_prot_inuse_get(seq_file_net(seq), proto), 2755 sock_prot_memory_allocated(proto), 2756 sock_prot_memory_pressure(proto), 2757 proto->max_header, 2758 proto->slab == NULL ? "no" : "yes", 2759 module_name(proto->owner), 2760 proto_method_implemented(proto->close), 2761 proto_method_implemented(proto->connect), 2762 proto_method_implemented(proto->disconnect), 2763 proto_method_implemented(proto->accept), 2764 proto_method_implemented(proto->ioctl), 2765 proto_method_implemented(proto->init), 2766 proto_method_implemented(proto->destroy), 2767 proto_method_implemented(proto->shutdown), 2768 proto_method_implemented(proto->setsockopt), 2769 proto_method_implemented(proto->getsockopt), 2770 proto_method_implemented(proto->sendmsg), 2771 proto_method_implemented(proto->recvmsg), 2772 proto_method_implemented(proto->sendpage), 2773 proto_method_implemented(proto->bind), 2774 proto_method_implemented(proto->backlog_rcv), 2775 proto_method_implemented(proto->hash), 2776 proto_method_implemented(proto->unhash), 2777 proto_method_implemented(proto->get_port), 2778 proto_method_implemented(proto->enter_memory_pressure)); 2779} 2780 2781static int proto_seq_show(struct seq_file *seq, void *v) 2782{ 2783 if (v == &proto_list) 2784 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 2785 "protocol", 2786 "size", 2787 "sockets", 2788 "memory", 2789 "press", 2790 "maxhdr", 2791 "slab", 2792 "module", 2793 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 2794 else 2795 proto_seq_printf(seq, list_entry(v, struct proto, node)); 2796 return 0; 2797} 2798 2799static const struct seq_operations proto_seq_ops = { 2800 .start = proto_seq_start, 2801 .next = proto_seq_next, 2802 .stop = proto_seq_stop, 2803 .show = proto_seq_show, 2804}; 2805 2806static int proto_seq_open(struct inode *inode, struct file *file) 2807{ 2808 return seq_open_net(inode, file, &proto_seq_ops, 2809 sizeof(struct seq_net_private)); 2810} 2811 2812static const struct file_operations proto_seq_fops = { 2813 .owner = THIS_MODULE, 2814 .open = proto_seq_open, 2815 .read = seq_read, 2816 .llseek = seq_lseek, 2817 .release = seq_release_net, 2818}; 2819 2820static __net_init int proto_init_net(struct net *net) 2821{ 2822 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) 2823 return -ENOMEM; 2824 2825 return 0; 2826} 2827 2828static __net_exit void proto_exit_net(struct net *net) 2829{ 2830 remove_proc_entry("protocols", net->proc_net); 2831} 2832 2833 2834static __net_initdata struct pernet_operations proto_net_ops = { 2835 .init = proto_init_net, 2836 .exit = proto_exit_net, 2837}; 2838 2839static int __init proto_init(void) 2840{ 2841 return register_pernet_subsys(&proto_net_ops); 2842} 2843 2844subsys_initcall(proto_init); 2845 2846#endif /* PROC_FS */ 2847