sock.c revision d59577b6ffd313d0ab3be39cb1ab47e29bdc9182
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94#include <linux/capability.h> 95#include <linux/errno.h> 96#include <linux/types.h> 97#include <linux/socket.h> 98#include <linux/in.h> 99#include <linux/kernel.h> 100#include <linux/module.h> 101#include <linux/proc_fs.h> 102#include <linux/seq_file.h> 103#include <linux/sched.h> 104#include <linux/timer.h> 105#include <linux/string.h> 106#include <linux/sockios.h> 107#include <linux/net.h> 108#include <linux/mm.h> 109#include <linux/slab.h> 110#include <linux/interrupt.h> 111#include <linux/poll.h> 112#include <linux/tcp.h> 113#include <linux/init.h> 114#include <linux/highmem.h> 115#include <linux/user_namespace.h> 116#include <linux/static_key.h> 117#include <linux/memcontrol.h> 118#include <linux/prefetch.h> 119 120#include <asm/uaccess.h> 121 122#include <linux/netdevice.h> 123#include <net/protocol.h> 124#include <linux/skbuff.h> 125#include <net/net_namespace.h> 126#include <net/request_sock.h> 127#include <net/sock.h> 128#include <linux/net_tstamp.h> 129#include <net/xfrm.h> 130#include <linux/ipsec.h> 131#include <net/cls_cgroup.h> 132#include <net/netprio_cgroup.h> 133 134#include <linux/filter.h> 135 136#include <trace/events/sock.h> 137 138#ifdef CONFIG_INET 139#include <net/tcp.h> 140#endif 141 142static DEFINE_MUTEX(proto_list_mutex); 143static LIST_HEAD(proto_list); 144 145#ifdef CONFIG_MEMCG_KMEM 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 147{ 148 struct proto *proto; 149 int ret = 0; 150 151 mutex_lock(&proto_list_mutex); 152 list_for_each_entry(proto, &proto_list, node) { 153 if (proto->init_cgroup) { 154 ret = proto->init_cgroup(memcg, ss); 155 if (ret) 156 goto out; 157 } 158 } 159 160 mutex_unlock(&proto_list_mutex); 161 return ret; 162out: 163 list_for_each_entry_continue_reverse(proto, &proto_list, node) 164 if (proto->destroy_cgroup) 165 proto->destroy_cgroup(memcg); 166 mutex_unlock(&proto_list_mutex); 167 return ret; 168} 169 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) 171{ 172 struct proto *proto; 173 174 mutex_lock(&proto_list_mutex); 175 list_for_each_entry_reverse(proto, &proto_list, node) 176 if (proto->destroy_cgroup) 177 proto->destroy_cgroup(memcg); 178 mutex_unlock(&proto_list_mutex); 179} 180#endif 181 182/* 183 * Each address family might have different locking rules, so we have 184 * one slock key per address family: 185 */ 186static struct lock_class_key af_family_keys[AF_MAX]; 187static struct lock_class_key af_family_slock_keys[AF_MAX]; 188 189struct static_key memcg_socket_limit_enabled; 190EXPORT_SYMBOL(memcg_socket_limit_enabled); 191 192/* 193 * Make lock validator output more readable. (we pre-construct these 194 * strings build-time, so that runtime initialization of socket 195 * locks is fast): 196 */ 197static const char *const af_family_key_strings[AF_MAX+1] = { 198 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 199 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", 200 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , 201 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , 202 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 203 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 204 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 205 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 206 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 207 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 208 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 209 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 210 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , 211 "sk_lock-AF_NFC" , "sk_lock-AF_MAX" 212}; 213static const char *const af_family_slock_key_strings[AF_MAX+1] = { 214 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 215 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", 216 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , 217 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , 218 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 219 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 220 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 221 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , 222 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 223 "slock-27" , "slock-28" , "slock-AF_CAN" , 224 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 225 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 226 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , 227 "slock-AF_NFC" , "slock-AF_MAX" 228}; 229static const char *const af_family_clock_key_strings[AF_MAX+1] = { 230 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 231 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 232 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 233 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 234 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 235 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 236 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 237 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 238 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 239 "clock-27" , "clock-28" , "clock-AF_CAN" , 240 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 241 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 242 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , 243 "clock-AF_NFC" , "clock-AF_MAX" 244}; 245 246/* 247 * sk_callback_lock locking rules are per-address-family, 248 * so split the lock classes by using a per-AF key: 249 */ 250static struct lock_class_key af_callback_keys[AF_MAX]; 251 252/* Take into consideration the size of the struct sk_buff overhead in the 253 * determination of these values, since that is non-constant across 254 * platforms. This makes socket queueing behavior and performance 255 * not depend upon such differences. 256 */ 257#define _SK_MEM_PACKETS 256 258#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 259#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 260#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 261 262/* Run time adjustable parameters. */ 263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 264EXPORT_SYMBOL(sysctl_wmem_max); 265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 266EXPORT_SYMBOL(sysctl_rmem_max); 267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 269 270/* Maximal space eaten by iovec or ancillary data plus some space */ 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 272EXPORT_SYMBOL(sysctl_optmem_max); 273 274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; 275EXPORT_SYMBOL_GPL(memalloc_socks); 276 277/** 278 * sk_set_memalloc - sets %SOCK_MEMALLOC 279 * @sk: socket to set it on 280 * 281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 282 * It's the responsibility of the admin to adjust min_free_kbytes 283 * to meet the requirements 284 */ 285void sk_set_memalloc(struct sock *sk) 286{ 287 sock_set_flag(sk, SOCK_MEMALLOC); 288 sk->sk_allocation |= __GFP_MEMALLOC; 289 static_key_slow_inc(&memalloc_socks); 290} 291EXPORT_SYMBOL_GPL(sk_set_memalloc); 292 293void sk_clear_memalloc(struct sock *sk) 294{ 295 sock_reset_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation &= ~__GFP_MEMALLOC; 297 static_key_slow_dec(&memalloc_socks); 298 299 /* 300 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 301 * progress of swapping. However, if SOCK_MEMALLOC is cleared while 302 * it has rmem allocations there is a risk that the user of the 303 * socket cannot make forward progress due to exceeding the rmem 304 * limits. By rights, sk_clear_memalloc() should only be called 305 * on sockets being torn down but warn and reset the accounting if 306 * that assumption breaks. 307 */ 308 if (WARN_ON(sk->sk_forward_alloc)) 309 sk_mem_reclaim(sk); 310} 311EXPORT_SYMBOL_GPL(sk_clear_memalloc); 312 313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 314{ 315 int ret; 316 unsigned long pflags = current->flags; 317 318 /* these should have been dropped before queueing */ 319 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 320 321 current->flags |= PF_MEMALLOC; 322 ret = sk->sk_backlog_rcv(sk, skb); 323 tsk_restore_flags(current, pflags, PF_MEMALLOC); 324 325 return ret; 326} 327EXPORT_SYMBOL(__sk_backlog_rcv); 328 329static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 330{ 331 struct timeval tv; 332 333 if (optlen < sizeof(tv)) 334 return -EINVAL; 335 if (copy_from_user(&tv, optval, sizeof(tv))) 336 return -EFAULT; 337 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 338 return -EDOM; 339 340 if (tv.tv_sec < 0) { 341 static int warned __read_mostly; 342 343 *timeo_p = 0; 344 if (warned < 10 && net_ratelimit()) { 345 warned++; 346 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 347 __func__, current->comm, task_pid_nr(current)); 348 } 349 return 0; 350 } 351 *timeo_p = MAX_SCHEDULE_TIMEOUT; 352 if (tv.tv_sec == 0 && tv.tv_usec == 0) 353 return 0; 354 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 355 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 356 return 0; 357} 358 359static void sock_warn_obsolete_bsdism(const char *name) 360{ 361 static int warned; 362 static char warncomm[TASK_COMM_LEN]; 363 if (strcmp(warncomm, current->comm) && warned < 5) { 364 strcpy(warncomm, current->comm); 365 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 366 warncomm, name); 367 warned++; 368 } 369} 370 371#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) 372 373static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 374{ 375 if (sk->sk_flags & flags) { 376 sk->sk_flags &= ~flags; 377 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 378 net_disable_timestamp(); 379 } 380} 381 382 383int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 384{ 385 int err; 386 int skb_len; 387 unsigned long flags; 388 struct sk_buff_head *list = &sk->sk_receive_queue; 389 390 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 391 atomic_inc(&sk->sk_drops); 392 trace_sock_rcvqueue_full(sk, skb); 393 return -ENOMEM; 394 } 395 396 err = sk_filter(sk, skb); 397 if (err) 398 return err; 399 400 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 401 atomic_inc(&sk->sk_drops); 402 return -ENOBUFS; 403 } 404 405 skb->dev = NULL; 406 skb_set_owner_r(skb, sk); 407 408 /* Cache the SKB length before we tack it onto the receive 409 * queue. Once it is added it no longer belongs to us and 410 * may be freed by other threads of control pulling packets 411 * from the queue. 412 */ 413 skb_len = skb->len; 414 415 /* we escape from rcu protected region, make sure we dont leak 416 * a norefcounted dst 417 */ 418 skb_dst_force(skb); 419 420 spin_lock_irqsave(&list->lock, flags); 421 skb->dropcount = atomic_read(&sk->sk_drops); 422 __skb_queue_tail(list, skb); 423 spin_unlock_irqrestore(&list->lock, flags); 424 425 if (!sock_flag(sk, SOCK_DEAD)) 426 sk->sk_data_ready(sk, skb_len); 427 return 0; 428} 429EXPORT_SYMBOL(sock_queue_rcv_skb); 430 431int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) 432{ 433 int rc = NET_RX_SUCCESS; 434 435 if (sk_filter(sk, skb)) 436 goto discard_and_relse; 437 438 skb->dev = NULL; 439 440 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { 441 atomic_inc(&sk->sk_drops); 442 goto discard_and_relse; 443 } 444 if (nested) 445 bh_lock_sock_nested(sk); 446 else 447 bh_lock_sock(sk); 448 if (!sock_owned_by_user(sk)) { 449 /* 450 * trylock + unlock semantics: 451 */ 452 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 453 454 rc = sk_backlog_rcv(sk, skb); 455 456 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 457 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 458 bh_unlock_sock(sk); 459 atomic_inc(&sk->sk_drops); 460 goto discard_and_relse; 461 } 462 463 bh_unlock_sock(sk); 464out: 465 sock_put(sk); 466 return rc; 467discard_and_relse: 468 kfree_skb(skb); 469 goto out; 470} 471EXPORT_SYMBOL(sk_receive_skb); 472 473void sk_reset_txq(struct sock *sk) 474{ 475 sk_tx_queue_clear(sk); 476} 477EXPORT_SYMBOL(sk_reset_txq); 478 479struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 480{ 481 struct dst_entry *dst = __sk_dst_get(sk); 482 483 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 484 sk_tx_queue_clear(sk); 485 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 486 dst_release(dst); 487 return NULL; 488 } 489 490 return dst; 491} 492EXPORT_SYMBOL(__sk_dst_check); 493 494struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 495{ 496 struct dst_entry *dst = sk_dst_get(sk); 497 498 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 499 sk_dst_reset(sk); 500 dst_release(dst); 501 return NULL; 502 } 503 504 return dst; 505} 506EXPORT_SYMBOL(sk_dst_check); 507 508static int sock_setbindtodevice(struct sock *sk, char __user *optval, 509 int optlen) 510{ 511 int ret = -ENOPROTOOPT; 512#ifdef CONFIG_NETDEVICES 513 struct net *net = sock_net(sk); 514 char devname[IFNAMSIZ]; 515 int index; 516 517 /* Sorry... */ 518 ret = -EPERM; 519 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 520 goto out; 521 522 ret = -EINVAL; 523 if (optlen < 0) 524 goto out; 525 526 /* Bind this socket to a particular device like "eth0", 527 * as specified in the passed interface name. If the 528 * name is "" or the option length is zero the socket 529 * is not bound. 530 */ 531 if (optlen > IFNAMSIZ - 1) 532 optlen = IFNAMSIZ - 1; 533 memset(devname, 0, sizeof(devname)); 534 535 ret = -EFAULT; 536 if (copy_from_user(devname, optval, optlen)) 537 goto out; 538 539 index = 0; 540 if (devname[0] != '\0') { 541 struct net_device *dev; 542 543 rcu_read_lock(); 544 dev = dev_get_by_name_rcu(net, devname); 545 if (dev) 546 index = dev->ifindex; 547 rcu_read_unlock(); 548 ret = -ENODEV; 549 if (!dev) 550 goto out; 551 } 552 553 lock_sock(sk); 554 sk->sk_bound_dev_if = index; 555 sk_dst_reset(sk); 556 release_sock(sk); 557 558 ret = 0; 559 560out: 561#endif 562 563 return ret; 564} 565 566static int sock_getbindtodevice(struct sock *sk, char __user *optval, 567 int __user *optlen, int len) 568{ 569 int ret = -ENOPROTOOPT; 570#ifdef CONFIG_NETDEVICES 571 struct net *net = sock_net(sk); 572 struct net_device *dev; 573 char devname[IFNAMSIZ]; 574 unsigned seq; 575 576 if (sk->sk_bound_dev_if == 0) { 577 len = 0; 578 goto zero; 579 } 580 581 ret = -EINVAL; 582 if (len < IFNAMSIZ) 583 goto out; 584 585retry: 586 seq = read_seqcount_begin(&devnet_rename_seq); 587 rcu_read_lock(); 588 dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); 589 ret = -ENODEV; 590 if (!dev) { 591 rcu_read_unlock(); 592 goto out; 593 } 594 595 strcpy(devname, dev->name); 596 rcu_read_unlock(); 597 if (read_seqcount_retry(&devnet_rename_seq, seq)) 598 goto retry; 599 600 len = strlen(devname) + 1; 601 602 ret = -EFAULT; 603 if (copy_to_user(optval, devname, len)) 604 goto out; 605 606zero: 607 ret = -EFAULT; 608 if (put_user(len, optlen)) 609 goto out; 610 611 ret = 0; 612 613out: 614#endif 615 616 return ret; 617} 618 619static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 620{ 621 if (valbool) 622 sock_set_flag(sk, bit); 623 else 624 sock_reset_flag(sk, bit); 625} 626 627/* 628 * This is meant for all protocols to use and covers goings on 629 * at the socket level. Everything here is generic. 630 */ 631 632int sock_setsockopt(struct socket *sock, int level, int optname, 633 char __user *optval, unsigned int optlen) 634{ 635 struct sock *sk = sock->sk; 636 int val; 637 int valbool; 638 struct linger ling; 639 int ret = 0; 640 641 /* 642 * Options without arguments 643 */ 644 645 if (optname == SO_BINDTODEVICE) 646 return sock_setbindtodevice(sk, optval, optlen); 647 648 if (optlen < sizeof(int)) 649 return -EINVAL; 650 651 if (get_user(val, (int __user *)optval)) 652 return -EFAULT; 653 654 valbool = val ? 1 : 0; 655 656 lock_sock(sk); 657 658 switch (optname) { 659 case SO_DEBUG: 660 if (val && !capable(CAP_NET_ADMIN)) 661 ret = -EACCES; 662 else 663 sock_valbool_flag(sk, SOCK_DBG, valbool); 664 break; 665 case SO_REUSEADDR: 666 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 667 break; 668 case SO_TYPE: 669 case SO_PROTOCOL: 670 case SO_DOMAIN: 671 case SO_ERROR: 672 ret = -ENOPROTOOPT; 673 break; 674 case SO_DONTROUTE: 675 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 676 break; 677 case SO_BROADCAST: 678 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 679 break; 680 case SO_SNDBUF: 681 /* Don't error on this BSD doesn't and if you think 682 * about it this is right. Otherwise apps have to 683 * play 'guess the biggest size' games. RCVBUF/SNDBUF 684 * are treated in BSD as hints 685 */ 686 val = min_t(u32, val, sysctl_wmem_max); 687set_sndbuf: 688 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 689 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); 690 /* Wake up sending tasks if we upped the value. */ 691 sk->sk_write_space(sk); 692 break; 693 694 case SO_SNDBUFFORCE: 695 if (!capable(CAP_NET_ADMIN)) { 696 ret = -EPERM; 697 break; 698 } 699 goto set_sndbuf; 700 701 case SO_RCVBUF: 702 /* Don't error on this BSD doesn't and if you think 703 * about it this is right. Otherwise apps have to 704 * play 'guess the biggest size' games. RCVBUF/SNDBUF 705 * are treated in BSD as hints 706 */ 707 val = min_t(u32, val, sysctl_rmem_max); 708set_rcvbuf: 709 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 710 /* 711 * We double it on the way in to account for 712 * "struct sk_buff" etc. overhead. Applications 713 * assume that the SO_RCVBUF setting they make will 714 * allow that much actual data to be received on that 715 * socket. 716 * 717 * Applications are unaware that "struct sk_buff" and 718 * other overheads allocate from the receive buffer 719 * during socket buffer allocation. 720 * 721 * And after considering the possible alternatives, 722 * returning the value we actually used in getsockopt 723 * is the most desirable behavior. 724 */ 725 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); 726 break; 727 728 case SO_RCVBUFFORCE: 729 if (!capable(CAP_NET_ADMIN)) { 730 ret = -EPERM; 731 break; 732 } 733 goto set_rcvbuf; 734 735 case SO_KEEPALIVE: 736#ifdef CONFIG_INET 737 if (sk->sk_protocol == IPPROTO_TCP && 738 sk->sk_type == SOCK_STREAM) 739 tcp_set_keepalive(sk, valbool); 740#endif 741 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 742 break; 743 744 case SO_OOBINLINE: 745 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 746 break; 747 748 case SO_NO_CHECK: 749 sk->sk_no_check = valbool; 750 break; 751 752 case SO_PRIORITY: 753 if ((val >= 0 && val <= 6) || 754 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 755 sk->sk_priority = val; 756 else 757 ret = -EPERM; 758 break; 759 760 case SO_LINGER: 761 if (optlen < sizeof(ling)) { 762 ret = -EINVAL; /* 1003.1g */ 763 break; 764 } 765 if (copy_from_user(&ling, optval, sizeof(ling))) { 766 ret = -EFAULT; 767 break; 768 } 769 if (!ling.l_onoff) 770 sock_reset_flag(sk, SOCK_LINGER); 771 else { 772#if (BITS_PER_LONG == 32) 773 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 774 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 775 else 776#endif 777 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 778 sock_set_flag(sk, SOCK_LINGER); 779 } 780 break; 781 782 case SO_BSDCOMPAT: 783 sock_warn_obsolete_bsdism("setsockopt"); 784 break; 785 786 case SO_PASSCRED: 787 if (valbool) 788 set_bit(SOCK_PASSCRED, &sock->flags); 789 else 790 clear_bit(SOCK_PASSCRED, &sock->flags); 791 break; 792 793 case SO_TIMESTAMP: 794 case SO_TIMESTAMPNS: 795 if (valbool) { 796 if (optname == SO_TIMESTAMP) 797 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 798 else 799 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 800 sock_set_flag(sk, SOCK_RCVTSTAMP); 801 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 802 } else { 803 sock_reset_flag(sk, SOCK_RCVTSTAMP); 804 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 805 } 806 break; 807 808 case SO_TIMESTAMPING: 809 if (val & ~SOF_TIMESTAMPING_MASK) { 810 ret = -EINVAL; 811 break; 812 } 813 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, 814 val & SOF_TIMESTAMPING_TX_HARDWARE); 815 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, 816 val & SOF_TIMESTAMPING_TX_SOFTWARE); 817 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, 818 val & SOF_TIMESTAMPING_RX_HARDWARE); 819 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 820 sock_enable_timestamp(sk, 821 SOCK_TIMESTAMPING_RX_SOFTWARE); 822 else 823 sock_disable_timestamp(sk, 824 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 825 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, 826 val & SOF_TIMESTAMPING_SOFTWARE); 827 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, 828 val & SOF_TIMESTAMPING_SYS_HARDWARE); 829 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, 830 val & SOF_TIMESTAMPING_RAW_HARDWARE); 831 break; 832 833 case SO_RCVLOWAT: 834 if (val < 0) 835 val = INT_MAX; 836 sk->sk_rcvlowat = val ? : 1; 837 break; 838 839 case SO_RCVTIMEO: 840 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 841 break; 842 843 case SO_SNDTIMEO: 844 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 845 break; 846 847 case SO_ATTACH_FILTER: 848 ret = -EINVAL; 849 if (optlen == sizeof(struct sock_fprog)) { 850 struct sock_fprog fprog; 851 852 ret = -EFAULT; 853 if (copy_from_user(&fprog, optval, sizeof(fprog))) 854 break; 855 856 ret = sk_attach_filter(&fprog, sk); 857 } 858 break; 859 860 case SO_DETACH_FILTER: 861 ret = sk_detach_filter(sk); 862 break; 863 864 case SO_LOCK_FILTER: 865 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 866 ret = -EPERM; 867 else 868 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 869 break; 870 871 case SO_PASSSEC: 872 if (valbool) 873 set_bit(SOCK_PASSSEC, &sock->flags); 874 else 875 clear_bit(SOCK_PASSSEC, &sock->flags); 876 break; 877 case SO_MARK: 878 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 879 ret = -EPERM; 880 else 881 sk->sk_mark = val; 882 break; 883 884 /* We implement the SO_SNDLOWAT etc to 885 not be settable (1003.1g 5.3) */ 886 case SO_RXQ_OVFL: 887 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 888 break; 889 890 case SO_WIFI_STATUS: 891 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 892 break; 893 894 case SO_PEEK_OFF: 895 if (sock->ops->set_peek_off) 896 sock->ops->set_peek_off(sk, val); 897 else 898 ret = -EOPNOTSUPP; 899 break; 900 901 case SO_NOFCS: 902 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 903 break; 904 905 default: 906 ret = -ENOPROTOOPT; 907 break; 908 } 909 release_sock(sk); 910 return ret; 911} 912EXPORT_SYMBOL(sock_setsockopt); 913 914 915void cred_to_ucred(struct pid *pid, const struct cred *cred, 916 struct ucred *ucred) 917{ 918 ucred->pid = pid_vnr(pid); 919 ucred->uid = ucred->gid = -1; 920 if (cred) { 921 struct user_namespace *current_ns = current_user_ns(); 922 923 ucred->uid = from_kuid_munged(current_ns, cred->euid); 924 ucred->gid = from_kgid_munged(current_ns, cred->egid); 925 } 926} 927EXPORT_SYMBOL_GPL(cred_to_ucred); 928 929int sock_getsockopt(struct socket *sock, int level, int optname, 930 char __user *optval, int __user *optlen) 931{ 932 struct sock *sk = sock->sk; 933 934 union { 935 int val; 936 struct linger ling; 937 struct timeval tm; 938 } v; 939 940 int lv = sizeof(int); 941 int len; 942 943 if (get_user(len, optlen)) 944 return -EFAULT; 945 if (len < 0) 946 return -EINVAL; 947 948 memset(&v, 0, sizeof(v)); 949 950 switch (optname) { 951 case SO_DEBUG: 952 v.val = sock_flag(sk, SOCK_DBG); 953 break; 954 955 case SO_DONTROUTE: 956 v.val = sock_flag(sk, SOCK_LOCALROUTE); 957 break; 958 959 case SO_BROADCAST: 960 v.val = sock_flag(sk, SOCK_BROADCAST); 961 break; 962 963 case SO_SNDBUF: 964 v.val = sk->sk_sndbuf; 965 break; 966 967 case SO_RCVBUF: 968 v.val = sk->sk_rcvbuf; 969 break; 970 971 case SO_REUSEADDR: 972 v.val = sk->sk_reuse; 973 break; 974 975 case SO_KEEPALIVE: 976 v.val = sock_flag(sk, SOCK_KEEPOPEN); 977 break; 978 979 case SO_TYPE: 980 v.val = sk->sk_type; 981 break; 982 983 case SO_PROTOCOL: 984 v.val = sk->sk_protocol; 985 break; 986 987 case SO_DOMAIN: 988 v.val = sk->sk_family; 989 break; 990 991 case SO_ERROR: 992 v.val = -sock_error(sk); 993 if (v.val == 0) 994 v.val = xchg(&sk->sk_err_soft, 0); 995 break; 996 997 case SO_OOBINLINE: 998 v.val = sock_flag(sk, SOCK_URGINLINE); 999 break; 1000 1001 case SO_NO_CHECK: 1002 v.val = sk->sk_no_check; 1003 break; 1004 1005 case SO_PRIORITY: 1006 v.val = sk->sk_priority; 1007 break; 1008 1009 case SO_LINGER: 1010 lv = sizeof(v.ling); 1011 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1012 v.ling.l_linger = sk->sk_lingertime / HZ; 1013 break; 1014 1015 case SO_BSDCOMPAT: 1016 sock_warn_obsolete_bsdism("getsockopt"); 1017 break; 1018 1019 case SO_TIMESTAMP: 1020 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1021 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1022 break; 1023 1024 case SO_TIMESTAMPNS: 1025 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1026 break; 1027 1028 case SO_TIMESTAMPING: 1029 v.val = 0; 1030 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) 1031 v.val |= SOF_TIMESTAMPING_TX_HARDWARE; 1032 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) 1033 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; 1034 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) 1035 v.val |= SOF_TIMESTAMPING_RX_HARDWARE; 1036 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) 1037 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; 1038 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) 1039 v.val |= SOF_TIMESTAMPING_SOFTWARE; 1040 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) 1041 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; 1042 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) 1043 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; 1044 break; 1045 1046 case SO_RCVTIMEO: 1047 lv = sizeof(struct timeval); 1048 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1049 v.tm.tv_sec = 0; 1050 v.tm.tv_usec = 0; 1051 } else { 1052 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1053 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 1054 } 1055 break; 1056 1057 case SO_SNDTIMEO: 1058 lv = sizeof(struct timeval); 1059 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1060 v.tm.tv_sec = 0; 1061 v.tm.tv_usec = 0; 1062 } else { 1063 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1064 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 1065 } 1066 break; 1067 1068 case SO_RCVLOWAT: 1069 v.val = sk->sk_rcvlowat; 1070 break; 1071 1072 case SO_SNDLOWAT: 1073 v.val = 1; 1074 break; 1075 1076 case SO_PASSCRED: 1077 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1078 break; 1079 1080 case SO_PEERCRED: 1081 { 1082 struct ucred peercred; 1083 if (len > sizeof(peercred)) 1084 len = sizeof(peercred); 1085 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1086 if (copy_to_user(optval, &peercred, len)) 1087 return -EFAULT; 1088 goto lenout; 1089 } 1090 1091 case SO_PEERNAME: 1092 { 1093 char address[128]; 1094 1095 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) 1096 return -ENOTCONN; 1097 if (lv < len) 1098 return -EINVAL; 1099 if (copy_to_user(optval, address, len)) 1100 return -EFAULT; 1101 goto lenout; 1102 } 1103 1104 /* Dubious BSD thing... Probably nobody even uses it, but 1105 * the UNIX standard wants it for whatever reason... -DaveM 1106 */ 1107 case SO_ACCEPTCONN: 1108 v.val = sk->sk_state == TCP_LISTEN; 1109 break; 1110 1111 case SO_PASSSEC: 1112 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1113 break; 1114 1115 case SO_PEERSEC: 1116 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1117 1118 case SO_MARK: 1119 v.val = sk->sk_mark; 1120 break; 1121 1122 case SO_RXQ_OVFL: 1123 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1124 break; 1125 1126 case SO_WIFI_STATUS: 1127 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1128 break; 1129 1130 case SO_PEEK_OFF: 1131 if (!sock->ops->set_peek_off) 1132 return -EOPNOTSUPP; 1133 1134 v.val = sk->sk_peek_off; 1135 break; 1136 case SO_NOFCS: 1137 v.val = sock_flag(sk, SOCK_NOFCS); 1138 break; 1139 1140 case SO_BINDTODEVICE: 1141 return sock_getbindtodevice(sk, optval, optlen, len); 1142 1143 case SO_GET_FILTER: 1144 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1145 if (len < 0) 1146 return len; 1147 1148 goto lenout; 1149 1150 case SO_LOCK_FILTER: 1151 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1152 break; 1153 1154 default: 1155 return -ENOPROTOOPT; 1156 } 1157 1158 if (len > lv) 1159 len = lv; 1160 if (copy_to_user(optval, &v, len)) 1161 return -EFAULT; 1162lenout: 1163 if (put_user(len, optlen)) 1164 return -EFAULT; 1165 return 0; 1166} 1167 1168/* 1169 * Initialize an sk_lock. 1170 * 1171 * (We also register the sk_lock with the lock validator.) 1172 */ 1173static inline void sock_lock_init(struct sock *sk) 1174{ 1175 sock_lock_init_class_and_name(sk, 1176 af_family_slock_key_strings[sk->sk_family], 1177 af_family_slock_keys + sk->sk_family, 1178 af_family_key_strings[sk->sk_family], 1179 af_family_keys + sk->sk_family); 1180} 1181 1182/* 1183 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1184 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1185 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1186 */ 1187static void sock_copy(struct sock *nsk, const struct sock *osk) 1188{ 1189#ifdef CONFIG_SECURITY_NETWORK 1190 void *sptr = nsk->sk_security; 1191#endif 1192 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1193 1194 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1195 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1196 1197#ifdef CONFIG_SECURITY_NETWORK 1198 nsk->sk_security = sptr; 1199 security_sk_clone(osk, nsk); 1200#endif 1201} 1202 1203/* 1204 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes 1205 * un-modified. Special care is taken when initializing object to zero. 1206 */ 1207static inline void sk_prot_clear_nulls(struct sock *sk, int size) 1208{ 1209 if (offsetof(struct sock, sk_node.next) != 0) 1210 memset(sk, 0, offsetof(struct sock, sk_node.next)); 1211 memset(&sk->sk_node.pprev, 0, 1212 size - offsetof(struct sock, sk_node.pprev)); 1213} 1214 1215void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) 1216{ 1217 unsigned long nulls1, nulls2; 1218 1219 nulls1 = offsetof(struct sock, __sk_common.skc_node.next); 1220 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); 1221 if (nulls1 > nulls2) 1222 swap(nulls1, nulls2); 1223 1224 if (nulls1 != 0) 1225 memset((char *)sk, 0, nulls1); 1226 memset((char *)sk + nulls1 + sizeof(void *), 0, 1227 nulls2 - nulls1 - sizeof(void *)); 1228 memset((char *)sk + nulls2 + sizeof(void *), 0, 1229 size - nulls2 - sizeof(void *)); 1230} 1231EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); 1232 1233static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1234 int family) 1235{ 1236 struct sock *sk; 1237 struct kmem_cache *slab; 1238 1239 slab = prot->slab; 1240 if (slab != NULL) { 1241 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1242 if (!sk) 1243 return sk; 1244 if (priority & __GFP_ZERO) { 1245 if (prot->clear_sk) 1246 prot->clear_sk(sk, prot->obj_size); 1247 else 1248 sk_prot_clear_nulls(sk, prot->obj_size); 1249 } 1250 } else 1251 sk = kmalloc(prot->obj_size, priority); 1252 1253 if (sk != NULL) { 1254 kmemcheck_annotate_bitfield(sk, flags); 1255 1256 if (security_sk_alloc(sk, family, priority)) 1257 goto out_free; 1258 1259 if (!try_module_get(prot->owner)) 1260 goto out_free_sec; 1261 sk_tx_queue_clear(sk); 1262 } 1263 1264 return sk; 1265 1266out_free_sec: 1267 security_sk_free(sk); 1268out_free: 1269 if (slab != NULL) 1270 kmem_cache_free(slab, sk); 1271 else 1272 kfree(sk); 1273 return NULL; 1274} 1275 1276static void sk_prot_free(struct proto *prot, struct sock *sk) 1277{ 1278 struct kmem_cache *slab; 1279 struct module *owner; 1280 1281 owner = prot->owner; 1282 slab = prot->slab; 1283 1284 security_sk_free(sk); 1285 if (slab != NULL) 1286 kmem_cache_free(slab, sk); 1287 else 1288 kfree(sk); 1289 module_put(owner); 1290} 1291 1292#ifdef CONFIG_CGROUPS 1293#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) 1294void sock_update_classid(struct sock *sk, struct task_struct *task) 1295{ 1296 u32 classid; 1297 1298 classid = task_cls_classid(task); 1299 if (classid != sk->sk_classid) 1300 sk->sk_classid = classid; 1301} 1302EXPORT_SYMBOL(sock_update_classid); 1303#endif 1304 1305#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) 1306void sock_update_netprioidx(struct sock *sk, struct task_struct *task) 1307{ 1308 if (in_interrupt()) 1309 return; 1310 1311 sk->sk_cgrp_prioidx = task_netprioidx(task); 1312} 1313EXPORT_SYMBOL_GPL(sock_update_netprioidx); 1314#endif 1315#endif 1316 1317/** 1318 * sk_alloc - All socket objects are allocated here 1319 * @net: the applicable net namespace 1320 * @family: protocol family 1321 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1322 * @prot: struct proto associated with this new sock instance 1323 */ 1324struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1325 struct proto *prot) 1326{ 1327 struct sock *sk; 1328 1329 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1330 if (sk) { 1331 sk->sk_family = family; 1332 /* 1333 * See comment in struct sock definition to understand 1334 * why we need sk_prot_creator -acme 1335 */ 1336 sk->sk_prot = sk->sk_prot_creator = prot; 1337 sock_lock_init(sk); 1338 sock_net_set(sk, get_net(net)); 1339 atomic_set(&sk->sk_wmem_alloc, 1); 1340 1341 sock_update_classid(sk, current); 1342 sock_update_netprioidx(sk, current); 1343 } 1344 1345 return sk; 1346} 1347EXPORT_SYMBOL(sk_alloc); 1348 1349static void __sk_free(struct sock *sk) 1350{ 1351 struct sk_filter *filter; 1352 1353 if (sk->sk_destruct) 1354 sk->sk_destruct(sk); 1355 1356 filter = rcu_dereference_check(sk->sk_filter, 1357 atomic_read(&sk->sk_wmem_alloc) == 0); 1358 if (filter) { 1359 sk_filter_uncharge(sk, filter); 1360 RCU_INIT_POINTER(sk->sk_filter, NULL); 1361 } 1362 1363 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1364 1365 if (atomic_read(&sk->sk_omem_alloc)) 1366 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1367 __func__, atomic_read(&sk->sk_omem_alloc)); 1368 1369 if (sk->sk_peer_cred) 1370 put_cred(sk->sk_peer_cred); 1371 put_pid(sk->sk_peer_pid); 1372 put_net(sock_net(sk)); 1373 sk_prot_free(sk->sk_prot_creator, sk); 1374} 1375 1376void sk_free(struct sock *sk) 1377{ 1378 /* 1379 * We subtract one from sk_wmem_alloc and can know if 1380 * some packets are still in some tx queue. 1381 * If not null, sock_wfree() will call __sk_free(sk) later 1382 */ 1383 if (atomic_dec_and_test(&sk->sk_wmem_alloc)) 1384 __sk_free(sk); 1385} 1386EXPORT_SYMBOL(sk_free); 1387 1388/* 1389 * Last sock_put should drop reference to sk->sk_net. It has already 1390 * been dropped in sk_change_net. Taking reference to stopping namespace 1391 * is not an option. 1392 * Take reference to a socket to remove it from hash _alive_ and after that 1393 * destroy it in the context of init_net. 1394 */ 1395void sk_release_kernel(struct sock *sk) 1396{ 1397 if (sk == NULL || sk->sk_socket == NULL) 1398 return; 1399 1400 sock_hold(sk); 1401 sock_release(sk->sk_socket); 1402 release_net(sock_net(sk)); 1403 sock_net_set(sk, get_net(&init_net)); 1404 sock_put(sk); 1405} 1406EXPORT_SYMBOL(sk_release_kernel); 1407 1408static void sk_update_clone(const struct sock *sk, struct sock *newsk) 1409{ 1410 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1411 sock_update_memcg(newsk); 1412} 1413 1414/** 1415 * sk_clone_lock - clone a socket, and lock its clone 1416 * @sk: the socket to clone 1417 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1418 * 1419 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1420 */ 1421struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1422{ 1423 struct sock *newsk; 1424 1425 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1426 if (newsk != NULL) { 1427 struct sk_filter *filter; 1428 1429 sock_copy(newsk, sk); 1430 1431 /* SANITY */ 1432 get_net(sock_net(newsk)); 1433 sk_node_init(&newsk->sk_node); 1434 sock_lock_init(newsk); 1435 bh_lock_sock(newsk); 1436 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1437 newsk->sk_backlog.len = 0; 1438 1439 atomic_set(&newsk->sk_rmem_alloc, 0); 1440 /* 1441 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1442 */ 1443 atomic_set(&newsk->sk_wmem_alloc, 1); 1444 atomic_set(&newsk->sk_omem_alloc, 0); 1445 skb_queue_head_init(&newsk->sk_receive_queue); 1446 skb_queue_head_init(&newsk->sk_write_queue); 1447#ifdef CONFIG_NET_DMA 1448 skb_queue_head_init(&newsk->sk_async_wait_queue); 1449#endif 1450 1451 spin_lock_init(&newsk->sk_dst_lock); 1452 rwlock_init(&newsk->sk_callback_lock); 1453 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1454 af_callback_keys + newsk->sk_family, 1455 af_family_clock_key_strings[newsk->sk_family]); 1456 1457 newsk->sk_dst_cache = NULL; 1458 newsk->sk_wmem_queued = 0; 1459 newsk->sk_forward_alloc = 0; 1460 newsk->sk_send_head = NULL; 1461 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1462 1463 sock_reset_flag(newsk, SOCK_DONE); 1464 skb_queue_head_init(&newsk->sk_error_queue); 1465 1466 filter = rcu_dereference_protected(newsk->sk_filter, 1); 1467 if (filter != NULL) 1468 sk_filter_charge(newsk, filter); 1469 1470 if (unlikely(xfrm_sk_clone_policy(newsk))) { 1471 /* It is still raw copy of parent, so invalidate 1472 * destructor and make plain sk_free() */ 1473 newsk->sk_destruct = NULL; 1474 bh_unlock_sock(newsk); 1475 sk_free(newsk); 1476 newsk = NULL; 1477 goto out; 1478 } 1479 1480 newsk->sk_err = 0; 1481 newsk->sk_priority = 0; 1482 /* 1483 * Before updating sk_refcnt, we must commit prior changes to memory 1484 * (Documentation/RCU/rculist_nulls.txt for details) 1485 */ 1486 smp_wmb(); 1487 atomic_set(&newsk->sk_refcnt, 2); 1488 1489 /* 1490 * Increment the counter in the same struct proto as the master 1491 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1492 * is the same as sk->sk_prot->socks, as this field was copied 1493 * with memcpy). 1494 * 1495 * This _changes_ the previous behaviour, where 1496 * tcp_create_openreq_child always was incrementing the 1497 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1498 * to be taken into account in all callers. -acme 1499 */ 1500 sk_refcnt_debug_inc(newsk); 1501 sk_set_socket(newsk, NULL); 1502 newsk->sk_wq = NULL; 1503 1504 sk_update_clone(sk, newsk); 1505 1506 if (newsk->sk_prot->sockets_allocated) 1507 sk_sockets_allocated_inc(newsk); 1508 1509 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1510 net_enable_timestamp(); 1511 } 1512out: 1513 return newsk; 1514} 1515EXPORT_SYMBOL_GPL(sk_clone_lock); 1516 1517void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1518{ 1519 __sk_dst_set(sk, dst); 1520 sk->sk_route_caps = dst->dev->features; 1521 if (sk->sk_route_caps & NETIF_F_GSO) 1522 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1523 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1524 if (sk_can_gso(sk)) { 1525 if (dst->header_len) { 1526 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1527 } else { 1528 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1529 sk->sk_gso_max_size = dst->dev->gso_max_size; 1530 sk->sk_gso_max_segs = dst->dev->gso_max_segs; 1531 } 1532 } 1533} 1534EXPORT_SYMBOL_GPL(sk_setup_caps); 1535 1536/* 1537 * Simple resource managers for sockets. 1538 */ 1539 1540 1541/* 1542 * Write buffer destructor automatically called from kfree_skb. 1543 */ 1544void sock_wfree(struct sk_buff *skb) 1545{ 1546 struct sock *sk = skb->sk; 1547 unsigned int len = skb->truesize; 1548 1549 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1550 /* 1551 * Keep a reference on sk_wmem_alloc, this will be released 1552 * after sk_write_space() call 1553 */ 1554 atomic_sub(len - 1, &sk->sk_wmem_alloc); 1555 sk->sk_write_space(sk); 1556 len = 1; 1557 } 1558 /* 1559 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1560 * could not do because of in-flight packets 1561 */ 1562 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) 1563 __sk_free(sk); 1564} 1565EXPORT_SYMBOL(sock_wfree); 1566 1567/* 1568 * Read buffer destructor automatically called from kfree_skb. 1569 */ 1570void sock_rfree(struct sk_buff *skb) 1571{ 1572 struct sock *sk = skb->sk; 1573 unsigned int len = skb->truesize; 1574 1575 atomic_sub(len, &sk->sk_rmem_alloc); 1576 sk_mem_uncharge(sk, len); 1577} 1578EXPORT_SYMBOL(sock_rfree); 1579 1580void sock_edemux(struct sk_buff *skb) 1581{ 1582 struct sock *sk = skb->sk; 1583 1584#ifdef CONFIG_INET 1585 if (sk->sk_state == TCP_TIME_WAIT) 1586 inet_twsk_put(inet_twsk(sk)); 1587 else 1588#endif 1589 sock_put(sk); 1590} 1591EXPORT_SYMBOL(sock_edemux); 1592 1593kuid_t sock_i_uid(struct sock *sk) 1594{ 1595 kuid_t uid; 1596 1597 read_lock_bh(&sk->sk_callback_lock); 1598 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1599 read_unlock_bh(&sk->sk_callback_lock); 1600 return uid; 1601} 1602EXPORT_SYMBOL(sock_i_uid); 1603 1604unsigned long sock_i_ino(struct sock *sk) 1605{ 1606 unsigned long ino; 1607 1608 read_lock_bh(&sk->sk_callback_lock); 1609 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1610 read_unlock_bh(&sk->sk_callback_lock); 1611 return ino; 1612} 1613EXPORT_SYMBOL(sock_i_ino); 1614 1615/* 1616 * Allocate a skb from the socket's send buffer. 1617 */ 1618struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1619 gfp_t priority) 1620{ 1621 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1622 struct sk_buff *skb = alloc_skb(size, priority); 1623 if (skb) { 1624 skb_set_owner_w(skb, sk); 1625 return skb; 1626 } 1627 } 1628 return NULL; 1629} 1630EXPORT_SYMBOL(sock_wmalloc); 1631 1632/* 1633 * Allocate a skb from the socket's receive buffer. 1634 */ 1635struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, 1636 gfp_t priority) 1637{ 1638 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1639 struct sk_buff *skb = alloc_skb(size, priority); 1640 if (skb) { 1641 skb_set_owner_r(skb, sk); 1642 return skb; 1643 } 1644 } 1645 return NULL; 1646} 1647 1648/* 1649 * Allocate a memory block from the socket's option memory buffer. 1650 */ 1651void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1652{ 1653 if ((unsigned int)size <= sysctl_optmem_max && 1654 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1655 void *mem; 1656 /* First do the add, to avoid the race if kmalloc 1657 * might sleep. 1658 */ 1659 atomic_add(size, &sk->sk_omem_alloc); 1660 mem = kmalloc(size, priority); 1661 if (mem) 1662 return mem; 1663 atomic_sub(size, &sk->sk_omem_alloc); 1664 } 1665 return NULL; 1666} 1667EXPORT_SYMBOL(sock_kmalloc); 1668 1669/* 1670 * Free an option memory block. 1671 */ 1672void sock_kfree_s(struct sock *sk, void *mem, int size) 1673{ 1674 kfree(mem); 1675 atomic_sub(size, &sk->sk_omem_alloc); 1676} 1677EXPORT_SYMBOL(sock_kfree_s); 1678 1679/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 1680 I think, these locks should be removed for datagram sockets. 1681 */ 1682static long sock_wait_for_wmem(struct sock *sk, long timeo) 1683{ 1684 DEFINE_WAIT(wait); 1685 1686 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1687 for (;;) { 1688 if (!timeo) 1689 break; 1690 if (signal_pending(current)) 1691 break; 1692 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1693 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1694 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1695 break; 1696 if (sk->sk_shutdown & SEND_SHUTDOWN) 1697 break; 1698 if (sk->sk_err) 1699 break; 1700 timeo = schedule_timeout(timeo); 1701 } 1702 finish_wait(sk_sleep(sk), &wait); 1703 return timeo; 1704} 1705 1706 1707/* 1708 * Generic send/receive buffer handlers 1709 */ 1710 1711struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1712 unsigned long data_len, int noblock, 1713 int *errcode) 1714{ 1715 struct sk_buff *skb; 1716 gfp_t gfp_mask; 1717 long timeo; 1718 int err; 1719 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 1720 1721 err = -EMSGSIZE; 1722 if (npages > MAX_SKB_FRAGS) 1723 goto failure; 1724 1725 gfp_mask = sk->sk_allocation; 1726 if (gfp_mask & __GFP_WAIT) 1727 gfp_mask |= __GFP_REPEAT; 1728 1729 timeo = sock_sndtimeo(sk, noblock); 1730 while (1) { 1731 err = sock_error(sk); 1732 if (err != 0) 1733 goto failure; 1734 1735 err = -EPIPE; 1736 if (sk->sk_shutdown & SEND_SHUTDOWN) 1737 goto failure; 1738 1739 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1740 skb = alloc_skb(header_len, gfp_mask); 1741 if (skb) { 1742 int i; 1743 1744 /* No pages, we're done... */ 1745 if (!data_len) 1746 break; 1747 1748 skb->truesize += data_len; 1749 skb_shinfo(skb)->nr_frags = npages; 1750 for (i = 0; i < npages; i++) { 1751 struct page *page; 1752 1753 page = alloc_pages(sk->sk_allocation, 0); 1754 if (!page) { 1755 err = -ENOBUFS; 1756 skb_shinfo(skb)->nr_frags = i; 1757 kfree_skb(skb); 1758 goto failure; 1759 } 1760 1761 __skb_fill_page_desc(skb, i, 1762 page, 0, 1763 (data_len >= PAGE_SIZE ? 1764 PAGE_SIZE : 1765 data_len)); 1766 data_len -= PAGE_SIZE; 1767 } 1768 1769 /* Full success... */ 1770 break; 1771 } 1772 err = -ENOBUFS; 1773 goto failure; 1774 } 1775 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1776 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1777 err = -EAGAIN; 1778 if (!timeo) 1779 goto failure; 1780 if (signal_pending(current)) 1781 goto interrupted; 1782 timeo = sock_wait_for_wmem(sk, timeo); 1783 } 1784 1785 skb_set_owner_w(skb, sk); 1786 return skb; 1787 1788interrupted: 1789 err = sock_intr_errno(timeo); 1790failure: 1791 *errcode = err; 1792 return NULL; 1793} 1794EXPORT_SYMBOL(sock_alloc_send_pskb); 1795 1796struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1797 int noblock, int *errcode) 1798{ 1799 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); 1800} 1801EXPORT_SYMBOL(sock_alloc_send_skb); 1802 1803/* On 32bit arches, an skb frag is limited to 2^15 */ 1804#define SKB_FRAG_PAGE_ORDER get_order(32768) 1805 1806bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 1807{ 1808 int order; 1809 1810 if (pfrag->page) { 1811 if (atomic_read(&pfrag->page->_count) == 1) { 1812 pfrag->offset = 0; 1813 return true; 1814 } 1815 if (pfrag->offset < pfrag->size) 1816 return true; 1817 put_page(pfrag->page); 1818 } 1819 1820 /* We restrict high order allocations to users that can afford to wait */ 1821 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; 1822 1823 do { 1824 gfp_t gfp = sk->sk_allocation; 1825 1826 if (order) 1827 gfp |= __GFP_COMP | __GFP_NOWARN; 1828 pfrag->page = alloc_pages(gfp, order); 1829 if (likely(pfrag->page)) { 1830 pfrag->offset = 0; 1831 pfrag->size = PAGE_SIZE << order; 1832 return true; 1833 } 1834 } while (--order >= 0); 1835 1836 sk_enter_memory_pressure(sk); 1837 sk_stream_moderate_sndbuf(sk); 1838 return false; 1839} 1840EXPORT_SYMBOL(sk_page_frag_refill); 1841 1842static void __lock_sock(struct sock *sk) 1843 __releases(&sk->sk_lock.slock) 1844 __acquires(&sk->sk_lock.slock) 1845{ 1846 DEFINE_WAIT(wait); 1847 1848 for (;;) { 1849 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 1850 TASK_UNINTERRUPTIBLE); 1851 spin_unlock_bh(&sk->sk_lock.slock); 1852 schedule(); 1853 spin_lock_bh(&sk->sk_lock.slock); 1854 if (!sock_owned_by_user(sk)) 1855 break; 1856 } 1857 finish_wait(&sk->sk_lock.wq, &wait); 1858} 1859 1860static void __release_sock(struct sock *sk) 1861 __releases(&sk->sk_lock.slock) 1862 __acquires(&sk->sk_lock.slock) 1863{ 1864 struct sk_buff *skb = sk->sk_backlog.head; 1865 1866 do { 1867 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 1868 bh_unlock_sock(sk); 1869 1870 do { 1871 struct sk_buff *next = skb->next; 1872 1873 prefetch(next); 1874 WARN_ON_ONCE(skb_dst_is_noref(skb)); 1875 skb->next = NULL; 1876 sk_backlog_rcv(sk, skb); 1877 1878 /* 1879 * We are in process context here with softirqs 1880 * disabled, use cond_resched_softirq() to preempt. 1881 * This is safe to do because we've taken the backlog 1882 * queue private: 1883 */ 1884 cond_resched_softirq(); 1885 1886 skb = next; 1887 } while (skb != NULL); 1888 1889 bh_lock_sock(sk); 1890 } while ((skb = sk->sk_backlog.head) != NULL); 1891 1892 /* 1893 * Doing the zeroing here guarantee we can not loop forever 1894 * while a wild producer attempts to flood us. 1895 */ 1896 sk->sk_backlog.len = 0; 1897} 1898 1899/** 1900 * sk_wait_data - wait for data to arrive at sk_receive_queue 1901 * @sk: sock to wait on 1902 * @timeo: for how long 1903 * 1904 * Now socket state including sk->sk_err is changed only under lock, 1905 * hence we may omit checks after joining wait queue. 1906 * We check receive queue before schedule() only as optimization; 1907 * it is very likely that release_sock() added new data. 1908 */ 1909int sk_wait_data(struct sock *sk, long *timeo) 1910{ 1911 int rc; 1912 DEFINE_WAIT(wait); 1913 1914 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1915 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1916 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); 1917 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1918 finish_wait(sk_sleep(sk), &wait); 1919 return rc; 1920} 1921EXPORT_SYMBOL(sk_wait_data); 1922 1923/** 1924 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 1925 * @sk: socket 1926 * @size: memory size to allocate 1927 * @kind: allocation type 1928 * 1929 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 1930 * rmem allocation. This function assumes that protocols which have 1931 * memory_pressure use sk_wmem_queued as write buffer accounting. 1932 */ 1933int __sk_mem_schedule(struct sock *sk, int size, int kind) 1934{ 1935 struct proto *prot = sk->sk_prot; 1936 int amt = sk_mem_pages(size); 1937 long allocated; 1938 int parent_status = UNDER_LIMIT; 1939 1940 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1941 1942 allocated = sk_memory_allocated_add(sk, amt, &parent_status); 1943 1944 /* Under limit. */ 1945 if (parent_status == UNDER_LIMIT && 1946 allocated <= sk_prot_mem_limits(sk, 0)) { 1947 sk_leave_memory_pressure(sk); 1948 return 1; 1949 } 1950 1951 /* Under pressure. (we or our parents) */ 1952 if ((parent_status > SOFT_LIMIT) || 1953 allocated > sk_prot_mem_limits(sk, 1)) 1954 sk_enter_memory_pressure(sk); 1955 1956 /* Over hard limit (we or our parents) */ 1957 if ((parent_status == OVER_LIMIT) || 1958 (allocated > sk_prot_mem_limits(sk, 2))) 1959 goto suppress_allocation; 1960 1961 /* guarantee minimum buffer size under pressure */ 1962 if (kind == SK_MEM_RECV) { 1963 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 1964 return 1; 1965 1966 } else { /* SK_MEM_SEND */ 1967 if (sk->sk_type == SOCK_STREAM) { 1968 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 1969 return 1; 1970 } else if (atomic_read(&sk->sk_wmem_alloc) < 1971 prot->sysctl_wmem[0]) 1972 return 1; 1973 } 1974 1975 if (sk_has_memory_pressure(sk)) { 1976 int alloc; 1977 1978 if (!sk_under_memory_pressure(sk)) 1979 return 1; 1980 alloc = sk_sockets_allocated_read_positive(sk); 1981 if (sk_prot_mem_limits(sk, 2) > alloc * 1982 sk_mem_pages(sk->sk_wmem_queued + 1983 atomic_read(&sk->sk_rmem_alloc) + 1984 sk->sk_forward_alloc)) 1985 return 1; 1986 } 1987 1988suppress_allocation: 1989 1990 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 1991 sk_stream_moderate_sndbuf(sk); 1992 1993 /* Fail only if socket is _under_ its sndbuf. 1994 * In this case we cannot block, so that we have to fail. 1995 */ 1996 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 1997 return 1; 1998 } 1999 2000 trace_sock_exceed_buf_limit(sk, prot, allocated); 2001 2002 /* Alas. Undo changes. */ 2003 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 2004 2005 sk_memory_allocated_sub(sk, amt); 2006 2007 return 0; 2008} 2009EXPORT_SYMBOL(__sk_mem_schedule); 2010 2011/** 2012 * __sk_reclaim - reclaim memory_allocated 2013 * @sk: socket 2014 */ 2015void __sk_mem_reclaim(struct sock *sk) 2016{ 2017 sk_memory_allocated_sub(sk, 2018 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); 2019 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 2020 2021 if (sk_under_memory_pressure(sk) && 2022 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2023 sk_leave_memory_pressure(sk); 2024} 2025EXPORT_SYMBOL(__sk_mem_reclaim); 2026 2027 2028/* 2029 * Set of default routines for initialising struct proto_ops when 2030 * the protocol does not support a particular function. In certain 2031 * cases where it makes no sense for a protocol to have a "do nothing" 2032 * function, some default processing is provided. 2033 */ 2034 2035int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2036{ 2037 return -EOPNOTSUPP; 2038} 2039EXPORT_SYMBOL(sock_no_bind); 2040 2041int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2042 int len, int flags) 2043{ 2044 return -EOPNOTSUPP; 2045} 2046EXPORT_SYMBOL(sock_no_connect); 2047 2048int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2049{ 2050 return -EOPNOTSUPP; 2051} 2052EXPORT_SYMBOL(sock_no_socketpair); 2053 2054int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 2055{ 2056 return -EOPNOTSUPP; 2057} 2058EXPORT_SYMBOL(sock_no_accept); 2059 2060int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2061 int *len, int peer) 2062{ 2063 return -EOPNOTSUPP; 2064} 2065EXPORT_SYMBOL(sock_no_getname); 2066 2067unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 2068{ 2069 return 0; 2070} 2071EXPORT_SYMBOL(sock_no_poll); 2072 2073int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2074{ 2075 return -EOPNOTSUPP; 2076} 2077EXPORT_SYMBOL(sock_no_ioctl); 2078 2079int sock_no_listen(struct socket *sock, int backlog) 2080{ 2081 return -EOPNOTSUPP; 2082} 2083EXPORT_SYMBOL(sock_no_listen); 2084 2085int sock_no_shutdown(struct socket *sock, int how) 2086{ 2087 return -EOPNOTSUPP; 2088} 2089EXPORT_SYMBOL(sock_no_shutdown); 2090 2091int sock_no_setsockopt(struct socket *sock, int level, int optname, 2092 char __user *optval, unsigned int optlen) 2093{ 2094 return -EOPNOTSUPP; 2095} 2096EXPORT_SYMBOL(sock_no_setsockopt); 2097 2098int sock_no_getsockopt(struct socket *sock, int level, int optname, 2099 char __user *optval, int __user *optlen) 2100{ 2101 return -EOPNOTSUPP; 2102} 2103EXPORT_SYMBOL(sock_no_getsockopt); 2104 2105int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 2106 size_t len) 2107{ 2108 return -EOPNOTSUPP; 2109} 2110EXPORT_SYMBOL(sock_no_sendmsg); 2111 2112int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 2113 size_t len, int flags) 2114{ 2115 return -EOPNOTSUPP; 2116} 2117EXPORT_SYMBOL(sock_no_recvmsg); 2118 2119int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2120{ 2121 /* Mirror missing mmap method error code */ 2122 return -ENODEV; 2123} 2124EXPORT_SYMBOL(sock_no_mmap); 2125 2126ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2127{ 2128 ssize_t res; 2129 struct msghdr msg = {.msg_flags = flags}; 2130 struct kvec iov; 2131 char *kaddr = kmap(page); 2132 iov.iov_base = kaddr + offset; 2133 iov.iov_len = size; 2134 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2135 kunmap(page); 2136 return res; 2137} 2138EXPORT_SYMBOL(sock_no_sendpage); 2139 2140/* 2141 * Default Socket Callbacks 2142 */ 2143 2144static void sock_def_wakeup(struct sock *sk) 2145{ 2146 struct socket_wq *wq; 2147 2148 rcu_read_lock(); 2149 wq = rcu_dereference(sk->sk_wq); 2150 if (wq_has_sleeper(wq)) 2151 wake_up_interruptible_all(&wq->wait); 2152 rcu_read_unlock(); 2153} 2154 2155static void sock_def_error_report(struct sock *sk) 2156{ 2157 struct socket_wq *wq; 2158 2159 rcu_read_lock(); 2160 wq = rcu_dereference(sk->sk_wq); 2161 if (wq_has_sleeper(wq)) 2162 wake_up_interruptible_poll(&wq->wait, POLLERR); 2163 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2164 rcu_read_unlock(); 2165} 2166 2167static void sock_def_readable(struct sock *sk, int len) 2168{ 2169 struct socket_wq *wq; 2170 2171 rcu_read_lock(); 2172 wq = rcu_dereference(sk->sk_wq); 2173 if (wq_has_sleeper(wq)) 2174 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | 2175 POLLRDNORM | POLLRDBAND); 2176 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2177 rcu_read_unlock(); 2178} 2179 2180static void sock_def_write_space(struct sock *sk) 2181{ 2182 struct socket_wq *wq; 2183 2184 rcu_read_lock(); 2185 2186 /* Do not wake up a writer until he can make "significant" 2187 * progress. --DaveM 2188 */ 2189 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2190 wq = rcu_dereference(sk->sk_wq); 2191 if (wq_has_sleeper(wq)) 2192 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | 2193 POLLWRNORM | POLLWRBAND); 2194 2195 /* Should agree with poll, otherwise some programs break */ 2196 if (sock_writeable(sk)) 2197 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2198 } 2199 2200 rcu_read_unlock(); 2201} 2202 2203static void sock_def_destruct(struct sock *sk) 2204{ 2205 kfree(sk->sk_protinfo); 2206} 2207 2208void sk_send_sigurg(struct sock *sk) 2209{ 2210 if (sk->sk_socket && sk->sk_socket->file) 2211 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2212 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2213} 2214EXPORT_SYMBOL(sk_send_sigurg); 2215 2216void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2217 unsigned long expires) 2218{ 2219 if (!mod_timer(timer, expires)) 2220 sock_hold(sk); 2221} 2222EXPORT_SYMBOL(sk_reset_timer); 2223 2224void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2225{ 2226 if (timer_pending(timer) && del_timer(timer)) 2227 __sock_put(sk); 2228} 2229EXPORT_SYMBOL(sk_stop_timer); 2230 2231void sock_init_data(struct socket *sock, struct sock *sk) 2232{ 2233 skb_queue_head_init(&sk->sk_receive_queue); 2234 skb_queue_head_init(&sk->sk_write_queue); 2235 skb_queue_head_init(&sk->sk_error_queue); 2236#ifdef CONFIG_NET_DMA 2237 skb_queue_head_init(&sk->sk_async_wait_queue); 2238#endif 2239 2240 sk->sk_send_head = NULL; 2241 2242 init_timer(&sk->sk_timer); 2243 2244 sk->sk_allocation = GFP_KERNEL; 2245 sk->sk_rcvbuf = sysctl_rmem_default; 2246 sk->sk_sndbuf = sysctl_wmem_default; 2247 sk->sk_state = TCP_CLOSE; 2248 sk_set_socket(sk, sock); 2249 2250 sock_set_flag(sk, SOCK_ZAPPED); 2251 2252 if (sock) { 2253 sk->sk_type = sock->type; 2254 sk->sk_wq = sock->wq; 2255 sock->sk = sk; 2256 } else 2257 sk->sk_wq = NULL; 2258 2259 spin_lock_init(&sk->sk_dst_lock); 2260 rwlock_init(&sk->sk_callback_lock); 2261 lockdep_set_class_and_name(&sk->sk_callback_lock, 2262 af_callback_keys + sk->sk_family, 2263 af_family_clock_key_strings[sk->sk_family]); 2264 2265 sk->sk_state_change = sock_def_wakeup; 2266 sk->sk_data_ready = sock_def_readable; 2267 sk->sk_write_space = sock_def_write_space; 2268 sk->sk_error_report = sock_def_error_report; 2269 sk->sk_destruct = sock_def_destruct; 2270 2271 sk->sk_frag.page = NULL; 2272 sk->sk_frag.offset = 0; 2273 sk->sk_peek_off = -1; 2274 2275 sk->sk_peer_pid = NULL; 2276 sk->sk_peer_cred = NULL; 2277 sk->sk_write_pending = 0; 2278 sk->sk_rcvlowat = 1; 2279 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2280 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2281 2282 sk->sk_stamp = ktime_set(-1L, 0); 2283 2284 /* 2285 * Before updating sk_refcnt, we must commit prior changes to memory 2286 * (Documentation/RCU/rculist_nulls.txt for details) 2287 */ 2288 smp_wmb(); 2289 atomic_set(&sk->sk_refcnt, 1); 2290 atomic_set(&sk->sk_drops, 0); 2291} 2292EXPORT_SYMBOL(sock_init_data); 2293 2294void lock_sock_nested(struct sock *sk, int subclass) 2295{ 2296 might_sleep(); 2297 spin_lock_bh(&sk->sk_lock.slock); 2298 if (sk->sk_lock.owned) 2299 __lock_sock(sk); 2300 sk->sk_lock.owned = 1; 2301 spin_unlock(&sk->sk_lock.slock); 2302 /* 2303 * The sk_lock has mutex_lock() semantics here: 2304 */ 2305 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2306 local_bh_enable(); 2307} 2308EXPORT_SYMBOL(lock_sock_nested); 2309 2310void release_sock(struct sock *sk) 2311{ 2312 /* 2313 * The sk_lock has mutex_unlock() semantics: 2314 */ 2315 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 2316 2317 spin_lock_bh(&sk->sk_lock.slock); 2318 if (sk->sk_backlog.tail) 2319 __release_sock(sk); 2320 2321 if (sk->sk_prot->release_cb) 2322 sk->sk_prot->release_cb(sk); 2323 2324 sk->sk_lock.owned = 0; 2325 if (waitqueue_active(&sk->sk_lock.wq)) 2326 wake_up(&sk->sk_lock.wq); 2327 spin_unlock_bh(&sk->sk_lock.slock); 2328} 2329EXPORT_SYMBOL(release_sock); 2330 2331/** 2332 * lock_sock_fast - fast version of lock_sock 2333 * @sk: socket 2334 * 2335 * This version should be used for very small section, where process wont block 2336 * return false if fast path is taken 2337 * sk_lock.slock locked, owned = 0, BH disabled 2338 * return true if slow path is taken 2339 * sk_lock.slock unlocked, owned = 1, BH enabled 2340 */ 2341bool lock_sock_fast(struct sock *sk) 2342{ 2343 might_sleep(); 2344 spin_lock_bh(&sk->sk_lock.slock); 2345 2346 if (!sk->sk_lock.owned) 2347 /* 2348 * Note : We must disable BH 2349 */ 2350 return false; 2351 2352 __lock_sock(sk); 2353 sk->sk_lock.owned = 1; 2354 spin_unlock(&sk->sk_lock.slock); 2355 /* 2356 * The sk_lock has mutex_lock() semantics here: 2357 */ 2358 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2359 local_bh_enable(); 2360 return true; 2361} 2362EXPORT_SYMBOL(lock_sock_fast); 2363 2364int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2365{ 2366 struct timeval tv; 2367 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2368 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2369 tv = ktime_to_timeval(sk->sk_stamp); 2370 if (tv.tv_sec == -1) 2371 return -ENOENT; 2372 if (tv.tv_sec == 0) { 2373 sk->sk_stamp = ktime_get_real(); 2374 tv = ktime_to_timeval(sk->sk_stamp); 2375 } 2376 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2377} 2378EXPORT_SYMBOL(sock_get_timestamp); 2379 2380int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2381{ 2382 struct timespec ts; 2383 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2384 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2385 ts = ktime_to_timespec(sk->sk_stamp); 2386 if (ts.tv_sec == -1) 2387 return -ENOENT; 2388 if (ts.tv_sec == 0) { 2389 sk->sk_stamp = ktime_get_real(); 2390 ts = ktime_to_timespec(sk->sk_stamp); 2391 } 2392 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2393} 2394EXPORT_SYMBOL(sock_get_timestampns); 2395 2396void sock_enable_timestamp(struct sock *sk, int flag) 2397{ 2398 if (!sock_flag(sk, flag)) { 2399 unsigned long previous_flags = sk->sk_flags; 2400 2401 sock_set_flag(sk, flag); 2402 /* 2403 * we just set one of the two flags which require net 2404 * time stamping, but time stamping might have been on 2405 * already because of the other one 2406 */ 2407 if (!(previous_flags & SK_FLAGS_TIMESTAMP)) 2408 net_enable_timestamp(); 2409 } 2410} 2411 2412/* 2413 * Get a socket option on an socket. 2414 * 2415 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2416 * asynchronous errors should be reported by getsockopt. We assume 2417 * this means if you specify SO_ERROR (otherwise whats the point of it). 2418 */ 2419int sock_common_getsockopt(struct socket *sock, int level, int optname, 2420 char __user *optval, int __user *optlen) 2421{ 2422 struct sock *sk = sock->sk; 2423 2424 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2425} 2426EXPORT_SYMBOL(sock_common_getsockopt); 2427 2428#ifdef CONFIG_COMPAT 2429int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2430 char __user *optval, int __user *optlen) 2431{ 2432 struct sock *sk = sock->sk; 2433 2434 if (sk->sk_prot->compat_getsockopt != NULL) 2435 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2436 optval, optlen); 2437 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2438} 2439EXPORT_SYMBOL(compat_sock_common_getsockopt); 2440#endif 2441 2442int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, 2443 struct msghdr *msg, size_t size, int flags) 2444{ 2445 struct sock *sk = sock->sk; 2446 int addr_len = 0; 2447 int err; 2448 2449 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 2450 flags & ~MSG_DONTWAIT, &addr_len); 2451 if (err >= 0) 2452 msg->msg_namelen = addr_len; 2453 return err; 2454} 2455EXPORT_SYMBOL(sock_common_recvmsg); 2456 2457/* 2458 * Set socket options on an inet socket. 2459 */ 2460int sock_common_setsockopt(struct socket *sock, int level, int optname, 2461 char __user *optval, unsigned int optlen) 2462{ 2463 struct sock *sk = sock->sk; 2464 2465 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2466} 2467EXPORT_SYMBOL(sock_common_setsockopt); 2468 2469#ifdef CONFIG_COMPAT 2470int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2471 char __user *optval, unsigned int optlen) 2472{ 2473 struct sock *sk = sock->sk; 2474 2475 if (sk->sk_prot->compat_setsockopt != NULL) 2476 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2477 optval, optlen); 2478 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2479} 2480EXPORT_SYMBOL(compat_sock_common_setsockopt); 2481#endif 2482 2483void sk_common_release(struct sock *sk) 2484{ 2485 if (sk->sk_prot->destroy) 2486 sk->sk_prot->destroy(sk); 2487 2488 /* 2489 * Observation: when sock_common_release is called, processes have 2490 * no access to socket. But net still has. 2491 * Step one, detach it from networking: 2492 * 2493 * A. Remove from hash tables. 2494 */ 2495 2496 sk->sk_prot->unhash(sk); 2497 2498 /* 2499 * In this point socket cannot receive new packets, but it is possible 2500 * that some packets are in flight because some CPU runs receiver and 2501 * did hash table lookup before we unhashed socket. They will achieve 2502 * receive queue and will be purged by socket destructor. 2503 * 2504 * Also we still have packets pending on receive queue and probably, 2505 * our own packets waiting in device queues. sock_destroy will drain 2506 * receive queue, but transmitted packets will delay socket destruction 2507 * until the last reference will be released. 2508 */ 2509 2510 sock_orphan(sk); 2511 2512 xfrm_sk_free_policy(sk); 2513 2514 sk_refcnt_debug_release(sk); 2515 2516 if (sk->sk_frag.page) { 2517 put_page(sk->sk_frag.page); 2518 sk->sk_frag.page = NULL; 2519 } 2520 2521 sock_put(sk); 2522} 2523EXPORT_SYMBOL(sk_common_release); 2524 2525#ifdef CONFIG_PROC_FS 2526#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2527struct prot_inuse { 2528 int val[PROTO_INUSE_NR]; 2529}; 2530 2531static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 2532 2533#ifdef CONFIG_NET_NS 2534void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2535{ 2536 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); 2537} 2538EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2539 2540int sock_prot_inuse_get(struct net *net, struct proto *prot) 2541{ 2542 int cpu, idx = prot->inuse_idx; 2543 int res = 0; 2544 2545 for_each_possible_cpu(cpu) 2546 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; 2547 2548 return res >= 0 ? res : 0; 2549} 2550EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2551 2552static int __net_init sock_inuse_init_net(struct net *net) 2553{ 2554 net->core.inuse = alloc_percpu(struct prot_inuse); 2555 return net->core.inuse ? 0 : -ENOMEM; 2556} 2557 2558static void __net_exit sock_inuse_exit_net(struct net *net) 2559{ 2560 free_percpu(net->core.inuse); 2561} 2562 2563static struct pernet_operations net_inuse_ops = { 2564 .init = sock_inuse_init_net, 2565 .exit = sock_inuse_exit_net, 2566}; 2567 2568static __init int net_inuse_init(void) 2569{ 2570 if (register_pernet_subsys(&net_inuse_ops)) 2571 panic("Cannot initialize net inuse counters"); 2572 2573 return 0; 2574} 2575 2576core_initcall(net_inuse_init); 2577#else 2578static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); 2579 2580void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2581{ 2582 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); 2583} 2584EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2585 2586int sock_prot_inuse_get(struct net *net, struct proto *prot) 2587{ 2588 int cpu, idx = prot->inuse_idx; 2589 int res = 0; 2590 2591 for_each_possible_cpu(cpu) 2592 res += per_cpu(prot_inuse, cpu).val[idx]; 2593 2594 return res >= 0 ? res : 0; 2595} 2596EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2597#endif 2598 2599static void assign_proto_idx(struct proto *prot) 2600{ 2601 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 2602 2603 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 2604 pr_err("PROTO_INUSE_NR exhausted\n"); 2605 return; 2606 } 2607 2608 set_bit(prot->inuse_idx, proto_inuse_idx); 2609} 2610 2611static void release_proto_idx(struct proto *prot) 2612{ 2613 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 2614 clear_bit(prot->inuse_idx, proto_inuse_idx); 2615} 2616#else 2617static inline void assign_proto_idx(struct proto *prot) 2618{ 2619} 2620 2621static inline void release_proto_idx(struct proto *prot) 2622{ 2623} 2624#endif 2625 2626int proto_register(struct proto *prot, int alloc_slab) 2627{ 2628 if (alloc_slab) { 2629 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2630 SLAB_HWCACHE_ALIGN | prot->slab_flags, 2631 NULL); 2632 2633 if (prot->slab == NULL) { 2634 pr_crit("%s: Can't create sock SLAB cache!\n", 2635 prot->name); 2636 goto out; 2637 } 2638 2639 if (prot->rsk_prot != NULL) { 2640 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); 2641 if (prot->rsk_prot->slab_name == NULL) 2642 goto out_free_sock_slab; 2643 2644 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, 2645 prot->rsk_prot->obj_size, 0, 2646 SLAB_HWCACHE_ALIGN, NULL); 2647 2648 if (prot->rsk_prot->slab == NULL) { 2649 pr_crit("%s: Can't create request sock SLAB cache!\n", 2650 prot->name); 2651 goto out_free_request_sock_slab_name; 2652 } 2653 } 2654 2655 if (prot->twsk_prot != NULL) { 2656 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 2657 2658 if (prot->twsk_prot->twsk_slab_name == NULL) 2659 goto out_free_request_sock_slab; 2660 2661 prot->twsk_prot->twsk_slab = 2662 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 2663 prot->twsk_prot->twsk_obj_size, 2664 0, 2665 SLAB_HWCACHE_ALIGN | 2666 prot->slab_flags, 2667 NULL); 2668 if (prot->twsk_prot->twsk_slab == NULL) 2669 goto out_free_timewait_sock_slab_name; 2670 } 2671 } 2672 2673 mutex_lock(&proto_list_mutex); 2674 list_add(&prot->node, &proto_list); 2675 assign_proto_idx(prot); 2676 mutex_unlock(&proto_list_mutex); 2677 return 0; 2678 2679out_free_timewait_sock_slab_name: 2680 kfree(prot->twsk_prot->twsk_slab_name); 2681out_free_request_sock_slab: 2682 if (prot->rsk_prot && prot->rsk_prot->slab) { 2683 kmem_cache_destroy(prot->rsk_prot->slab); 2684 prot->rsk_prot->slab = NULL; 2685 } 2686out_free_request_sock_slab_name: 2687 if (prot->rsk_prot) 2688 kfree(prot->rsk_prot->slab_name); 2689out_free_sock_slab: 2690 kmem_cache_destroy(prot->slab); 2691 prot->slab = NULL; 2692out: 2693 return -ENOBUFS; 2694} 2695EXPORT_SYMBOL(proto_register); 2696 2697void proto_unregister(struct proto *prot) 2698{ 2699 mutex_lock(&proto_list_mutex); 2700 release_proto_idx(prot); 2701 list_del(&prot->node); 2702 mutex_unlock(&proto_list_mutex); 2703 2704 if (prot->slab != NULL) { 2705 kmem_cache_destroy(prot->slab); 2706 prot->slab = NULL; 2707 } 2708 2709 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { 2710 kmem_cache_destroy(prot->rsk_prot->slab); 2711 kfree(prot->rsk_prot->slab_name); 2712 prot->rsk_prot->slab = NULL; 2713 } 2714 2715 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 2716 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 2717 kfree(prot->twsk_prot->twsk_slab_name); 2718 prot->twsk_prot->twsk_slab = NULL; 2719 } 2720} 2721EXPORT_SYMBOL(proto_unregister); 2722 2723#ifdef CONFIG_PROC_FS 2724static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2725 __acquires(proto_list_mutex) 2726{ 2727 mutex_lock(&proto_list_mutex); 2728 return seq_list_start_head(&proto_list, *pos); 2729} 2730 2731static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2732{ 2733 return seq_list_next(v, &proto_list, pos); 2734} 2735 2736static void proto_seq_stop(struct seq_file *seq, void *v) 2737 __releases(proto_list_mutex) 2738{ 2739 mutex_unlock(&proto_list_mutex); 2740} 2741 2742static char proto_method_implemented(const void *method) 2743{ 2744 return method == NULL ? 'n' : 'y'; 2745} 2746static long sock_prot_memory_allocated(struct proto *proto) 2747{ 2748 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 2749} 2750 2751static char *sock_prot_memory_pressure(struct proto *proto) 2752{ 2753 return proto->memory_pressure != NULL ? 2754 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 2755} 2756 2757static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2758{ 2759 2760 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 2761 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2762 proto->name, 2763 proto->obj_size, 2764 sock_prot_inuse_get(seq_file_net(seq), proto), 2765 sock_prot_memory_allocated(proto), 2766 sock_prot_memory_pressure(proto), 2767 proto->max_header, 2768 proto->slab == NULL ? "no" : "yes", 2769 module_name(proto->owner), 2770 proto_method_implemented(proto->close), 2771 proto_method_implemented(proto->connect), 2772 proto_method_implemented(proto->disconnect), 2773 proto_method_implemented(proto->accept), 2774 proto_method_implemented(proto->ioctl), 2775 proto_method_implemented(proto->init), 2776 proto_method_implemented(proto->destroy), 2777 proto_method_implemented(proto->shutdown), 2778 proto_method_implemented(proto->setsockopt), 2779 proto_method_implemented(proto->getsockopt), 2780 proto_method_implemented(proto->sendmsg), 2781 proto_method_implemented(proto->recvmsg), 2782 proto_method_implemented(proto->sendpage), 2783 proto_method_implemented(proto->bind), 2784 proto_method_implemented(proto->backlog_rcv), 2785 proto_method_implemented(proto->hash), 2786 proto_method_implemented(proto->unhash), 2787 proto_method_implemented(proto->get_port), 2788 proto_method_implemented(proto->enter_memory_pressure)); 2789} 2790 2791static int proto_seq_show(struct seq_file *seq, void *v) 2792{ 2793 if (v == &proto_list) 2794 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 2795 "protocol", 2796 "size", 2797 "sockets", 2798 "memory", 2799 "press", 2800 "maxhdr", 2801 "slab", 2802 "module", 2803 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 2804 else 2805 proto_seq_printf(seq, list_entry(v, struct proto, node)); 2806 return 0; 2807} 2808 2809static const struct seq_operations proto_seq_ops = { 2810 .start = proto_seq_start, 2811 .next = proto_seq_next, 2812 .stop = proto_seq_stop, 2813 .show = proto_seq_show, 2814}; 2815 2816static int proto_seq_open(struct inode *inode, struct file *file) 2817{ 2818 return seq_open_net(inode, file, &proto_seq_ops, 2819 sizeof(struct seq_net_private)); 2820} 2821 2822static const struct file_operations proto_seq_fops = { 2823 .owner = THIS_MODULE, 2824 .open = proto_seq_open, 2825 .read = seq_read, 2826 .llseek = seq_lseek, 2827 .release = seq_release_net, 2828}; 2829 2830static __net_init int proto_init_net(struct net *net) 2831{ 2832 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) 2833 return -ENOMEM; 2834 2835 return 0; 2836} 2837 2838static __net_exit void proto_exit_net(struct net *net) 2839{ 2840 proc_net_remove(net, "protocols"); 2841} 2842 2843 2844static __net_initdata struct pernet_operations proto_net_ops = { 2845 .init = proto_init_net, 2846 .exit = proto_exit_net, 2847}; 2848 2849static int __init proto_init(void) 2850{ 2851 return register_pernet_subsys(&proto_net_ops); 2852} 2853 2854subsys_initcall(proto_init); 2855 2856#endif /* PROC_FS */ 2857