sock.c revision 5bc1421e34ecfe0bd4b26dc3232b7d5e25179144
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92#include <linux/capability.h> 93#include <linux/errno.h> 94#include <linux/types.h> 95#include <linux/socket.h> 96#include <linux/in.h> 97#include <linux/kernel.h> 98#include <linux/module.h> 99#include <linux/proc_fs.h> 100#include <linux/seq_file.h> 101#include <linux/sched.h> 102#include <linux/timer.h> 103#include <linux/string.h> 104#include <linux/sockios.h> 105#include <linux/net.h> 106#include <linux/mm.h> 107#include <linux/slab.h> 108#include <linux/interrupt.h> 109#include <linux/poll.h> 110#include <linux/tcp.h> 111#include <linux/init.h> 112#include <linux/highmem.h> 113#include <linux/user_namespace.h> 114 115#include <asm/uaccess.h> 116#include <asm/system.h> 117 118#include <linux/netdevice.h> 119#include <net/protocol.h> 120#include <linux/skbuff.h> 121#include <net/net_namespace.h> 122#include <net/request_sock.h> 123#include <net/sock.h> 124#include <linux/net_tstamp.h> 125#include <net/xfrm.h> 126#include <linux/ipsec.h> 127#include <net/cls_cgroup.h> 128#include <net/netprio_cgroup.h> 129 130#include <linux/filter.h> 131 132#include <trace/events/sock.h> 133 134#ifdef CONFIG_INET 135#include <net/tcp.h> 136#endif 137 138/* 139 * Each address family might have different locking rules, so we have 140 * one slock key per address family: 141 */ 142static struct lock_class_key af_family_keys[AF_MAX]; 143static struct lock_class_key af_family_slock_keys[AF_MAX]; 144 145/* 146 * Make lock validator output more readable. (we pre-construct these 147 * strings build-time, so that runtime initialization of socket 148 * locks is fast): 149 */ 150static const char *const af_family_key_strings[AF_MAX+1] = { 151 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 152 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", 153 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , 154 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , 155 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 156 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 157 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 158 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 159 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 160 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 161 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 162 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 163 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , 164 "sk_lock-AF_NFC" , "sk_lock-AF_MAX" 165}; 166static const char *const af_family_slock_key_strings[AF_MAX+1] = { 167 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 168 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", 169 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , 170 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , 171 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 172 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 173 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 174 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , 175 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 176 "slock-27" , "slock-28" , "slock-AF_CAN" , 177 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 178 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 179 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , 180 "slock-AF_NFC" , "slock-AF_MAX" 181}; 182static const char *const af_family_clock_key_strings[AF_MAX+1] = { 183 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 184 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 185 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 186 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 187 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 188 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 189 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 190 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 191 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 192 "clock-27" , "clock-28" , "clock-AF_CAN" , 193 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 194 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 195 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , 196 "clock-AF_NFC" , "clock-AF_MAX" 197}; 198 199/* 200 * sk_callback_lock locking rules are per-address-family, 201 * so split the lock classes by using a per-AF key: 202 */ 203static struct lock_class_key af_callback_keys[AF_MAX]; 204 205/* Take into consideration the size of the struct sk_buff overhead in the 206 * determination of these values, since that is non-constant across 207 * platforms. This makes socket queueing behavior and performance 208 * not depend upon such differences. 209 */ 210#define _SK_MEM_PACKETS 256 211#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 212#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 213#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 214 215/* Run time adjustable parameters. */ 216__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 217__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 218__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 219__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 220 221/* Maximal space eaten by iovec or ancillary data plus some space */ 222int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 223EXPORT_SYMBOL(sysctl_optmem_max); 224 225#if defined(CONFIG_CGROUPS) 226#if !defined(CONFIG_NET_CLS_CGROUP) 227int net_cls_subsys_id = -1; 228EXPORT_SYMBOL_GPL(net_cls_subsys_id); 229#endif 230#if !defined(CONFIG_NETPRIO_CGROUP) 231int net_prio_subsys_id = -1; 232EXPORT_SYMBOL_GPL(net_prio_subsys_id); 233#endif 234#endif 235 236static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 237{ 238 struct timeval tv; 239 240 if (optlen < sizeof(tv)) 241 return -EINVAL; 242 if (copy_from_user(&tv, optval, sizeof(tv))) 243 return -EFAULT; 244 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 245 return -EDOM; 246 247 if (tv.tv_sec < 0) { 248 static int warned __read_mostly; 249 250 *timeo_p = 0; 251 if (warned < 10 && net_ratelimit()) { 252 warned++; 253 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " 254 "tries to set negative timeout\n", 255 current->comm, task_pid_nr(current)); 256 } 257 return 0; 258 } 259 *timeo_p = MAX_SCHEDULE_TIMEOUT; 260 if (tv.tv_sec == 0 && tv.tv_usec == 0) 261 return 0; 262 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 263 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 264 return 0; 265} 266 267static void sock_warn_obsolete_bsdism(const char *name) 268{ 269 static int warned; 270 static char warncomm[TASK_COMM_LEN]; 271 if (strcmp(warncomm, current->comm) && warned < 5) { 272 strcpy(warncomm, current->comm); 273 printk(KERN_WARNING "process `%s' is using obsolete " 274 "%s SO_BSDCOMPAT\n", warncomm, name); 275 warned++; 276 } 277} 278 279static void sock_disable_timestamp(struct sock *sk, int flag) 280{ 281 if (sock_flag(sk, flag)) { 282 sock_reset_flag(sk, flag); 283 if (!sock_flag(sk, SOCK_TIMESTAMP) && 284 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { 285 net_disable_timestamp(); 286 } 287 } 288} 289 290 291int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 292{ 293 int err; 294 int skb_len; 295 unsigned long flags; 296 struct sk_buff_head *list = &sk->sk_receive_queue; 297 298 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces 299 number of warnings when compiling with -W --ANK 300 */ 301 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 302 (unsigned)sk->sk_rcvbuf) { 303 atomic_inc(&sk->sk_drops); 304 trace_sock_rcvqueue_full(sk, skb); 305 return -ENOMEM; 306 } 307 308 err = sk_filter(sk, skb); 309 if (err) 310 return err; 311 312 if (!sk_rmem_schedule(sk, skb->truesize)) { 313 atomic_inc(&sk->sk_drops); 314 return -ENOBUFS; 315 } 316 317 skb->dev = NULL; 318 skb_set_owner_r(skb, sk); 319 320 /* Cache the SKB length before we tack it onto the receive 321 * queue. Once it is added it no longer belongs to us and 322 * may be freed by other threads of control pulling packets 323 * from the queue. 324 */ 325 skb_len = skb->len; 326 327 /* we escape from rcu protected region, make sure we dont leak 328 * a norefcounted dst 329 */ 330 skb_dst_force(skb); 331 332 spin_lock_irqsave(&list->lock, flags); 333 skb->dropcount = atomic_read(&sk->sk_drops); 334 __skb_queue_tail(list, skb); 335 spin_unlock_irqrestore(&list->lock, flags); 336 337 if (!sock_flag(sk, SOCK_DEAD)) 338 sk->sk_data_ready(sk, skb_len); 339 return 0; 340} 341EXPORT_SYMBOL(sock_queue_rcv_skb); 342 343int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) 344{ 345 int rc = NET_RX_SUCCESS; 346 347 if (sk_filter(sk, skb)) 348 goto discard_and_relse; 349 350 skb->dev = NULL; 351 352 if (sk_rcvqueues_full(sk, skb)) { 353 atomic_inc(&sk->sk_drops); 354 goto discard_and_relse; 355 } 356 if (nested) 357 bh_lock_sock_nested(sk); 358 else 359 bh_lock_sock(sk); 360 if (!sock_owned_by_user(sk)) { 361 /* 362 * trylock + unlock semantics: 363 */ 364 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 365 366 rc = sk_backlog_rcv(sk, skb); 367 368 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 369 } else if (sk_add_backlog(sk, skb)) { 370 bh_unlock_sock(sk); 371 atomic_inc(&sk->sk_drops); 372 goto discard_and_relse; 373 } 374 375 bh_unlock_sock(sk); 376out: 377 sock_put(sk); 378 return rc; 379discard_and_relse: 380 kfree_skb(skb); 381 goto out; 382} 383EXPORT_SYMBOL(sk_receive_skb); 384 385void sk_reset_txq(struct sock *sk) 386{ 387 sk_tx_queue_clear(sk); 388} 389EXPORT_SYMBOL(sk_reset_txq); 390 391struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 392{ 393 struct dst_entry *dst = __sk_dst_get(sk); 394 395 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 396 sk_tx_queue_clear(sk); 397 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 398 dst_release(dst); 399 return NULL; 400 } 401 402 return dst; 403} 404EXPORT_SYMBOL(__sk_dst_check); 405 406struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 407{ 408 struct dst_entry *dst = sk_dst_get(sk); 409 410 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 411 sk_dst_reset(sk); 412 dst_release(dst); 413 return NULL; 414 } 415 416 return dst; 417} 418EXPORT_SYMBOL(sk_dst_check); 419 420static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) 421{ 422 int ret = -ENOPROTOOPT; 423#ifdef CONFIG_NETDEVICES 424 struct net *net = sock_net(sk); 425 char devname[IFNAMSIZ]; 426 int index; 427 428 /* Sorry... */ 429 ret = -EPERM; 430 if (!capable(CAP_NET_RAW)) 431 goto out; 432 433 ret = -EINVAL; 434 if (optlen < 0) 435 goto out; 436 437 /* Bind this socket to a particular device like "eth0", 438 * as specified in the passed interface name. If the 439 * name is "" or the option length is zero the socket 440 * is not bound. 441 */ 442 if (optlen > IFNAMSIZ - 1) 443 optlen = IFNAMSIZ - 1; 444 memset(devname, 0, sizeof(devname)); 445 446 ret = -EFAULT; 447 if (copy_from_user(devname, optval, optlen)) 448 goto out; 449 450 index = 0; 451 if (devname[0] != '\0') { 452 struct net_device *dev; 453 454 rcu_read_lock(); 455 dev = dev_get_by_name_rcu(net, devname); 456 if (dev) 457 index = dev->ifindex; 458 rcu_read_unlock(); 459 ret = -ENODEV; 460 if (!dev) 461 goto out; 462 } 463 464 lock_sock(sk); 465 sk->sk_bound_dev_if = index; 466 sk_dst_reset(sk); 467 release_sock(sk); 468 469 ret = 0; 470 471out: 472#endif 473 474 return ret; 475} 476 477static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 478{ 479 if (valbool) 480 sock_set_flag(sk, bit); 481 else 482 sock_reset_flag(sk, bit); 483} 484 485/* 486 * This is meant for all protocols to use and covers goings on 487 * at the socket level. Everything here is generic. 488 */ 489 490int sock_setsockopt(struct socket *sock, int level, int optname, 491 char __user *optval, unsigned int optlen) 492{ 493 struct sock *sk = sock->sk; 494 int val; 495 int valbool; 496 struct linger ling; 497 int ret = 0; 498 499 /* 500 * Options without arguments 501 */ 502 503 if (optname == SO_BINDTODEVICE) 504 return sock_bindtodevice(sk, optval, optlen); 505 506 if (optlen < sizeof(int)) 507 return -EINVAL; 508 509 if (get_user(val, (int __user *)optval)) 510 return -EFAULT; 511 512 valbool = val ? 1 : 0; 513 514 lock_sock(sk); 515 516 switch (optname) { 517 case SO_DEBUG: 518 if (val && !capable(CAP_NET_ADMIN)) 519 ret = -EACCES; 520 else 521 sock_valbool_flag(sk, SOCK_DBG, valbool); 522 break; 523 case SO_REUSEADDR: 524 sk->sk_reuse = valbool; 525 break; 526 case SO_TYPE: 527 case SO_PROTOCOL: 528 case SO_DOMAIN: 529 case SO_ERROR: 530 ret = -ENOPROTOOPT; 531 break; 532 case SO_DONTROUTE: 533 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 534 break; 535 case SO_BROADCAST: 536 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 537 break; 538 case SO_SNDBUF: 539 /* Don't error on this BSD doesn't and if you think 540 about it this is right. Otherwise apps have to 541 play 'guess the biggest size' games. RCVBUF/SNDBUF 542 are treated in BSD as hints */ 543 544 if (val > sysctl_wmem_max) 545 val = sysctl_wmem_max; 546set_sndbuf: 547 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 548 if ((val * 2) < SOCK_MIN_SNDBUF) 549 sk->sk_sndbuf = SOCK_MIN_SNDBUF; 550 else 551 sk->sk_sndbuf = val * 2; 552 553 /* 554 * Wake up sending tasks if we 555 * upped the value. 556 */ 557 sk->sk_write_space(sk); 558 break; 559 560 case SO_SNDBUFFORCE: 561 if (!capable(CAP_NET_ADMIN)) { 562 ret = -EPERM; 563 break; 564 } 565 goto set_sndbuf; 566 567 case SO_RCVBUF: 568 /* Don't error on this BSD doesn't and if you think 569 about it this is right. Otherwise apps have to 570 play 'guess the biggest size' games. RCVBUF/SNDBUF 571 are treated in BSD as hints */ 572 573 if (val > sysctl_rmem_max) 574 val = sysctl_rmem_max; 575set_rcvbuf: 576 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 577 /* 578 * We double it on the way in to account for 579 * "struct sk_buff" etc. overhead. Applications 580 * assume that the SO_RCVBUF setting they make will 581 * allow that much actual data to be received on that 582 * socket. 583 * 584 * Applications are unaware that "struct sk_buff" and 585 * other overheads allocate from the receive buffer 586 * during socket buffer allocation. 587 * 588 * And after considering the possible alternatives, 589 * returning the value we actually used in getsockopt 590 * is the most desirable behavior. 591 */ 592 if ((val * 2) < SOCK_MIN_RCVBUF) 593 sk->sk_rcvbuf = SOCK_MIN_RCVBUF; 594 else 595 sk->sk_rcvbuf = val * 2; 596 break; 597 598 case SO_RCVBUFFORCE: 599 if (!capable(CAP_NET_ADMIN)) { 600 ret = -EPERM; 601 break; 602 } 603 goto set_rcvbuf; 604 605 case SO_KEEPALIVE: 606#ifdef CONFIG_INET 607 if (sk->sk_protocol == IPPROTO_TCP) 608 tcp_set_keepalive(sk, valbool); 609#endif 610 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 611 break; 612 613 case SO_OOBINLINE: 614 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 615 break; 616 617 case SO_NO_CHECK: 618 sk->sk_no_check = valbool; 619 break; 620 621 case SO_PRIORITY: 622 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 623 sk->sk_priority = val; 624 else 625 ret = -EPERM; 626 break; 627 628 case SO_LINGER: 629 if (optlen < sizeof(ling)) { 630 ret = -EINVAL; /* 1003.1g */ 631 break; 632 } 633 if (copy_from_user(&ling, optval, sizeof(ling))) { 634 ret = -EFAULT; 635 break; 636 } 637 if (!ling.l_onoff) 638 sock_reset_flag(sk, SOCK_LINGER); 639 else { 640#if (BITS_PER_LONG == 32) 641 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 642 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 643 else 644#endif 645 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 646 sock_set_flag(sk, SOCK_LINGER); 647 } 648 break; 649 650 case SO_BSDCOMPAT: 651 sock_warn_obsolete_bsdism("setsockopt"); 652 break; 653 654 case SO_PASSCRED: 655 if (valbool) 656 set_bit(SOCK_PASSCRED, &sock->flags); 657 else 658 clear_bit(SOCK_PASSCRED, &sock->flags); 659 break; 660 661 case SO_TIMESTAMP: 662 case SO_TIMESTAMPNS: 663 if (valbool) { 664 if (optname == SO_TIMESTAMP) 665 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 666 else 667 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 668 sock_set_flag(sk, SOCK_RCVTSTAMP); 669 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 670 } else { 671 sock_reset_flag(sk, SOCK_RCVTSTAMP); 672 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 673 } 674 break; 675 676 case SO_TIMESTAMPING: 677 if (val & ~SOF_TIMESTAMPING_MASK) { 678 ret = -EINVAL; 679 break; 680 } 681 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, 682 val & SOF_TIMESTAMPING_TX_HARDWARE); 683 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, 684 val & SOF_TIMESTAMPING_TX_SOFTWARE); 685 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, 686 val & SOF_TIMESTAMPING_RX_HARDWARE); 687 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 688 sock_enable_timestamp(sk, 689 SOCK_TIMESTAMPING_RX_SOFTWARE); 690 else 691 sock_disable_timestamp(sk, 692 SOCK_TIMESTAMPING_RX_SOFTWARE); 693 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, 694 val & SOF_TIMESTAMPING_SOFTWARE); 695 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, 696 val & SOF_TIMESTAMPING_SYS_HARDWARE); 697 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, 698 val & SOF_TIMESTAMPING_RAW_HARDWARE); 699 break; 700 701 case SO_RCVLOWAT: 702 if (val < 0) 703 val = INT_MAX; 704 sk->sk_rcvlowat = val ? : 1; 705 break; 706 707 case SO_RCVTIMEO: 708 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 709 break; 710 711 case SO_SNDTIMEO: 712 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 713 break; 714 715 case SO_ATTACH_FILTER: 716 ret = -EINVAL; 717 if (optlen == sizeof(struct sock_fprog)) { 718 struct sock_fprog fprog; 719 720 ret = -EFAULT; 721 if (copy_from_user(&fprog, optval, sizeof(fprog))) 722 break; 723 724 ret = sk_attach_filter(&fprog, sk); 725 } 726 break; 727 728 case SO_DETACH_FILTER: 729 ret = sk_detach_filter(sk); 730 break; 731 732 case SO_PASSSEC: 733 if (valbool) 734 set_bit(SOCK_PASSSEC, &sock->flags); 735 else 736 clear_bit(SOCK_PASSSEC, &sock->flags); 737 break; 738 case SO_MARK: 739 if (!capable(CAP_NET_ADMIN)) 740 ret = -EPERM; 741 else 742 sk->sk_mark = val; 743 break; 744 745 /* We implement the SO_SNDLOWAT etc to 746 not be settable (1003.1g 5.3) */ 747 case SO_RXQ_OVFL: 748 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 749 break; 750 751 case SO_WIFI_STATUS: 752 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 753 break; 754 755 default: 756 ret = -ENOPROTOOPT; 757 break; 758 } 759 release_sock(sk); 760 return ret; 761} 762EXPORT_SYMBOL(sock_setsockopt); 763 764 765void cred_to_ucred(struct pid *pid, const struct cred *cred, 766 struct ucred *ucred) 767{ 768 ucred->pid = pid_vnr(pid); 769 ucred->uid = ucred->gid = -1; 770 if (cred) { 771 struct user_namespace *current_ns = current_user_ns(); 772 773 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid); 774 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid); 775 } 776} 777EXPORT_SYMBOL_GPL(cred_to_ucred); 778 779int sock_getsockopt(struct socket *sock, int level, int optname, 780 char __user *optval, int __user *optlen) 781{ 782 struct sock *sk = sock->sk; 783 784 union { 785 int val; 786 struct linger ling; 787 struct timeval tm; 788 } v; 789 790 int lv = sizeof(int); 791 int len; 792 793 if (get_user(len, optlen)) 794 return -EFAULT; 795 if (len < 0) 796 return -EINVAL; 797 798 memset(&v, 0, sizeof(v)); 799 800 switch (optname) { 801 case SO_DEBUG: 802 v.val = sock_flag(sk, SOCK_DBG); 803 break; 804 805 case SO_DONTROUTE: 806 v.val = sock_flag(sk, SOCK_LOCALROUTE); 807 break; 808 809 case SO_BROADCAST: 810 v.val = !!sock_flag(sk, SOCK_BROADCAST); 811 break; 812 813 case SO_SNDBUF: 814 v.val = sk->sk_sndbuf; 815 break; 816 817 case SO_RCVBUF: 818 v.val = sk->sk_rcvbuf; 819 break; 820 821 case SO_REUSEADDR: 822 v.val = sk->sk_reuse; 823 break; 824 825 case SO_KEEPALIVE: 826 v.val = !!sock_flag(sk, SOCK_KEEPOPEN); 827 break; 828 829 case SO_TYPE: 830 v.val = sk->sk_type; 831 break; 832 833 case SO_PROTOCOL: 834 v.val = sk->sk_protocol; 835 break; 836 837 case SO_DOMAIN: 838 v.val = sk->sk_family; 839 break; 840 841 case SO_ERROR: 842 v.val = -sock_error(sk); 843 if (v.val == 0) 844 v.val = xchg(&sk->sk_err_soft, 0); 845 break; 846 847 case SO_OOBINLINE: 848 v.val = !!sock_flag(sk, SOCK_URGINLINE); 849 break; 850 851 case SO_NO_CHECK: 852 v.val = sk->sk_no_check; 853 break; 854 855 case SO_PRIORITY: 856 v.val = sk->sk_priority; 857 break; 858 859 case SO_LINGER: 860 lv = sizeof(v.ling); 861 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); 862 v.ling.l_linger = sk->sk_lingertime / HZ; 863 break; 864 865 case SO_BSDCOMPAT: 866 sock_warn_obsolete_bsdism("getsockopt"); 867 break; 868 869 case SO_TIMESTAMP: 870 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 871 !sock_flag(sk, SOCK_RCVTSTAMPNS); 872 break; 873 874 case SO_TIMESTAMPNS: 875 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 876 break; 877 878 case SO_TIMESTAMPING: 879 v.val = 0; 880 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) 881 v.val |= SOF_TIMESTAMPING_TX_HARDWARE; 882 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) 883 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; 884 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) 885 v.val |= SOF_TIMESTAMPING_RX_HARDWARE; 886 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) 887 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; 888 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) 889 v.val |= SOF_TIMESTAMPING_SOFTWARE; 890 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) 891 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; 892 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) 893 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; 894 break; 895 896 case SO_RCVTIMEO: 897 lv = sizeof(struct timeval); 898 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 899 v.tm.tv_sec = 0; 900 v.tm.tv_usec = 0; 901 } else { 902 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 903 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 904 } 905 break; 906 907 case SO_SNDTIMEO: 908 lv = sizeof(struct timeval); 909 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 910 v.tm.tv_sec = 0; 911 v.tm.tv_usec = 0; 912 } else { 913 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 914 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 915 } 916 break; 917 918 case SO_RCVLOWAT: 919 v.val = sk->sk_rcvlowat; 920 break; 921 922 case SO_SNDLOWAT: 923 v.val = 1; 924 break; 925 926 case SO_PASSCRED: 927 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; 928 break; 929 930 case SO_PEERCRED: 931 { 932 struct ucred peercred; 933 if (len > sizeof(peercred)) 934 len = sizeof(peercred); 935 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 936 if (copy_to_user(optval, &peercred, len)) 937 return -EFAULT; 938 goto lenout; 939 } 940 941 case SO_PEERNAME: 942 { 943 char address[128]; 944 945 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) 946 return -ENOTCONN; 947 if (lv < len) 948 return -EINVAL; 949 if (copy_to_user(optval, address, len)) 950 return -EFAULT; 951 goto lenout; 952 } 953 954 /* Dubious BSD thing... Probably nobody even uses it, but 955 * the UNIX standard wants it for whatever reason... -DaveM 956 */ 957 case SO_ACCEPTCONN: 958 v.val = sk->sk_state == TCP_LISTEN; 959 break; 960 961 case SO_PASSSEC: 962 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; 963 break; 964 965 case SO_PEERSEC: 966 return security_socket_getpeersec_stream(sock, optval, optlen, len); 967 968 case SO_MARK: 969 v.val = sk->sk_mark; 970 break; 971 972 case SO_RXQ_OVFL: 973 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); 974 break; 975 976 case SO_WIFI_STATUS: 977 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); 978 break; 979 980 default: 981 return -ENOPROTOOPT; 982 } 983 984 if (len > lv) 985 len = lv; 986 if (copy_to_user(optval, &v, len)) 987 return -EFAULT; 988lenout: 989 if (put_user(len, optlen)) 990 return -EFAULT; 991 return 0; 992} 993 994/* 995 * Initialize an sk_lock. 996 * 997 * (We also register the sk_lock with the lock validator.) 998 */ 999static inline void sock_lock_init(struct sock *sk) 1000{ 1001 sock_lock_init_class_and_name(sk, 1002 af_family_slock_key_strings[sk->sk_family], 1003 af_family_slock_keys + sk->sk_family, 1004 af_family_key_strings[sk->sk_family], 1005 af_family_keys + sk->sk_family); 1006} 1007 1008/* 1009 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1010 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1011 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1012 */ 1013static void sock_copy(struct sock *nsk, const struct sock *osk) 1014{ 1015#ifdef CONFIG_SECURITY_NETWORK 1016 void *sptr = nsk->sk_security; 1017#endif 1018 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1019 1020 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1021 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1022 1023#ifdef CONFIG_SECURITY_NETWORK 1024 nsk->sk_security = sptr; 1025 security_sk_clone(osk, nsk); 1026#endif 1027} 1028 1029/* 1030 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes 1031 * un-modified. Special care is taken when initializing object to zero. 1032 */ 1033static inline void sk_prot_clear_nulls(struct sock *sk, int size) 1034{ 1035 if (offsetof(struct sock, sk_node.next) != 0) 1036 memset(sk, 0, offsetof(struct sock, sk_node.next)); 1037 memset(&sk->sk_node.pprev, 0, 1038 size - offsetof(struct sock, sk_node.pprev)); 1039} 1040 1041void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) 1042{ 1043 unsigned long nulls1, nulls2; 1044 1045 nulls1 = offsetof(struct sock, __sk_common.skc_node.next); 1046 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); 1047 if (nulls1 > nulls2) 1048 swap(nulls1, nulls2); 1049 1050 if (nulls1 != 0) 1051 memset((char *)sk, 0, nulls1); 1052 memset((char *)sk + nulls1 + sizeof(void *), 0, 1053 nulls2 - nulls1 - sizeof(void *)); 1054 memset((char *)sk + nulls2 + sizeof(void *), 0, 1055 size - nulls2 - sizeof(void *)); 1056} 1057EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); 1058 1059static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1060 int family) 1061{ 1062 struct sock *sk; 1063 struct kmem_cache *slab; 1064 1065 slab = prot->slab; 1066 if (slab != NULL) { 1067 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1068 if (!sk) 1069 return sk; 1070 if (priority & __GFP_ZERO) { 1071 if (prot->clear_sk) 1072 prot->clear_sk(sk, prot->obj_size); 1073 else 1074 sk_prot_clear_nulls(sk, prot->obj_size); 1075 } 1076 } else 1077 sk = kmalloc(prot->obj_size, priority); 1078 1079 if (sk != NULL) { 1080 kmemcheck_annotate_bitfield(sk, flags); 1081 1082 if (security_sk_alloc(sk, family, priority)) 1083 goto out_free; 1084 1085 if (!try_module_get(prot->owner)) 1086 goto out_free_sec; 1087 sk_tx_queue_clear(sk); 1088 } 1089 1090 return sk; 1091 1092out_free_sec: 1093 security_sk_free(sk); 1094out_free: 1095 if (slab != NULL) 1096 kmem_cache_free(slab, sk); 1097 else 1098 kfree(sk); 1099 return NULL; 1100} 1101 1102static void sk_prot_free(struct proto *prot, struct sock *sk) 1103{ 1104 struct kmem_cache *slab; 1105 struct module *owner; 1106 1107 owner = prot->owner; 1108 slab = prot->slab; 1109 1110 security_sk_free(sk); 1111 if (slab != NULL) 1112 kmem_cache_free(slab, sk); 1113 else 1114 kfree(sk); 1115 module_put(owner); 1116} 1117 1118#ifdef CONFIG_CGROUPS 1119void sock_update_classid(struct sock *sk) 1120{ 1121 u32 classid; 1122 1123 rcu_read_lock(); /* doing current task, which cannot vanish. */ 1124 classid = task_cls_classid(current); 1125 rcu_read_unlock(); 1126 if (classid && classid != sk->sk_classid) 1127 sk->sk_classid = classid; 1128} 1129EXPORT_SYMBOL(sock_update_classid); 1130 1131void sock_update_netprioidx(struct sock *sk) 1132{ 1133 struct cgroup_netprio_state *state; 1134 if (in_interrupt()) 1135 return; 1136 rcu_read_lock(); 1137 state = task_netprio_state(current); 1138 sk->sk_cgrp_prioidx = state ? state->prioidx : 0; 1139 rcu_read_unlock(); 1140} 1141EXPORT_SYMBOL_GPL(sock_update_netprioidx); 1142#endif 1143 1144/** 1145 * sk_alloc - All socket objects are allocated here 1146 * @net: the applicable net namespace 1147 * @family: protocol family 1148 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1149 * @prot: struct proto associated with this new sock instance 1150 */ 1151struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1152 struct proto *prot) 1153{ 1154 struct sock *sk; 1155 1156 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1157 if (sk) { 1158 sk->sk_family = family; 1159 /* 1160 * See comment in struct sock definition to understand 1161 * why we need sk_prot_creator -acme 1162 */ 1163 sk->sk_prot = sk->sk_prot_creator = prot; 1164 sock_lock_init(sk); 1165 sock_net_set(sk, get_net(net)); 1166 atomic_set(&sk->sk_wmem_alloc, 1); 1167 1168 sock_update_classid(sk); 1169 sock_update_netprioidx(sk); 1170 } 1171 1172 return sk; 1173} 1174EXPORT_SYMBOL(sk_alloc); 1175 1176static void __sk_free(struct sock *sk) 1177{ 1178 struct sk_filter *filter; 1179 1180 if (sk->sk_destruct) 1181 sk->sk_destruct(sk); 1182 1183 filter = rcu_dereference_check(sk->sk_filter, 1184 atomic_read(&sk->sk_wmem_alloc) == 0); 1185 if (filter) { 1186 sk_filter_uncharge(sk, filter); 1187 RCU_INIT_POINTER(sk->sk_filter, NULL); 1188 } 1189 1190 sock_disable_timestamp(sk, SOCK_TIMESTAMP); 1191 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); 1192 1193 if (atomic_read(&sk->sk_omem_alloc)) 1194 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", 1195 __func__, atomic_read(&sk->sk_omem_alloc)); 1196 1197 if (sk->sk_peer_cred) 1198 put_cred(sk->sk_peer_cred); 1199 put_pid(sk->sk_peer_pid); 1200 put_net(sock_net(sk)); 1201 sk_prot_free(sk->sk_prot_creator, sk); 1202} 1203 1204void sk_free(struct sock *sk) 1205{ 1206 /* 1207 * We subtract one from sk_wmem_alloc and can know if 1208 * some packets are still in some tx queue. 1209 * If not null, sock_wfree() will call __sk_free(sk) later 1210 */ 1211 if (atomic_dec_and_test(&sk->sk_wmem_alloc)) 1212 __sk_free(sk); 1213} 1214EXPORT_SYMBOL(sk_free); 1215 1216/* 1217 * Last sock_put should drop reference to sk->sk_net. It has already 1218 * been dropped in sk_change_net. Taking reference to stopping namespace 1219 * is not an option. 1220 * Take reference to a socket to remove it from hash _alive_ and after that 1221 * destroy it in the context of init_net. 1222 */ 1223void sk_release_kernel(struct sock *sk) 1224{ 1225 if (sk == NULL || sk->sk_socket == NULL) 1226 return; 1227 1228 sock_hold(sk); 1229 sock_release(sk->sk_socket); 1230 release_net(sock_net(sk)); 1231 sock_net_set(sk, get_net(&init_net)); 1232 sock_put(sk); 1233} 1234EXPORT_SYMBOL(sk_release_kernel); 1235 1236/** 1237 * sk_clone_lock - clone a socket, and lock its clone 1238 * @sk: the socket to clone 1239 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1240 * 1241 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1242 */ 1243struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1244{ 1245 struct sock *newsk; 1246 1247 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1248 if (newsk != NULL) { 1249 struct sk_filter *filter; 1250 1251 sock_copy(newsk, sk); 1252 1253 /* SANITY */ 1254 get_net(sock_net(newsk)); 1255 sk_node_init(&newsk->sk_node); 1256 sock_lock_init(newsk); 1257 bh_lock_sock(newsk); 1258 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1259 newsk->sk_backlog.len = 0; 1260 1261 atomic_set(&newsk->sk_rmem_alloc, 0); 1262 /* 1263 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1264 */ 1265 atomic_set(&newsk->sk_wmem_alloc, 1); 1266 atomic_set(&newsk->sk_omem_alloc, 0); 1267 skb_queue_head_init(&newsk->sk_receive_queue); 1268 skb_queue_head_init(&newsk->sk_write_queue); 1269#ifdef CONFIG_NET_DMA 1270 skb_queue_head_init(&newsk->sk_async_wait_queue); 1271#endif 1272 1273 spin_lock_init(&newsk->sk_dst_lock); 1274 rwlock_init(&newsk->sk_callback_lock); 1275 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1276 af_callback_keys + newsk->sk_family, 1277 af_family_clock_key_strings[newsk->sk_family]); 1278 1279 newsk->sk_dst_cache = NULL; 1280 newsk->sk_wmem_queued = 0; 1281 newsk->sk_forward_alloc = 0; 1282 newsk->sk_send_head = NULL; 1283 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1284 1285 sock_reset_flag(newsk, SOCK_DONE); 1286 skb_queue_head_init(&newsk->sk_error_queue); 1287 1288 filter = rcu_dereference_protected(newsk->sk_filter, 1); 1289 if (filter != NULL) 1290 sk_filter_charge(newsk, filter); 1291 1292 if (unlikely(xfrm_sk_clone_policy(newsk))) { 1293 /* It is still raw copy of parent, so invalidate 1294 * destructor and make plain sk_free() */ 1295 newsk->sk_destruct = NULL; 1296 bh_unlock_sock(newsk); 1297 sk_free(newsk); 1298 newsk = NULL; 1299 goto out; 1300 } 1301 1302 newsk->sk_err = 0; 1303 newsk->sk_priority = 0; 1304 /* 1305 * Before updating sk_refcnt, we must commit prior changes to memory 1306 * (Documentation/RCU/rculist_nulls.txt for details) 1307 */ 1308 smp_wmb(); 1309 atomic_set(&newsk->sk_refcnt, 2); 1310 1311 /* 1312 * Increment the counter in the same struct proto as the master 1313 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1314 * is the same as sk->sk_prot->socks, as this field was copied 1315 * with memcpy). 1316 * 1317 * This _changes_ the previous behaviour, where 1318 * tcp_create_openreq_child always was incrementing the 1319 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1320 * to be taken into account in all callers. -acme 1321 */ 1322 sk_refcnt_debug_inc(newsk); 1323 sk_set_socket(newsk, NULL); 1324 newsk->sk_wq = NULL; 1325 1326 if (newsk->sk_prot->sockets_allocated) 1327 percpu_counter_inc(newsk->sk_prot->sockets_allocated); 1328 1329 if (sock_flag(newsk, SOCK_TIMESTAMP) || 1330 sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) 1331 net_enable_timestamp(); 1332 } 1333out: 1334 return newsk; 1335} 1336EXPORT_SYMBOL_GPL(sk_clone_lock); 1337 1338void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1339{ 1340 __sk_dst_set(sk, dst); 1341 sk->sk_route_caps = dst->dev->features; 1342 if (sk->sk_route_caps & NETIF_F_GSO) 1343 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1344 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1345 if (sk_can_gso(sk)) { 1346 if (dst->header_len) { 1347 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1348 } else { 1349 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1350 sk->sk_gso_max_size = dst->dev->gso_max_size; 1351 } 1352 } 1353} 1354EXPORT_SYMBOL_GPL(sk_setup_caps); 1355 1356void __init sk_init(void) 1357{ 1358 if (totalram_pages <= 4096) { 1359 sysctl_wmem_max = 32767; 1360 sysctl_rmem_max = 32767; 1361 sysctl_wmem_default = 32767; 1362 sysctl_rmem_default = 32767; 1363 } else if (totalram_pages >= 131072) { 1364 sysctl_wmem_max = 131071; 1365 sysctl_rmem_max = 131071; 1366 } 1367} 1368 1369/* 1370 * Simple resource managers for sockets. 1371 */ 1372 1373 1374/* 1375 * Write buffer destructor automatically called from kfree_skb. 1376 */ 1377void sock_wfree(struct sk_buff *skb) 1378{ 1379 struct sock *sk = skb->sk; 1380 unsigned int len = skb->truesize; 1381 1382 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1383 /* 1384 * Keep a reference on sk_wmem_alloc, this will be released 1385 * after sk_write_space() call 1386 */ 1387 atomic_sub(len - 1, &sk->sk_wmem_alloc); 1388 sk->sk_write_space(sk); 1389 len = 1; 1390 } 1391 /* 1392 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1393 * could not do because of in-flight packets 1394 */ 1395 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) 1396 __sk_free(sk); 1397} 1398EXPORT_SYMBOL(sock_wfree); 1399 1400/* 1401 * Read buffer destructor automatically called from kfree_skb. 1402 */ 1403void sock_rfree(struct sk_buff *skb) 1404{ 1405 struct sock *sk = skb->sk; 1406 unsigned int len = skb->truesize; 1407 1408 atomic_sub(len, &sk->sk_rmem_alloc); 1409 sk_mem_uncharge(sk, len); 1410} 1411EXPORT_SYMBOL(sock_rfree); 1412 1413 1414int sock_i_uid(struct sock *sk) 1415{ 1416 int uid; 1417 1418 read_lock_bh(&sk->sk_callback_lock); 1419 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; 1420 read_unlock_bh(&sk->sk_callback_lock); 1421 return uid; 1422} 1423EXPORT_SYMBOL(sock_i_uid); 1424 1425unsigned long sock_i_ino(struct sock *sk) 1426{ 1427 unsigned long ino; 1428 1429 read_lock_bh(&sk->sk_callback_lock); 1430 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1431 read_unlock_bh(&sk->sk_callback_lock); 1432 return ino; 1433} 1434EXPORT_SYMBOL(sock_i_ino); 1435 1436/* 1437 * Allocate a skb from the socket's send buffer. 1438 */ 1439struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1440 gfp_t priority) 1441{ 1442 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1443 struct sk_buff *skb = alloc_skb(size, priority); 1444 if (skb) { 1445 skb_set_owner_w(skb, sk); 1446 return skb; 1447 } 1448 } 1449 return NULL; 1450} 1451EXPORT_SYMBOL(sock_wmalloc); 1452 1453/* 1454 * Allocate a skb from the socket's receive buffer. 1455 */ 1456struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, 1457 gfp_t priority) 1458{ 1459 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1460 struct sk_buff *skb = alloc_skb(size, priority); 1461 if (skb) { 1462 skb_set_owner_r(skb, sk); 1463 return skb; 1464 } 1465 } 1466 return NULL; 1467} 1468 1469/* 1470 * Allocate a memory block from the socket's option memory buffer. 1471 */ 1472void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1473{ 1474 if ((unsigned)size <= sysctl_optmem_max && 1475 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1476 void *mem; 1477 /* First do the add, to avoid the race if kmalloc 1478 * might sleep. 1479 */ 1480 atomic_add(size, &sk->sk_omem_alloc); 1481 mem = kmalloc(size, priority); 1482 if (mem) 1483 return mem; 1484 atomic_sub(size, &sk->sk_omem_alloc); 1485 } 1486 return NULL; 1487} 1488EXPORT_SYMBOL(sock_kmalloc); 1489 1490/* 1491 * Free an option memory block. 1492 */ 1493void sock_kfree_s(struct sock *sk, void *mem, int size) 1494{ 1495 kfree(mem); 1496 atomic_sub(size, &sk->sk_omem_alloc); 1497} 1498EXPORT_SYMBOL(sock_kfree_s); 1499 1500/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 1501 I think, these locks should be removed for datagram sockets. 1502 */ 1503static long sock_wait_for_wmem(struct sock *sk, long timeo) 1504{ 1505 DEFINE_WAIT(wait); 1506 1507 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1508 for (;;) { 1509 if (!timeo) 1510 break; 1511 if (signal_pending(current)) 1512 break; 1513 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1514 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1515 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1516 break; 1517 if (sk->sk_shutdown & SEND_SHUTDOWN) 1518 break; 1519 if (sk->sk_err) 1520 break; 1521 timeo = schedule_timeout(timeo); 1522 } 1523 finish_wait(sk_sleep(sk), &wait); 1524 return timeo; 1525} 1526 1527 1528/* 1529 * Generic send/receive buffer handlers 1530 */ 1531 1532struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1533 unsigned long data_len, int noblock, 1534 int *errcode) 1535{ 1536 struct sk_buff *skb; 1537 gfp_t gfp_mask; 1538 long timeo; 1539 int err; 1540 1541 gfp_mask = sk->sk_allocation; 1542 if (gfp_mask & __GFP_WAIT) 1543 gfp_mask |= __GFP_REPEAT; 1544 1545 timeo = sock_sndtimeo(sk, noblock); 1546 while (1) { 1547 err = sock_error(sk); 1548 if (err != 0) 1549 goto failure; 1550 1551 err = -EPIPE; 1552 if (sk->sk_shutdown & SEND_SHUTDOWN) 1553 goto failure; 1554 1555 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1556 skb = alloc_skb(header_len, gfp_mask); 1557 if (skb) { 1558 int npages; 1559 int i; 1560 1561 /* No pages, we're done... */ 1562 if (!data_len) 1563 break; 1564 1565 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 1566 skb->truesize += data_len; 1567 skb_shinfo(skb)->nr_frags = npages; 1568 for (i = 0; i < npages; i++) { 1569 struct page *page; 1570 1571 page = alloc_pages(sk->sk_allocation, 0); 1572 if (!page) { 1573 err = -ENOBUFS; 1574 skb_shinfo(skb)->nr_frags = i; 1575 kfree_skb(skb); 1576 goto failure; 1577 } 1578 1579 __skb_fill_page_desc(skb, i, 1580 page, 0, 1581 (data_len >= PAGE_SIZE ? 1582 PAGE_SIZE : 1583 data_len)); 1584 data_len -= PAGE_SIZE; 1585 } 1586 1587 /* Full success... */ 1588 break; 1589 } 1590 err = -ENOBUFS; 1591 goto failure; 1592 } 1593 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1594 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1595 err = -EAGAIN; 1596 if (!timeo) 1597 goto failure; 1598 if (signal_pending(current)) 1599 goto interrupted; 1600 timeo = sock_wait_for_wmem(sk, timeo); 1601 } 1602 1603 skb_set_owner_w(skb, sk); 1604 return skb; 1605 1606interrupted: 1607 err = sock_intr_errno(timeo); 1608failure: 1609 *errcode = err; 1610 return NULL; 1611} 1612EXPORT_SYMBOL(sock_alloc_send_pskb); 1613 1614struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1615 int noblock, int *errcode) 1616{ 1617 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); 1618} 1619EXPORT_SYMBOL(sock_alloc_send_skb); 1620 1621static void __lock_sock(struct sock *sk) 1622 __releases(&sk->sk_lock.slock) 1623 __acquires(&sk->sk_lock.slock) 1624{ 1625 DEFINE_WAIT(wait); 1626 1627 for (;;) { 1628 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 1629 TASK_UNINTERRUPTIBLE); 1630 spin_unlock_bh(&sk->sk_lock.slock); 1631 schedule(); 1632 spin_lock_bh(&sk->sk_lock.slock); 1633 if (!sock_owned_by_user(sk)) 1634 break; 1635 } 1636 finish_wait(&sk->sk_lock.wq, &wait); 1637} 1638 1639static void __release_sock(struct sock *sk) 1640 __releases(&sk->sk_lock.slock) 1641 __acquires(&sk->sk_lock.slock) 1642{ 1643 struct sk_buff *skb = sk->sk_backlog.head; 1644 1645 do { 1646 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 1647 bh_unlock_sock(sk); 1648 1649 do { 1650 struct sk_buff *next = skb->next; 1651 1652 WARN_ON_ONCE(skb_dst_is_noref(skb)); 1653 skb->next = NULL; 1654 sk_backlog_rcv(sk, skb); 1655 1656 /* 1657 * We are in process context here with softirqs 1658 * disabled, use cond_resched_softirq() to preempt. 1659 * This is safe to do because we've taken the backlog 1660 * queue private: 1661 */ 1662 cond_resched_softirq(); 1663 1664 skb = next; 1665 } while (skb != NULL); 1666 1667 bh_lock_sock(sk); 1668 } while ((skb = sk->sk_backlog.head) != NULL); 1669 1670 /* 1671 * Doing the zeroing here guarantee we can not loop forever 1672 * while a wild producer attempts to flood us. 1673 */ 1674 sk->sk_backlog.len = 0; 1675} 1676 1677/** 1678 * sk_wait_data - wait for data to arrive at sk_receive_queue 1679 * @sk: sock to wait on 1680 * @timeo: for how long 1681 * 1682 * Now socket state including sk->sk_err is changed only under lock, 1683 * hence we may omit checks after joining wait queue. 1684 * We check receive queue before schedule() only as optimization; 1685 * it is very likely that release_sock() added new data. 1686 */ 1687int sk_wait_data(struct sock *sk, long *timeo) 1688{ 1689 int rc; 1690 DEFINE_WAIT(wait); 1691 1692 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1693 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1694 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); 1695 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1696 finish_wait(sk_sleep(sk), &wait); 1697 return rc; 1698} 1699EXPORT_SYMBOL(sk_wait_data); 1700 1701/** 1702 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 1703 * @sk: socket 1704 * @size: memory size to allocate 1705 * @kind: allocation type 1706 * 1707 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 1708 * rmem allocation. This function assumes that protocols which have 1709 * memory_pressure use sk_wmem_queued as write buffer accounting. 1710 */ 1711int __sk_mem_schedule(struct sock *sk, int size, int kind) 1712{ 1713 struct proto *prot = sk->sk_prot; 1714 int amt = sk_mem_pages(size); 1715 long allocated; 1716 1717 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1718 allocated = atomic_long_add_return(amt, prot->memory_allocated); 1719 1720 /* Under limit. */ 1721 if (allocated <= prot->sysctl_mem[0]) { 1722 if (prot->memory_pressure && *prot->memory_pressure) 1723 *prot->memory_pressure = 0; 1724 return 1; 1725 } 1726 1727 /* Under pressure. */ 1728 if (allocated > prot->sysctl_mem[1]) 1729 if (prot->enter_memory_pressure) 1730 prot->enter_memory_pressure(sk); 1731 1732 /* Over hard limit. */ 1733 if (allocated > prot->sysctl_mem[2]) 1734 goto suppress_allocation; 1735 1736 /* guarantee minimum buffer size under pressure */ 1737 if (kind == SK_MEM_RECV) { 1738 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 1739 return 1; 1740 } else { /* SK_MEM_SEND */ 1741 if (sk->sk_type == SOCK_STREAM) { 1742 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 1743 return 1; 1744 } else if (atomic_read(&sk->sk_wmem_alloc) < 1745 prot->sysctl_wmem[0]) 1746 return 1; 1747 } 1748 1749 if (prot->memory_pressure) { 1750 int alloc; 1751 1752 if (!*prot->memory_pressure) 1753 return 1; 1754 alloc = percpu_counter_read_positive(prot->sockets_allocated); 1755 if (prot->sysctl_mem[2] > alloc * 1756 sk_mem_pages(sk->sk_wmem_queued + 1757 atomic_read(&sk->sk_rmem_alloc) + 1758 sk->sk_forward_alloc)) 1759 return 1; 1760 } 1761 1762suppress_allocation: 1763 1764 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 1765 sk_stream_moderate_sndbuf(sk); 1766 1767 /* Fail only if socket is _under_ its sndbuf. 1768 * In this case we cannot block, so that we have to fail. 1769 */ 1770 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 1771 return 1; 1772 } 1773 1774 trace_sock_exceed_buf_limit(sk, prot, allocated); 1775 1776 /* Alas. Undo changes. */ 1777 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 1778 atomic_long_sub(amt, prot->memory_allocated); 1779 return 0; 1780} 1781EXPORT_SYMBOL(__sk_mem_schedule); 1782 1783/** 1784 * __sk_reclaim - reclaim memory_allocated 1785 * @sk: socket 1786 */ 1787void __sk_mem_reclaim(struct sock *sk) 1788{ 1789 struct proto *prot = sk->sk_prot; 1790 1791 atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 1792 prot->memory_allocated); 1793 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 1794 1795 if (prot->memory_pressure && *prot->memory_pressure && 1796 (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) 1797 *prot->memory_pressure = 0; 1798} 1799EXPORT_SYMBOL(__sk_mem_reclaim); 1800 1801 1802/* 1803 * Set of default routines for initialising struct proto_ops when 1804 * the protocol does not support a particular function. In certain 1805 * cases where it makes no sense for a protocol to have a "do nothing" 1806 * function, some default processing is provided. 1807 */ 1808 1809int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 1810{ 1811 return -EOPNOTSUPP; 1812} 1813EXPORT_SYMBOL(sock_no_bind); 1814 1815int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 1816 int len, int flags) 1817{ 1818 return -EOPNOTSUPP; 1819} 1820EXPORT_SYMBOL(sock_no_connect); 1821 1822int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 1823{ 1824 return -EOPNOTSUPP; 1825} 1826EXPORT_SYMBOL(sock_no_socketpair); 1827 1828int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 1829{ 1830 return -EOPNOTSUPP; 1831} 1832EXPORT_SYMBOL(sock_no_accept); 1833 1834int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 1835 int *len, int peer) 1836{ 1837 return -EOPNOTSUPP; 1838} 1839EXPORT_SYMBOL(sock_no_getname); 1840 1841unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 1842{ 1843 return 0; 1844} 1845EXPORT_SYMBOL(sock_no_poll); 1846 1847int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1848{ 1849 return -EOPNOTSUPP; 1850} 1851EXPORT_SYMBOL(sock_no_ioctl); 1852 1853int sock_no_listen(struct socket *sock, int backlog) 1854{ 1855 return -EOPNOTSUPP; 1856} 1857EXPORT_SYMBOL(sock_no_listen); 1858 1859int sock_no_shutdown(struct socket *sock, int how) 1860{ 1861 return -EOPNOTSUPP; 1862} 1863EXPORT_SYMBOL(sock_no_shutdown); 1864 1865int sock_no_setsockopt(struct socket *sock, int level, int optname, 1866 char __user *optval, unsigned int optlen) 1867{ 1868 return -EOPNOTSUPP; 1869} 1870EXPORT_SYMBOL(sock_no_setsockopt); 1871 1872int sock_no_getsockopt(struct socket *sock, int level, int optname, 1873 char __user *optval, int __user *optlen) 1874{ 1875 return -EOPNOTSUPP; 1876} 1877EXPORT_SYMBOL(sock_no_getsockopt); 1878 1879int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 1880 size_t len) 1881{ 1882 return -EOPNOTSUPP; 1883} 1884EXPORT_SYMBOL(sock_no_sendmsg); 1885 1886int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 1887 size_t len, int flags) 1888{ 1889 return -EOPNOTSUPP; 1890} 1891EXPORT_SYMBOL(sock_no_recvmsg); 1892 1893int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 1894{ 1895 /* Mirror missing mmap method error code */ 1896 return -ENODEV; 1897} 1898EXPORT_SYMBOL(sock_no_mmap); 1899 1900ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 1901{ 1902 ssize_t res; 1903 struct msghdr msg = {.msg_flags = flags}; 1904 struct kvec iov; 1905 char *kaddr = kmap(page); 1906 iov.iov_base = kaddr + offset; 1907 iov.iov_len = size; 1908 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 1909 kunmap(page); 1910 return res; 1911} 1912EXPORT_SYMBOL(sock_no_sendpage); 1913 1914/* 1915 * Default Socket Callbacks 1916 */ 1917 1918static void sock_def_wakeup(struct sock *sk) 1919{ 1920 struct socket_wq *wq; 1921 1922 rcu_read_lock(); 1923 wq = rcu_dereference(sk->sk_wq); 1924 if (wq_has_sleeper(wq)) 1925 wake_up_interruptible_all(&wq->wait); 1926 rcu_read_unlock(); 1927} 1928 1929static void sock_def_error_report(struct sock *sk) 1930{ 1931 struct socket_wq *wq; 1932 1933 rcu_read_lock(); 1934 wq = rcu_dereference(sk->sk_wq); 1935 if (wq_has_sleeper(wq)) 1936 wake_up_interruptible_poll(&wq->wait, POLLERR); 1937 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 1938 rcu_read_unlock(); 1939} 1940 1941static void sock_def_readable(struct sock *sk, int len) 1942{ 1943 struct socket_wq *wq; 1944 1945 rcu_read_lock(); 1946 wq = rcu_dereference(sk->sk_wq); 1947 if (wq_has_sleeper(wq)) 1948 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | 1949 POLLRDNORM | POLLRDBAND); 1950 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 1951 rcu_read_unlock(); 1952} 1953 1954static void sock_def_write_space(struct sock *sk) 1955{ 1956 struct socket_wq *wq; 1957 1958 rcu_read_lock(); 1959 1960 /* Do not wake up a writer until he can make "significant" 1961 * progress. --DaveM 1962 */ 1963 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 1964 wq = rcu_dereference(sk->sk_wq); 1965 if (wq_has_sleeper(wq)) 1966 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | 1967 POLLWRNORM | POLLWRBAND); 1968 1969 /* Should agree with poll, otherwise some programs break */ 1970 if (sock_writeable(sk)) 1971 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 1972 } 1973 1974 rcu_read_unlock(); 1975} 1976 1977static void sock_def_destruct(struct sock *sk) 1978{ 1979 kfree(sk->sk_protinfo); 1980} 1981 1982void sk_send_sigurg(struct sock *sk) 1983{ 1984 if (sk->sk_socket && sk->sk_socket->file) 1985 if (send_sigurg(&sk->sk_socket->file->f_owner)) 1986 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 1987} 1988EXPORT_SYMBOL(sk_send_sigurg); 1989 1990void sk_reset_timer(struct sock *sk, struct timer_list* timer, 1991 unsigned long expires) 1992{ 1993 if (!mod_timer(timer, expires)) 1994 sock_hold(sk); 1995} 1996EXPORT_SYMBOL(sk_reset_timer); 1997 1998void sk_stop_timer(struct sock *sk, struct timer_list* timer) 1999{ 2000 if (timer_pending(timer) && del_timer(timer)) 2001 __sock_put(sk); 2002} 2003EXPORT_SYMBOL(sk_stop_timer); 2004 2005void sock_init_data(struct socket *sock, struct sock *sk) 2006{ 2007 skb_queue_head_init(&sk->sk_receive_queue); 2008 skb_queue_head_init(&sk->sk_write_queue); 2009 skb_queue_head_init(&sk->sk_error_queue); 2010#ifdef CONFIG_NET_DMA 2011 skb_queue_head_init(&sk->sk_async_wait_queue); 2012#endif 2013 2014 sk->sk_send_head = NULL; 2015 2016 init_timer(&sk->sk_timer); 2017 2018 sk->sk_allocation = GFP_KERNEL; 2019 sk->sk_rcvbuf = sysctl_rmem_default; 2020 sk->sk_sndbuf = sysctl_wmem_default; 2021 sk->sk_state = TCP_CLOSE; 2022 sk_set_socket(sk, sock); 2023 2024 sock_set_flag(sk, SOCK_ZAPPED); 2025 2026 if (sock) { 2027 sk->sk_type = sock->type; 2028 sk->sk_wq = sock->wq; 2029 sock->sk = sk; 2030 } else 2031 sk->sk_wq = NULL; 2032 2033 spin_lock_init(&sk->sk_dst_lock); 2034 rwlock_init(&sk->sk_callback_lock); 2035 lockdep_set_class_and_name(&sk->sk_callback_lock, 2036 af_callback_keys + sk->sk_family, 2037 af_family_clock_key_strings[sk->sk_family]); 2038 2039 sk->sk_state_change = sock_def_wakeup; 2040 sk->sk_data_ready = sock_def_readable; 2041 sk->sk_write_space = sock_def_write_space; 2042 sk->sk_error_report = sock_def_error_report; 2043 sk->sk_destruct = sock_def_destruct; 2044 2045 sk->sk_sndmsg_page = NULL; 2046 sk->sk_sndmsg_off = 0; 2047 2048 sk->sk_peer_pid = NULL; 2049 sk->sk_peer_cred = NULL; 2050 sk->sk_write_pending = 0; 2051 sk->sk_rcvlowat = 1; 2052 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2053 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2054 2055 sk->sk_stamp = ktime_set(-1L, 0); 2056 2057 /* 2058 * Before updating sk_refcnt, we must commit prior changes to memory 2059 * (Documentation/RCU/rculist_nulls.txt for details) 2060 */ 2061 smp_wmb(); 2062 atomic_set(&sk->sk_refcnt, 1); 2063 atomic_set(&sk->sk_drops, 0); 2064} 2065EXPORT_SYMBOL(sock_init_data); 2066 2067void lock_sock_nested(struct sock *sk, int subclass) 2068{ 2069 might_sleep(); 2070 spin_lock_bh(&sk->sk_lock.slock); 2071 if (sk->sk_lock.owned) 2072 __lock_sock(sk); 2073 sk->sk_lock.owned = 1; 2074 spin_unlock(&sk->sk_lock.slock); 2075 /* 2076 * The sk_lock has mutex_lock() semantics here: 2077 */ 2078 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2079 local_bh_enable(); 2080} 2081EXPORT_SYMBOL(lock_sock_nested); 2082 2083void release_sock(struct sock *sk) 2084{ 2085 /* 2086 * The sk_lock has mutex_unlock() semantics: 2087 */ 2088 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 2089 2090 spin_lock_bh(&sk->sk_lock.slock); 2091 if (sk->sk_backlog.tail) 2092 __release_sock(sk); 2093 sk->sk_lock.owned = 0; 2094 if (waitqueue_active(&sk->sk_lock.wq)) 2095 wake_up(&sk->sk_lock.wq); 2096 spin_unlock_bh(&sk->sk_lock.slock); 2097} 2098EXPORT_SYMBOL(release_sock); 2099 2100/** 2101 * lock_sock_fast - fast version of lock_sock 2102 * @sk: socket 2103 * 2104 * This version should be used for very small section, where process wont block 2105 * return false if fast path is taken 2106 * sk_lock.slock locked, owned = 0, BH disabled 2107 * return true if slow path is taken 2108 * sk_lock.slock unlocked, owned = 1, BH enabled 2109 */ 2110bool lock_sock_fast(struct sock *sk) 2111{ 2112 might_sleep(); 2113 spin_lock_bh(&sk->sk_lock.slock); 2114 2115 if (!sk->sk_lock.owned) 2116 /* 2117 * Note : We must disable BH 2118 */ 2119 return false; 2120 2121 __lock_sock(sk); 2122 sk->sk_lock.owned = 1; 2123 spin_unlock(&sk->sk_lock.slock); 2124 /* 2125 * The sk_lock has mutex_lock() semantics here: 2126 */ 2127 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2128 local_bh_enable(); 2129 return true; 2130} 2131EXPORT_SYMBOL(lock_sock_fast); 2132 2133int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2134{ 2135 struct timeval tv; 2136 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2137 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2138 tv = ktime_to_timeval(sk->sk_stamp); 2139 if (tv.tv_sec == -1) 2140 return -ENOENT; 2141 if (tv.tv_sec == 0) { 2142 sk->sk_stamp = ktime_get_real(); 2143 tv = ktime_to_timeval(sk->sk_stamp); 2144 } 2145 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2146} 2147EXPORT_SYMBOL(sock_get_timestamp); 2148 2149int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2150{ 2151 struct timespec ts; 2152 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2153 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2154 ts = ktime_to_timespec(sk->sk_stamp); 2155 if (ts.tv_sec == -1) 2156 return -ENOENT; 2157 if (ts.tv_sec == 0) { 2158 sk->sk_stamp = ktime_get_real(); 2159 ts = ktime_to_timespec(sk->sk_stamp); 2160 } 2161 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2162} 2163EXPORT_SYMBOL(sock_get_timestampns); 2164 2165void sock_enable_timestamp(struct sock *sk, int flag) 2166{ 2167 if (!sock_flag(sk, flag)) { 2168 sock_set_flag(sk, flag); 2169 /* 2170 * we just set one of the two flags which require net 2171 * time stamping, but time stamping might have been on 2172 * already because of the other one 2173 */ 2174 if (!sock_flag(sk, 2175 flag == SOCK_TIMESTAMP ? 2176 SOCK_TIMESTAMPING_RX_SOFTWARE : 2177 SOCK_TIMESTAMP)) 2178 net_enable_timestamp(); 2179 } 2180} 2181 2182/* 2183 * Get a socket option on an socket. 2184 * 2185 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2186 * asynchronous errors should be reported by getsockopt. We assume 2187 * this means if you specify SO_ERROR (otherwise whats the point of it). 2188 */ 2189int sock_common_getsockopt(struct socket *sock, int level, int optname, 2190 char __user *optval, int __user *optlen) 2191{ 2192 struct sock *sk = sock->sk; 2193 2194 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2195} 2196EXPORT_SYMBOL(sock_common_getsockopt); 2197 2198#ifdef CONFIG_COMPAT 2199int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2200 char __user *optval, int __user *optlen) 2201{ 2202 struct sock *sk = sock->sk; 2203 2204 if (sk->sk_prot->compat_getsockopt != NULL) 2205 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2206 optval, optlen); 2207 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2208} 2209EXPORT_SYMBOL(compat_sock_common_getsockopt); 2210#endif 2211 2212int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, 2213 struct msghdr *msg, size_t size, int flags) 2214{ 2215 struct sock *sk = sock->sk; 2216 int addr_len = 0; 2217 int err; 2218 2219 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 2220 flags & ~MSG_DONTWAIT, &addr_len); 2221 if (err >= 0) 2222 msg->msg_namelen = addr_len; 2223 return err; 2224} 2225EXPORT_SYMBOL(sock_common_recvmsg); 2226 2227/* 2228 * Set socket options on an inet socket. 2229 */ 2230int sock_common_setsockopt(struct socket *sock, int level, int optname, 2231 char __user *optval, unsigned int optlen) 2232{ 2233 struct sock *sk = sock->sk; 2234 2235 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2236} 2237EXPORT_SYMBOL(sock_common_setsockopt); 2238 2239#ifdef CONFIG_COMPAT 2240int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2241 char __user *optval, unsigned int optlen) 2242{ 2243 struct sock *sk = sock->sk; 2244 2245 if (sk->sk_prot->compat_setsockopt != NULL) 2246 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2247 optval, optlen); 2248 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2249} 2250EXPORT_SYMBOL(compat_sock_common_setsockopt); 2251#endif 2252 2253void sk_common_release(struct sock *sk) 2254{ 2255 if (sk->sk_prot->destroy) 2256 sk->sk_prot->destroy(sk); 2257 2258 /* 2259 * Observation: when sock_common_release is called, processes have 2260 * no access to socket. But net still has. 2261 * Step one, detach it from networking: 2262 * 2263 * A. Remove from hash tables. 2264 */ 2265 2266 sk->sk_prot->unhash(sk); 2267 2268 /* 2269 * In this point socket cannot receive new packets, but it is possible 2270 * that some packets are in flight because some CPU runs receiver and 2271 * did hash table lookup before we unhashed socket. They will achieve 2272 * receive queue and will be purged by socket destructor. 2273 * 2274 * Also we still have packets pending on receive queue and probably, 2275 * our own packets waiting in device queues. sock_destroy will drain 2276 * receive queue, but transmitted packets will delay socket destruction 2277 * until the last reference will be released. 2278 */ 2279 2280 sock_orphan(sk); 2281 2282 xfrm_sk_free_policy(sk); 2283 2284 sk_refcnt_debug_release(sk); 2285 sock_put(sk); 2286} 2287EXPORT_SYMBOL(sk_common_release); 2288 2289static DEFINE_RWLOCK(proto_list_lock); 2290static LIST_HEAD(proto_list); 2291 2292#ifdef CONFIG_PROC_FS 2293#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2294struct prot_inuse { 2295 int val[PROTO_INUSE_NR]; 2296}; 2297 2298static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 2299 2300#ifdef CONFIG_NET_NS 2301void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2302{ 2303 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); 2304} 2305EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2306 2307int sock_prot_inuse_get(struct net *net, struct proto *prot) 2308{ 2309 int cpu, idx = prot->inuse_idx; 2310 int res = 0; 2311 2312 for_each_possible_cpu(cpu) 2313 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; 2314 2315 return res >= 0 ? res : 0; 2316} 2317EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2318 2319static int __net_init sock_inuse_init_net(struct net *net) 2320{ 2321 net->core.inuse = alloc_percpu(struct prot_inuse); 2322 return net->core.inuse ? 0 : -ENOMEM; 2323} 2324 2325static void __net_exit sock_inuse_exit_net(struct net *net) 2326{ 2327 free_percpu(net->core.inuse); 2328} 2329 2330static struct pernet_operations net_inuse_ops = { 2331 .init = sock_inuse_init_net, 2332 .exit = sock_inuse_exit_net, 2333}; 2334 2335static __init int net_inuse_init(void) 2336{ 2337 if (register_pernet_subsys(&net_inuse_ops)) 2338 panic("Cannot initialize net inuse counters"); 2339 2340 return 0; 2341} 2342 2343core_initcall(net_inuse_init); 2344#else 2345static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); 2346 2347void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2348{ 2349 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); 2350} 2351EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2352 2353int sock_prot_inuse_get(struct net *net, struct proto *prot) 2354{ 2355 int cpu, idx = prot->inuse_idx; 2356 int res = 0; 2357 2358 for_each_possible_cpu(cpu) 2359 res += per_cpu(prot_inuse, cpu).val[idx]; 2360 2361 return res >= 0 ? res : 0; 2362} 2363EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2364#endif 2365 2366static void assign_proto_idx(struct proto *prot) 2367{ 2368 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 2369 2370 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 2371 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); 2372 return; 2373 } 2374 2375 set_bit(prot->inuse_idx, proto_inuse_idx); 2376} 2377 2378static void release_proto_idx(struct proto *prot) 2379{ 2380 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 2381 clear_bit(prot->inuse_idx, proto_inuse_idx); 2382} 2383#else 2384static inline void assign_proto_idx(struct proto *prot) 2385{ 2386} 2387 2388static inline void release_proto_idx(struct proto *prot) 2389{ 2390} 2391#endif 2392 2393int proto_register(struct proto *prot, int alloc_slab) 2394{ 2395 if (alloc_slab) { 2396 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2397 SLAB_HWCACHE_ALIGN | prot->slab_flags, 2398 NULL); 2399 2400 if (prot->slab == NULL) { 2401 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", 2402 prot->name); 2403 goto out; 2404 } 2405 2406 if (prot->rsk_prot != NULL) { 2407 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); 2408 if (prot->rsk_prot->slab_name == NULL) 2409 goto out_free_sock_slab; 2410 2411 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, 2412 prot->rsk_prot->obj_size, 0, 2413 SLAB_HWCACHE_ALIGN, NULL); 2414 2415 if (prot->rsk_prot->slab == NULL) { 2416 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", 2417 prot->name); 2418 goto out_free_request_sock_slab_name; 2419 } 2420 } 2421 2422 if (prot->twsk_prot != NULL) { 2423 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 2424 2425 if (prot->twsk_prot->twsk_slab_name == NULL) 2426 goto out_free_request_sock_slab; 2427 2428 prot->twsk_prot->twsk_slab = 2429 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 2430 prot->twsk_prot->twsk_obj_size, 2431 0, 2432 SLAB_HWCACHE_ALIGN | 2433 prot->slab_flags, 2434 NULL); 2435 if (prot->twsk_prot->twsk_slab == NULL) 2436 goto out_free_timewait_sock_slab_name; 2437 } 2438 } 2439 2440 write_lock(&proto_list_lock); 2441 list_add(&prot->node, &proto_list); 2442 assign_proto_idx(prot); 2443 write_unlock(&proto_list_lock); 2444 return 0; 2445 2446out_free_timewait_sock_slab_name: 2447 kfree(prot->twsk_prot->twsk_slab_name); 2448out_free_request_sock_slab: 2449 if (prot->rsk_prot && prot->rsk_prot->slab) { 2450 kmem_cache_destroy(prot->rsk_prot->slab); 2451 prot->rsk_prot->slab = NULL; 2452 } 2453out_free_request_sock_slab_name: 2454 if (prot->rsk_prot) 2455 kfree(prot->rsk_prot->slab_name); 2456out_free_sock_slab: 2457 kmem_cache_destroy(prot->slab); 2458 prot->slab = NULL; 2459out: 2460 return -ENOBUFS; 2461} 2462EXPORT_SYMBOL(proto_register); 2463 2464void proto_unregister(struct proto *prot) 2465{ 2466 write_lock(&proto_list_lock); 2467 release_proto_idx(prot); 2468 list_del(&prot->node); 2469 write_unlock(&proto_list_lock); 2470 2471 if (prot->slab != NULL) { 2472 kmem_cache_destroy(prot->slab); 2473 prot->slab = NULL; 2474 } 2475 2476 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { 2477 kmem_cache_destroy(prot->rsk_prot->slab); 2478 kfree(prot->rsk_prot->slab_name); 2479 prot->rsk_prot->slab = NULL; 2480 } 2481 2482 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 2483 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 2484 kfree(prot->twsk_prot->twsk_slab_name); 2485 prot->twsk_prot->twsk_slab = NULL; 2486 } 2487} 2488EXPORT_SYMBOL(proto_unregister); 2489 2490#ifdef CONFIG_PROC_FS 2491static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2492 __acquires(proto_list_lock) 2493{ 2494 read_lock(&proto_list_lock); 2495 return seq_list_start_head(&proto_list, *pos); 2496} 2497 2498static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2499{ 2500 return seq_list_next(v, &proto_list, pos); 2501} 2502 2503static void proto_seq_stop(struct seq_file *seq, void *v) 2504 __releases(proto_list_lock) 2505{ 2506 read_unlock(&proto_list_lock); 2507} 2508 2509static char proto_method_implemented(const void *method) 2510{ 2511 return method == NULL ? 'n' : 'y'; 2512} 2513 2514static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2515{ 2516 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 2517 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2518 proto->name, 2519 proto->obj_size, 2520 sock_prot_inuse_get(seq_file_net(seq), proto), 2521 proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, 2522 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", 2523 proto->max_header, 2524 proto->slab == NULL ? "no" : "yes", 2525 module_name(proto->owner), 2526 proto_method_implemented(proto->close), 2527 proto_method_implemented(proto->connect), 2528 proto_method_implemented(proto->disconnect), 2529 proto_method_implemented(proto->accept), 2530 proto_method_implemented(proto->ioctl), 2531 proto_method_implemented(proto->init), 2532 proto_method_implemented(proto->destroy), 2533 proto_method_implemented(proto->shutdown), 2534 proto_method_implemented(proto->setsockopt), 2535 proto_method_implemented(proto->getsockopt), 2536 proto_method_implemented(proto->sendmsg), 2537 proto_method_implemented(proto->recvmsg), 2538 proto_method_implemented(proto->sendpage), 2539 proto_method_implemented(proto->bind), 2540 proto_method_implemented(proto->backlog_rcv), 2541 proto_method_implemented(proto->hash), 2542 proto_method_implemented(proto->unhash), 2543 proto_method_implemented(proto->get_port), 2544 proto_method_implemented(proto->enter_memory_pressure)); 2545} 2546 2547static int proto_seq_show(struct seq_file *seq, void *v) 2548{ 2549 if (v == &proto_list) 2550 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 2551 "protocol", 2552 "size", 2553 "sockets", 2554 "memory", 2555 "press", 2556 "maxhdr", 2557 "slab", 2558 "module", 2559 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 2560 else 2561 proto_seq_printf(seq, list_entry(v, struct proto, node)); 2562 return 0; 2563} 2564 2565static const struct seq_operations proto_seq_ops = { 2566 .start = proto_seq_start, 2567 .next = proto_seq_next, 2568 .stop = proto_seq_stop, 2569 .show = proto_seq_show, 2570}; 2571 2572static int proto_seq_open(struct inode *inode, struct file *file) 2573{ 2574 return seq_open_net(inode, file, &proto_seq_ops, 2575 sizeof(struct seq_net_private)); 2576} 2577 2578static const struct file_operations proto_seq_fops = { 2579 .owner = THIS_MODULE, 2580 .open = proto_seq_open, 2581 .read = seq_read, 2582 .llseek = seq_lseek, 2583 .release = seq_release_net, 2584}; 2585 2586static __net_init int proto_init_net(struct net *net) 2587{ 2588 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) 2589 return -ENOMEM; 2590 2591 return 0; 2592} 2593 2594static __net_exit void proto_exit_net(struct net *net) 2595{ 2596 proc_net_remove(net, "protocols"); 2597} 2598 2599 2600static __net_initdata struct pernet_operations proto_net_ops = { 2601 .init = proto_init_net, 2602 .exit = proto_exit_net, 2603}; 2604 2605static int __init proto_init(void) 2606{ 2607 return register_pernet_subsys(&proto_net_ops); 2608} 2609 2610subsys_initcall(proto_init); 2611 2612#endif /* PROC_FS */ 2613