sock.c revision 3b885787ea4112eaa80945999ea0901bf742707f
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92#include <linux/capability.h> 93#include <linux/errno.h> 94#include <linux/types.h> 95#include <linux/socket.h> 96#include <linux/in.h> 97#include <linux/kernel.h> 98#include <linux/module.h> 99#include <linux/proc_fs.h> 100#include <linux/seq_file.h> 101#include <linux/sched.h> 102#include <linux/timer.h> 103#include <linux/string.h> 104#include <linux/sockios.h> 105#include <linux/net.h> 106#include <linux/mm.h> 107#include <linux/slab.h> 108#include <linux/interrupt.h> 109#include <linux/poll.h> 110#include <linux/tcp.h> 111#include <linux/init.h> 112#include <linux/highmem.h> 113 114#include <asm/uaccess.h> 115#include <asm/system.h> 116 117#include <linux/netdevice.h> 118#include <net/protocol.h> 119#include <linux/skbuff.h> 120#include <net/net_namespace.h> 121#include <net/request_sock.h> 122#include <net/sock.h> 123#include <linux/net_tstamp.h> 124#include <net/xfrm.h> 125#include <linux/ipsec.h> 126 127#include <linux/filter.h> 128 129#ifdef CONFIG_INET 130#include <net/tcp.h> 131#endif 132 133/* 134 * Each address family might have different locking rules, so we have 135 * one slock key per address family: 136 */ 137static struct lock_class_key af_family_keys[AF_MAX]; 138static struct lock_class_key af_family_slock_keys[AF_MAX]; 139 140/* 141 * Make lock validator output more readable. (we pre-construct these 142 * strings build-time, so that runtime initialization of socket 143 * locks is fast): 144 */ 145static const char *const af_family_key_strings[AF_MAX+1] = { 146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", 148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , 149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , 150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 158 "sk_lock-AF_IEEE802154", 159 "sk_lock-AF_MAX" 160}; 161static const char *const af_family_slock_key_strings[AF_MAX+1] = { 162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", 164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , 165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , 166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , 170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 171 "slock-27" , "slock-28" , "slock-AF_CAN" , 172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 174 "slock-AF_IEEE802154", 175 "slock-AF_MAX" 176}; 177static const char *const af_family_clock_key_strings[AF_MAX+1] = { 178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 187 "clock-27" , "clock-28" , "clock-AF_CAN" , 188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 190 "clock-AF_IEEE802154", 191 "clock-AF_MAX" 192}; 193 194/* 195 * sk_callback_lock locking rules are per-address-family, 196 * so split the lock classes by using a per-AF key: 197 */ 198static struct lock_class_key af_callback_keys[AF_MAX]; 199 200/* Take into consideration the size of the struct sk_buff overhead in the 201 * determination of these values, since that is non-constant across 202 * platforms. This makes socket queueing behavior and performance 203 * not depend upon such differences. 204 */ 205#define _SK_MEM_PACKETS 256 206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) 207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 209 210/* Run time adjustable parameters. */ 211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 215 216/* Maximal space eaten by iovec or ancilliary data plus some space */ 217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 218EXPORT_SYMBOL(sysctl_optmem_max); 219 220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 221{ 222 struct timeval tv; 223 224 if (optlen < sizeof(tv)) 225 return -EINVAL; 226 if (copy_from_user(&tv, optval, sizeof(tv))) 227 return -EFAULT; 228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 229 return -EDOM; 230 231 if (tv.tv_sec < 0) { 232 static int warned __read_mostly; 233 234 *timeo_p = 0; 235 if (warned < 10 && net_ratelimit()) { 236 warned++; 237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " 238 "tries to set negative timeout\n", 239 current->comm, task_pid_nr(current)); 240 } 241 return 0; 242 } 243 *timeo_p = MAX_SCHEDULE_TIMEOUT; 244 if (tv.tv_sec == 0 && tv.tv_usec == 0) 245 return 0; 246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 248 return 0; 249} 250 251static void sock_warn_obsolete_bsdism(const char *name) 252{ 253 static int warned; 254 static char warncomm[TASK_COMM_LEN]; 255 if (strcmp(warncomm, current->comm) && warned < 5) { 256 strcpy(warncomm, current->comm); 257 printk(KERN_WARNING "process `%s' is using obsolete " 258 "%s SO_BSDCOMPAT\n", warncomm, name); 259 warned++; 260 } 261} 262 263static void sock_disable_timestamp(struct sock *sk, int flag) 264{ 265 if (sock_flag(sk, flag)) { 266 sock_reset_flag(sk, flag); 267 if (!sock_flag(sk, SOCK_TIMESTAMP) && 268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { 269 net_disable_timestamp(); 270 } 271 } 272} 273 274 275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 276{ 277 int err = 0; 278 int skb_len; 279 unsigned long flags; 280 struct sk_buff_head *list = &sk->sk_receive_queue; 281 282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces 283 number of warnings when compiling with -W --ANK 284 */ 285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 286 (unsigned)sk->sk_rcvbuf) { 287 err = -ENOMEM; 288 goto out; 289 } 290 291 err = sk_filter(sk, skb); 292 if (err) 293 goto out; 294 295 if (!sk_rmem_schedule(sk, skb->truesize)) { 296 err = -ENOBUFS; 297 goto out; 298 } 299 300 skb->dev = NULL; 301 skb_set_owner_r(skb, sk); 302 303 /* Cache the SKB length before we tack it onto the receive 304 * queue. Once it is added it no longer belongs to us and 305 * may be freed by other threads of control pulling packets 306 * from the queue. 307 */ 308 skb_len = skb->len; 309 310 spin_lock_irqsave(&list->lock, flags); 311 skb->dropcount = atomic_read(&sk->sk_drops); 312 __skb_queue_tail(list, skb); 313 spin_unlock_irqrestore(&list->lock, flags); 314 315 if (!sock_flag(sk, SOCK_DEAD)) 316 sk->sk_data_ready(sk, skb_len); 317out: 318 return err; 319} 320EXPORT_SYMBOL(sock_queue_rcv_skb); 321 322int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) 323{ 324 int rc = NET_RX_SUCCESS; 325 326 if (sk_filter(sk, skb)) 327 goto discard_and_relse; 328 329 skb->dev = NULL; 330 331 if (nested) 332 bh_lock_sock_nested(sk); 333 else 334 bh_lock_sock(sk); 335 if (!sock_owned_by_user(sk)) { 336 /* 337 * trylock + unlock semantics: 338 */ 339 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 340 341 rc = sk_backlog_rcv(sk, skb); 342 343 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 344 } else 345 sk_add_backlog(sk, skb); 346 bh_unlock_sock(sk); 347out: 348 sock_put(sk); 349 return rc; 350discard_and_relse: 351 kfree_skb(skb); 352 goto out; 353} 354EXPORT_SYMBOL(sk_receive_skb); 355 356struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 357{ 358 struct dst_entry *dst = sk->sk_dst_cache; 359 360 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 361 sk->sk_dst_cache = NULL; 362 dst_release(dst); 363 return NULL; 364 } 365 366 return dst; 367} 368EXPORT_SYMBOL(__sk_dst_check); 369 370struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 371{ 372 struct dst_entry *dst = sk_dst_get(sk); 373 374 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 375 sk_dst_reset(sk); 376 dst_release(dst); 377 return NULL; 378 } 379 380 return dst; 381} 382EXPORT_SYMBOL(sk_dst_check); 383 384static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) 385{ 386 int ret = -ENOPROTOOPT; 387#ifdef CONFIG_NETDEVICES 388 struct net *net = sock_net(sk); 389 char devname[IFNAMSIZ]; 390 int index; 391 392 /* Sorry... */ 393 ret = -EPERM; 394 if (!capable(CAP_NET_RAW)) 395 goto out; 396 397 ret = -EINVAL; 398 if (optlen < 0) 399 goto out; 400 401 /* Bind this socket to a particular device like "eth0", 402 * as specified in the passed interface name. If the 403 * name is "" or the option length is zero the socket 404 * is not bound. 405 */ 406 if (optlen > IFNAMSIZ - 1) 407 optlen = IFNAMSIZ - 1; 408 memset(devname, 0, sizeof(devname)); 409 410 ret = -EFAULT; 411 if (copy_from_user(devname, optval, optlen)) 412 goto out; 413 414 if (devname[0] == '\0') { 415 index = 0; 416 } else { 417 struct net_device *dev = dev_get_by_name(net, devname); 418 419 ret = -ENODEV; 420 if (!dev) 421 goto out; 422 423 index = dev->ifindex; 424 dev_put(dev); 425 } 426 427 lock_sock(sk); 428 sk->sk_bound_dev_if = index; 429 sk_dst_reset(sk); 430 release_sock(sk); 431 432 ret = 0; 433 434out: 435#endif 436 437 return ret; 438} 439 440static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 441{ 442 if (valbool) 443 sock_set_flag(sk, bit); 444 else 445 sock_reset_flag(sk, bit); 446} 447 448/* 449 * This is meant for all protocols to use and covers goings on 450 * at the socket level. Everything here is generic. 451 */ 452 453int sock_setsockopt(struct socket *sock, int level, int optname, 454 char __user *optval, unsigned int optlen) 455{ 456 struct sock *sk = sock->sk; 457 int val; 458 int valbool; 459 struct linger ling; 460 int ret = 0; 461 462 /* 463 * Options without arguments 464 */ 465 466 if (optname == SO_BINDTODEVICE) 467 return sock_bindtodevice(sk, optval, optlen); 468 469 if (optlen < sizeof(int)) 470 return -EINVAL; 471 472 if (get_user(val, (int __user *)optval)) 473 return -EFAULT; 474 475 valbool = val ? 1 : 0; 476 477 lock_sock(sk); 478 479 switch (optname) { 480 case SO_DEBUG: 481 if (val && !capable(CAP_NET_ADMIN)) 482 ret = -EACCES; 483 else 484 sock_valbool_flag(sk, SOCK_DBG, valbool); 485 break; 486 case SO_REUSEADDR: 487 sk->sk_reuse = valbool; 488 break; 489 case SO_TYPE: 490 case SO_PROTOCOL: 491 case SO_DOMAIN: 492 case SO_ERROR: 493 ret = -ENOPROTOOPT; 494 break; 495 case SO_DONTROUTE: 496 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 497 break; 498 case SO_BROADCAST: 499 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 500 break; 501 case SO_SNDBUF: 502 /* Don't error on this BSD doesn't and if you think 503 about it this is right. Otherwise apps have to 504 play 'guess the biggest size' games. RCVBUF/SNDBUF 505 are treated in BSD as hints */ 506 507 if (val > sysctl_wmem_max) 508 val = sysctl_wmem_max; 509set_sndbuf: 510 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 511 if ((val * 2) < SOCK_MIN_SNDBUF) 512 sk->sk_sndbuf = SOCK_MIN_SNDBUF; 513 else 514 sk->sk_sndbuf = val * 2; 515 516 /* 517 * Wake up sending tasks if we 518 * upped the value. 519 */ 520 sk->sk_write_space(sk); 521 break; 522 523 case SO_SNDBUFFORCE: 524 if (!capable(CAP_NET_ADMIN)) { 525 ret = -EPERM; 526 break; 527 } 528 goto set_sndbuf; 529 530 case SO_RCVBUF: 531 /* Don't error on this BSD doesn't and if you think 532 about it this is right. Otherwise apps have to 533 play 'guess the biggest size' games. RCVBUF/SNDBUF 534 are treated in BSD as hints */ 535 536 if (val > sysctl_rmem_max) 537 val = sysctl_rmem_max; 538set_rcvbuf: 539 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 540 /* 541 * We double it on the way in to account for 542 * "struct sk_buff" etc. overhead. Applications 543 * assume that the SO_RCVBUF setting they make will 544 * allow that much actual data to be received on that 545 * socket. 546 * 547 * Applications are unaware that "struct sk_buff" and 548 * other overheads allocate from the receive buffer 549 * during socket buffer allocation. 550 * 551 * And after considering the possible alternatives, 552 * returning the value we actually used in getsockopt 553 * is the most desirable behavior. 554 */ 555 if ((val * 2) < SOCK_MIN_RCVBUF) 556 sk->sk_rcvbuf = SOCK_MIN_RCVBUF; 557 else 558 sk->sk_rcvbuf = val * 2; 559 break; 560 561 case SO_RCVBUFFORCE: 562 if (!capable(CAP_NET_ADMIN)) { 563 ret = -EPERM; 564 break; 565 } 566 goto set_rcvbuf; 567 568 case SO_KEEPALIVE: 569#ifdef CONFIG_INET 570 if (sk->sk_protocol == IPPROTO_TCP) 571 tcp_set_keepalive(sk, valbool); 572#endif 573 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 574 break; 575 576 case SO_OOBINLINE: 577 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 578 break; 579 580 case SO_NO_CHECK: 581 sk->sk_no_check = valbool; 582 break; 583 584 case SO_PRIORITY: 585 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 586 sk->sk_priority = val; 587 else 588 ret = -EPERM; 589 break; 590 591 case SO_LINGER: 592 if (optlen < sizeof(ling)) { 593 ret = -EINVAL; /* 1003.1g */ 594 break; 595 } 596 if (copy_from_user(&ling, optval, sizeof(ling))) { 597 ret = -EFAULT; 598 break; 599 } 600 if (!ling.l_onoff) 601 sock_reset_flag(sk, SOCK_LINGER); 602 else { 603#if (BITS_PER_LONG == 32) 604 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 605 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 606 else 607#endif 608 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 609 sock_set_flag(sk, SOCK_LINGER); 610 } 611 break; 612 613 case SO_BSDCOMPAT: 614 sock_warn_obsolete_bsdism("setsockopt"); 615 break; 616 617 case SO_PASSCRED: 618 if (valbool) 619 set_bit(SOCK_PASSCRED, &sock->flags); 620 else 621 clear_bit(SOCK_PASSCRED, &sock->flags); 622 break; 623 624 case SO_TIMESTAMP: 625 case SO_TIMESTAMPNS: 626 if (valbool) { 627 if (optname == SO_TIMESTAMP) 628 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 629 else 630 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 631 sock_set_flag(sk, SOCK_RCVTSTAMP); 632 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 633 } else { 634 sock_reset_flag(sk, SOCK_RCVTSTAMP); 635 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 636 } 637 break; 638 639 case SO_TIMESTAMPING: 640 if (val & ~SOF_TIMESTAMPING_MASK) { 641 ret = -EINVAL; 642 break; 643 } 644 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, 645 val & SOF_TIMESTAMPING_TX_HARDWARE); 646 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, 647 val & SOF_TIMESTAMPING_TX_SOFTWARE); 648 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, 649 val & SOF_TIMESTAMPING_RX_HARDWARE); 650 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 651 sock_enable_timestamp(sk, 652 SOCK_TIMESTAMPING_RX_SOFTWARE); 653 else 654 sock_disable_timestamp(sk, 655 SOCK_TIMESTAMPING_RX_SOFTWARE); 656 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, 657 val & SOF_TIMESTAMPING_SOFTWARE); 658 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, 659 val & SOF_TIMESTAMPING_SYS_HARDWARE); 660 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, 661 val & SOF_TIMESTAMPING_RAW_HARDWARE); 662 break; 663 664 case SO_RCVLOWAT: 665 if (val < 0) 666 val = INT_MAX; 667 sk->sk_rcvlowat = val ? : 1; 668 break; 669 670 case SO_RCVTIMEO: 671 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 672 break; 673 674 case SO_SNDTIMEO: 675 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 676 break; 677 678 case SO_ATTACH_FILTER: 679 ret = -EINVAL; 680 if (optlen == sizeof(struct sock_fprog)) { 681 struct sock_fprog fprog; 682 683 ret = -EFAULT; 684 if (copy_from_user(&fprog, optval, sizeof(fprog))) 685 break; 686 687 ret = sk_attach_filter(&fprog, sk); 688 } 689 break; 690 691 case SO_DETACH_FILTER: 692 ret = sk_detach_filter(sk); 693 break; 694 695 case SO_PASSSEC: 696 if (valbool) 697 set_bit(SOCK_PASSSEC, &sock->flags); 698 else 699 clear_bit(SOCK_PASSSEC, &sock->flags); 700 break; 701 case SO_MARK: 702 if (!capable(CAP_NET_ADMIN)) 703 ret = -EPERM; 704 else 705 sk->sk_mark = val; 706 break; 707 708 /* We implement the SO_SNDLOWAT etc to 709 not be settable (1003.1g 5.3) */ 710 case SO_RXQ_OVFL: 711 if (valbool) 712 sock_set_flag(sk, SOCK_RXQ_OVFL); 713 else 714 sock_reset_flag(sk, SOCK_RXQ_OVFL); 715 break; 716 default: 717 ret = -ENOPROTOOPT; 718 break; 719 } 720 release_sock(sk); 721 return ret; 722} 723EXPORT_SYMBOL(sock_setsockopt); 724 725 726int sock_getsockopt(struct socket *sock, int level, int optname, 727 char __user *optval, int __user *optlen) 728{ 729 struct sock *sk = sock->sk; 730 731 union { 732 int val; 733 struct linger ling; 734 struct timeval tm; 735 } v; 736 737 unsigned int lv = sizeof(int); 738 int len; 739 740 if (get_user(len, optlen)) 741 return -EFAULT; 742 if (len < 0) 743 return -EINVAL; 744 745 memset(&v, 0, sizeof(v)); 746 747 switch (optname) { 748 case SO_DEBUG: 749 v.val = sock_flag(sk, SOCK_DBG); 750 break; 751 752 case SO_DONTROUTE: 753 v.val = sock_flag(sk, SOCK_LOCALROUTE); 754 break; 755 756 case SO_BROADCAST: 757 v.val = !!sock_flag(sk, SOCK_BROADCAST); 758 break; 759 760 case SO_SNDBUF: 761 v.val = sk->sk_sndbuf; 762 break; 763 764 case SO_RCVBUF: 765 v.val = sk->sk_rcvbuf; 766 break; 767 768 case SO_REUSEADDR: 769 v.val = sk->sk_reuse; 770 break; 771 772 case SO_KEEPALIVE: 773 v.val = !!sock_flag(sk, SOCK_KEEPOPEN); 774 break; 775 776 case SO_TYPE: 777 v.val = sk->sk_type; 778 break; 779 780 case SO_PROTOCOL: 781 v.val = sk->sk_protocol; 782 break; 783 784 case SO_DOMAIN: 785 v.val = sk->sk_family; 786 break; 787 788 case SO_ERROR: 789 v.val = -sock_error(sk); 790 if (v.val == 0) 791 v.val = xchg(&sk->sk_err_soft, 0); 792 break; 793 794 case SO_OOBINLINE: 795 v.val = !!sock_flag(sk, SOCK_URGINLINE); 796 break; 797 798 case SO_NO_CHECK: 799 v.val = sk->sk_no_check; 800 break; 801 802 case SO_PRIORITY: 803 v.val = sk->sk_priority; 804 break; 805 806 case SO_LINGER: 807 lv = sizeof(v.ling); 808 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); 809 v.ling.l_linger = sk->sk_lingertime / HZ; 810 break; 811 812 case SO_BSDCOMPAT: 813 sock_warn_obsolete_bsdism("getsockopt"); 814 break; 815 816 case SO_TIMESTAMP: 817 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 818 !sock_flag(sk, SOCK_RCVTSTAMPNS); 819 break; 820 821 case SO_TIMESTAMPNS: 822 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 823 break; 824 825 case SO_TIMESTAMPING: 826 v.val = 0; 827 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) 828 v.val |= SOF_TIMESTAMPING_TX_HARDWARE; 829 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) 830 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; 831 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) 832 v.val |= SOF_TIMESTAMPING_RX_HARDWARE; 833 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) 834 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; 835 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) 836 v.val |= SOF_TIMESTAMPING_SOFTWARE; 837 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) 838 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; 839 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) 840 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; 841 break; 842 843 case SO_RCVTIMEO: 844 lv = sizeof(struct timeval); 845 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 846 v.tm.tv_sec = 0; 847 v.tm.tv_usec = 0; 848 } else { 849 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 850 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 851 } 852 break; 853 854 case SO_SNDTIMEO: 855 lv = sizeof(struct timeval); 856 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 857 v.tm.tv_sec = 0; 858 v.tm.tv_usec = 0; 859 } else { 860 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 861 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 862 } 863 break; 864 865 case SO_RCVLOWAT: 866 v.val = sk->sk_rcvlowat; 867 break; 868 869 case SO_SNDLOWAT: 870 v.val = 1; 871 break; 872 873 case SO_PASSCRED: 874 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; 875 break; 876 877 case SO_PEERCRED: 878 if (len > sizeof(sk->sk_peercred)) 879 len = sizeof(sk->sk_peercred); 880 if (copy_to_user(optval, &sk->sk_peercred, len)) 881 return -EFAULT; 882 goto lenout; 883 884 case SO_PEERNAME: 885 { 886 char address[128]; 887 888 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) 889 return -ENOTCONN; 890 if (lv < len) 891 return -EINVAL; 892 if (copy_to_user(optval, address, len)) 893 return -EFAULT; 894 goto lenout; 895 } 896 897 /* Dubious BSD thing... Probably nobody even uses it, but 898 * the UNIX standard wants it for whatever reason... -DaveM 899 */ 900 case SO_ACCEPTCONN: 901 v.val = sk->sk_state == TCP_LISTEN; 902 break; 903 904 case SO_PASSSEC: 905 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; 906 break; 907 908 case SO_PEERSEC: 909 return security_socket_getpeersec_stream(sock, optval, optlen, len); 910 911 case SO_MARK: 912 v.val = sk->sk_mark; 913 break; 914 915 case SO_RXQ_OVFL: 916 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); 917 break; 918 919 default: 920 return -ENOPROTOOPT; 921 } 922 923 if (len > lv) 924 len = lv; 925 if (copy_to_user(optval, &v, len)) 926 return -EFAULT; 927lenout: 928 if (put_user(len, optlen)) 929 return -EFAULT; 930 return 0; 931} 932 933/* 934 * Initialize an sk_lock. 935 * 936 * (We also register the sk_lock with the lock validator.) 937 */ 938static inline void sock_lock_init(struct sock *sk) 939{ 940 sock_lock_init_class_and_name(sk, 941 af_family_slock_key_strings[sk->sk_family], 942 af_family_slock_keys + sk->sk_family, 943 af_family_key_strings[sk->sk_family], 944 af_family_keys + sk->sk_family); 945} 946 947/* 948 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 949 * even temporarly, because of RCU lookups. sk_node should also be left as is. 950 */ 951static void sock_copy(struct sock *nsk, const struct sock *osk) 952{ 953#ifdef CONFIG_SECURITY_NETWORK 954 void *sptr = nsk->sk_security; 955#endif 956 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != 957 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt)); 958 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, 959 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); 960#ifdef CONFIG_SECURITY_NETWORK 961 nsk->sk_security = sptr; 962 security_sk_clone(osk, nsk); 963#endif 964} 965 966static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 967 int family) 968{ 969 struct sock *sk; 970 struct kmem_cache *slab; 971 972 slab = prot->slab; 973 if (slab != NULL) { 974 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 975 if (!sk) 976 return sk; 977 if (priority & __GFP_ZERO) { 978 /* 979 * caches using SLAB_DESTROY_BY_RCU should let 980 * sk_node.next un-modified. Special care is taken 981 * when initializing object to zero. 982 */ 983 if (offsetof(struct sock, sk_node.next) != 0) 984 memset(sk, 0, offsetof(struct sock, sk_node.next)); 985 memset(&sk->sk_node.pprev, 0, 986 prot->obj_size - offsetof(struct sock, 987 sk_node.pprev)); 988 } 989 } 990 else 991 sk = kmalloc(prot->obj_size, priority); 992 993 if (sk != NULL) { 994 kmemcheck_annotate_bitfield(sk, flags); 995 996 if (security_sk_alloc(sk, family, priority)) 997 goto out_free; 998 999 if (!try_module_get(prot->owner)) 1000 goto out_free_sec; 1001 } 1002 1003 return sk; 1004 1005out_free_sec: 1006 security_sk_free(sk); 1007out_free: 1008 if (slab != NULL) 1009 kmem_cache_free(slab, sk); 1010 else 1011 kfree(sk); 1012 return NULL; 1013} 1014 1015static void sk_prot_free(struct proto *prot, struct sock *sk) 1016{ 1017 struct kmem_cache *slab; 1018 struct module *owner; 1019 1020 owner = prot->owner; 1021 slab = prot->slab; 1022 1023 security_sk_free(sk); 1024 if (slab != NULL) 1025 kmem_cache_free(slab, sk); 1026 else 1027 kfree(sk); 1028 module_put(owner); 1029} 1030 1031/** 1032 * sk_alloc - All socket objects are allocated here 1033 * @net: the applicable net namespace 1034 * @family: protocol family 1035 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1036 * @prot: struct proto associated with this new sock instance 1037 */ 1038struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1039 struct proto *prot) 1040{ 1041 struct sock *sk; 1042 1043 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1044 if (sk) { 1045 sk->sk_family = family; 1046 /* 1047 * See comment in struct sock definition to understand 1048 * why we need sk_prot_creator -acme 1049 */ 1050 sk->sk_prot = sk->sk_prot_creator = prot; 1051 sock_lock_init(sk); 1052 sock_net_set(sk, get_net(net)); 1053 atomic_set(&sk->sk_wmem_alloc, 1); 1054 } 1055 1056 return sk; 1057} 1058EXPORT_SYMBOL(sk_alloc); 1059 1060static void __sk_free(struct sock *sk) 1061{ 1062 struct sk_filter *filter; 1063 1064 if (sk->sk_destruct) 1065 sk->sk_destruct(sk); 1066 1067 filter = rcu_dereference(sk->sk_filter); 1068 if (filter) { 1069 sk_filter_uncharge(sk, filter); 1070 rcu_assign_pointer(sk->sk_filter, NULL); 1071 } 1072 1073 sock_disable_timestamp(sk, SOCK_TIMESTAMP); 1074 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); 1075 1076 if (atomic_read(&sk->sk_omem_alloc)) 1077 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", 1078 __func__, atomic_read(&sk->sk_omem_alloc)); 1079 1080 put_net(sock_net(sk)); 1081 sk_prot_free(sk->sk_prot_creator, sk); 1082} 1083 1084void sk_free(struct sock *sk) 1085{ 1086 /* 1087 * We substract one from sk_wmem_alloc and can know if 1088 * some packets are still in some tx queue. 1089 * If not null, sock_wfree() will call __sk_free(sk) later 1090 */ 1091 if (atomic_dec_and_test(&sk->sk_wmem_alloc)) 1092 __sk_free(sk); 1093} 1094EXPORT_SYMBOL(sk_free); 1095 1096/* 1097 * Last sock_put should drop referrence to sk->sk_net. It has already 1098 * been dropped in sk_change_net. Taking referrence to stopping namespace 1099 * is not an option. 1100 * Take referrence to a socket to remove it from hash _alive_ and after that 1101 * destroy it in the context of init_net. 1102 */ 1103void sk_release_kernel(struct sock *sk) 1104{ 1105 if (sk == NULL || sk->sk_socket == NULL) 1106 return; 1107 1108 sock_hold(sk); 1109 sock_release(sk->sk_socket); 1110 release_net(sock_net(sk)); 1111 sock_net_set(sk, get_net(&init_net)); 1112 sock_put(sk); 1113} 1114EXPORT_SYMBOL(sk_release_kernel); 1115 1116struct sock *sk_clone(const struct sock *sk, const gfp_t priority) 1117{ 1118 struct sock *newsk; 1119 1120 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1121 if (newsk != NULL) { 1122 struct sk_filter *filter; 1123 1124 sock_copy(newsk, sk); 1125 1126 /* SANITY */ 1127 get_net(sock_net(newsk)); 1128 sk_node_init(&newsk->sk_node); 1129 sock_lock_init(newsk); 1130 bh_lock_sock(newsk); 1131 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1132 1133 atomic_set(&newsk->sk_rmem_alloc, 0); 1134 /* 1135 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1136 */ 1137 atomic_set(&newsk->sk_wmem_alloc, 1); 1138 atomic_set(&newsk->sk_omem_alloc, 0); 1139 skb_queue_head_init(&newsk->sk_receive_queue); 1140 skb_queue_head_init(&newsk->sk_write_queue); 1141#ifdef CONFIG_NET_DMA 1142 skb_queue_head_init(&newsk->sk_async_wait_queue); 1143#endif 1144 1145 rwlock_init(&newsk->sk_dst_lock); 1146 rwlock_init(&newsk->sk_callback_lock); 1147 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1148 af_callback_keys + newsk->sk_family, 1149 af_family_clock_key_strings[newsk->sk_family]); 1150 1151 newsk->sk_dst_cache = NULL; 1152 newsk->sk_wmem_queued = 0; 1153 newsk->sk_forward_alloc = 0; 1154 newsk->sk_send_head = NULL; 1155 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1156 1157 sock_reset_flag(newsk, SOCK_DONE); 1158 skb_queue_head_init(&newsk->sk_error_queue); 1159 1160 filter = newsk->sk_filter; 1161 if (filter != NULL) 1162 sk_filter_charge(newsk, filter); 1163 1164 if (unlikely(xfrm_sk_clone_policy(newsk))) { 1165 /* It is still raw copy of parent, so invalidate 1166 * destructor and make plain sk_free() */ 1167 newsk->sk_destruct = NULL; 1168 sk_free(newsk); 1169 newsk = NULL; 1170 goto out; 1171 } 1172 1173 newsk->sk_err = 0; 1174 newsk->sk_priority = 0; 1175 /* 1176 * Before updating sk_refcnt, we must commit prior changes to memory 1177 * (Documentation/RCU/rculist_nulls.txt for details) 1178 */ 1179 smp_wmb(); 1180 atomic_set(&newsk->sk_refcnt, 2); 1181 1182 /* 1183 * Increment the counter in the same struct proto as the master 1184 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1185 * is the same as sk->sk_prot->socks, as this field was copied 1186 * with memcpy). 1187 * 1188 * This _changes_ the previous behaviour, where 1189 * tcp_create_openreq_child always was incrementing the 1190 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1191 * to be taken into account in all callers. -acme 1192 */ 1193 sk_refcnt_debug_inc(newsk); 1194 sk_set_socket(newsk, NULL); 1195 newsk->sk_sleep = NULL; 1196 1197 if (newsk->sk_prot->sockets_allocated) 1198 percpu_counter_inc(newsk->sk_prot->sockets_allocated); 1199 } 1200out: 1201 return newsk; 1202} 1203EXPORT_SYMBOL_GPL(sk_clone); 1204 1205void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1206{ 1207 __sk_dst_set(sk, dst); 1208 sk->sk_route_caps = dst->dev->features; 1209 if (sk->sk_route_caps & NETIF_F_GSO) 1210 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1211 if (sk_can_gso(sk)) { 1212 if (dst->header_len) { 1213 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1214 } else { 1215 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1216 sk->sk_gso_max_size = dst->dev->gso_max_size; 1217 } 1218 } 1219} 1220EXPORT_SYMBOL_GPL(sk_setup_caps); 1221 1222void __init sk_init(void) 1223{ 1224 if (totalram_pages <= 4096) { 1225 sysctl_wmem_max = 32767; 1226 sysctl_rmem_max = 32767; 1227 sysctl_wmem_default = 32767; 1228 sysctl_rmem_default = 32767; 1229 } else if (totalram_pages >= 131072) { 1230 sysctl_wmem_max = 131071; 1231 sysctl_rmem_max = 131071; 1232 } 1233} 1234 1235/* 1236 * Simple resource managers for sockets. 1237 */ 1238 1239 1240/* 1241 * Write buffer destructor automatically called from kfree_skb. 1242 */ 1243void sock_wfree(struct sk_buff *skb) 1244{ 1245 struct sock *sk = skb->sk; 1246 unsigned int len = skb->truesize; 1247 1248 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1249 /* 1250 * Keep a reference on sk_wmem_alloc, this will be released 1251 * after sk_write_space() call 1252 */ 1253 atomic_sub(len - 1, &sk->sk_wmem_alloc); 1254 sk->sk_write_space(sk); 1255 len = 1; 1256 } 1257 /* 1258 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1259 * could not do because of in-flight packets 1260 */ 1261 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) 1262 __sk_free(sk); 1263} 1264EXPORT_SYMBOL(sock_wfree); 1265 1266/* 1267 * Read buffer destructor automatically called from kfree_skb. 1268 */ 1269void sock_rfree(struct sk_buff *skb) 1270{ 1271 struct sock *sk = skb->sk; 1272 1273 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1274 sk_mem_uncharge(skb->sk, skb->truesize); 1275} 1276EXPORT_SYMBOL(sock_rfree); 1277 1278 1279int sock_i_uid(struct sock *sk) 1280{ 1281 int uid; 1282 1283 read_lock(&sk->sk_callback_lock); 1284 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; 1285 read_unlock(&sk->sk_callback_lock); 1286 return uid; 1287} 1288EXPORT_SYMBOL(sock_i_uid); 1289 1290unsigned long sock_i_ino(struct sock *sk) 1291{ 1292 unsigned long ino; 1293 1294 read_lock(&sk->sk_callback_lock); 1295 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1296 read_unlock(&sk->sk_callback_lock); 1297 return ino; 1298} 1299EXPORT_SYMBOL(sock_i_ino); 1300 1301/* 1302 * Allocate a skb from the socket's send buffer. 1303 */ 1304struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1305 gfp_t priority) 1306{ 1307 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1308 struct sk_buff *skb = alloc_skb(size, priority); 1309 if (skb) { 1310 skb_set_owner_w(skb, sk); 1311 return skb; 1312 } 1313 } 1314 return NULL; 1315} 1316EXPORT_SYMBOL(sock_wmalloc); 1317 1318/* 1319 * Allocate a skb from the socket's receive buffer. 1320 */ 1321struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, 1322 gfp_t priority) 1323{ 1324 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1325 struct sk_buff *skb = alloc_skb(size, priority); 1326 if (skb) { 1327 skb_set_owner_r(skb, sk); 1328 return skb; 1329 } 1330 } 1331 return NULL; 1332} 1333 1334/* 1335 * Allocate a memory block from the socket's option memory buffer. 1336 */ 1337void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1338{ 1339 if ((unsigned)size <= sysctl_optmem_max && 1340 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1341 void *mem; 1342 /* First do the add, to avoid the race if kmalloc 1343 * might sleep. 1344 */ 1345 atomic_add(size, &sk->sk_omem_alloc); 1346 mem = kmalloc(size, priority); 1347 if (mem) 1348 return mem; 1349 atomic_sub(size, &sk->sk_omem_alloc); 1350 } 1351 return NULL; 1352} 1353EXPORT_SYMBOL(sock_kmalloc); 1354 1355/* 1356 * Free an option memory block. 1357 */ 1358void sock_kfree_s(struct sock *sk, void *mem, int size) 1359{ 1360 kfree(mem); 1361 atomic_sub(size, &sk->sk_omem_alloc); 1362} 1363EXPORT_SYMBOL(sock_kfree_s); 1364 1365/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 1366 I think, these locks should be removed for datagram sockets. 1367 */ 1368static long sock_wait_for_wmem(struct sock *sk, long timeo) 1369{ 1370 DEFINE_WAIT(wait); 1371 1372 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1373 for (;;) { 1374 if (!timeo) 1375 break; 1376 if (signal_pending(current)) 1377 break; 1378 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1379 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1380 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1381 break; 1382 if (sk->sk_shutdown & SEND_SHUTDOWN) 1383 break; 1384 if (sk->sk_err) 1385 break; 1386 timeo = schedule_timeout(timeo); 1387 } 1388 finish_wait(sk->sk_sleep, &wait); 1389 return timeo; 1390} 1391 1392 1393/* 1394 * Generic send/receive buffer handlers 1395 */ 1396 1397struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1398 unsigned long data_len, int noblock, 1399 int *errcode) 1400{ 1401 struct sk_buff *skb; 1402 gfp_t gfp_mask; 1403 long timeo; 1404 int err; 1405 1406 gfp_mask = sk->sk_allocation; 1407 if (gfp_mask & __GFP_WAIT) 1408 gfp_mask |= __GFP_REPEAT; 1409 1410 timeo = sock_sndtimeo(sk, noblock); 1411 while (1) { 1412 err = sock_error(sk); 1413 if (err != 0) 1414 goto failure; 1415 1416 err = -EPIPE; 1417 if (sk->sk_shutdown & SEND_SHUTDOWN) 1418 goto failure; 1419 1420 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1421 skb = alloc_skb(header_len, gfp_mask); 1422 if (skb) { 1423 int npages; 1424 int i; 1425 1426 /* No pages, we're done... */ 1427 if (!data_len) 1428 break; 1429 1430 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 1431 skb->truesize += data_len; 1432 skb_shinfo(skb)->nr_frags = npages; 1433 for (i = 0; i < npages; i++) { 1434 struct page *page; 1435 skb_frag_t *frag; 1436 1437 page = alloc_pages(sk->sk_allocation, 0); 1438 if (!page) { 1439 err = -ENOBUFS; 1440 skb_shinfo(skb)->nr_frags = i; 1441 kfree_skb(skb); 1442 goto failure; 1443 } 1444 1445 frag = &skb_shinfo(skb)->frags[i]; 1446 frag->page = page; 1447 frag->page_offset = 0; 1448 frag->size = (data_len >= PAGE_SIZE ? 1449 PAGE_SIZE : 1450 data_len); 1451 data_len -= PAGE_SIZE; 1452 } 1453 1454 /* Full success... */ 1455 break; 1456 } 1457 err = -ENOBUFS; 1458 goto failure; 1459 } 1460 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1461 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1462 err = -EAGAIN; 1463 if (!timeo) 1464 goto failure; 1465 if (signal_pending(current)) 1466 goto interrupted; 1467 timeo = sock_wait_for_wmem(sk, timeo); 1468 } 1469 1470 skb_set_owner_w(skb, sk); 1471 return skb; 1472 1473interrupted: 1474 err = sock_intr_errno(timeo); 1475failure: 1476 *errcode = err; 1477 return NULL; 1478} 1479EXPORT_SYMBOL(sock_alloc_send_pskb); 1480 1481struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1482 int noblock, int *errcode) 1483{ 1484 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); 1485} 1486EXPORT_SYMBOL(sock_alloc_send_skb); 1487 1488static void __lock_sock(struct sock *sk) 1489{ 1490 DEFINE_WAIT(wait); 1491 1492 for (;;) { 1493 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 1494 TASK_UNINTERRUPTIBLE); 1495 spin_unlock_bh(&sk->sk_lock.slock); 1496 schedule(); 1497 spin_lock_bh(&sk->sk_lock.slock); 1498 if (!sock_owned_by_user(sk)) 1499 break; 1500 } 1501 finish_wait(&sk->sk_lock.wq, &wait); 1502} 1503 1504static void __release_sock(struct sock *sk) 1505{ 1506 struct sk_buff *skb = sk->sk_backlog.head; 1507 1508 do { 1509 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 1510 bh_unlock_sock(sk); 1511 1512 do { 1513 struct sk_buff *next = skb->next; 1514 1515 skb->next = NULL; 1516 sk_backlog_rcv(sk, skb); 1517 1518 /* 1519 * We are in process context here with softirqs 1520 * disabled, use cond_resched_softirq() to preempt. 1521 * This is safe to do because we've taken the backlog 1522 * queue private: 1523 */ 1524 cond_resched_softirq(); 1525 1526 skb = next; 1527 } while (skb != NULL); 1528 1529 bh_lock_sock(sk); 1530 } while ((skb = sk->sk_backlog.head) != NULL); 1531} 1532 1533/** 1534 * sk_wait_data - wait for data to arrive at sk_receive_queue 1535 * @sk: sock to wait on 1536 * @timeo: for how long 1537 * 1538 * Now socket state including sk->sk_err is changed only under lock, 1539 * hence we may omit checks after joining wait queue. 1540 * We check receive queue before schedule() only as optimization; 1541 * it is very likely that release_sock() added new data. 1542 */ 1543int sk_wait_data(struct sock *sk, long *timeo) 1544{ 1545 int rc; 1546 DEFINE_WAIT(wait); 1547 1548 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1549 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1550 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); 1551 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1552 finish_wait(sk->sk_sleep, &wait); 1553 return rc; 1554} 1555EXPORT_SYMBOL(sk_wait_data); 1556 1557/** 1558 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 1559 * @sk: socket 1560 * @size: memory size to allocate 1561 * @kind: allocation type 1562 * 1563 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 1564 * rmem allocation. This function assumes that protocols which have 1565 * memory_pressure use sk_wmem_queued as write buffer accounting. 1566 */ 1567int __sk_mem_schedule(struct sock *sk, int size, int kind) 1568{ 1569 struct proto *prot = sk->sk_prot; 1570 int amt = sk_mem_pages(size); 1571 int allocated; 1572 1573 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1574 allocated = atomic_add_return(amt, prot->memory_allocated); 1575 1576 /* Under limit. */ 1577 if (allocated <= prot->sysctl_mem[0]) { 1578 if (prot->memory_pressure && *prot->memory_pressure) 1579 *prot->memory_pressure = 0; 1580 return 1; 1581 } 1582 1583 /* Under pressure. */ 1584 if (allocated > prot->sysctl_mem[1]) 1585 if (prot->enter_memory_pressure) 1586 prot->enter_memory_pressure(sk); 1587 1588 /* Over hard limit. */ 1589 if (allocated > prot->sysctl_mem[2]) 1590 goto suppress_allocation; 1591 1592 /* guarantee minimum buffer size under pressure */ 1593 if (kind == SK_MEM_RECV) { 1594 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 1595 return 1; 1596 } else { /* SK_MEM_SEND */ 1597 if (sk->sk_type == SOCK_STREAM) { 1598 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 1599 return 1; 1600 } else if (atomic_read(&sk->sk_wmem_alloc) < 1601 prot->sysctl_wmem[0]) 1602 return 1; 1603 } 1604 1605 if (prot->memory_pressure) { 1606 int alloc; 1607 1608 if (!*prot->memory_pressure) 1609 return 1; 1610 alloc = percpu_counter_read_positive(prot->sockets_allocated); 1611 if (prot->sysctl_mem[2] > alloc * 1612 sk_mem_pages(sk->sk_wmem_queued + 1613 atomic_read(&sk->sk_rmem_alloc) + 1614 sk->sk_forward_alloc)) 1615 return 1; 1616 } 1617 1618suppress_allocation: 1619 1620 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 1621 sk_stream_moderate_sndbuf(sk); 1622 1623 /* Fail only if socket is _under_ its sndbuf. 1624 * In this case we cannot block, so that we have to fail. 1625 */ 1626 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 1627 return 1; 1628 } 1629 1630 /* Alas. Undo changes. */ 1631 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 1632 atomic_sub(amt, prot->memory_allocated); 1633 return 0; 1634} 1635EXPORT_SYMBOL(__sk_mem_schedule); 1636 1637/** 1638 * __sk_reclaim - reclaim memory_allocated 1639 * @sk: socket 1640 */ 1641void __sk_mem_reclaim(struct sock *sk) 1642{ 1643 struct proto *prot = sk->sk_prot; 1644 1645 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 1646 prot->memory_allocated); 1647 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 1648 1649 if (prot->memory_pressure && *prot->memory_pressure && 1650 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) 1651 *prot->memory_pressure = 0; 1652} 1653EXPORT_SYMBOL(__sk_mem_reclaim); 1654 1655 1656/* 1657 * Set of default routines for initialising struct proto_ops when 1658 * the protocol does not support a particular function. In certain 1659 * cases where it makes no sense for a protocol to have a "do nothing" 1660 * function, some default processing is provided. 1661 */ 1662 1663int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 1664{ 1665 return -EOPNOTSUPP; 1666} 1667EXPORT_SYMBOL(sock_no_bind); 1668 1669int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 1670 int len, int flags) 1671{ 1672 return -EOPNOTSUPP; 1673} 1674EXPORT_SYMBOL(sock_no_connect); 1675 1676int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 1677{ 1678 return -EOPNOTSUPP; 1679} 1680EXPORT_SYMBOL(sock_no_socketpair); 1681 1682int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 1683{ 1684 return -EOPNOTSUPP; 1685} 1686EXPORT_SYMBOL(sock_no_accept); 1687 1688int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 1689 int *len, int peer) 1690{ 1691 return -EOPNOTSUPP; 1692} 1693EXPORT_SYMBOL(sock_no_getname); 1694 1695unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 1696{ 1697 return 0; 1698} 1699EXPORT_SYMBOL(sock_no_poll); 1700 1701int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1702{ 1703 return -EOPNOTSUPP; 1704} 1705EXPORT_SYMBOL(sock_no_ioctl); 1706 1707int sock_no_listen(struct socket *sock, int backlog) 1708{ 1709 return -EOPNOTSUPP; 1710} 1711EXPORT_SYMBOL(sock_no_listen); 1712 1713int sock_no_shutdown(struct socket *sock, int how) 1714{ 1715 return -EOPNOTSUPP; 1716} 1717EXPORT_SYMBOL(sock_no_shutdown); 1718 1719int sock_no_setsockopt(struct socket *sock, int level, int optname, 1720 char __user *optval, unsigned int optlen) 1721{ 1722 return -EOPNOTSUPP; 1723} 1724EXPORT_SYMBOL(sock_no_setsockopt); 1725 1726int sock_no_getsockopt(struct socket *sock, int level, int optname, 1727 char __user *optval, int __user *optlen) 1728{ 1729 return -EOPNOTSUPP; 1730} 1731EXPORT_SYMBOL(sock_no_getsockopt); 1732 1733int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 1734 size_t len) 1735{ 1736 return -EOPNOTSUPP; 1737} 1738EXPORT_SYMBOL(sock_no_sendmsg); 1739 1740int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, 1741 size_t len, int flags) 1742{ 1743 return -EOPNOTSUPP; 1744} 1745EXPORT_SYMBOL(sock_no_recvmsg); 1746 1747int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 1748{ 1749 /* Mirror missing mmap method error code */ 1750 return -ENODEV; 1751} 1752EXPORT_SYMBOL(sock_no_mmap); 1753 1754ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 1755{ 1756 ssize_t res; 1757 struct msghdr msg = {.msg_flags = flags}; 1758 struct kvec iov; 1759 char *kaddr = kmap(page); 1760 iov.iov_base = kaddr + offset; 1761 iov.iov_len = size; 1762 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 1763 kunmap(page); 1764 return res; 1765} 1766EXPORT_SYMBOL(sock_no_sendpage); 1767 1768/* 1769 * Default Socket Callbacks 1770 */ 1771 1772static void sock_def_wakeup(struct sock *sk) 1773{ 1774 read_lock(&sk->sk_callback_lock); 1775 if (sk_has_sleeper(sk)) 1776 wake_up_interruptible_all(sk->sk_sleep); 1777 read_unlock(&sk->sk_callback_lock); 1778} 1779 1780static void sock_def_error_report(struct sock *sk) 1781{ 1782 read_lock(&sk->sk_callback_lock); 1783 if (sk_has_sleeper(sk)) 1784 wake_up_interruptible_poll(sk->sk_sleep, POLLERR); 1785 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 1786 read_unlock(&sk->sk_callback_lock); 1787} 1788 1789static void sock_def_readable(struct sock *sk, int len) 1790{ 1791 read_lock(&sk->sk_callback_lock); 1792 if (sk_has_sleeper(sk)) 1793 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | 1794 POLLRDNORM | POLLRDBAND); 1795 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 1796 read_unlock(&sk->sk_callback_lock); 1797} 1798 1799static void sock_def_write_space(struct sock *sk) 1800{ 1801 read_lock(&sk->sk_callback_lock); 1802 1803 /* Do not wake up a writer until he can make "significant" 1804 * progress. --DaveM 1805 */ 1806 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 1807 if (sk_has_sleeper(sk)) 1808 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | 1809 POLLWRNORM | POLLWRBAND); 1810 1811 /* Should agree with poll, otherwise some programs break */ 1812 if (sock_writeable(sk)) 1813 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 1814 } 1815 1816 read_unlock(&sk->sk_callback_lock); 1817} 1818 1819static void sock_def_destruct(struct sock *sk) 1820{ 1821 kfree(sk->sk_protinfo); 1822} 1823 1824void sk_send_sigurg(struct sock *sk) 1825{ 1826 if (sk->sk_socket && sk->sk_socket->file) 1827 if (send_sigurg(&sk->sk_socket->file->f_owner)) 1828 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 1829} 1830EXPORT_SYMBOL(sk_send_sigurg); 1831 1832void sk_reset_timer(struct sock *sk, struct timer_list* timer, 1833 unsigned long expires) 1834{ 1835 if (!mod_timer(timer, expires)) 1836 sock_hold(sk); 1837} 1838EXPORT_SYMBOL(sk_reset_timer); 1839 1840void sk_stop_timer(struct sock *sk, struct timer_list* timer) 1841{ 1842 if (timer_pending(timer) && del_timer(timer)) 1843 __sock_put(sk); 1844} 1845EXPORT_SYMBOL(sk_stop_timer); 1846 1847void sock_init_data(struct socket *sock, struct sock *sk) 1848{ 1849 skb_queue_head_init(&sk->sk_receive_queue); 1850 skb_queue_head_init(&sk->sk_write_queue); 1851 skb_queue_head_init(&sk->sk_error_queue); 1852#ifdef CONFIG_NET_DMA 1853 skb_queue_head_init(&sk->sk_async_wait_queue); 1854#endif 1855 1856 sk->sk_send_head = NULL; 1857 1858 init_timer(&sk->sk_timer); 1859 1860 sk->sk_allocation = GFP_KERNEL; 1861 sk->sk_rcvbuf = sysctl_rmem_default; 1862 sk->sk_sndbuf = sysctl_wmem_default; 1863 sk->sk_state = TCP_CLOSE; 1864 sk_set_socket(sk, sock); 1865 1866 sock_set_flag(sk, SOCK_ZAPPED); 1867 1868 if (sock) { 1869 sk->sk_type = sock->type; 1870 sk->sk_sleep = &sock->wait; 1871 sock->sk = sk; 1872 } else 1873 sk->sk_sleep = NULL; 1874 1875 rwlock_init(&sk->sk_dst_lock); 1876 rwlock_init(&sk->sk_callback_lock); 1877 lockdep_set_class_and_name(&sk->sk_callback_lock, 1878 af_callback_keys + sk->sk_family, 1879 af_family_clock_key_strings[sk->sk_family]); 1880 1881 sk->sk_state_change = sock_def_wakeup; 1882 sk->sk_data_ready = sock_def_readable; 1883 sk->sk_write_space = sock_def_write_space; 1884 sk->sk_error_report = sock_def_error_report; 1885 sk->sk_destruct = sock_def_destruct; 1886 1887 sk->sk_sndmsg_page = NULL; 1888 sk->sk_sndmsg_off = 0; 1889 1890 sk->sk_peercred.pid = 0; 1891 sk->sk_peercred.uid = -1; 1892 sk->sk_peercred.gid = -1; 1893 sk->sk_write_pending = 0; 1894 sk->sk_rcvlowat = 1; 1895 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 1896 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 1897 1898 sk->sk_stamp = ktime_set(-1L, 0); 1899 1900 /* 1901 * Before updating sk_refcnt, we must commit prior changes to memory 1902 * (Documentation/RCU/rculist_nulls.txt for details) 1903 */ 1904 smp_wmb(); 1905 atomic_set(&sk->sk_refcnt, 1); 1906 atomic_set(&sk->sk_drops, 0); 1907} 1908EXPORT_SYMBOL(sock_init_data); 1909 1910void lock_sock_nested(struct sock *sk, int subclass) 1911{ 1912 might_sleep(); 1913 spin_lock_bh(&sk->sk_lock.slock); 1914 if (sk->sk_lock.owned) 1915 __lock_sock(sk); 1916 sk->sk_lock.owned = 1; 1917 spin_unlock(&sk->sk_lock.slock); 1918 /* 1919 * The sk_lock has mutex_lock() semantics here: 1920 */ 1921 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 1922 local_bh_enable(); 1923} 1924EXPORT_SYMBOL(lock_sock_nested); 1925 1926void release_sock(struct sock *sk) 1927{ 1928 /* 1929 * The sk_lock has mutex_unlock() semantics: 1930 */ 1931 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 1932 1933 spin_lock_bh(&sk->sk_lock.slock); 1934 if (sk->sk_backlog.tail) 1935 __release_sock(sk); 1936 sk->sk_lock.owned = 0; 1937 if (waitqueue_active(&sk->sk_lock.wq)) 1938 wake_up(&sk->sk_lock.wq); 1939 spin_unlock_bh(&sk->sk_lock.slock); 1940} 1941EXPORT_SYMBOL(release_sock); 1942 1943int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 1944{ 1945 struct timeval tv; 1946 if (!sock_flag(sk, SOCK_TIMESTAMP)) 1947 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 1948 tv = ktime_to_timeval(sk->sk_stamp); 1949 if (tv.tv_sec == -1) 1950 return -ENOENT; 1951 if (tv.tv_sec == 0) { 1952 sk->sk_stamp = ktime_get_real(); 1953 tv = ktime_to_timeval(sk->sk_stamp); 1954 } 1955 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 1956} 1957EXPORT_SYMBOL(sock_get_timestamp); 1958 1959int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 1960{ 1961 struct timespec ts; 1962 if (!sock_flag(sk, SOCK_TIMESTAMP)) 1963 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 1964 ts = ktime_to_timespec(sk->sk_stamp); 1965 if (ts.tv_sec == -1) 1966 return -ENOENT; 1967 if (ts.tv_sec == 0) { 1968 sk->sk_stamp = ktime_get_real(); 1969 ts = ktime_to_timespec(sk->sk_stamp); 1970 } 1971 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 1972} 1973EXPORT_SYMBOL(sock_get_timestampns); 1974 1975void sock_enable_timestamp(struct sock *sk, int flag) 1976{ 1977 if (!sock_flag(sk, flag)) { 1978 sock_set_flag(sk, flag); 1979 /* 1980 * we just set one of the two flags which require net 1981 * time stamping, but time stamping might have been on 1982 * already because of the other one 1983 */ 1984 if (!sock_flag(sk, 1985 flag == SOCK_TIMESTAMP ? 1986 SOCK_TIMESTAMPING_RX_SOFTWARE : 1987 SOCK_TIMESTAMP)) 1988 net_enable_timestamp(); 1989 } 1990} 1991 1992/* 1993 * Get a socket option on an socket. 1994 * 1995 * FIX: POSIX 1003.1g is very ambiguous here. It states that 1996 * asynchronous errors should be reported by getsockopt. We assume 1997 * this means if you specify SO_ERROR (otherwise whats the point of it). 1998 */ 1999int sock_common_getsockopt(struct socket *sock, int level, int optname, 2000 char __user *optval, int __user *optlen) 2001{ 2002 struct sock *sk = sock->sk; 2003 2004 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2005} 2006EXPORT_SYMBOL(sock_common_getsockopt); 2007 2008#ifdef CONFIG_COMPAT 2009int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2010 char __user *optval, int __user *optlen) 2011{ 2012 struct sock *sk = sock->sk; 2013 2014 if (sk->sk_prot->compat_getsockopt != NULL) 2015 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2016 optval, optlen); 2017 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2018} 2019EXPORT_SYMBOL(compat_sock_common_getsockopt); 2020#endif 2021 2022int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, 2023 struct msghdr *msg, size_t size, int flags) 2024{ 2025 struct sock *sk = sock->sk; 2026 int addr_len = 0; 2027 int err; 2028 2029 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 2030 flags & ~MSG_DONTWAIT, &addr_len); 2031 if (err >= 0) 2032 msg->msg_namelen = addr_len; 2033 return err; 2034} 2035EXPORT_SYMBOL(sock_common_recvmsg); 2036 2037/* 2038 * Set socket options on an inet socket. 2039 */ 2040int sock_common_setsockopt(struct socket *sock, int level, int optname, 2041 char __user *optval, unsigned int optlen) 2042{ 2043 struct sock *sk = sock->sk; 2044 2045 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2046} 2047EXPORT_SYMBOL(sock_common_setsockopt); 2048 2049#ifdef CONFIG_COMPAT 2050int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2051 char __user *optval, unsigned int optlen) 2052{ 2053 struct sock *sk = sock->sk; 2054 2055 if (sk->sk_prot->compat_setsockopt != NULL) 2056 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2057 optval, optlen); 2058 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2059} 2060EXPORT_SYMBOL(compat_sock_common_setsockopt); 2061#endif 2062 2063void sk_common_release(struct sock *sk) 2064{ 2065 if (sk->sk_prot->destroy) 2066 sk->sk_prot->destroy(sk); 2067 2068 /* 2069 * Observation: when sock_common_release is called, processes have 2070 * no access to socket. But net still has. 2071 * Step one, detach it from networking: 2072 * 2073 * A. Remove from hash tables. 2074 */ 2075 2076 sk->sk_prot->unhash(sk); 2077 2078 /* 2079 * In this point socket cannot receive new packets, but it is possible 2080 * that some packets are in flight because some CPU runs receiver and 2081 * did hash table lookup before we unhashed socket. They will achieve 2082 * receive queue and will be purged by socket destructor. 2083 * 2084 * Also we still have packets pending on receive queue and probably, 2085 * our own packets waiting in device queues. sock_destroy will drain 2086 * receive queue, but transmitted packets will delay socket destruction 2087 * until the last reference will be released. 2088 */ 2089 2090 sock_orphan(sk); 2091 2092 xfrm_sk_free_policy(sk); 2093 2094 sk_refcnt_debug_release(sk); 2095 sock_put(sk); 2096} 2097EXPORT_SYMBOL(sk_common_release); 2098 2099static DEFINE_RWLOCK(proto_list_lock); 2100static LIST_HEAD(proto_list); 2101 2102#ifdef CONFIG_PROC_FS 2103#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2104struct prot_inuse { 2105 int val[PROTO_INUSE_NR]; 2106}; 2107 2108static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 2109 2110#ifdef CONFIG_NET_NS 2111void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2112{ 2113 int cpu = smp_processor_id(); 2114 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; 2115} 2116EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2117 2118int sock_prot_inuse_get(struct net *net, struct proto *prot) 2119{ 2120 int cpu, idx = prot->inuse_idx; 2121 int res = 0; 2122 2123 for_each_possible_cpu(cpu) 2124 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; 2125 2126 return res >= 0 ? res : 0; 2127} 2128EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2129 2130static int sock_inuse_init_net(struct net *net) 2131{ 2132 net->core.inuse = alloc_percpu(struct prot_inuse); 2133 return net->core.inuse ? 0 : -ENOMEM; 2134} 2135 2136static void sock_inuse_exit_net(struct net *net) 2137{ 2138 free_percpu(net->core.inuse); 2139} 2140 2141static struct pernet_operations net_inuse_ops = { 2142 .init = sock_inuse_init_net, 2143 .exit = sock_inuse_exit_net, 2144}; 2145 2146static __init int net_inuse_init(void) 2147{ 2148 if (register_pernet_subsys(&net_inuse_ops)) 2149 panic("Cannot initialize net inuse counters"); 2150 2151 return 0; 2152} 2153 2154core_initcall(net_inuse_init); 2155#else 2156static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); 2157 2158void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2159{ 2160 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; 2161} 2162EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2163 2164int sock_prot_inuse_get(struct net *net, struct proto *prot) 2165{ 2166 int cpu, idx = prot->inuse_idx; 2167 int res = 0; 2168 2169 for_each_possible_cpu(cpu) 2170 res += per_cpu(prot_inuse, cpu).val[idx]; 2171 2172 return res >= 0 ? res : 0; 2173} 2174EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2175#endif 2176 2177static void assign_proto_idx(struct proto *prot) 2178{ 2179 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 2180 2181 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 2182 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); 2183 return; 2184 } 2185 2186 set_bit(prot->inuse_idx, proto_inuse_idx); 2187} 2188 2189static void release_proto_idx(struct proto *prot) 2190{ 2191 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 2192 clear_bit(prot->inuse_idx, proto_inuse_idx); 2193} 2194#else 2195static inline void assign_proto_idx(struct proto *prot) 2196{ 2197} 2198 2199static inline void release_proto_idx(struct proto *prot) 2200{ 2201} 2202#endif 2203 2204int proto_register(struct proto *prot, int alloc_slab) 2205{ 2206 if (alloc_slab) { 2207 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2208 SLAB_HWCACHE_ALIGN | prot->slab_flags, 2209 NULL); 2210 2211 if (prot->slab == NULL) { 2212 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", 2213 prot->name); 2214 goto out; 2215 } 2216 2217 if (prot->rsk_prot != NULL) { 2218 static const char mask[] = "request_sock_%s"; 2219 2220 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); 2221 if (prot->rsk_prot->slab_name == NULL) 2222 goto out_free_sock_slab; 2223 2224 sprintf(prot->rsk_prot->slab_name, mask, prot->name); 2225 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, 2226 prot->rsk_prot->obj_size, 0, 2227 SLAB_HWCACHE_ALIGN, NULL); 2228 2229 if (prot->rsk_prot->slab == NULL) { 2230 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", 2231 prot->name); 2232 goto out_free_request_sock_slab_name; 2233 } 2234 } 2235 2236 if (prot->twsk_prot != NULL) { 2237 static const char mask[] = "tw_sock_%s"; 2238 2239 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); 2240 2241 if (prot->twsk_prot->twsk_slab_name == NULL) 2242 goto out_free_request_sock_slab; 2243 2244 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name); 2245 prot->twsk_prot->twsk_slab = 2246 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 2247 prot->twsk_prot->twsk_obj_size, 2248 0, 2249 SLAB_HWCACHE_ALIGN | 2250 prot->slab_flags, 2251 NULL); 2252 if (prot->twsk_prot->twsk_slab == NULL) 2253 goto out_free_timewait_sock_slab_name; 2254 } 2255 } 2256 2257 write_lock(&proto_list_lock); 2258 list_add(&prot->node, &proto_list); 2259 assign_proto_idx(prot); 2260 write_unlock(&proto_list_lock); 2261 return 0; 2262 2263out_free_timewait_sock_slab_name: 2264 kfree(prot->twsk_prot->twsk_slab_name); 2265out_free_request_sock_slab: 2266 if (prot->rsk_prot && prot->rsk_prot->slab) { 2267 kmem_cache_destroy(prot->rsk_prot->slab); 2268 prot->rsk_prot->slab = NULL; 2269 } 2270out_free_request_sock_slab_name: 2271 kfree(prot->rsk_prot->slab_name); 2272out_free_sock_slab: 2273 kmem_cache_destroy(prot->slab); 2274 prot->slab = NULL; 2275out: 2276 return -ENOBUFS; 2277} 2278EXPORT_SYMBOL(proto_register); 2279 2280void proto_unregister(struct proto *prot) 2281{ 2282 write_lock(&proto_list_lock); 2283 release_proto_idx(prot); 2284 list_del(&prot->node); 2285 write_unlock(&proto_list_lock); 2286 2287 if (prot->slab != NULL) { 2288 kmem_cache_destroy(prot->slab); 2289 prot->slab = NULL; 2290 } 2291 2292 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { 2293 kmem_cache_destroy(prot->rsk_prot->slab); 2294 kfree(prot->rsk_prot->slab_name); 2295 prot->rsk_prot->slab = NULL; 2296 } 2297 2298 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 2299 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 2300 kfree(prot->twsk_prot->twsk_slab_name); 2301 prot->twsk_prot->twsk_slab = NULL; 2302 } 2303} 2304EXPORT_SYMBOL(proto_unregister); 2305 2306#ifdef CONFIG_PROC_FS 2307static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2308 __acquires(proto_list_lock) 2309{ 2310 read_lock(&proto_list_lock); 2311 return seq_list_start_head(&proto_list, *pos); 2312} 2313 2314static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2315{ 2316 return seq_list_next(v, &proto_list, pos); 2317} 2318 2319static void proto_seq_stop(struct seq_file *seq, void *v) 2320 __releases(proto_list_lock) 2321{ 2322 read_unlock(&proto_list_lock); 2323} 2324 2325static char proto_method_implemented(const void *method) 2326{ 2327 return method == NULL ? 'n' : 'y'; 2328} 2329 2330static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2331{ 2332 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " 2333 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2334 proto->name, 2335 proto->obj_size, 2336 sock_prot_inuse_get(seq_file_net(seq), proto), 2337 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, 2338 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", 2339 proto->max_header, 2340 proto->slab == NULL ? "no" : "yes", 2341 module_name(proto->owner), 2342 proto_method_implemented(proto->close), 2343 proto_method_implemented(proto->connect), 2344 proto_method_implemented(proto->disconnect), 2345 proto_method_implemented(proto->accept), 2346 proto_method_implemented(proto->ioctl), 2347 proto_method_implemented(proto->init), 2348 proto_method_implemented(proto->destroy), 2349 proto_method_implemented(proto->shutdown), 2350 proto_method_implemented(proto->setsockopt), 2351 proto_method_implemented(proto->getsockopt), 2352 proto_method_implemented(proto->sendmsg), 2353 proto_method_implemented(proto->recvmsg), 2354 proto_method_implemented(proto->sendpage), 2355 proto_method_implemented(proto->bind), 2356 proto_method_implemented(proto->backlog_rcv), 2357 proto_method_implemented(proto->hash), 2358 proto_method_implemented(proto->unhash), 2359 proto_method_implemented(proto->get_port), 2360 proto_method_implemented(proto->enter_memory_pressure)); 2361} 2362 2363static int proto_seq_show(struct seq_file *seq, void *v) 2364{ 2365 if (v == &proto_list) 2366 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 2367 "protocol", 2368 "size", 2369 "sockets", 2370 "memory", 2371 "press", 2372 "maxhdr", 2373 "slab", 2374 "module", 2375 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 2376 else 2377 proto_seq_printf(seq, list_entry(v, struct proto, node)); 2378 return 0; 2379} 2380 2381static const struct seq_operations proto_seq_ops = { 2382 .start = proto_seq_start, 2383 .next = proto_seq_next, 2384 .stop = proto_seq_stop, 2385 .show = proto_seq_show, 2386}; 2387 2388static int proto_seq_open(struct inode *inode, struct file *file) 2389{ 2390 return seq_open_net(inode, file, &proto_seq_ops, 2391 sizeof(struct seq_net_private)); 2392} 2393 2394static const struct file_operations proto_seq_fops = { 2395 .owner = THIS_MODULE, 2396 .open = proto_seq_open, 2397 .read = seq_read, 2398 .llseek = seq_lseek, 2399 .release = seq_release_net, 2400}; 2401 2402static __net_init int proto_init_net(struct net *net) 2403{ 2404 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) 2405 return -ENOMEM; 2406 2407 return 0; 2408} 2409 2410static __net_exit void proto_exit_net(struct net *net) 2411{ 2412 proc_net_remove(net, "protocols"); 2413} 2414 2415 2416static __net_initdata struct pernet_operations proto_net_ops = { 2417 .init = proto_init_net, 2418 .exit = proto_exit_net, 2419}; 2420 2421static int __init proto_init(void) 2422{ 2423 return register_pernet_subsys(&proto_net_ops); 2424} 2425 2426subsys_initcall(proto_init); 2427 2428#endif /* PROC_FS */ 2429