sock.c revision 70355602879229c6f8bd694ec9c0814222bc4936
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
120#include <net/net_namespace.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <net/xfrm.h>
124#include <linux/ipsec.h>
125
126#include <linux/filter.h>
127
128#ifdef CONFIG_INET
129#include <net/tcp.h>
130#endif
131
132/*
133 * Each address family might have different locking rules, so we have
134 * one slock key per address family:
135 */
136static struct lock_class_key af_family_keys[AF_MAX];
137static struct lock_class_key af_family_slock_keys[AF_MAX];
138
139/*
140 * Make lock validator output more readable. (we pre-construct these
141 * strings build-time, so that runtime initialization of socket
142 * locks is fast):
143 */
144static const char *af_family_key_strings[AF_MAX+1] = {
145  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
146  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
147  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
148  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
149  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
150  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
151  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
152  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
153  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
154  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
155  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
156  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
157  "sk_lock-AF_MAX"
158};
159static const char *af_family_slock_key_strings[AF_MAX+1] = {
160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
171  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
172  "slock-AF_MAX"
173};
174static const char *af_family_clock_key_strings[AF_MAX+1] = {
175  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
176  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
177  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
178  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
179  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
180  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
181  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
182  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
183  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
184  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
185  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
186  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
187  "clock-AF_MAX"
188};
189
190/*
191 * sk_callback_lock locking rules are per-address-family,
192 * so split the lock classes by using a per-AF key:
193 */
194static struct lock_class_key af_callback_keys[AF_MAX];
195
196/* Take into consideration the size of the struct sk_buff overhead in the
197 * determination of these values, since that is non-constant across
198 * platforms.  This makes socket queueing behavior and performance
199 * not depend upon such differences.
200 */
201#define _SK_MEM_PACKETS		256
202#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
203#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205
206/* Run time adjustable parameters. */
207__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
208__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
209__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
210__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
211
212/* Maximal space eaten by iovec or ancilliary data plus some space */
213int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
214
215static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
216{
217	struct timeval tv;
218
219	if (optlen < sizeof(tv))
220		return -EINVAL;
221	if (copy_from_user(&tv, optval, sizeof(tv)))
222		return -EFAULT;
223	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
224		return -EDOM;
225
226	if (tv.tv_sec < 0) {
227		static int warned __read_mostly;
228
229		*timeo_p = 0;
230		if (warned < 10 && net_ratelimit()) {
231			warned++;
232			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
233			       "tries to set negative timeout\n",
234				current->comm, task_pid_nr(current));
235		}
236		return 0;
237	}
238	*timeo_p = MAX_SCHEDULE_TIMEOUT;
239	if (tv.tv_sec == 0 && tv.tv_usec == 0)
240		return 0;
241	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243	return 0;
244}
245
246static void sock_warn_obsolete_bsdism(const char *name)
247{
248	static int warned;
249	static char warncomm[TASK_COMM_LEN];
250	if (strcmp(warncomm, current->comm) && warned < 5) {
251		strcpy(warncomm,  current->comm);
252		printk(KERN_WARNING "process `%s' is using obsolete "
253		       "%s SO_BSDCOMPAT\n", warncomm, name);
254		warned++;
255	}
256}
257
258static void sock_disable_timestamp(struct sock *sk)
259{
260	if (sock_flag(sk, SOCK_TIMESTAMP)) {
261		sock_reset_flag(sk, SOCK_TIMESTAMP);
262		net_disable_timestamp();
263	}
264}
265
266
267int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268{
269	int err = 0;
270	int skb_len;
271
272	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
273	   number of warnings when compiling with -W --ANK
274	 */
275	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276	    (unsigned)sk->sk_rcvbuf) {
277		err = -ENOMEM;
278		goto out;
279	}
280
281	err = sk_filter(sk, skb);
282	if (err)
283		goto out;
284
285	if (!sk_rmem_schedule(sk, skb->truesize)) {
286		err = -ENOBUFS;
287		goto out;
288	}
289
290	skb->dev = NULL;
291	skb_set_owner_r(skb, sk);
292	/*
293	 * release dst right now while its hot
294	 */
295	dst_release(skb->dst);
296	skb->dst = NULL;
297	/* Cache the SKB length before we tack it onto the receive
298	 * queue.  Once it is added it no longer belongs to us and
299	 * may be freed by other threads of control pulling packets
300	 * from the queue.
301	 */
302	skb_len = skb->len;
303
304	skb_queue_tail(&sk->sk_receive_queue, skb);
305
306	if (!sock_flag(sk, SOCK_DEAD))
307		sk->sk_data_ready(sk, skb_len);
308out:
309	return err;
310}
311EXPORT_SYMBOL(sock_queue_rcv_skb);
312
313int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
314{
315	int rc = NET_RX_SUCCESS;
316
317	if (sk_filter(sk, skb))
318		goto discard_and_relse;
319
320	skb->dev = NULL;
321
322	if (nested)
323		bh_lock_sock_nested(sk);
324	else
325		bh_lock_sock(sk);
326	if (!sock_owned_by_user(sk)) {
327		/*
328		 * trylock + unlock semantics:
329		 */
330		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
331
332		rc = sk_backlog_rcv(sk, skb);
333
334		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
335	} else
336		sk_add_backlog(sk, skb);
337	bh_unlock_sock(sk);
338out:
339	sock_put(sk);
340	return rc;
341discard_and_relse:
342	kfree_skb(skb);
343	goto out;
344}
345EXPORT_SYMBOL(sk_receive_skb);
346
347struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
348{
349	struct dst_entry *dst = sk->sk_dst_cache;
350
351	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
352		sk->sk_dst_cache = NULL;
353		dst_release(dst);
354		return NULL;
355	}
356
357	return dst;
358}
359EXPORT_SYMBOL(__sk_dst_check);
360
361struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
362{
363	struct dst_entry *dst = sk_dst_get(sk);
364
365	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
366		sk_dst_reset(sk);
367		dst_release(dst);
368		return NULL;
369	}
370
371	return dst;
372}
373EXPORT_SYMBOL(sk_dst_check);
374
375static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
376{
377	int ret = -ENOPROTOOPT;
378#ifdef CONFIG_NETDEVICES
379	struct net *net = sock_net(sk);
380	char devname[IFNAMSIZ];
381	int index;
382
383	/* Sorry... */
384	ret = -EPERM;
385	if (!capable(CAP_NET_RAW))
386		goto out;
387
388	ret = -EINVAL;
389	if (optlen < 0)
390		goto out;
391
392	/* Bind this socket to a particular device like "eth0",
393	 * as specified in the passed interface name. If the
394	 * name is "" or the option length is zero the socket
395	 * is not bound.
396	 */
397	if (optlen > IFNAMSIZ - 1)
398		optlen = IFNAMSIZ - 1;
399	memset(devname, 0, sizeof(devname));
400
401	ret = -EFAULT;
402	if (copy_from_user(devname, optval, optlen))
403		goto out;
404
405	if (devname[0] == '\0') {
406		index = 0;
407	} else {
408		struct net_device *dev = dev_get_by_name(net, devname);
409
410		ret = -ENODEV;
411		if (!dev)
412			goto out;
413
414		index = dev->ifindex;
415		dev_put(dev);
416	}
417
418	lock_sock(sk);
419	sk->sk_bound_dev_if = index;
420	sk_dst_reset(sk);
421	release_sock(sk);
422
423	ret = 0;
424
425out:
426#endif
427
428	return ret;
429}
430
431static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
432{
433	if (valbool)
434		sock_set_flag(sk, bit);
435	else
436		sock_reset_flag(sk, bit);
437}
438
439/*
440 *	This is meant for all protocols to use and covers goings on
441 *	at the socket level. Everything here is generic.
442 */
443
444int sock_setsockopt(struct socket *sock, int level, int optname,
445		    char __user *optval, int optlen)
446{
447	struct sock *sk=sock->sk;
448	int val;
449	int valbool;
450	struct linger ling;
451	int ret = 0;
452
453	/*
454	 *	Options without arguments
455	 */
456
457	if (optname == SO_BINDTODEVICE)
458		return sock_bindtodevice(sk, optval, optlen);
459
460	if (optlen < sizeof(int))
461		return -EINVAL;
462
463	if (get_user(val, (int __user *)optval))
464		return -EFAULT;
465
466	valbool = val?1:0;
467
468	lock_sock(sk);
469
470	switch(optname) {
471	case SO_DEBUG:
472		if (val && !capable(CAP_NET_ADMIN)) {
473			ret = -EACCES;
474		} else
475			sock_valbool_flag(sk, SOCK_DBG, valbool);
476		break;
477	case SO_REUSEADDR:
478		sk->sk_reuse = valbool;
479		break;
480	case SO_TYPE:
481	case SO_ERROR:
482		ret = -ENOPROTOOPT;
483		break;
484	case SO_DONTROUTE:
485		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
486		break;
487	case SO_BROADCAST:
488		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
489		break;
490	case SO_SNDBUF:
491		/* Don't error on this BSD doesn't and if you think
492		   about it this is right. Otherwise apps have to
493		   play 'guess the biggest size' games. RCVBUF/SNDBUF
494		   are treated in BSD as hints */
495
496		if (val > sysctl_wmem_max)
497			val = sysctl_wmem_max;
498set_sndbuf:
499		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
500		if ((val * 2) < SOCK_MIN_SNDBUF)
501			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
502		else
503			sk->sk_sndbuf = val * 2;
504
505		/*
506		 *	Wake up sending tasks if we
507		 *	upped the value.
508		 */
509		sk->sk_write_space(sk);
510		break;
511
512	case SO_SNDBUFFORCE:
513		if (!capable(CAP_NET_ADMIN)) {
514			ret = -EPERM;
515			break;
516		}
517		goto set_sndbuf;
518
519	case SO_RCVBUF:
520		/* Don't error on this BSD doesn't and if you think
521		   about it this is right. Otherwise apps have to
522		   play 'guess the biggest size' games. RCVBUF/SNDBUF
523		   are treated in BSD as hints */
524
525		if (val > sysctl_rmem_max)
526			val = sysctl_rmem_max;
527set_rcvbuf:
528		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
529		/*
530		 * We double it on the way in to account for
531		 * "struct sk_buff" etc. overhead.   Applications
532		 * assume that the SO_RCVBUF setting they make will
533		 * allow that much actual data to be received on that
534		 * socket.
535		 *
536		 * Applications are unaware that "struct sk_buff" and
537		 * other overheads allocate from the receive buffer
538		 * during socket buffer allocation.
539		 *
540		 * And after considering the possible alternatives,
541		 * returning the value we actually used in getsockopt
542		 * is the most desirable behavior.
543		 */
544		if ((val * 2) < SOCK_MIN_RCVBUF)
545			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
546		else
547			sk->sk_rcvbuf = val * 2;
548		break;
549
550	case SO_RCVBUFFORCE:
551		if (!capable(CAP_NET_ADMIN)) {
552			ret = -EPERM;
553			break;
554		}
555		goto set_rcvbuf;
556
557	case SO_KEEPALIVE:
558#ifdef CONFIG_INET
559		if (sk->sk_protocol == IPPROTO_TCP)
560			tcp_set_keepalive(sk, valbool);
561#endif
562		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
563		break;
564
565	case SO_OOBINLINE:
566		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
567		break;
568
569	case SO_NO_CHECK:
570		sk->sk_no_check = valbool;
571		break;
572
573	case SO_PRIORITY:
574		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
575			sk->sk_priority = val;
576		else
577			ret = -EPERM;
578		break;
579
580	case SO_LINGER:
581		if (optlen < sizeof(ling)) {
582			ret = -EINVAL;	/* 1003.1g */
583			break;
584		}
585		if (copy_from_user(&ling,optval,sizeof(ling))) {
586			ret = -EFAULT;
587			break;
588		}
589		if (!ling.l_onoff)
590			sock_reset_flag(sk, SOCK_LINGER);
591		else {
592#if (BITS_PER_LONG == 32)
593			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
594				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
595			else
596#endif
597				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
598			sock_set_flag(sk, SOCK_LINGER);
599		}
600		break;
601
602	case SO_BSDCOMPAT:
603		sock_warn_obsolete_bsdism("setsockopt");
604		break;
605
606	case SO_PASSCRED:
607		if (valbool)
608			set_bit(SOCK_PASSCRED, &sock->flags);
609		else
610			clear_bit(SOCK_PASSCRED, &sock->flags);
611		break;
612
613	case SO_TIMESTAMP:
614	case SO_TIMESTAMPNS:
615		if (valbool)  {
616			if (optname == SO_TIMESTAMP)
617				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
618			else
619				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
620			sock_set_flag(sk, SOCK_RCVTSTAMP);
621			sock_enable_timestamp(sk);
622		} else {
623			sock_reset_flag(sk, SOCK_RCVTSTAMP);
624			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
625		}
626		break;
627
628	case SO_RCVLOWAT:
629		if (val < 0)
630			val = INT_MAX;
631		sk->sk_rcvlowat = val ? : 1;
632		break;
633
634	case SO_RCVTIMEO:
635		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
636		break;
637
638	case SO_SNDTIMEO:
639		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
640		break;
641
642	case SO_ATTACH_FILTER:
643		ret = -EINVAL;
644		if (optlen == sizeof(struct sock_fprog)) {
645			struct sock_fprog fprog;
646
647			ret = -EFAULT;
648			if (copy_from_user(&fprog, optval, sizeof(fprog)))
649				break;
650
651			ret = sk_attach_filter(&fprog, sk);
652		}
653		break;
654
655	case SO_DETACH_FILTER:
656		ret = sk_detach_filter(sk);
657		break;
658
659	case SO_PASSSEC:
660		if (valbool)
661			set_bit(SOCK_PASSSEC, &sock->flags);
662		else
663			clear_bit(SOCK_PASSSEC, &sock->flags);
664		break;
665	case SO_MARK:
666		if (!capable(CAP_NET_ADMIN))
667			ret = -EPERM;
668		else {
669			sk->sk_mark = val;
670		}
671		break;
672
673		/* We implement the SO_SNDLOWAT etc to
674		   not be settable (1003.1g 5.3) */
675	default:
676		ret = -ENOPROTOOPT;
677		break;
678	}
679	release_sock(sk);
680	return ret;
681}
682
683
684int sock_getsockopt(struct socket *sock, int level, int optname,
685		    char __user *optval, int __user *optlen)
686{
687	struct sock *sk = sock->sk;
688
689	union {
690		int val;
691		struct linger ling;
692		struct timeval tm;
693	} v;
694
695	unsigned int lv = sizeof(int);
696	int len;
697
698	if (get_user(len, optlen))
699		return -EFAULT;
700	if (len < 0)
701		return -EINVAL;
702
703	switch(optname) {
704	case SO_DEBUG:
705		v.val = sock_flag(sk, SOCK_DBG);
706		break;
707
708	case SO_DONTROUTE:
709		v.val = sock_flag(sk, SOCK_LOCALROUTE);
710		break;
711
712	case SO_BROADCAST:
713		v.val = !!sock_flag(sk, SOCK_BROADCAST);
714		break;
715
716	case SO_SNDBUF:
717		v.val = sk->sk_sndbuf;
718		break;
719
720	case SO_RCVBUF:
721		v.val = sk->sk_rcvbuf;
722		break;
723
724	case SO_REUSEADDR:
725		v.val = sk->sk_reuse;
726		break;
727
728	case SO_KEEPALIVE:
729		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
730		break;
731
732	case SO_TYPE:
733		v.val = sk->sk_type;
734		break;
735
736	case SO_ERROR:
737		v.val = -sock_error(sk);
738		if (v.val==0)
739			v.val = xchg(&sk->sk_err_soft, 0);
740		break;
741
742	case SO_OOBINLINE:
743		v.val = !!sock_flag(sk, SOCK_URGINLINE);
744		break;
745
746	case SO_NO_CHECK:
747		v.val = sk->sk_no_check;
748		break;
749
750	case SO_PRIORITY:
751		v.val = sk->sk_priority;
752		break;
753
754	case SO_LINGER:
755		lv		= sizeof(v.ling);
756		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
757		v.ling.l_linger	= sk->sk_lingertime / HZ;
758		break;
759
760	case SO_BSDCOMPAT:
761		sock_warn_obsolete_bsdism("getsockopt");
762		break;
763
764	case SO_TIMESTAMP:
765		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
766				!sock_flag(sk, SOCK_RCVTSTAMPNS);
767		break;
768
769	case SO_TIMESTAMPNS:
770		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
771		break;
772
773	case SO_RCVTIMEO:
774		lv=sizeof(struct timeval);
775		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
776			v.tm.tv_sec = 0;
777			v.tm.tv_usec = 0;
778		} else {
779			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
780			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
781		}
782		break;
783
784	case SO_SNDTIMEO:
785		lv=sizeof(struct timeval);
786		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
787			v.tm.tv_sec = 0;
788			v.tm.tv_usec = 0;
789		} else {
790			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
791			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
792		}
793		break;
794
795	case SO_RCVLOWAT:
796		v.val = sk->sk_rcvlowat;
797		break;
798
799	case SO_SNDLOWAT:
800		v.val=1;
801		break;
802
803	case SO_PASSCRED:
804		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
805		break;
806
807	case SO_PEERCRED:
808		if (len > sizeof(sk->sk_peercred))
809			len = sizeof(sk->sk_peercred);
810		if (copy_to_user(optval, &sk->sk_peercred, len))
811			return -EFAULT;
812		goto lenout;
813
814	case SO_PEERNAME:
815	{
816		char address[128];
817
818		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
819			return -ENOTCONN;
820		if (lv < len)
821			return -EINVAL;
822		if (copy_to_user(optval, address, len))
823			return -EFAULT;
824		goto lenout;
825	}
826
827	/* Dubious BSD thing... Probably nobody even uses it, but
828	 * the UNIX standard wants it for whatever reason... -DaveM
829	 */
830	case SO_ACCEPTCONN:
831		v.val = sk->sk_state == TCP_LISTEN;
832		break;
833
834	case SO_PASSSEC:
835		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
836		break;
837
838	case SO_PEERSEC:
839		return security_socket_getpeersec_stream(sock, optval, optlen, len);
840
841	case SO_MARK:
842		v.val = sk->sk_mark;
843		break;
844
845	default:
846		return -ENOPROTOOPT;
847	}
848
849	if (len > lv)
850		len = lv;
851	if (copy_to_user(optval, &v, len))
852		return -EFAULT;
853lenout:
854	if (put_user(len, optlen))
855		return -EFAULT;
856	return 0;
857}
858
859/*
860 * Initialize an sk_lock.
861 *
862 * (We also register the sk_lock with the lock validator.)
863 */
864static inline void sock_lock_init(struct sock *sk)
865{
866	sock_lock_init_class_and_name(sk,
867			af_family_slock_key_strings[sk->sk_family],
868			af_family_slock_keys + sk->sk_family,
869			af_family_key_strings[sk->sk_family],
870			af_family_keys + sk->sk_family);
871}
872
873static void sock_copy(struct sock *nsk, const struct sock *osk)
874{
875#ifdef CONFIG_SECURITY_NETWORK
876	void *sptr = nsk->sk_security;
877#endif
878
879	memcpy(nsk, osk, osk->sk_prot->obj_size);
880#ifdef CONFIG_SECURITY_NETWORK
881	nsk->sk_security = sptr;
882	security_sk_clone(osk, nsk);
883#endif
884}
885
886static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
887		int family)
888{
889	struct sock *sk;
890	struct kmem_cache *slab;
891
892	slab = prot->slab;
893	if (slab != NULL)
894		sk = kmem_cache_alloc(slab, priority);
895	else
896		sk = kmalloc(prot->obj_size, priority);
897
898	if (sk != NULL) {
899		if (security_sk_alloc(sk, family, priority))
900			goto out_free;
901
902		if (!try_module_get(prot->owner))
903			goto out_free_sec;
904	}
905
906	return sk;
907
908out_free_sec:
909	security_sk_free(sk);
910out_free:
911	if (slab != NULL)
912		kmem_cache_free(slab, sk);
913	else
914		kfree(sk);
915	return NULL;
916}
917
918static void sk_prot_free(struct proto *prot, struct sock *sk)
919{
920	struct kmem_cache *slab;
921	struct module *owner;
922
923	owner = prot->owner;
924	slab = prot->slab;
925
926	security_sk_free(sk);
927	if (slab != NULL)
928		kmem_cache_free(slab, sk);
929	else
930		kfree(sk);
931	module_put(owner);
932}
933
934/**
935 *	sk_alloc - All socket objects are allocated here
936 *	@net: the applicable net namespace
937 *	@family: protocol family
938 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
939 *	@prot: struct proto associated with this new sock instance
940 */
941struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
942		      struct proto *prot)
943{
944	struct sock *sk;
945
946	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
947	if (sk) {
948		sk->sk_family = family;
949		/*
950		 * See comment in struct sock definition to understand
951		 * why we need sk_prot_creator -acme
952		 */
953		sk->sk_prot = sk->sk_prot_creator = prot;
954		sock_lock_init(sk);
955		sock_net_set(sk, get_net(net));
956	}
957
958	return sk;
959}
960
961void sk_free(struct sock *sk)
962{
963	struct sk_filter *filter;
964
965	if (sk->sk_destruct)
966		sk->sk_destruct(sk);
967
968	filter = rcu_dereference(sk->sk_filter);
969	if (filter) {
970		sk_filter_uncharge(sk, filter);
971		rcu_assign_pointer(sk->sk_filter, NULL);
972	}
973
974	sock_disable_timestamp(sk);
975
976	if (atomic_read(&sk->sk_omem_alloc))
977		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
978		       __func__, atomic_read(&sk->sk_omem_alloc));
979
980	put_net(sock_net(sk));
981	sk_prot_free(sk->sk_prot_creator, sk);
982}
983
984/*
985 * Last sock_put should drop referrence to sk->sk_net. It has already
986 * been dropped in sk_change_net. Taking referrence to stopping namespace
987 * is not an option.
988 * Take referrence to a socket to remove it from hash _alive_ and after that
989 * destroy it in the context of init_net.
990 */
991void sk_release_kernel(struct sock *sk)
992{
993	if (sk == NULL || sk->sk_socket == NULL)
994		return;
995
996	sock_hold(sk);
997	sock_release(sk->sk_socket);
998	release_net(sock_net(sk));
999	sock_net_set(sk, get_net(&init_net));
1000	sock_put(sk);
1001}
1002EXPORT_SYMBOL(sk_release_kernel);
1003
1004struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1005{
1006	struct sock *newsk;
1007
1008	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1009	if (newsk != NULL) {
1010		struct sk_filter *filter;
1011
1012		sock_copy(newsk, sk);
1013
1014		/* SANITY */
1015		get_net(sock_net(newsk));
1016		sk_node_init(&newsk->sk_node);
1017		sock_lock_init(newsk);
1018		bh_lock_sock(newsk);
1019		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1020
1021		atomic_set(&newsk->sk_rmem_alloc, 0);
1022		atomic_set(&newsk->sk_wmem_alloc, 0);
1023		atomic_set(&newsk->sk_omem_alloc, 0);
1024		skb_queue_head_init(&newsk->sk_receive_queue);
1025		skb_queue_head_init(&newsk->sk_write_queue);
1026#ifdef CONFIG_NET_DMA
1027		skb_queue_head_init(&newsk->sk_async_wait_queue);
1028#endif
1029
1030		rwlock_init(&newsk->sk_dst_lock);
1031		rwlock_init(&newsk->sk_callback_lock);
1032		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1033				af_callback_keys + newsk->sk_family,
1034				af_family_clock_key_strings[newsk->sk_family]);
1035
1036		newsk->sk_dst_cache	= NULL;
1037		newsk->sk_wmem_queued	= 0;
1038		newsk->sk_forward_alloc = 0;
1039		newsk->sk_send_head	= NULL;
1040		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1041
1042		sock_reset_flag(newsk, SOCK_DONE);
1043		skb_queue_head_init(&newsk->sk_error_queue);
1044
1045		filter = newsk->sk_filter;
1046		if (filter != NULL)
1047			sk_filter_charge(newsk, filter);
1048
1049		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1050			/* It is still raw copy of parent, so invalidate
1051			 * destructor and make plain sk_free() */
1052			newsk->sk_destruct = NULL;
1053			sk_free(newsk);
1054			newsk = NULL;
1055			goto out;
1056		}
1057
1058		newsk->sk_err	   = 0;
1059		newsk->sk_priority = 0;
1060		atomic_set(&newsk->sk_refcnt, 2);
1061
1062		/*
1063		 * Increment the counter in the same struct proto as the master
1064		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1065		 * is the same as sk->sk_prot->socks, as this field was copied
1066		 * with memcpy).
1067		 *
1068		 * This _changes_ the previous behaviour, where
1069		 * tcp_create_openreq_child always was incrementing the
1070		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1071		 * to be taken into account in all callers. -acme
1072		 */
1073		sk_refcnt_debug_inc(newsk);
1074		sk_set_socket(newsk, NULL);
1075		newsk->sk_sleep	 = NULL;
1076
1077		if (newsk->sk_prot->sockets_allocated)
1078			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1079	}
1080out:
1081	return newsk;
1082}
1083
1084EXPORT_SYMBOL_GPL(sk_clone);
1085
1086void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1087{
1088	__sk_dst_set(sk, dst);
1089	sk->sk_route_caps = dst->dev->features;
1090	if (sk->sk_route_caps & NETIF_F_GSO)
1091		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1092	if (sk_can_gso(sk)) {
1093		if (dst->header_len) {
1094			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1095		} else {
1096			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1097			sk->sk_gso_max_size = dst->dev->gso_max_size;
1098		}
1099	}
1100}
1101EXPORT_SYMBOL_GPL(sk_setup_caps);
1102
1103void __init sk_init(void)
1104{
1105	if (num_physpages <= 4096) {
1106		sysctl_wmem_max = 32767;
1107		sysctl_rmem_max = 32767;
1108		sysctl_wmem_default = 32767;
1109		sysctl_rmem_default = 32767;
1110	} else if (num_physpages >= 131072) {
1111		sysctl_wmem_max = 131071;
1112		sysctl_rmem_max = 131071;
1113	}
1114}
1115
1116/*
1117 *	Simple resource managers for sockets.
1118 */
1119
1120
1121/*
1122 * Write buffer destructor automatically called from kfree_skb.
1123 */
1124void sock_wfree(struct sk_buff *skb)
1125{
1126	struct sock *sk = skb->sk;
1127
1128	/* In case it might be waiting for more memory. */
1129	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1130	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1131		sk->sk_write_space(sk);
1132	sock_put(sk);
1133}
1134
1135/*
1136 * Read buffer destructor automatically called from kfree_skb.
1137 */
1138void sock_rfree(struct sk_buff *skb)
1139{
1140	struct sock *sk = skb->sk;
1141
1142	skb_truesize_check(skb);
1143	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1144	sk_mem_uncharge(skb->sk, skb->truesize);
1145}
1146
1147
1148int sock_i_uid(struct sock *sk)
1149{
1150	int uid;
1151
1152	read_lock(&sk->sk_callback_lock);
1153	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1154	read_unlock(&sk->sk_callback_lock);
1155	return uid;
1156}
1157
1158unsigned long sock_i_ino(struct sock *sk)
1159{
1160	unsigned long ino;
1161
1162	read_lock(&sk->sk_callback_lock);
1163	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1164	read_unlock(&sk->sk_callback_lock);
1165	return ino;
1166}
1167
1168/*
1169 * Allocate a skb from the socket's send buffer.
1170 */
1171struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1172			     gfp_t priority)
1173{
1174	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1175		struct sk_buff * skb = alloc_skb(size, priority);
1176		if (skb) {
1177			skb_set_owner_w(skb, sk);
1178			return skb;
1179		}
1180	}
1181	return NULL;
1182}
1183
1184/*
1185 * Allocate a skb from the socket's receive buffer.
1186 */
1187struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1188			     gfp_t priority)
1189{
1190	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1191		struct sk_buff *skb = alloc_skb(size, priority);
1192		if (skb) {
1193			skb_set_owner_r(skb, sk);
1194			return skb;
1195		}
1196	}
1197	return NULL;
1198}
1199
1200/*
1201 * Allocate a memory block from the socket's option memory buffer.
1202 */
1203void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1204{
1205	if ((unsigned)size <= sysctl_optmem_max &&
1206	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1207		void *mem;
1208		/* First do the add, to avoid the race if kmalloc
1209		 * might sleep.
1210		 */
1211		atomic_add(size, &sk->sk_omem_alloc);
1212		mem = kmalloc(size, priority);
1213		if (mem)
1214			return mem;
1215		atomic_sub(size, &sk->sk_omem_alloc);
1216	}
1217	return NULL;
1218}
1219
1220/*
1221 * Free an option memory block.
1222 */
1223void sock_kfree_s(struct sock *sk, void *mem, int size)
1224{
1225	kfree(mem);
1226	atomic_sub(size, &sk->sk_omem_alloc);
1227}
1228
1229/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1230   I think, these locks should be removed for datagram sockets.
1231 */
1232static long sock_wait_for_wmem(struct sock * sk, long timeo)
1233{
1234	DEFINE_WAIT(wait);
1235
1236	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1237	for (;;) {
1238		if (!timeo)
1239			break;
1240		if (signal_pending(current))
1241			break;
1242		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1243		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1244		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1245			break;
1246		if (sk->sk_shutdown & SEND_SHUTDOWN)
1247			break;
1248		if (sk->sk_err)
1249			break;
1250		timeo = schedule_timeout(timeo);
1251	}
1252	finish_wait(sk->sk_sleep, &wait);
1253	return timeo;
1254}
1255
1256
1257/*
1258 *	Generic send/receive buffer handlers
1259 */
1260
1261static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1262					    unsigned long header_len,
1263					    unsigned long data_len,
1264					    int noblock, int *errcode)
1265{
1266	struct sk_buff *skb;
1267	gfp_t gfp_mask;
1268	long timeo;
1269	int err;
1270
1271	gfp_mask = sk->sk_allocation;
1272	if (gfp_mask & __GFP_WAIT)
1273		gfp_mask |= __GFP_REPEAT;
1274
1275	timeo = sock_sndtimeo(sk, noblock);
1276	while (1) {
1277		err = sock_error(sk);
1278		if (err != 0)
1279			goto failure;
1280
1281		err = -EPIPE;
1282		if (sk->sk_shutdown & SEND_SHUTDOWN)
1283			goto failure;
1284
1285		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1286			skb = alloc_skb(header_len, gfp_mask);
1287			if (skb) {
1288				int npages;
1289				int i;
1290
1291				/* No pages, we're done... */
1292				if (!data_len)
1293					break;
1294
1295				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1296				skb->truesize += data_len;
1297				skb_shinfo(skb)->nr_frags = npages;
1298				for (i = 0; i < npages; i++) {
1299					struct page *page;
1300					skb_frag_t *frag;
1301
1302					page = alloc_pages(sk->sk_allocation, 0);
1303					if (!page) {
1304						err = -ENOBUFS;
1305						skb_shinfo(skb)->nr_frags = i;
1306						kfree_skb(skb);
1307						goto failure;
1308					}
1309
1310					frag = &skb_shinfo(skb)->frags[i];
1311					frag->page = page;
1312					frag->page_offset = 0;
1313					frag->size = (data_len >= PAGE_SIZE ?
1314						      PAGE_SIZE :
1315						      data_len);
1316					data_len -= PAGE_SIZE;
1317				}
1318
1319				/* Full success... */
1320				break;
1321			}
1322			err = -ENOBUFS;
1323			goto failure;
1324		}
1325		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1326		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1327		err = -EAGAIN;
1328		if (!timeo)
1329			goto failure;
1330		if (signal_pending(current))
1331			goto interrupted;
1332		timeo = sock_wait_for_wmem(sk, timeo);
1333	}
1334
1335	skb_set_owner_w(skb, sk);
1336	return skb;
1337
1338interrupted:
1339	err = sock_intr_errno(timeo);
1340failure:
1341	*errcode = err;
1342	return NULL;
1343}
1344
1345struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1346				    int noblock, int *errcode)
1347{
1348	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1349}
1350
1351static void __lock_sock(struct sock *sk)
1352{
1353	DEFINE_WAIT(wait);
1354
1355	for (;;) {
1356		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1357					TASK_UNINTERRUPTIBLE);
1358		spin_unlock_bh(&sk->sk_lock.slock);
1359		schedule();
1360		spin_lock_bh(&sk->sk_lock.slock);
1361		if (!sock_owned_by_user(sk))
1362			break;
1363	}
1364	finish_wait(&sk->sk_lock.wq, &wait);
1365}
1366
1367static void __release_sock(struct sock *sk)
1368{
1369	struct sk_buff *skb = sk->sk_backlog.head;
1370
1371	do {
1372		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1373		bh_unlock_sock(sk);
1374
1375		do {
1376			struct sk_buff *next = skb->next;
1377
1378			skb->next = NULL;
1379			sk_backlog_rcv(sk, skb);
1380
1381			/*
1382			 * We are in process context here with softirqs
1383			 * disabled, use cond_resched_softirq() to preempt.
1384			 * This is safe to do because we've taken the backlog
1385			 * queue private:
1386			 */
1387			cond_resched_softirq();
1388
1389			skb = next;
1390		} while (skb != NULL);
1391
1392		bh_lock_sock(sk);
1393	} while ((skb = sk->sk_backlog.head) != NULL);
1394}
1395
1396/**
1397 * sk_wait_data - wait for data to arrive at sk_receive_queue
1398 * @sk:    sock to wait on
1399 * @timeo: for how long
1400 *
1401 * Now socket state including sk->sk_err is changed only under lock,
1402 * hence we may omit checks after joining wait queue.
1403 * We check receive queue before schedule() only as optimization;
1404 * it is very likely that release_sock() added new data.
1405 */
1406int sk_wait_data(struct sock *sk, long *timeo)
1407{
1408	int rc;
1409	DEFINE_WAIT(wait);
1410
1411	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1412	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1413	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1414	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1415	finish_wait(sk->sk_sleep, &wait);
1416	return rc;
1417}
1418
1419EXPORT_SYMBOL(sk_wait_data);
1420
1421/**
1422 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1423 *	@sk: socket
1424 *	@size: memory size to allocate
1425 *	@kind: allocation type
1426 *
1427 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1428 *	rmem allocation. This function assumes that protocols which have
1429 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1430 */
1431int __sk_mem_schedule(struct sock *sk, int size, int kind)
1432{
1433	struct proto *prot = sk->sk_prot;
1434	int amt = sk_mem_pages(size);
1435	int allocated;
1436
1437	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1438	allocated = atomic_add_return(amt, prot->memory_allocated);
1439
1440	/* Under limit. */
1441	if (allocated <= prot->sysctl_mem[0]) {
1442		if (prot->memory_pressure && *prot->memory_pressure)
1443			*prot->memory_pressure = 0;
1444		return 1;
1445	}
1446
1447	/* Under pressure. */
1448	if (allocated > prot->sysctl_mem[1])
1449		if (prot->enter_memory_pressure)
1450			prot->enter_memory_pressure(sk);
1451
1452	/* Over hard limit. */
1453	if (allocated > prot->sysctl_mem[2])
1454		goto suppress_allocation;
1455
1456	/* guarantee minimum buffer size under pressure */
1457	if (kind == SK_MEM_RECV) {
1458		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1459			return 1;
1460	} else { /* SK_MEM_SEND */
1461		if (sk->sk_type == SOCK_STREAM) {
1462			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1463				return 1;
1464		} else if (atomic_read(&sk->sk_wmem_alloc) <
1465			   prot->sysctl_wmem[0])
1466				return 1;
1467	}
1468
1469	if (prot->memory_pressure) {
1470		int alloc;
1471
1472		if (!*prot->memory_pressure)
1473			return 1;
1474		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1475		if (prot->sysctl_mem[2] > alloc *
1476		    sk_mem_pages(sk->sk_wmem_queued +
1477				 atomic_read(&sk->sk_rmem_alloc) +
1478				 sk->sk_forward_alloc))
1479			return 1;
1480	}
1481
1482suppress_allocation:
1483
1484	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1485		sk_stream_moderate_sndbuf(sk);
1486
1487		/* Fail only if socket is _under_ its sndbuf.
1488		 * In this case we cannot block, so that we have to fail.
1489		 */
1490		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1491			return 1;
1492	}
1493
1494	/* Alas. Undo changes. */
1495	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1496	atomic_sub(amt, prot->memory_allocated);
1497	return 0;
1498}
1499
1500EXPORT_SYMBOL(__sk_mem_schedule);
1501
1502/**
1503 *	__sk_reclaim - reclaim memory_allocated
1504 *	@sk: socket
1505 */
1506void __sk_mem_reclaim(struct sock *sk)
1507{
1508	struct proto *prot = sk->sk_prot;
1509
1510	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1511		   prot->memory_allocated);
1512	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1513
1514	if (prot->memory_pressure && *prot->memory_pressure &&
1515	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1516		*prot->memory_pressure = 0;
1517}
1518
1519EXPORT_SYMBOL(__sk_mem_reclaim);
1520
1521
1522/*
1523 * Set of default routines for initialising struct proto_ops when
1524 * the protocol does not support a particular function. In certain
1525 * cases where it makes no sense for a protocol to have a "do nothing"
1526 * function, some default processing is provided.
1527 */
1528
1529int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1530{
1531	return -EOPNOTSUPP;
1532}
1533
1534int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1535		    int len, int flags)
1536{
1537	return -EOPNOTSUPP;
1538}
1539
1540int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1541{
1542	return -EOPNOTSUPP;
1543}
1544
1545int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1546{
1547	return -EOPNOTSUPP;
1548}
1549
1550int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1551		    int *len, int peer)
1552{
1553	return -EOPNOTSUPP;
1554}
1555
1556unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1557{
1558	return 0;
1559}
1560
1561int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1562{
1563	return -EOPNOTSUPP;
1564}
1565
1566int sock_no_listen(struct socket *sock, int backlog)
1567{
1568	return -EOPNOTSUPP;
1569}
1570
1571int sock_no_shutdown(struct socket *sock, int how)
1572{
1573	return -EOPNOTSUPP;
1574}
1575
1576int sock_no_setsockopt(struct socket *sock, int level, int optname,
1577		    char __user *optval, int optlen)
1578{
1579	return -EOPNOTSUPP;
1580}
1581
1582int sock_no_getsockopt(struct socket *sock, int level, int optname,
1583		    char __user *optval, int __user *optlen)
1584{
1585	return -EOPNOTSUPP;
1586}
1587
1588int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1589		    size_t len)
1590{
1591	return -EOPNOTSUPP;
1592}
1593
1594int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1595		    size_t len, int flags)
1596{
1597	return -EOPNOTSUPP;
1598}
1599
1600int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1601{
1602	/* Mirror missing mmap method error code */
1603	return -ENODEV;
1604}
1605
1606ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1607{
1608	ssize_t res;
1609	struct msghdr msg = {.msg_flags = flags};
1610	struct kvec iov;
1611	char *kaddr = kmap(page);
1612	iov.iov_base = kaddr + offset;
1613	iov.iov_len = size;
1614	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1615	kunmap(page);
1616	return res;
1617}
1618
1619/*
1620 *	Default Socket Callbacks
1621 */
1622
1623static void sock_def_wakeup(struct sock *sk)
1624{
1625	read_lock(&sk->sk_callback_lock);
1626	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627		wake_up_interruptible_all(sk->sk_sleep);
1628	read_unlock(&sk->sk_callback_lock);
1629}
1630
1631static void sock_def_error_report(struct sock *sk)
1632{
1633	read_lock(&sk->sk_callback_lock);
1634	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1635		wake_up_interruptible(sk->sk_sleep);
1636	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1637	read_unlock(&sk->sk_callback_lock);
1638}
1639
1640static void sock_def_readable(struct sock *sk, int len)
1641{
1642	read_lock(&sk->sk_callback_lock);
1643	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1644		wake_up_interruptible_sync(sk->sk_sleep);
1645	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1646	read_unlock(&sk->sk_callback_lock);
1647}
1648
1649static void sock_def_write_space(struct sock *sk)
1650{
1651	read_lock(&sk->sk_callback_lock);
1652
1653	/* Do not wake up a writer until he can make "significant"
1654	 * progress.  --DaveM
1655	 */
1656	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1657		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1658			wake_up_interruptible_sync(sk->sk_sleep);
1659
1660		/* Should agree with poll, otherwise some programs break */
1661		if (sock_writeable(sk))
1662			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1663	}
1664
1665	read_unlock(&sk->sk_callback_lock);
1666}
1667
1668static void sock_def_destruct(struct sock *sk)
1669{
1670	kfree(sk->sk_protinfo);
1671}
1672
1673void sk_send_sigurg(struct sock *sk)
1674{
1675	if (sk->sk_socket && sk->sk_socket->file)
1676		if (send_sigurg(&sk->sk_socket->file->f_owner))
1677			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1678}
1679
1680void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1681		    unsigned long expires)
1682{
1683	if (!mod_timer(timer, expires))
1684		sock_hold(sk);
1685}
1686
1687EXPORT_SYMBOL(sk_reset_timer);
1688
1689void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1690{
1691	if (timer_pending(timer) && del_timer(timer))
1692		__sock_put(sk);
1693}
1694
1695EXPORT_SYMBOL(sk_stop_timer);
1696
1697void sock_init_data(struct socket *sock, struct sock *sk)
1698{
1699	skb_queue_head_init(&sk->sk_receive_queue);
1700	skb_queue_head_init(&sk->sk_write_queue);
1701	skb_queue_head_init(&sk->sk_error_queue);
1702#ifdef CONFIG_NET_DMA
1703	skb_queue_head_init(&sk->sk_async_wait_queue);
1704#endif
1705
1706	sk->sk_send_head	=	NULL;
1707
1708	init_timer(&sk->sk_timer);
1709
1710	sk->sk_allocation	=	GFP_KERNEL;
1711	sk->sk_rcvbuf		=	sysctl_rmem_default;
1712	sk->sk_sndbuf		=	sysctl_wmem_default;
1713	sk->sk_state		=	TCP_CLOSE;
1714	sk_set_socket(sk, sock);
1715
1716	sock_set_flag(sk, SOCK_ZAPPED);
1717
1718	if (sock) {
1719		sk->sk_type	=	sock->type;
1720		sk->sk_sleep	=	&sock->wait;
1721		sock->sk	=	sk;
1722	} else
1723		sk->sk_sleep	=	NULL;
1724
1725	rwlock_init(&sk->sk_dst_lock);
1726	rwlock_init(&sk->sk_callback_lock);
1727	lockdep_set_class_and_name(&sk->sk_callback_lock,
1728			af_callback_keys + sk->sk_family,
1729			af_family_clock_key_strings[sk->sk_family]);
1730
1731	sk->sk_state_change	=	sock_def_wakeup;
1732	sk->sk_data_ready	=	sock_def_readable;
1733	sk->sk_write_space	=	sock_def_write_space;
1734	sk->sk_error_report	=	sock_def_error_report;
1735	sk->sk_destruct		=	sock_def_destruct;
1736
1737	sk->sk_sndmsg_page	=	NULL;
1738	sk->sk_sndmsg_off	=	0;
1739
1740	sk->sk_peercred.pid 	=	0;
1741	sk->sk_peercred.uid	=	-1;
1742	sk->sk_peercred.gid	=	-1;
1743	sk->sk_write_pending	=	0;
1744	sk->sk_rcvlowat		=	1;
1745	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1746	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1747
1748	sk->sk_stamp = ktime_set(-1L, 0);
1749
1750	atomic_set(&sk->sk_refcnt, 1);
1751	atomic_set(&sk->sk_drops, 0);
1752}
1753
1754void lock_sock_nested(struct sock *sk, int subclass)
1755{
1756	might_sleep();
1757	spin_lock_bh(&sk->sk_lock.slock);
1758	if (sk->sk_lock.owned)
1759		__lock_sock(sk);
1760	sk->sk_lock.owned = 1;
1761	spin_unlock(&sk->sk_lock.slock);
1762	/*
1763	 * The sk_lock has mutex_lock() semantics here:
1764	 */
1765	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1766	local_bh_enable();
1767}
1768
1769EXPORT_SYMBOL(lock_sock_nested);
1770
1771void release_sock(struct sock *sk)
1772{
1773	/*
1774	 * The sk_lock has mutex_unlock() semantics:
1775	 */
1776	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1777
1778	spin_lock_bh(&sk->sk_lock.slock);
1779	if (sk->sk_backlog.tail)
1780		__release_sock(sk);
1781	sk->sk_lock.owned = 0;
1782	if (waitqueue_active(&sk->sk_lock.wq))
1783		wake_up(&sk->sk_lock.wq);
1784	spin_unlock_bh(&sk->sk_lock.slock);
1785}
1786EXPORT_SYMBOL(release_sock);
1787
1788int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1789{
1790	struct timeval tv;
1791	if (!sock_flag(sk, SOCK_TIMESTAMP))
1792		sock_enable_timestamp(sk);
1793	tv = ktime_to_timeval(sk->sk_stamp);
1794	if (tv.tv_sec == -1)
1795		return -ENOENT;
1796	if (tv.tv_sec == 0) {
1797		sk->sk_stamp = ktime_get_real();
1798		tv = ktime_to_timeval(sk->sk_stamp);
1799	}
1800	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1801}
1802EXPORT_SYMBOL(sock_get_timestamp);
1803
1804int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1805{
1806	struct timespec ts;
1807	if (!sock_flag(sk, SOCK_TIMESTAMP))
1808		sock_enable_timestamp(sk);
1809	ts = ktime_to_timespec(sk->sk_stamp);
1810	if (ts.tv_sec == -1)
1811		return -ENOENT;
1812	if (ts.tv_sec == 0) {
1813		sk->sk_stamp = ktime_get_real();
1814		ts = ktime_to_timespec(sk->sk_stamp);
1815	}
1816	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1817}
1818EXPORT_SYMBOL(sock_get_timestampns);
1819
1820void sock_enable_timestamp(struct sock *sk)
1821{
1822	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1823		sock_set_flag(sk, SOCK_TIMESTAMP);
1824		net_enable_timestamp();
1825	}
1826}
1827
1828/*
1829 *	Get a socket option on an socket.
1830 *
1831 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1832 *	asynchronous errors should be reported by getsockopt. We assume
1833 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1834 */
1835int sock_common_getsockopt(struct socket *sock, int level, int optname,
1836			   char __user *optval, int __user *optlen)
1837{
1838	struct sock *sk = sock->sk;
1839
1840	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1841}
1842
1843EXPORT_SYMBOL(sock_common_getsockopt);
1844
1845#ifdef CONFIG_COMPAT
1846int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1847				  char __user *optval, int __user *optlen)
1848{
1849	struct sock *sk = sock->sk;
1850
1851	if (sk->sk_prot->compat_getsockopt != NULL)
1852		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1853						      optval, optlen);
1854	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1855}
1856EXPORT_SYMBOL(compat_sock_common_getsockopt);
1857#endif
1858
1859int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1860			struct msghdr *msg, size_t size, int flags)
1861{
1862	struct sock *sk = sock->sk;
1863	int addr_len = 0;
1864	int err;
1865
1866	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1867				   flags & ~MSG_DONTWAIT, &addr_len);
1868	if (err >= 0)
1869		msg->msg_namelen = addr_len;
1870	return err;
1871}
1872
1873EXPORT_SYMBOL(sock_common_recvmsg);
1874
1875/*
1876 *	Set socket options on an inet socket.
1877 */
1878int sock_common_setsockopt(struct socket *sock, int level, int optname,
1879			   char __user *optval, int optlen)
1880{
1881	struct sock *sk = sock->sk;
1882
1883	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1884}
1885
1886EXPORT_SYMBOL(sock_common_setsockopt);
1887
1888#ifdef CONFIG_COMPAT
1889int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1890				  char __user *optval, int optlen)
1891{
1892	struct sock *sk = sock->sk;
1893
1894	if (sk->sk_prot->compat_setsockopt != NULL)
1895		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1896						      optval, optlen);
1897	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1898}
1899EXPORT_SYMBOL(compat_sock_common_setsockopt);
1900#endif
1901
1902void sk_common_release(struct sock *sk)
1903{
1904	if (sk->sk_prot->destroy)
1905		sk->sk_prot->destroy(sk);
1906
1907	/*
1908	 * Observation: when sock_common_release is called, processes have
1909	 * no access to socket. But net still has.
1910	 * Step one, detach it from networking:
1911	 *
1912	 * A. Remove from hash tables.
1913	 */
1914
1915	sk->sk_prot->unhash(sk);
1916
1917	/*
1918	 * In this point socket cannot receive new packets, but it is possible
1919	 * that some packets are in flight because some CPU runs receiver and
1920	 * did hash table lookup before we unhashed socket. They will achieve
1921	 * receive queue and will be purged by socket destructor.
1922	 *
1923	 * Also we still have packets pending on receive queue and probably,
1924	 * our own packets waiting in device queues. sock_destroy will drain
1925	 * receive queue, but transmitted packets will delay socket destruction
1926	 * until the last reference will be released.
1927	 */
1928
1929	sock_orphan(sk);
1930
1931	xfrm_sk_free_policy(sk);
1932
1933	sk_refcnt_debug_release(sk);
1934	sock_put(sk);
1935}
1936
1937EXPORT_SYMBOL(sk_common_release);
1938
1939static DEFINE_RWLOCK(proto_list_lock);
1940static LIST_HEAD(proto_list);
1941
1942#ifdef CONFIG_PROC_FS
1943#define PROTO_INUSE_NR	64	/* should be enough for the first time */
1944struct prot_inuse {
1945	int val[PROTO_INUSE_NR];
1946};
1947
1948static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1949
1950#ifdef CONFIG_NET_NS
1951void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1952{
1953	int cpu = smp_processor_id();
1954	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1955}
1956EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1957
1958int sock_prot_inuse_get(struct net *net, struct proto *prot)
1959{
1960	int cpu, idx = prot->inuse_idx;
1961	int res = 0;
1962
1963	for_each_possible_cpu(cpu)
1964		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1965
1966	return res >= 0 ? res : 0;
1967}
1968EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1969
1970static int sock_inuse_init_net(struct net *net)
1971{
1972	net->core.inuse = alloc_percpu(struct prot_inuse);
1973	return net->core.inuse ? 0 : -ENOMEM;
1974}
1975
1976static void sock_inuse_exit_net(struct net *net)
1977{
1978	free_percpu(net->core.inuse);
1979}
1980
1981static struct pernet_operations net_inuse_ops = {
1982	.init = sock_inuse_init_net,
1983	.exit = sock_inuse_exit_net,
1984};
1985
1986static __init int net_inuse_init(void)
1987{
1988	if (register_pernet_subsys(&net_inuse_ops))
1989		panic("Cannot initialize net inuse counters");
1990
1991	return 0;
1992}
1993
1994core_initcall(net_inuse_init);
1995#else
1996static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1997
1998void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1999{
2000	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2001}
2002EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2003
2004int sock_prot_inuse_get(struct net *net, struct proto *prot)
2005{
2006	int cpu, idx = prot->inuse_idx;
2007	int res = 0;
2008
2009	for_each_possible_cpu(cpu)
2010		res += per_cpu(prot_inuse, cpu).val[idx];
2011
2012	return res >= 0 ? res : 0;
2013}
2014EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2015#endif
2016
2017static void assign_proto_idx(struct proto *prot)
2018{
2019	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2020
2021	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2022		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2023		return;
2024	}
2025
2026	set_bit(prot->inuse_idx, proto_inuse_idx);
2027}
2028
2029static void release_proto_idx(struct proto *prot)
2030{
2031	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2032		clear_bit(prot->inuse_idx, proto_inuse_idx);
2033}
2034#else
2035static inline void assign_proto_idx(struct proto *prot)
2036{
2037}
2038
2039static inline void release_proto_idx(struct proto *prot)
2040{
2041}
2042#endif
2043
2044int proto_register(struct proto *prot, int alloc_slab)
2045{
2046	char *request_sock_slab_name = NULL;
2047	char *timewait_sock_slab_name;
2048
2049	if (alloc_slab) {
2050		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2051					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2052					NULL);
2053
2054		if (prot->slab == NULL) {
2055			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2056			       prot->name);
2057			goto out;
2058		}
2059
2060		if (prot->rsk_prot != NULL) {
2061			static const char mask[] = "request_sock_%s";
2062
2063			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2064			if (request_sock_slab_name == NULL)
2065				goto out_free_sock_slab;
2066
2067			sprintf(request_sock_slab_name, mask, prot->name);
2068			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
2069								 prot->rsk_prot->obj_size, 0,
2070								 SLAB_HWCACHE_ALIGN, NULL);
2071
2072			if (prot->rsk_prot->slab == NULL) {
2073				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2074				       prot->name);
2075				goto out_free_request_sock_slab_name;
2076			}
2077		}
2078
2079		if (prot->twsk_prot != NULL) {
2080			static const char mask[] = "tw_sock_%s";
2081
2082			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2083
2084			if (timewait_sock_slab_name == NULL)
2085				goto out_free_request_sock_slab;
2086
2087			sprintf(timewait_sock_slab_name, mask, prot->name);
2088			prot->twsk_prot->twsk_slab =
2089				kmem_cache_create(timewait_sock_slab_name,
2090						  prot->twsk_prot->twsk_obj_size,
2091						  0,
2092						  SLAB_HWCACHE_ALIGN |
2093							prot->slab_flags,
2094						  NULL);
2095			if (prot->twsk_prot->twsk_slab == NULL)
2096				goto out_free_timewait_sock_slab_name;
2097		}
2098	}
2099
2100	write_lock(&proto_list_lock);
2101	list_add(&prot->node, &proto_list);
2102	assign_proto_idx(prot);
2103	write_unlock(&proto_list_lock);
2104	return 0;
2105
2106out_free_timewait_sock_slab_name:
2107	kfree(timewait_sock_slab_name);
2108out_free_request_sock_slab:
2109	if (prot->rsk_prot && prot->rsk_prot->slab) {
2110		kmem_cache_destroy(prot->rsk_prot->slab);
2111		prot->rsk_prot->slab = NULL;
2112	}
2113out_free_request_sock_slab_name:
2114	kfree(request_sock_slab_name);
2115out_free_sock_slab:
2116	kmem_cache_destroy(prot->slab);
2117	prot->slab = NULL;
2118out:
2119	return -ENOBUFS;
2120}
2121
2122EXPORT_SYMBOL(proto_register);
2123
2124void proto_unregister(struct proto *prot)
2125{
2126	write_lock(&proto_list_lock);
2127	release_proto_idx(prot);
2128	list_del(&prot->node);
2129	write_unlock(&proto_list_lock);
2130
2131	if (prot->slab != NULL) {
2132		kmem_cache_destroy(prot->slab);
2133		prot->slab = NULL;
2134	}
2135
2136	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2137		const char *name = kmem_cache_name(prot->rsk_prot->slab);
2138
2139		kmem_cache_destroy(prot->rsk_prot->slab);
2140		kfree(name);
2141		prot->rsk_prot->slab = NULL;
2142	}
2143
2144	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2145		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
2146
2147		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2148		kfree(name);
2149		prot->twsk_prot->twsk_slab = NULL;
2150	}
2151}
2152
2153EXPORT_SYMBOL(proto_unregister);
2154
2155#ifdef CONFIG_PROC_FS
2156static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2157	__acquires(proto_list_lock)
2158{
2159	read_lock(&proto_list_lock);
2160	return seq_list_start_head(&proto_list, *pos);
2161}
2162
2163static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2164{
2165	return seq_list_next(v, &proto_list, pos);
2166}
2167
2168static void proto_seq_stop(struct seq_file *seq, void *v)
2169	__releases(proto_list_lock)
2170{
2171	read_unlock(&proto_list_lock);
2172}
2173
2174static char proto_method_implemented(const void *method)
2175{
2176	return method == NULL ? 'n' : 'y';
2177}
2178
2179static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2180{
2181	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2182			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2183		   proto->name,
2184		   proto->obj_size,
2185		   sock_prot_inuse_get(seq_file_net(seq), proto),
2186		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2187		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2188		   proto->max_header,
2189		   proto->slab == NULL ? "no" : "yes",
2190		   module_name(proto->owner),
2191		   proto_method_implemented(proto->close),
2192		   proto_method_implemented(proto->connect),
2193		   proto_method_implemented(proto->disconnect),
2194		   proto_method_implemented(proto->accept),
2195		   proto_method_implemented(proto->ioctl),
2196		   proto_method_implemented(proto->init),
2197		   proto_method_implemented(proto->destroy),
2198		   proto_method_implemented(proto->shutdown),
2199		   proto_method_implemented(proto->setsockopt),
2200		   proto_method_implemented(proto->getsockopt),
2201		   proto_method_implemented(proto->sendmsg),
2202		   proto_method_implemented(proto->recvmsg),
2203		   proto_method_implemented(proto->sendpage),
2204		   proto_method_implemented(proto->bind),
2205		   proto_method_implemented(proto->backlog_rcv),
2206		   proto_method_implemented(proto->hash),
2207		   proto_method_implemented(proto->unhash),
2208		   proto_method_implemented(proto->get_port),
2209		   proto_method_implemented(proto->enter_memory_pressure));
2210}
2211
2212static int proto_seq_show(struct seq_file *seq, void *v)
2213{
2214	if (v == &proto_list)
2215		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2216			   "protocol",
2217			   "size",
2218			   "sockets",
2219			   "memory",
2220			   "press",
2221			   "maxhdr",
2222			   "slab",
2223			   "module",
2224			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2225	else
2226		proto_seq_printf(seq, list_entry(v, struct proto, node));
2227	return 0;
2228}
2229
2230static const struct seq_operations proto_seq_ops = {
2231	.start  = proto_seq_start,
2232	.next   = proto_seq_next,
2233	.stop   = proto_seq_stop,
2234	.show   = proto_seq_show,
2235};
2236
2237static int proto_seq_open(struct inode *inode, struct file *file)
2238{
2239	return seq_open_net(inode, file, &proto_seq_ops,
2240			    sizeof(struct seq_net_private));
2241}
2242
2243static const struct file_operations proto_seq_fops = {
2244	.owner		= THIS_MODULE,
2245	.open		= proto_seq_open,
2246	.read		= seq_read,
2247	.llseek		= seq_lseek,
2248	.release	= seq_release_net,
2249};
2250
2251static __net_init int proto_init_net(struct net *net)
2252{
2253	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2254		return -ENOMEM;
2255
2256	return 0;
2257}
2258
2259static __net_exit void proto_exit_net(struct net *net)
2260{
2261	proc_net_remove(net, "protocols");
2262}
2263
2264
2265static __net_initdata struct pernet_operations proto_net_ops = {
2266	.init = proto_init_net,
2267	.exit = proto_exit_net,
2268};
2269
2270static int __init proto_init(void)
2271{
2272	return register_pernet_subsys(&proto_net_ops);
2273}
2274
2275subsys_initcall(proto_init);
2276
2277#endif /* PROC_FS */
2278
2279EXPORT_SYMBOL(sk_alloc);
2280EXPORT_SYMBOL(sk_free);
2281EXPORT_SYMBOL(sk_send_sigurg);
2282EXPORT_SYMBOL(sock_alloc_send_skb);
2283EXPORT_SYMBOL(sock_init_data);
2284EXPORT_SYMBOL(sock_kfree_s);
2285EXPORT_SYMBOL(sock_kmalloc);
2286EXPORT_SYMBOL(sock_no_accept);
2287EXPORT_SYMBOL(sock_no_bind);
2288EXPORT_SYMBOL(sock_no_connect);
2289EXPORT_SYMBOL(sock_no_getname);
2290EXPORT_SYMBOL(sock_no_getsockopt);
2291EXPORT_SYMBOL(sock_no_ioctl);
2292EXPORT_SYMBOL(sock_no_listen);
2293EXPORT_SYMBOL(sock_no_mmap);
2294EXPORT_SYMBOL(sock_no_poll);
2295EXPORT_SYMBOL(sock_no_recvmsg);
2296EXPORT_SYMBOL(sock_no_sendmsg);
2297EXPORT_SYMBOL(sock_no_sendpage);
2298EXPORT_SYMBOL(sock_no_setsockopt);
2299EXPORT_SYMBOL(sock_no_shutdown);
2300EXPORT_SYMBOL(sock_no_socketpair);
2301EXPORT_SYMBOL(sock_rfree);
2302EXPORT_SYMBOL(sock_setsockopt);
2303EXPORT_SYMBOL(sock_wfree);
2304EXPORT_SYMBOL(sock_wmalloc);
2305EXPORT_SYMBOL(sock_i_uid);
2306EXPORT_SYMBOL(sock_i_ino);
2307EXPORT_SYMBOL(sysctl_optmem_max);
2308