sock.c revision 87d11ceb9deb7a3f13fdee6e89d9bb6be7d27a71
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/config.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <net/xfrm.h>
124#include <linux/ipsec.h>
125
126#include <linux/filter.h>
127
128#ifdef CONFIG_INET
129#include <net/tcp.h>
130#endif
131
132/* Take into consideration the size of the struct sk_buff overhead in the
133 * determination of these values, since that is non-constant across
134 * platforms.  This makes socket queueing behavior and performance
135 * not depend upon such differences.
136 */
137#define _SK_MEM_PACKETS		256
138#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
139#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141
142/* Run time adjustable parameters. */
143__u32 sysctl_wmem_max = SK_WMEM_MAX;
144__u32 sysctl_rmem_max = SK_RMEM_MAX;
145__u32 sysctl_wmem_default = SK_WMEM_MAX;
146__u32 sysctl_rmem_default = SK_RMEM_MAX;
147
148/* Maximal space eaten by iovec or ancilliary data plus some space */
149int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150
151static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152{
153	struct timeval tv;
154
155	if (optlen < sizeof(tv))
156		return -EINVAL;
157	if (copy_from_user(&tv, optval, sizeof(tv)))
158		return -EFAULT;
159
160	*timeo_p = MAX_SCHEDULE_TIMEOUT;
161	if (tv.tv_sec == 0 && tv.tv_usec == 0)
162		return 0;
163	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165	return 0;
166}
167
168static void sock_warn_obsolete_bsdism(const char *name)
169{
170	static int warned;
171	static char warncomm[TASK_COMM_LEN];
172	if (strcmp(warncomm, current->comm) && warned < 5) {
173		strcpy(warncomm,  current->comm);
174		printk(KERN_WARNING "process `%s' is using obsolete "
175		       "%s SO_BSDCOMPAT\n", warncomm, name);
176		warned++;
177	}
178}
179
180static void sock_disable_timestamp(struct sock *sk)
181{
182	if (sock_flag(sk, SOCK_TIMESTAMP)) {
183		sock_reset_flag(sk, SOCK_TIMESTAMP);
184		net_disable_timestamp();
185	}
186}
187
188
189/*
190 *	This is meant for all protocols to use and covers goings on
191 *	at the socket level. Everything here is generic.
192 */
193
194int sock_setsockopt(struct socket *sock, int level, int optname,
195		    char __user *optval, int optlen)
196{
197	struct sock *sk=sock->sk;
198	struct sk_filter *filter;
199	int val;
200	int valbool;
201	struct linger ling;
202	int ret = 0;
203
204	/*
205	 *	Options without arguments
206	 */
207
208#ifdef SO_DONTLINGER		/* Compatibility item... */
209	if (optname == SO_DONTLINGER) {
210		lock_sock(sk);
211		sock_reset_flag(sk, SOCK_LINGER);
212		release_sock(sk);
213		return 0;
214	}
215#endif
216
217  	if(optlen<sizeof(int))
218  		return(-EINVAL);
219
220	if (get_user(val, (int __user *)optval))
221		return -EFAULT;
222
223  	valbool = val?1:0;
224
225	lock_sock(sk);
226
227  	switch(optname)
228  	{
229		case SO_DEBUG:
230			if(val && !capable(CAP_NET_ADMIN))
231			{
232				ret = -EACCES;
233			}
234			else if (valbool)
235				sock_set_flag(sk, SOCK_DBG);
236			else
237				sock_reset_flag(sk, SOCK_DBG);
238			break;
239		case SO_REUSEADDR:
240			sk->sk_reuse = valbool;
241			break;
242		case SO_TYPE:
243		case SO_ERROR:
244			ret = -ENOPROTOOPT;
245		  	break;
246		case SO_DONTROUTE:
247			if (valbool)
248				sock_set_flag(sk, SOCK_LOCALROUTE);
249			else
250				sock_reset_flag(sk, SOCK_LOCALROUTE);
251			break;
252		case SO_BROADCAST:
253			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
254			break;
255		case SO_SNDBUF:
256			/* Don't error on this BSD doesn't and if you think
257			   about it this is right. Otherwise apps have to
258			   play 'guess the biggest size' games. RCVBUF/SNDBUF
259			   are treated in BSD as hints */
260
261			if (val > sysctl_wmem_max)
262				val = sysctl_wmem_max;
263set_sndbuf:
264			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
265			if ((val * 2) < SOCK_MIN_SNDBUF)
266				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
267			else
268				sk->sk_sndbuf = val * 2;
269
270			/*
271			 *	Wake up sending tasks if we
272			 *	upped the value.
273			 */
274			sk->sk_write_space(sk);
275			break;
276
277		case SO_SNDBUFFORCE:
278			if (!capable(CAP_NET_ADMIN)) {
279				ret = -EPERM;
280				break;
281			}
282			goto set_sndbuf;
283
284		case SO_RCVBUF:
285			/* Don't error on this BSD doesn't and if you think
286			   about it this is right. Otherwise apps have to
287			   play 'guess the biggest size' games. RCVBUF/SNDBUF
288			   are treated in BSD as hints */
289
290			if (val > sysctl_rmem_max)
291				val = sysctl_rmem_max;
292set_rcvbuf:
293			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
294			/* FIXME: is this lower bound the right one? */
295			if ((val * 2) < SOCK_MIN_RCVBUF)
296				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
297			else
298				sk->sk_rcvbuf = val * 2;
299			break;
300
301		case SO_RCVBUFFORCE:
302			if (!capable(CAP_NET_ADMIN)) {
303				ret = -EPERM;
304				break;
305			}
306			goto set_rcvbuf;
307
308		case SO_KEEPALIVE:
309#ifdef CONFIG_INET
310			if (sk->sk_protocol == IPPROTO_TCP)
311				tcp_set_keepalive(sk, valbool);
312#endif
313			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
314			break;
315
316	 	case SO_OOBINLINE:
317			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
318			break;
319
320	 	case SO_NO_CHECK:
321			sk->sk_no_check = valbool;
322			break;
323
324		case SO_PRIORITY:
325			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
326				sk->sk_priority = val;
327			else
328				ret = -EPERM;
329			break;
330
331		case SO_LINGER:
332			if(optlen<sizeof(ling)) {
333				ret = -EINVAL;	/* 1003.1g */
334				break;
335			}
336			if (copy_from_user(&ling,optval,sizeof(ling))) {
337				ret = -EFAULT;
338				break;
339			}
340			if (!ling.l_onoff)
341				sock_reset_flag(sk, SOCK_LINGER);
342			else {
343#if (BITS_PER_LONG == 32)
344				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
345					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
346				else
347#endif
348					sk->sk_lingertime = ling.l_linger * HZ;
349				sock_set_flag(sk, SOCK_LINGER);
350			}
351			break;
352
353		case SO_BSDCOMPAT:
354			sock_warn_obsolete_bsdism("setsockopt");
355			break;
356
357		case SO_PASSCRED:
358			if (valbool)
359				set_bit(SOCK_PASSCRED, &sock->flags);
360			else
361				clear_bit(SOCK_PASSCRED, &sock->flags);
362			break;
363
364		case SO_TIMESTAMP:
365			if (valbool)  {
366				sock_set_flag(sk, SOCK_RCVTSTAMP);
367				sock_enable_timestamp(sk);
368			} else
369				sock_reset_flag(sk, SOCK_RCVTSTAMP);
370			break;
371
372		case SO_RCVLOWAT:
373			if (val < 0)
374				val = INT_MAX;
375			sk->sk_rcvlowat = val ? : 1;
376			break;
377
378		case SO_RCVTIMEO:
379			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
380			break;
381
382		case SO_SNDTIMEO:
383			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
384			break;
385
386#ifdef CONFIG_NETDEVICES
387		case SO_BINDTODEVICE:
388		{
389			char devname[IFNAMSIZ];
390
391			/* Sorry... */
392			if (!capable(CAP_NET_RAW)) {
393				ret = -EPERM;
394				break;
395			}
396
397			/* Bind this socket to a particular device like "eth0",
398			 * as specified in the passed interface name. If the
399			 * name is "" or the option length is zero the socket
400			 * is not bound.
401			 */
402
403			if (!valbool) {
404				sk->sk_bound_dev_if = 0;
405			} else {
406				if (optlen > IFNAMSIZ)
407					optlen = IFNAMSIZ;
408				if (copy_from_user(devname, optval, optlen)) {
409					ret = -EFAULT;
410					break;
411				}
412
413				/* Remove any cached route for this socket. */
414				sk_dst_reset(sk);
415
416				if (devname[0] == '\0') {
417					sk->sk_bound_dev_if = 0;
418				} else {
419					struct net_device *dev = dev_get_by_name(devname);
420					if (!dev) {
421						ret = -ENODEV;
422						break;
423					}
424					sk->sk_bound_dev_if = dev->ifindex;
425					dev_put(dev);
426				}
427			}
428			break;
429		}
430#endif
431
432
433		case SO_ATTACH_FILTER:
434			ret = -EINVAL;
435			if (optlen == sizeof(struct sock_fprog)) {
436				struct sock_fprog fprog;
437
438				ret = -EFAULT;
439				if (copy_from_user(&fprog, optval, sizeof(fprog)))
440					break;
441
442				ret = sk_attach_filter(&fprog, sk);
443			}
444			break;
445
446		case SO_DETACH_FILTER:
447			spin_lock_bh(&sk->sk_lock.slock);
448			filter = sk->sk_filter;
449                        if (filter) {
450				sk->sk_filter = NULL;
451				spin_unlock_bh(&sk->sk_lock.slock);
452				sk_filter_release(sk, filter);
453				break;
454			}
455			spin_unlock_bh(&sk->sk_lock.slock);
456			ret = -ENONET;
457			break;
458
459		/* We implement the SO_SNDLOWAT etc to
460		   not be settable (1003.1g 5.3) */
461		default:
462		  	ret = -ENOPROTOOPT;
463			break;
464  	}
465	release_sock(sk);
466	return ret;
467}
468
469
470int sock_getsockopt(struct socket *sock, int level, int optname,
471		    char __user *optval, int __user *optlen)
472{
473	struct sock *sk = sock->sk;
474
475	union
476	{
477  		int val;
478  		struct linger ling;
479		struct timeval tm;
480	} v;
481
482	unsigned int lv = sizeof(int);
483	int len;
484
485  	if(get_user(len,optlen))
486  		return -EFAULT;
487	if(len < 0)
488		return -EINVAL;
489
490  	switch(optname)
491  	{
492		case SO_DEBUG:
493			v.val = sock_flag(sk, SOCK_DBG);
494			break;
495
496		case SO_DONTROUTE:
497			v.val = sock_flag(sk, SOCK_LOCALROUTE);
498			break;
499
500		case SO_BROADCAST:
501			v.val = !!sock_flag(sk, SOCK_BROADCAST);
502			break;
503
504		case SO_SNDBUF:
505			v.val = sk->sk_sndbuf;
506			break;
507
508		case SO_RCVBUF:
509			v.val = sk->sk_rcvbuf;
510			break;
511
512		case SO_REUSEADDR:
513			v.val = sk->sk_reuse;
514			break;
515
516		case SO_KEEPALIVE:
517			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
518			break;
519
520		case SO_TYPE:
521			v.val = sk->sk_type;
522			break;
523
524		case SO_ERROR:
525			v.val = -sock_error(sk);
526			if(v.val==0)
527				v.val = xchg(&sk->sk_err_soft, 0);
528			break;
529
530		case SO_OOBINLINE:
531			v.val = !!sock_flag(sk, SOCK_URGINLINE);
532			break;
533
534		case SO_NO_CHECK:
535			v.val = sk->sk_no_check;
536			break;
537
538		case SO_PRIORITY:
539			v.val = sk->sk_priority;
540			break;
541
542		case SO_LINGER:
543			lv		= sizeof(v.ling);
544			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
545 			v.ling.l_linger	= sk->sk_lingertime / HZ;
546			break;
547
548		case SO_BSDCOMPAT:
549			sock_warn_obsolete_bsdism("getsockopt");
550			break;
551
552		case SO_TIMESTAMP:
553			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
554			break;
555
556		case SO_RCVTIMEO:
557			lv=sizeof(struct timeval);
558			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
559				v.tm.tv_sec = 0;
560				v.tm.tv_usec = 0;
561			} else {
562				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
563				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
564			}
565			break;
566
567		case SO_SNDTIMEO:
568			lv=sizeof(struct timeval);
569			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
570				v.tm.tv_sec = 0;
571				v.tm.tv_usec = 0;
572			} else {
573				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
574				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
575			}
576			break;
577
578		case SO_RCVLOWAT:
579			v.val = sk->sk_rcvlowat;
580			break;
581
582		case SO_SNDLOWAT:
583			v.val=1;
584			break;
585
586		case SO_PASSCRED:
587			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
588			break;
589
590		case SO_PEERCRED:
591			if (len > sizeof(sk->sk_peercred))
592				len = sizeof(sk->sk_peercred);
593			if (copy_to_user(optval, &sk->sk_peercred, len))
594				return -EFAULT;
595			goto lenout;
596
597		case SO_PEERNAME:
598		{
599			char address[128];
600
601			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
602				return -ENOTCONN;
603			if (lv < len)
604				return -EINVAL;
605			if (copy_to_user(optval, address, len))
606				return -EFAULT;
607			goto lenout;
608		}
609
610		/* Dubious BSD thing... Probably nobody even uses it, but
611		 * the UNIX standard wants it for whatever reason... -DaveM
612		 */
613		case SO_ACCEPTCONN:
614			v.val = sk->sk_state == TCP_LISTEN;
615			break;
616
617		case SO_PEERSEC:
618			return security_socket_getpeersec(sock, optval, optlen, len);
619
620		default:
621			return(-ENOPROTOOPT);
622	}
623	if (len > lv)
624		len = lv;
625	if (copy_to_user(optval, &v, len))
626		return -EFAULT;
627lenout:
628  	if (put_user(len, optlen))
629  		return -EFAULT;
630  	return 0;
631}
632
633/**
634 *	sk_alloc - All socket objects are allocated here
635 *	@family: protocol family
636 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
637 *	@prot: struct proto associated with this new sock instance
638 *	@zero_it: if we should zero the newly allocated sock
639 */
640struct sock *sk_alloc(int family, unsigned int __nocast priority,
641		      struct proto *prot, int zero_it)
642{
643	struct sock *sk = NULL;
644	kmem_cache_t *slab = prot->slab;
645
646	if (slab != NULL)
647		sk = kmem_cache_alloc(slab, priority);
648	else
649		sk = kmalloc(prot->obj_size, priority);
650
651	if (sk) {
652		if (zero_it) {
653			memset(sk, 0, prot->obj_size);
654			sk->sk_family = family;
655			/*
656			 * See comment in struct sock definition to understand
657			 * why we need sk_prot_creator -acme
658			 */
659			sk->sk_prot = sk->sk_prot_creator = prot;
660			sock_lock_init(sk);
661		}
662
663		if (security_sk_alloc(sk, family, priority)) {
664			if (slab != NULL)
665				kmem_cache_free(slab, sk);
666			else
667				kfree(sk);
668			sk = NULL;
669		} else
670			__module_get(prot->owner);
671	}
672	return sk;
673}
674
675void sk_free(struct sock *sk)
676{
677	struct sk_filter *filter;
678	struct module *owner = sk->sk_prot_creator->owner;
679
680	if (sk->sk_destruct)
681		sk->sk_destruct(sk);
682
683	filter = sk->sk_filter;
684	if (filter) {
685		sk_filter_release(sk, filter);
686		sk->sk_filter = NULL;
687	}
688
689	sock_disable_timestamp(sk);
690
691	if (atomic_read(&sk->sk_omem_alloc))
692		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
693		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
694
695	security_sk_free(sk);
696	if (sk->sk_prot_creator->slab != NULL)
697		kmem_cache_free(sk->sk_prot_creator->slab, sk);
698	else
699		kfree(sk);
700	module_put(owner);
701}
702
703struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
704{
705	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
706
707	if (newsk != NULL) {
708		struct sk_filter *filter;
709
710		memcpy(newsk, sk, sk->sk_prot->obj_size);
711
712		/* SANITY */
713		sk_node_init(&newsk->sk_node);
714		sock_lock_init(newsk);
715		bh_lock_sock(newsk);
716
717		atomic_set(&newsk->sk_rmem_alloc, 0);
718		atomic_set(&newsk->sk_wmem_alloc, 0);
719		atomic_set(&newsk->sk_omem_alloc, 0);
720		skb_queue_head_init(&newsk->sk_receive_queue);
721		skb_queue_head_init(&newsk->sk_write_queue);
722
723		rwlock_init(&newsk->sk_dst_lock);
724		rwlock_init(&newsk->sk_callback_lock);
725
726		newsk->sk_dst_cache	= NULL;
727		newsk->sk_wmem_queued	= 0;
728		newsk->sk_forward_alloc = 0;
729		newsk->sk_send_head	= NULL;
730		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
731		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
732
733		sock_reset_flag(newsk, SOCK_DONE);
734		skb_queue_head_init(&newsk->sk_error_queue);
735
736		filter = newsk->sk_filter;
737		if (filter != NULL)
738			sk_filter_charge(newsk, filter);
739
740		if (unlikely(xfrm_sk_clone_policy(newsk))) {
741			/* It is still raw copy of parent, so invalidate
742			 * destructor and make plain sk_free() */
743			newsk->sk_destruct = NULL;
744			sk_free(newsk);
745			newsk = NULL;
746			goto out;
747		}
748
749		newsk->sk_err	   = 0;
750		newsk->sk_priority = 0;
751		atomic_set(&newsk->sk_refcnt, 2);
752
753		/*
754		 * Increment the counter in the same struct proto as the master
755		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
756		 * is the same as sk->sk_prot->socks, as this field was copied
757		 * with memcpy).
758		 *
759		 * This _changes_ the previous behaviour, where
760		 * tcp_create_openreq_child always was incrementing the
761		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
762		 * to be taken into account in all callers. -acme
763		 */
764		sk_refcnt_debug_inc(newsk);
765		newsk->sk_socket = NULL;
766		newsk->sk_sleep	 = NULL;
767
768		if (newsk->sk_prot->sockets_allocated)
769			atomic_inc(newsk->sk_prot->sockets_allocated);
770	}
771out:
772	return newsk;
773}
774
775EXPORT_SYMBOL_GPL(sk_clone);
776
777void __init sk_init(void)
778{
779	if (num_physpages <= 4096) {
780		sysctl_wmem_max = 32767;
781		sysctl_rmem_max = 32767;
782		sysctl_wmem_default = 32767;
783		sysctl_rmem_default = 32767;
784	} else if (num_physpages >= 131072) {
785		sysctl_wmem_max = 131071;
786		sysctl_rmem_max = 131071;
787	}
788}
789
790/*
791 *	Simple resource managers for sockets.
792 */
793
794
795/*
796 * Write buffer destructor automatically called from kfree_skb.
797 */
798void sock_wfree(struct sk_buff *skb)
799{
800	struct sock *sk = skb->sk;
801
802	/* In case it might be waiting for more memory. */
803	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
804	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
805		sk->sk_write_space(sk);
806	sock_put(sk);
807}
808
809/*
810 * Read buffer destructor automatically called from kfree_skb.
811 */
812void sock_rfree(struct sk_buff *skb)
813{
814	struct sock *sk = skb->sk;
815
816	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
817}
818
819
820int sock_i_uid(struct sock *sk)
821{
822	int uid;
823
824	read_lock(&sk->sk_callback_lock);
825	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
826	read_unlock(&sk->sk_callback_lock);
827	return uid;
828}
829
830unsigned long sock_i_ino(struct sock *sk)
831{
832	unsigned long ino;
833
834	read_lock(&sk->sk_callback_lock);
835	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
836	read_unlock(&sk->sk_callback_lock);
837	return ino;
838}
839
840/*
841 * Allocate a skb from the socket's send buffer.
842 */
843struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
844			     unsigned int __nocast priority)
845{
846	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
847		struct sk_buff * skb = alloc_skb(size, priority);
848		if (skb) {
849			skb_set_owner_w(skb, sk);
850			return skb;
851		}
852	}
853	return NULL;
854}
855
856/*
857 * Allocate a skb from the socket's receive buffer.
858 */
859struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
860			     unsigned int __nocast priority)
861{
862	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
863		struct sk_buff *skb = alloc_skb(size, priority);
864		if (skb) {
865			skb_set_owner_r(skb, sk);
866			return skb;
867		}
868	}
869	return NULL;
870}
871
872/*
873 * Allocate a memory block from the socket's option memory buffer.
874 */
875void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
876{
877	if ((unsigned)size <= sysctl_optmem_max &&
878	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
879		void *mem;
880		/* First do the add, to avoid the race if kmalloc
881 		 * might sleep.
882		 */
883		atomic_add(size, &sk->sk_omem_alloc);
884		mem = kmalloc(size, priority);
885		if (mem)
886			return mem;
887		atomic_sub(size, &sk->sk_omem_alloc);
888	}
889	return NULL;
890}
891
892/*
893 * Free an option memory block.
894 */
895void sock_kfree_s(struct sock *sk, void *mem, int size)
896{
897	kfree(mem);
898	atomic_sub(size, &sk->sk_omem_alloc);
899}
900
901/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
902   I think, these locks should be removed for datagram sockets.
903 */
904static long sock_wait_for_wmem(struct sock * sk, long timeo)
905{
906	DEFINE_WAIT(wait);
907
908	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
909	for (;;) {
910		if (!timeo)
911			break;
912		if (signal_pending(current))
913			break;
914		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
915		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
916		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
917			break;
918		if (sk->sk_shutdown & SEND_SHUTDOWN)
919			break;
920		if (sk->sk_err)
921			break;
922		timeo = schedule_timeout(timeo);
923	}
924	finish_wait(sk->sk_sleep, &wait);
925	return timeo;
926}
927
928
929/*
930 *	Generic send/receive buffer handlers
931 */
932
933static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
934					    unsigned long header_len,
935					    unsigned long data_len,
936					    int noblock, int *errcode)
937{
938	struct sk_buff *skb;
939	unsigned int gfp_mask;
940	long timeo;
941	int err;
942
943	gfp_mask = sk->sk_allocation;
944	if (gfp_mask & __GFP_WAIT)
945		gfp_mask |= __GFP_REPEAT;
946
947	timeo = sock_sndtimeo(sk, noblock);
948	while (1) {
949		err = sock_error(sk);
950		if (err != 0)
951			goto failure;
952
953		err = -EPIPE;
954		if (sk->sk_shutdown & SEND_SHUTDOWN)
955			goto failure;
956
957		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
958			skb = alloc_skb(header_len, sk->sk_allocation);
959			if (skb) {
960				int npages;
961				int i;
962
963				/* No pages, we're done... */
964				if (!data_len)
965					break;
966
967				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
968				skb->truesize += data_len;
969				skb_shinfo(skb)->nr_frags = npages;
970				for (i = 0; i < npages; i++) {
971					struct page *page;
972					skb_frag_t *frag;
973
974					page = alloc_pages(sk->sk_allocation, 0);
975					if (!page) {
976						err = -ENOBUFS;
977						skb_shinfo(skb)->nr_frags = i;
978						kfree_skb(skb);
979						goto failure;
980					}
981
982					frag = &skb_shinfo(skb)->frags[i];
983					frag->page = page;
984					frag->page_offset = 0;
985					frag->size = (data_len >= PAGE_SIZE ?
986						      PAGE_SIZE :
987						      data_len);
988					data_len -= PAGE_SIZE;
989				}
990
991				/* Full success... */
992				break;
993			}
994			err = -ENOBUFS;
995			goto failure;
996		}
997		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
998		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
999		err = -EAGAIN;
1000		if (!timeo)
1001			goto failure;
1002		if (signal_pending(current))
1003			goto interrupted;
1004		timeo = sock_wait_for_wmem(sk, timeo);
1005	}
1006
1007	skb_set_owner_w(skb, sk);
1008	return skb;
1009
1010interrupted:
1011	err = sock_intr_errno(timeo);
1012failure:
1013	*errcode = err;
1014	return NULL;
1015}
1016
1017struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1018				    int noblock, int *errcode)
1019{
1020	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1021}
1022
1023static void __lock_sock(struct sock *sk)
1024{
1025	DEFINE_WAIT(wait);
1026
1027	for(;;) {
1028		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1029					TASK_UNINTERRUPTIBLE);
1030		spin_unlock_bh(&sk->sk_lock.slock);
1031		schedule();
1032		spin_lock_bh(&sk->sk_lock.slock);
1033		if(!sock_owned_by_user(sk))
1034			break;
1035	}
1036	finish_wait(&sk->sk_lock.wq, &wait);
1037}
1038
1039static void __release_sock(struct sock *sk)
1040{
1041	struct sk_buff *skb = sk->sk_backlog.head;
1042
1043	do {
1044		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1045		bh_unlock_sock(sk);
1046
1047		do {
1048			struct sk_buff *next = skb->next;
1049
1050			skb->next = NULL;
1051			sk->sk_backlog_rcv(sk, skb);
1052
1053			/*
1054			 * We are in process context here with softirqs
1055			 * disabled, use cond_resched_softirq() to preempt.
1056			 * This is safe to do because we've taken the backlog
1057			 * queue private:
1058			 */
1059			cond_resched_softirq();
1060
1061			skb = next;
1062		} while (skb != NULL);
1063
1064		bh_lock_sock(sk);
1065	} while((skb = sk->sk_backlog.head) != NULL);
1066}
1067
1068/**
1069 * sk_wait_data - wait for data to arrive at sk_receive_queue
1070 * @sk:    sock to wait on
1071 * @timeo: for how long
1072 *
1073 * Now socket state including sk->sk_err is changed only under lock,
1074 * hence we may omit checks after joining wait queue.
1075 * We check receive queue before schedule() only as optimization;
1076 * it is very likely that release_sock() added new data.
1077 */
1078int sk_wait_data(struct sock *sk, long *timeo)
1079{
1080	int rc;
1081	DEFINE_WAIT(wait);
1082
1083	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1084	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1085	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1086	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1087	finish_wait(sk->sk_sleep, &wait);
1088	return rc;
1089}
1090
1091EXPORT_SYMBOL(sk_wait_data);
1092
1093/*
1094 * Set of default routines for initialising struct proto_ops when
1095 * the protocol does not support a particular function. In certain
1096 * cases where it makes no sense for a protocol to have a "do nothing"
1097 * function, some default processing is provided.
1098 */
1099
1100int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1101{
1102	return -EOPNOTSUPP;
1103}
1104
1105int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1106		    int len, int flags)
1107{
1108	return -EOPNOTSUPP;
1109}
1110
1111int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1112{
1113	return -EOPNOTSUPP;
1114}
1115
1116int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1117{
1118	return -EOPNOTSUPP;
1119}
1120
1121int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1122		    int *len, int peer)
1123{
1124	return -EOPNOTSUPP;
1125}
1126
1127unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1128{
1129	return 0;
1130}
1131
1132int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1133{
1134	return -EOPNOTSUPP;
1135}
1136
1137int sock_no_listen(struct socket *sock, int backlog)
1138{
1139	return -EOPNOTSUPP;
1140}
1141
1142int sock_no_shutdown(struct socket *sock, int how)
1143{
1144	return -EOPNOTSUPP;
1145}
1146
1147int sock_no_setsockopt(struct socket *sock, int level, int optname,
1148		    char __user *optval, int optlen)
1149{
1150	return -EOPNOTSUPP;
1151}
1152
1153int sock_no_getsockopt(struct socket *sock, int level, int optname,
1154		    char __user *optval, int __user *optlen)
1155{
1156	return -EOPNOTSUPP;
1157}
1158
1159int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1160		    size_t len)
1161{
1162	return -EOPNOTSUPP;
1163}
1164
1165int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1166		    size_t len, int flags)
1167{
1168	return -EOPNOTSUPP;
1169}
1170
1171int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1172{
1173	/* Mirror missing mmap method error code */
1174	return -ENODEV;
1175}
1176
1177ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1178{
1179	ssize_t res;
1180	struct msghdr msg = {.msg_flags = flags};
1181	struct kvec iov;
1182	char *kaddr = kmap(page);
1183	iov.iov_base = kaddr + offset;
1184	iov.iov_len = size;
1185	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1186	kunmap(page);
1187	return res;
1188}
1189
1190/*
1191 *	Default Socket Callbacks
1192 */
1193
1194static void sock_def_wakeup(struct sock *sk)
1195{
1196	read_lock(&sk->sk_callback_lock);
1197	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1198		wake_up_interruptible_all(sk->sk_sleep);
1199	read_unlock(&sk->sk_callback_lock);
1200}
1201
1202static void sock_def_error_report(struct sock *sk)
1203{
1204	read_lock(&sk->sk_callback_lock);
1205	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1206		wake_up_interruptible(sk->sk_sleep);
1207	sk_wake_async(sk,0,POLL_ERR);
1208	read_unlock(&sk->sk_callback_lock);
1209}
1210
1211static void sock_def_readable(struct sock *sk, int len)
1212{
1213	read_lock(&sk->sk_callback_lock);
1214	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1215		wake_up_interruptible(sk->sk_sleep);
1216	sk_wake_async(sk,1,POLL_IN);
1217	read_unlock(&sk->sk_callback_lock);
1218}
1219
1220static void sock_def_write_space(struct sock *sk)
1221{
1222	read_lock(&sk->sk_callback_lock);
1223
1224	/* Do not wake up a writer until he can make "significant"
1225	 * progress.  --DaveM
1226	 */
1227	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1228		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1229			wake_up_interruptible(sk->sk_sleep);
1230
1231		/* Should agree with poll, otherwise some programs break */
1232		if (sock_writeable(sk))
1233			sk_wake_async(sk, 2, POLL_OUT);
1234	}
1235
1236	read_unlock(&sk->sk_callback_lock);
1237}
1238
1239static void sock_def_destruct(struct sock *sk)
1240{
1241	if (sk->sk_protinfo)
1242		kfree(sk->sk_protinfo);
1243}
1244
1245void sk_send_sigurg(struct sock *sk)
1246{
1247	if (sk->sk_socket && sk->sk_socket->file)
1248		if (send_sigurg(&sk->sk_socket->file->f_owner))
1249			sk_wake_async(sk, 3, POLL_PRI);
1250}
1251
1252void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1253		    unsigned long expires)
1254{
1255	if (!mod_timer(timer, expires))
1256		sock_hold(sk);
1257}
1258
1259EXPORT_SYMBOL(sk_reset_timer);
1260
1261void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1262{
1263	if (timer_pending(timer) && del_timer(timer))
1264		__sock_put(sk);
1265}
1266
1267EXPORT_SYMBOL(sk_stop_timer);
1268
1269void sock_init_data(struct socket *sock, struct sock *sk)
1270{
1271	skb_queue_head_init(&sk->sk_receive_queue);
1272	skb_queue_head_init(&sk->sk_write_queue);
1273	skb_queue_head_init(&sk->sk_error_queue);
1274
1275	sk->sk_send_head	=	NULL;
1276
1277	init_timer(&sk->sk_timer);
1278
1279	sk->sk_allocation	=	GFP_KERNEL;
1280	sk->sk_rcvbuf		=	sysctl_rmem_default;
1281	sk->sk_sndbuf		=	sysctl_wmem_default;
1282	sk->sk_state		=	TCP_CLOSE;
1283	sk->sk_socket		=	sock;
1284
1285	sock_set_flag(sk, SOCK_ZAPPED);
1286
1287	if(sock)
1288	{
1289		sk->sk_type	=	sock->type;
1290		sk->sk_sleep	=	&sock->wait;
1291		sock->sk	=	sk;
1292	} else
1293		sk->sk_sleep	=	NULL;
1294
1295	rwlock_init(&sk->sk_dst_lock);
1296	rwlock_init(&sk->sk_callback_lock);
1297
1298	sk->sk_state_change	=	sock_def_wakeup;
1299	sk->sk_data_ready	=	sock_def_readable;
1300	sk->sk_write_space	=	sock_def_write_space;
1301	sk->sk_error_report	=	sock_def_error_report;
1302	sk->sk_destruct		=	sock_def_destruct;
1303
1304	sk->sk_sndmsg_page	=	NULL;
1305	sk->sk_sndmsg_off	=	0;
1306
1307	sk->sk_peercred.pid 	=	0;
1308	sk->sk_peercred.uid	=	-1;
1309	sk->sk_peercred.gid	=	-1;
1310	sk->sk_write_pending	=	0;
1311	sk->sk_rcvlowat		=	1;
1312	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1313	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1314
1315	sk->sk_stamp.tv_sec     = -1L;
1316	sk->sk_stamp.tv_usec    = -1L;
1317
1318	atomic_set(&sk->sk_refcnt, 1);
1319}
1320
1321void fastcall lock_sock(struct sock *sk)
1322{
1323	might_sleep();
1324	spin_lock_bh(&(sk->sk_lock.slock));
1325	if (sk->sk_lock.owner)
1326		__lock_sock(sk);
1327	sk->sk_lock.owner = (void *)1;
1328	spin_unlock_bh(&(sk->sk_lock.slock));
1329}
1330
1331EXPORT_SYMBOL(lock_sock);
1332
1333void fastcall release_sock(struct sock *sk)
1334{
1335	spin_lock_bh(&(sk->sk_lock.slock));
1336	if (sk->sk_backlog.tail)
1337		__release_sock(sk);
1338	sk->sk_lock.owner = NULL;
1339        if (waitqueue_active(&(sk->sk_lock.wq)))
1340		wake_up(&(sk->sk_lock.wq));
1341	spin_unlock_bh(&(sk->sk_lock.slock));
1342}
1343EXPORT_SYMBOL(release_sock);
1344
1345int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1346{
1347	if (!sock_flag(sk, SOCK_TIMESTAMP))
1348		sock_enable_timestamp(sk);
1349	if (sk->sk_stamp.tv_sec == -1)
1350		return -ENOENT;
1351	if (sk->sk_stamp.tv_sec == 0)
1352		do_gettimeofday(&sk->sk_stamp);
1353	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1354		-EFAULT : 0;
1355}
1356EXPORT_SYMBOL(sock_get_timestamp);
1357
1358void sock_enable_timestamp(struct sock *sk)
1359{
1360	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1361		sock_set_flag(sk, SOCK_TIMESTAMP);
1362		net_enable_timestamp();
1363	}
1364}
1365EXPORT_SYMBOL(sock_enable_timestamp);
1366
1367/*
1368 *	Get a socket option on an socket.
1369 *
1370 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1371 *	asynchronous errors should be reported by getsockopt. We assume
1372 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1373 */
1374int sock_common_getsockopt(struct socket *sock, int level, int optname,
1375			   char __user *optval, int __user *optlen)
1376{
1377	struct sock *sk = sock->sk;
1378
1379	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1380}
1381
1382EXPORT_SYMBOL(sock_common_getsockopt);
1383
1384int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1385			struct msghdr *msg, size_t size, int flags)
1386{
1387	struct sock *sk = sock->sk;
1388	int addr_len = 0;
1389	int err;
1390
1391	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1392				   flags & ~MSG_DONTWAIT, &addr_len);
1393	if (err >= 0)
1394		msg->msg_namelen = addr_len;
1395	return err;
1396}
1397
1398EXPORT_SYMBOL(sock_common_recvmsg);
1399
1400/*
1401 *	Set socket options on an inet socket.
1402 */
1403int sock_common_setsockopt(struct socket *sock, int level, int optname,
1404			   char __user *optval, int optlen)
1405{
1406	struct sock *sk = sock->sk;
1407
1408	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1409}
1410
1411EXPORT_SYMBOL(sock_common_setsockopt);
1412
1413void sk_common_release(struct sock *sk)
1414{
1415	if (sk->sk_prot->destroy)
1416		sk->sk_prot->destroy(sk);
1417
1418	/*
1419	 * Observation: when sock_common_release is called, processes have
1420	 * no access to socket. But net still has.
1421	 * Step one, detach it from networking:
1422	 *
1423	 * A. Remove from hash tables.
1424	 */
1425
1426	sk->sk_prot->unhash(sk);
1427
1428	/*
1429	 * In this point socket cannot receive new packets, but it is possible
1430	 * that some packets are in flight because some CPU runs receiver and
1431	 * did hash table lookup before we unhashed socket. They will achieve
1432	 * receive queue and will be purged by socket destructor.
1433	 *
1434	 * Also we still have packets pending on receive queue and probably,
1435	 * our own packets waiting in device queues. sock_destroy will drain
1436	 * receive queue, but transmitted packets will delay socket destruction
1437	 * until the last reference will be released.
1438	 */
1439
1440	sock_orphan(sk);
1441
1442	xfrm_sk_free_policy(sk);
1443
1444	sk_refcnt_debug_release(sk);
1445	sock_put(sk);
1446}
1447
1448EXPORT_SYMBOL(sk_common_release);
1449
1450static DEFINE_RWLOCK(proto_list_lock);
1451static LIST_HEAD(proto_list);
1452
1453int proto_register(struct proto *prot, int alloc_slab)
1454{
1455	char *request_sock_slab_name = NULL;
1456	char *timewait_sock_slab_name;
1457	int rc = -ENOBUFS;
1458
1459	if (alloc_slab) {
1460		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1461					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1462
1463		if (prot->slab == NULL) {
1464			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1465			       prot->name);
1466			goto out;
1467		}
1468
1469		if (prot->rsk_prot != NULL) {
1470			static const char mask[] = "request_sock_%s";
1471
1472			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1473			if (request_sock_slab_name == NULL)
1474				goto out_free_sock_slab;
1475
1476			sprintf(request_sock_slab_name, mask, prot->name);
1477			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1478								 prot->rsk_prot->obj_size, 0,
1479								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1480
1481			if (prot->rsk_prot->slab == NULL) {
1482				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1483				       prot->name);
1484				goto out_free_request_sock_slab_name;
1485			}
1486		}
1487
1488		if (prot->twsk_obj_size) {
1489			static const char mask[] = "tw_sock_%s";
1490
1491			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1492
1493			if (timewait_sock_slab_name == NULL)
1494				goto out_free_request_sock_slab;
1495
1496			sprintf(timewait_sock_slab_name, mask, prot->name);
1497			prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
1498							    prot->twsk_obj_size,
1499							    0, SLAB_HWCACHE_ALIGN,
1500							    NULL, NULL);
1501			if (prot->twsk_slab == NULL)
1502				goto out_free_timewait_sock_slab_name;
1503		}
1504	}
1505
1506	write_lock(&proto_list_lock);
1507	list_add(&prot->node, &proto_list);
1508	write_unlock(&proto_list_lock);
1509	rc = 0;
1510out:
1511	return rc;
1512out_free_timewait_sock_slab_name:
1513	kfree(timewait_sock_slab_name);
1514out_free_request_sock_slab:
1515	if (prot->rsk_prot && prot->rsk_prot->slab) {
1516		kmem_cache_destroy(prot->rsk_prot->slab);
1517		prot->rsk_prot->slab = NULL;
1518	}
1519out_free_request_sock_slab_name:
1520	kfree(request_sock_slab_name);
1521out_free_sock_slab:
1522	kmem_cache_destroy(prot->slab);
1523	prot->slab = NULL;
1524	goto out;
1525}
1526
1527EXPORT_SYMBOL(proto_register);
1528
1529void proto_unregister(struct proto *prot)
1530{
1531	write_lock(&proto_list_lock);
1532
1533	if (prot->slab != NULL) {
1534		kmem_cache_destroy(prot->slab);
1535		prot->slab = NULL;
1536	}
1537
1538	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1539		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1540
1541		kmem_cache_destroy(prot->rsk_prot->slab);
1542		kfree(name);
1543		prot->rsk_prot->slab = NULL;
1544	}
1545
1546	if (prot->twsk_slab != NULL) {
1547		const char *name = kmem_cache_name(prot->twsk_slab);
1548
1549		kmem_cache_destroy(prot->twsk_slab);
1550		kfree(name);
1551		prot->twsk_slab = NULL;
1552	}
1553
1554	list_del(&prot->node);
1555	write_unlock(&proto_list_lock);
1556}
1557
1558EXPORT_SYMBOL(proto_unregister);
1559
1560#ifdef CONFIG_PROC_FS
1561static inline struct proto *__proto_head(void)
1562{
1563	return list_entry(proto_list.next, struct proto, node);
1564}
1565
1566static inline struct proto *proto_head(void)
1567{
1568	return list_empty(&proto_list) ? NULL : __proto_head();
1569}
1570
1571static inline struct proto *proto_next(struct proto *proto)
1572{
1573	return proto->node.next == &proto_list ? NULL :
1574		list_entry(proto->node.next, struct proto, node);
1575}
1576
1577static inline struct proto *proto_get_idx(loff_t pos)
1578{
1579	struct proto *proto;
1580	loff_t i = 0;
1581
1582	list_for_each_entry(proto, &proto_list, node)
1583		if (i++ == pos)
1584			goto out;
1585
1586	proto = NULL;
1587out:
1588	return proto;
1589}
1590
1591static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1592{
1593	read_lock(&proto_list_lock);
1594	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1595}
1596
1597static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1598{
1599	++*pos;
1600	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1601}
1602
1603static void proto_seq_stop(struct seq_file *seq, void *v)
1604{
1605	read_unlock(&proto_list_lock);
1606}
1607
1608static char proto_method_implemented(const void *method)
1609{
1610	return method == NULL ? 'n' : 'y';
1611}
1612
1613static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1614{
1615	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1616			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1617		   proto->name,
1618		   proto->obj_size,
1619		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1620		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1621		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1622		   proto->max_header,
1623		   proto->slab == NULL ? "no" : "yes",
1624		   module_name(proto->owner),
1625		   proto_method_implemented(proto->close),
1626		   proto_method_implemented(proto->connect),
1627		   proto_method_implemented(proto->disconnect),
1628		   proto_method_implemented(proto->accept),
1629		   proto_method_implemented(proto->ioctl),
1630		   proto_method_implemented(proto->init),
1631		   proto_method_implemented(proto->destroy),
1632		   proto_method_implemented(proto->shutdown),
1633		   proto_method_implemented(proto->setsockopt),
1634		   proto_method_implemented(proto->getsockopt),
1635		   proto_method_implemented(proto->sendmsg),
1636		   proto_method_implemented(proto->recvmsg),
1637		   proto_method_implemented(proto->sendpage),
1638		   proto_method_implemented(proto->bind),
1639		   proto_method_implemented(proto->backlog_rcv),
1640		   proto_method_implemented(proto->hash),
1641		   proto_method_implemented(proto->unhash),
1642		   proto_method_implemented(proto->get_port),
1643		   proto_method_implemented(proto->enter_memory_pressure));
1644}
1645
1646static int proto_seq_show(struct seq_file *seq, void *v)
1647{
1648	if (v == SEQ_START_TOKEN)
1649		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1650			   "protocol",
1651			   "size",
1652			   "sockets",
1653			   "memory",
1654			   "press",
1655			   "maxhdr",
1656			   "slab",
1657			   "module",
1658			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1659	else
1660		proto_seq_printf(seq, v);
1661	return 0;
1662}
1663
1664static struct seq_operations proto_seq_ops = {
1665	.start  = proto_seq_start,
1666	.next   = proto_seq_next,
1667	.stop   = proto_seq_stop,
1668	.show   = proto_seq_show,
1669};
1670
1671static int proto_seq_open(struct inode *inode, struct file *file)
1672{
1673	return seq_open(file, &proto_seq_ops);
1674}
1675
1676static struct file_operations proto_seq_fops = {
1677	.owner		= THIS_MODULE,
1678	.open		= proto_seq_open,
1679	.read		= seq_read,
1680	.llseek		= seq_lseek,
1681	.release	= seq_release,
1682};
1683
1684static int __init proto_init(void)
1685{
1686	/* register /proc/net/protocols */
1687	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1688}
1689
1690subsys_initcall(proto_init);
1691
1692#endif /* PROC_FS */
1693
1694EXPORT_SYMBOL(sk_alloc);
1695EXPORT_SYMBOL(sk_free);
1696EXPORT_SYMBOL(sk_send_sigurg);
1697EXPORT_SYMBOL(sock_alloc_send_skb);
1698EXPORT_SYMBOL(sock_init_data);
1699EXPORT_SYMBOL(sock_kfree_s);
1700EXPORT_SYMBOL(sock_kmalloc);
1701EXPORT_SYMBOL(sock_no_accept);
1702EXPORT_SYMBOL(sock_no_bind);
1703EXPORT_SYMBOL(sock_no_connect);
1704EXPORT_SYMBOL(sock_no_getname);
1705EXPORT_SYMBOL(sock_no_getsockopt);
1706EXPORT_SYMBOL(sock_no_ioctl);
1707EXPORT_SYMBOL(sock_no_listen);
1708EXPORT_SYMBOL(sock_no_mmap);
1709EXPORT_SYMBOL(sock_no_poll);
1710EXPORT_SYMBOL(sock_no_recvmsg);
1711EXPORT_SYMBOL(sock_no_sendmsg);
1712EXPORT_SYMBOL(sock_no_sendpage);
1713EXPORT_SYMBOL(sock_no_setsockopt);
1714EXPORT_SYMBOL(sock_no_shutdown);
1715EXPORT_SYMBOL(sock_no_socketpair);
1716EXPORT_SYMBOL(sock_rfree);
1717EXPORT_SYMBOL(sock_setsockopt);
1718EXPORT_SYMBOL(sock_wfree);
1719EXPORT_SYMBOL(sock_wmalloc);
1720EXPORT_SYMBOL(sock_i_uid);
1721EXPORT_SYMBOL(sock_i_ino);
1722#ifdef CONFIG_SYSCTL
1723EXPORT_SYMBOL(sysctl_optmem_max);
1724EXPORT_SYMBOL(sysctl_rmem_max);
1725EXPORT_SYMBOL(sysctl_wmem_max);
1726#endif
1727