sock.c revision 2c7946a7bf45ae86736ab3b43d0085e43947945c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/config.h>
96#include <linux/errno.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/* Take into consideration the size of the struct sk_buff overhead in the
134 * determination of these values, since that is non-constant across
135 * platforms.  This makes socket queueing behavior and performance
136 * not depend upon such differences.
137 */
138#define _SK_MEM_PACKETS		256
139#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
140#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143/* Run time adjustable parameters. */
144__u32 sysctl_wmem_max = SK_WMEM_MAX;
145__u32 sysctl_rmem_max = SK_RMEM_MAX;
146__u32 sysctl_wmem_default = SK_WMEM_MAX;
147__u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149/* Maximal space eaten by iovec or ancilliary data plus some space */
150int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153{
154	struct timeval tv;
155
156	if (optlen < sizeof(tv))
157		return -EINVAL;
158	if (copy_from_user(&tv, optval, sizeof(tv)))
159		return -EFAULT;
160
161	*timeo_p = MAX_SCHEDULE_TIMEOUT;
162	if (tv.tv_sec == 0 && tv.tv_usec == 0)
163		return 0;
164	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166	return 0;
167}
168
169static void sock_warn_obsolete_bsdism(const char *name)
170{
171	static int warned;
172	static char warncomm[TASK_COMM_LEN];
173	if (strcmp(warncomm, current->comm) && warned < 5) {
174		strcpy(warncomm,  current->comm);
175		printk(KERN_WARNING "process `%s' is using obsolete "
176		       "%s SO_BSDCOMPAT\n", warncomm, name);
177		warned++;
178	}
179}
180
181static void sock_disable_timestamp(struct sock *sk)
182{
183	if (sock_flag(sk, SOCK_TIMESTAMP)) {
184		sock_reset_flag(sk, SOCK_TIMESTAMP);
185		net_disable_timestamp();
186	}
187}
188
189
190/*
191 *	This is meant for all protocols to use and covers goings on
192 *	at the socket level. Everything here is generic.
193 */
194
195int sock_setsockopt(struct socket *sock, int level, int optname,
196		    char __user *optval, int optlen)
197{
198	struct sock *sk=sock->sk;
199	struct sk_filter *filter;
200	int val;
201	int valbool;
202	struct linger ling;
203	int ret = 0;
204
205	/*
206	 *	Options without arguments
207	 */
208
209#ifdef SO_DONTLINGER		/* Compatibility item... */
210	if (optname == SO_DONTLINGER) {
211		lock_sock(sk);
212		sock_reset_flag(sk, SOCK_LINGER);
213		release_sock(sk);
214		return 0;
215	}
216#endif
217
218  	if(optlen<sizeof(int))
219  		return(-EINVAL);
220
221	if (get_user(val, (int __user *)optval))
222		return -EFAULT;
223
224  	valbool = val?1:0;
225
226	lock_sock(sk);
227
228  	switch(optname)
229  	{
230		case SO_DEBUG:
231			if(val && !capable(CAP_NET_ADMIN))
232			{
233				ret = -EACCES;
234			}
235			else if (valbool)
236				sock_set_flag(sk, SOCK_DBG);
237			else
238				sock_reset_flag(sk, SOCK_DBG);
239			break;
240		case SO_REUSEADDR:
241			sk->sk_reuse = valbool;
242			break;
243		case SO_TYPE:
244		case SO_ERROR:
245			ret = -ENOPROTOOPT;
246		  	break;
247		case SO_DONTROUTE:
248			if (valbool)
249				sock_set_flag(sk, SOCK_LOCALROUTE);
250			else
251				sock_reset_flag(sk, SOCK_LOCALROUTE);
252			break;
253		case SO_BROADCAST:
254			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
255			break;
256		case SO_SNDBUF:
257			/* Don't error on this BSD doesn't and if you think
258			   about it this is right. Otherwise apps have to
259			   play 'guess the biggest size' games. RCVBUF/SNDBUF
260			   are treated in BSD as hints */
261
262			if (val > sysctl_wmem_max)
263				val = sysctl_wmem_max;
264set_sndbuf:
265			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
266			if ((val * 2) < SOCK_MIN_SNDBUF)
267				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
268			else
269				sk->sk_sndbuf = val * 2;
270
271			/*
272			 *	Wake up sending tasks if we
273			 *	upped the value.
274			 */
275			sk->sk_write_space(sk);
276			break;
277
278		case SO_SNDBUFFORCE:
279			if (!capable(CAP_NET_ADMIN)) {
280				ret = -EPERM;
281				break;
282			}
283			goto set_sndbuf;
284
285		case SO_RCVBUF:
286			/* Don't error on this BSD doesn't and if you think
287			   about it this is right. Otherwise apps have to
288			   play 'guess the biggest size' games. RCVBUF/SNDBUF
289			   are treated in BSD as hints */
290
291			if (val > sysctl_rmem_max)
292				val = sysctl_rmem_max;
293set_rcvbuf:
294			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
295			/* FIXME: is this lower bound the right one? */
296			if ((val * 2) < SOCK_MIN_RCVBUF)
297				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
298			else
299				sk->sk_rcvbuf = val * 2;
300			break;
301
302		case SO_RCVBUFFORCE:
303			if (!capable(CAP_NET_ADMIN)) {
304				ret = -EPERM;
305				break;
306			}
307			goto set_rcvbuf;
308
309		case SO_KEEPALIVE:
310#ifdef CONFIG_INET
311			if (sk->sk_protocol == IPPROTO_TCP)
312				tcp_set_keepalive(sk, valbool);
313#endif
314			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
315			break;
316
317	 	case SO_OOBINLINE:
318			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
319			break;
320
321	 	case SO_NO_CHECK:
322			sk->sk_no_check = valbool;
323			break;
324
325		case SO_PRIORITY:
326			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
327				sk->sk_priority = val;
328			else
329				ret = -EPERM;
330			break;
331
332		case SO_LINGER:
333			if(optlen<sizeof(ling)) {
334				ret = -EINVAL;	/* 1003.1g */
335				break;
336			}
337			if (copy_from_user(&ling,optval,sizeof(ling))) {
338				ret = -EFAULT;
339				break;
340			}
341			if (!ling.l_onoff)
342				sock_reset_flag(sk, SOCK_LINGER);
343			else {
344#if (BITS_PER_LONG == 32)
345				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
346					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
347				else
348#endif
349					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
350				sock_set_flag(sk, SOCK_LINGER);
351			}
352			break;
353
354		case SO_BSDCOMPAT:
355			sock_warn_obsolete_bsdism("setsockopt");
356			break;
357
358		case SO_PASSCRED:
359			if (valbool)
360				set_bit(SOCK_PASSCRED, &sock->flags);
361			else
362				clear_bit(SOCK_PASSCRED, &sock->flags);
363			break;
364
365		case SO_TIMESTAMP:
366			if (valbool)  {
367				sock_set_flag(sk, SOCK_RCVTSTAMP);
368				sock_enable_timestamp(sk);
369			} else
370				sock_reset_flag(sk, SOCK_RCVTSTAMP);
371			break;
372
373		case SO_RCVLOWAT:
374			if (val < 0)
375				val = INT_MAX;
376			sk->sk_rcvlowat = val ? : 1;
377			break;
378
379		case SO_RCVTIMEO:
380			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
381			break;
382
383		case SO_SNDTIMEO:
384			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
385			break;
386
387#ifdef CONFIG_NETDEVICES
388		case SO_BINDTODEVICE:
389		{
390			char devname[IFNAMSIZ];
391
392			/* Sorry... */
393			if (!capable(CAP_NET_RAW)) {
394				ret = -EPERM;
395				break;
396			}
397
398			/* Bind this socket to a particular device like "eth0",
399			 * as specified in the passed interface name. If the
400			 * name is "" or the option length is zero the socket
401			 * is not bound.
402			 */
403
404			if (!valbool) {
405				sk->sk_bound_dev_if = 0;
406			} else {
407				if (optlen > IFNAMSIZ)
408					optlen = IFNAMSIZ;
409				if (copy_from_user(devname, optval, optlen)) {
410					ret = -EFAULT;
411					break;
412				}
413
414				/* Remove any cached route for this socket. */
415				sk_dst_reset(sk);
416
417				if (devname[0] == '\0') {
418					sk->sk_bound_dev_if = 0;
419				} else {
420					struct net_device *dev = dev_get_by_name(devname);
421					if (!dev) {
422						ret = -ENODEV;
423						break;
424					}
425					sk->sk_bound_dev_if = dev->ifindex;
426					dev_put(dev);
427				}
428			}
429			break;
430		}
431#endif
432
433
434		case SO_ATTACH_FILTER:
435			ret = -EINVAL;
436			if (optlen == sizeof(struct sock_fprog)) {
437				struct sock_fprog fprog;
438
439				ret = -EFAULT;
440				if (copy_from_user(&fprog, optval, sizeof(fprog)))
441					break;
442
443				ret = sk_attach_filter(&fprog, sk);
444			}
445			break;
446
447		case SO_DETACH_FILTER:
448			spin_lock_bh(&sk->sk_lock.slock);
449			filter = sk->sk_filter;
450                        if (filter) {
451				sk->sk_filter = NULL;
452				spin_unlock_bh(&sk->sk_lock.slock);
453				sk_filter_release(sk, filter);
454				break;
455			}
456			spin_unlock_bh(&sk->sk_lock.slock);
457			ret = -ENONET;
458			break;
459
460		/* We implement the SO_SNDLOWAT etc to
461		   not be settable (1003.1g 5.3) */
462		default:
463		  	ret = -ENOPROTOOPT;
464			break;
465  	}
466	release_sock(sk);
467	return ret;
468}
469
470
471int sock_getsockopt(struct socket *sock, int level, int optname,
472		    char __user *optval, int __user *optlen)
473{
474	struct sock *sk = sock->sk;
475
476	union
477	{
478  		int val;
479  		struct linger ling;
480		struct timeval tm;
481	} v;
482
483	unsigned int lv = sizeof(int);
484	int len;
485
486  	if(get_user(len,optlen))
487  		return -EFAULT;
488	if(len < 0)
489		return -EINVAL;
490
491  	switch(optname)
492  	{
493		case SO_DEBUG:
494			v.val = sock_flag(sk, SOCK_DBG);
495			break;
496
497		case SO_DONTROUTE:
498			v.val = sock_flag(sk, SOCK_LOCALROUTE);
499			break;
500
501		case SO_BROADCAST:
502			v.val = !!sock_flag(sk, SOCK_BROADCAST);
503			break;
504
505		case SO_SNDBUF:
506			v.val = sk->sk_sndbuf;
507			break;
508
509		case SO_RCVBUF:
510			v.val = sk->sk_rcvbuf;
511			break;
512
513		case SO_REUSEADDR:
514			v.val = sk->sk_reuse;
515			break;
516
517		case SO_KEEPALIVE:
518			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
519			break;
520
521		case SO_TYPE:
522			v.val = sk->sk_type;
523			break;
524
525		case SO_ERROR:
526			v.val = -sock_error(sk);
527			if(v.val==0)
528				v.val = xchg(&sk->sk_err_soft, 0);
529			break;
530
531		case SO_OOBINLINE:
532			v.val = !!sock_flag(sk, SOCK_URGINLINE);
533			break;
534
535		case SO_NO_CHECK:
536			v.val = sk->sk_no_check;
537			break;
538
539		case SO_PRIORITY:
540			v.val = sk->sk_priority;
541			break;
542
543		case SO_LINGER:
544			lv		= sizeof(v.ling);
545			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
546 			v.ling.l_linger	= sk->sk_lingertime / HZ;
547			break;
548
549		case SO_BSDCOMPAT:
550			sock_warn_obsolete_bsdism("getsockopt");
551			break;
552
553		case SO_TIMESTAMP:
554			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
555			break;
556
557		case SO_RCVTIMEO:
558			lv=sizeof(struct timeval);
559			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
560				v.tm.tv_sec = 0;
561				v.tm.tv_usec = 0;
562			} else {
563				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
564				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
565			}
566			break;
567
568		case SO_SNDTIMEO:
569			lv=sizeof(struct timeval);
570			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
571				v.tm.tv_sec = 0;
572				v.tm.tv_usec = 0;
573			} else {
574				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
575				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
576			}
577			break;
578
579		case SO_RCVLOWAT:
580			v.val = sk->sk_rcvlowat;
581			break;
582
583		case SO_SNDLOWAT:
584			v.val=1;
585			break;
586
587		case SO_PASSCRED:
588			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
589			break;
590
591		case SO_PEERCRED:
592			if (len > sizeof(sk->sk_peercred))
593				len = sizeof(sk->sk_peercred);
594			if (copy_to_user(optval, &sk->sk_peercred, len))
595				return -EFAULT;
596			goto lenout;
597
598		case SO_PEERNAME:
599		{
600			char address[128];
601
602			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
603				return -ENOTCONN;
604			if (lv < len)
605				return -EINVAL;
606			if (copy_to_user(optval, address, len))
607				return -EFAULT;
608			goto lenout;
609		}
610
611		/* Dubious BSD thing... Probably nobody even uses it, but
612		 * the UNIX standard wants it for whatever reason... -DaveM
613		 */
614		case SO_ACCEPTCONN:
615			v.val = sk->sk_state == TCP_LISTEN;
616			break;
617
618		case SO_PEERSEC:
619			return security_socket_getpeersec_stream(sock, optval, optlen, len);
620
621		default:
622			return(-ENOPROTOOPT);
623	}
624	if (len > lv)
625		len = lv;
626	if (copy_to_user(optval, &v, len))
627		return -EFAULT;
628lenout:
629  	if (put_user(len, optlen))
630  		return -EFAULT;
631  	return 0;
632}
633
634/**
635 *	sk_alloc - All socket objects are allocated here
636 *	@family: protocol family
637 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
638 *	@prot: struct proto associated with this new sock instance
639 *	@zero_it: if we should zero the newly allocated sock
640 */
641struct sock *sk_alloc(int family, gfp_t priority,
642		      struct proto *prot, int zero_it)
643{
644	struct sock *sk = NULL;
645	kmem_cache_t *slab = prot->slab;
646
647	if (slab != NULL)
648		sk = kmem_cache_alloc(slab, priority);
649	else
650		sk = kmalloc(prot->obj_size, priority);
651
652	if (sk) {
653		if (zero_it) {
654			memset(sk, 0, prot->obj_size);
655			sk->sk_family = family;
656			/*
657			 * See comment in struct sock definition to understand
658			 * why we need sk_prot_creator -acme
659			 */
660			sk->sk_prot = sk->sk_prot_creator = prot;
661			sock_lock_init(sk);
662		}
663
664		if (security_sk_alloc(sk, family, priority))
665			goto out_free;
666
667		if (!try_module_get(prot->owner))
668			goto out_free;
669	}
670	return sk;
671
672out_free:
673	if (slab != NULL)
674		kmem_cache_free(slab, sk);
675	else
676		kfree(sk);
677	return NULL;
678}
679
680void sk_free(struct sock *sk)
681{
682	struct sk_filter *filter;
683	struct module *owner = sk->sk_prot_creator->owner;
684
685	if (sk->sk_destruct)
686		sk->sk_destruct(sk);
687
688	filter = sk->sk_filter;
689	if (filter) {
690		sk_filter_release(sk, filter);
691		sk->sk_filter = NULL;
692	}
693
694	sock_disable_timestamp(sk);
695
696	if (atomic_read(&sk->sk_omem_alloc))
697		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
698		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
699
700	security_sk_free(sk);
701	if (sk->sk_prot_creator->slab != NULL)
702		kmem_cache_free(sk->sk_prot_creator->slab, sk);
703	else
704		kfree(sk);
705	module_put(owner);
706}
707
708struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
709{
710	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
711
712	if (newsk != NULL) {
713		struct sk_filter *filter;
714
715		memcpy(newsk, sk, sk->sk_prot->obj_size);
716
717		/* SANITY */
718		sk_node_init(&newsk->sk_node);
719		sock_lock_init(newsk);
720		bh_lock_sock(newsk);
721
722		atomic_set(&newsk->sk_rmem_alloc, 0);
723		atomic_set(&newsk->sk_wmem_alloc, 0);
724		atomic_set(&newsk->sk_omem_alloc, 0);
725		skb_queue_head_init(&newsk->sk_receive_queue);
726		skb_queue_head_init(&newsk->sk_write_queue);
727
728		rwlock_init(&newsk->sk_dst_lock);
729		rwlock_init(&newsk->sk_callback_lock);
730
731		newsk->sk_dst_cache	= NULL;
732		newsk->sk_wmem_queued	= 0;
733		newsk->sk_forward_alloc = 0;
734		newsk->sk_send_head	= NULL;
735		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
736		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
737
738		sock_reset_flag(newsk, SOCK_DONE);
739		skb_queue_head_init(&newsk->sk_error_queue);
740
741		filter = newsk->sk_filter;
742		if (filter != NULL)
743			sk_filter_charge(newsk, filter);
744
745		if (unlikely(xfrm_sk_clone_policy(newsk))) {
746			/* It is still raw copy of parent, so invalidate
747			 * destructor and make plain sk_free() */
748			newsk->sk_destruct = NULL;
749			sk_free(newsk);
750			newsk = NULL;
751			goto out;
752		}
753
754		newsk->sk_err	   = 0;
755		newsk->sk_priority = 0;
756		atomic_set(&newsk->sk_refcnt, 2);
757
758		/*
759		 * Increment the counter in the same struct proto as the master
760		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
761		 * is the same as sk->sk_prot->socks, as this field was copied
762		 * with memcpy).
763		 *
764		 * This _changes_ the previous behaviour, where
765		 * tcp_create_openreq_child always was incrementing the
766		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
767		 * to be taken into account in all callers. -acme
768		 */
769		sk_refcnt_debug_inc(newsk);
770		newsk->sk_socket = NULL;
771		newsk->sk_sleep	 = NULL;
772
773		if (newsk->sk_prot->sockets_allocated)
774			atomic_inc(newsk->sk_prot->sockets_allocated);
775	}
776out:
777	return newsk;
778}
779
780EXPORT_SYMBOL_GPL(sk_clone);
781
782void __init sk_init(void)
783{
784	if (num_physpages <= 4096) {
785		sysctl_wmem_max = 32767;
786		sysctl_rmem_max = 32767;
787		sysctl_wmem_default = 32767;
788		sysctl_rmem_default = 32767;
789	} else if (num_physpages >= 131072) {
790		sysctl_wmem_max = 131071;
791		sysctl_rmem_max = 131071;
792	}
793}
794
795/*
796 *	Simple resource managers for sockets.
797 */
798
799
800/*
801 * Write buffer destructor automatically called from kfree_skb.
802 */
803void sock_wfree(struct sk_buff *skb)
804{
805	struct sock *sk = skb->sk;
806
807	/* In case it might be waiting for more memory. */
808	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
809	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
810		sk->sk_write_space(sk);
811	sock_put(sk);
812}
813
814/*
815 * Read buffer destructor automatically called from kfree_skb.
816 */
817void sock_rfree(struct sk_buff *skb)
818{
819	struct sock *sk = skb->sk;
820
821	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
822}
823
824
825int sock_i_uid(struct sock *sk)
826{
827	int uid;
828
829	read_lock(&sk->sk_callback_lock);
830	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
831	read_unlock(&sk->sk_callback_lock);
832	return uid;
833}
834
835unsigned long sock_i_ino(struct sock *sk)
836{
837	unsigned long ino;
838
839	read_lock(&sk->sk_callback_lock);
840	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
841	read_unlock(&sk->sk_callback_lock);
842	return ino;
843}
844
845/*
846 * Allocate a skb from the socket's send buffer.
847 */
848struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
849			     gfp_t priority)
850{
851	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
852		struct sk_buff * skb = alloc_skb(size, priority);
853		if (skb) {
854			skb_set_owner_w(skb, sk);
855			return skb;
856		}
857	}
858	return NULL;
859}
860
861/*
862 * Allocate a skb from the socket's receive buffer.
863 */
864struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
865			     gfp_t priority)
866{
867	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
868		struct sk_buff *skb = alloc_skb(size, priority);
869		if (skb) {
870			skb_set_owner_r(skb, sk);
871			return skb;
872		}
873	}
874	return NULL;
875}
876
877/*
878 * Allocate a memory block from the socket's option memory buffer.
879 */
880void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
881{
882	if ((unsigned)size <= sysctl_optmem_max &&
883	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
884		void *mem;
885		/* First do the add, to avoid the race if kmalloc
886 		 * might sleep.
887		 */
888		atomic_add(size, &sk->sk_omem_alloc);
889		mem = kmalloc(size, priority);
890		if (mem)
891			return mem;
892		atomic_sub(size, &sk->sk_omem_alloc);
893	}
894	return NULL;
895}
896
897/*
898 * Free an option memory block.
899 */
900void sock_kfree_s(struct sock *sk, void *mem, int size)
901{
902	kfree(mem);
903	atomic_sub(size, &sk->sk_omem_alloc);
904}
905
906/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
907   I think, these locks should be removed for datagram sockets.
908 */
909static long sock_wait_for_wmem(struct sock * sk, long timeo)
910{
911	DEFINE_WAIT(wait);
912
913	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
914	for (;;) {
915		if (!timeo)
916			break;
917		if (signal_pending(current))
918			break;
919		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
920		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
921		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
922			break;
923		if (sk->sk_shutdown & SEND_SHUTDOWN)
924			break;
925		if (sk->sk_err)
926			break;
927		timeo = schedule_timeout(timeo);
928	}
929	finish_wait(sk->sk_sleep, &wait);
930	return timeo;
931}
932
933
934/*
935 *	Generic send/receive buffer handlers
936 */
937
938static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
939					    unsigned long header_len,
940					    unsigned long data_len,
941					    int noblock, int *errcode)
942{
943	struct sk_buff *skb;
944	gfp_t gfp_mask;
945	long timeo;
946	int err;
947
948	gfp_mask = sk->sk_allocation;
949	if (gfp_mask & __GFP_WAIT)
950		gfp_mask |= __GFP_REPEAT;
951
952	timeo = sock_sndtimeo(sk, noblock);
953	while (1) {
954		err = sock_error(sk);
955		if (err != 0)
956			goto failure;
957
958		err = -EPIPE;
959		if (sk->sk_shutdown & SEND_SHUTDOWN)
960			goto failure;
961
962		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
963			skb = alloc_skb(header_len, sk->sk_allocation);
964			if (skb) {
965				int npages;
966				int i;
967
968				/* No pages, we're done... */
969				if (!data_len)
970					break;
971
972				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
973				skb->truesize += data_len;
974				skb_shinfo(skb)->nr_frags = npages;
975				for (i = 0; i < npages; i++) {
976					struct page *page;
977					skb_frag_t *frag;
978
979					page = alloc_pages(sk->sk_allocation, 0);
980					if (!page) {
981						err = -ENOBUFS;
982						skb_shinfo(skb)->nr_frags = i;
983						kfree_skb(skb);
984						goto failure;
985					}
986
987					frag = &skb_shinfo(skb)->frags[i];
988					frag->page = page;
989					frag->page_offset = 0;
990					frag->size = (data_len >= PAGE_SIZE ?
991						      PAGE_SIZE :
992						      data_len);
993					data_len -= PAGE_SIZE;
994				}
995
996				/* Full success... */
997				break;
998			}
999			err = -ENOBUFS;
1000			goto failure;
1001		}
1002		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1003		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1004		err = -EAGAIN;
1005		if (!timeo)
1006			goto failure;
1007		if (signal_pending(current))
1008			goto interrupted;
1009		timeo = sock_wait_for_wmem(sk, timeo);
1010	}
1011
1012	skb_set_owner_w(skb, sk);
1013	return skb;
1014
1015interrupted:
1016	err = sock_intr_errno(timeo);
1017failure:
1018	*errcode = err;
1019	return NULL;
1020}
1021
1022struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1023				    int noblock, int *errcode)
1024{
1025	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1026}
1027
1028static void __lock_sock(struct sock *sk)
1029{
1030	DEFINE_WAIT(wait);
1031
1032	for(;;) {
1033		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1034					TASK_UNINTERRUPTIBLE);
1035		spin_unlock_bh(&sk->sk_lock.slock);
1036		schedule();
1037		spin_lock_bh(&sk->sk_lock.slock);
1038		if(!sock_owned_by_user(sk))
1039			break;
1040	}
1041	finish_wait(&sk->sk_lock.wq, &wait);
1042}
1043
1044static void __release_sock(struct sock *sk)
1045{
1046	struct sk_buff *skb = sk->sk_backlog.head;
1047
1048	do {
1049		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1050		bh_unlock_sock(sk);
1051
1052		do {
1053			struct sk_buff *next = skb->next;
1054
1055			skb->next = NULL;
1056			sk->sk_backlog_rcv(sk, skb);
1057
1058			/*
1059			 * We are in process context here with softirqs
1060			 * disabled, use cond_resched_softirq() to preempt.
1061			 * This is safe to do because we've taken the backlog
1062			 * queue private:
1063			 */
1064			cond_resched_softirq();
1065
1066			skb = next;
1067		} while (skb != NULL);
1068
1069		bh_lock_sock(sk);
1070	} while((skb = sk->sk_backlog.head) != NULL);
1071}
1072
1073/**
1074 * sk_wait_data - wait for data to arrive at sk_receive_queue
1075 * @sk:    sock to wait on
1076 * @timeo: for how long
1077 *
1078 * Now socket state including sk->sk_err is changed only under lock,
1079 * hence we may omit checks after joining wait queue.
1080 * We check receive queue before schedule() only as optimization;
1081 * it is very likely that release_sock() added new data.
1082 */
1083int sk_wait_data(struct sock *sk, long *timeo)
1084{
1085	int rc;
1086	DEFINE_WAIT(wait);
1087
1088	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1089	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1090	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1091	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1092	finish_wait(sk->sk_sleep, &wait);
1093	return rc;
1094}
1095
1096EXPORT_SYMBOL(sk_wait_data);
1097
1098/*
1099 * Set of default routines for initialising struct proto_ops when
1100 * the protocol does not support a particular function. In certain
1101 * cases where it makes no sense for a protocol to have a "do nothing"
1102 * function, some default processing is provided.
1103 */
1104
1105int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1106{
1107	return -EOPNOTSUPP;
1108}
1109
1110int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1111		    int len, int flags)
1112{
1113	return -EOPNOTSUPP;
1114}
1115
1116int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1117{
1118	return -EOPNOTSUPP;
1119}
1120
1121int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1122{
1123	return -EOPNOTSUPP;
1124}
1125
1126int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1127		    int *len, int peer)
1128{
1129	return -EOPNOTSUPP;
1130}
1131
1132unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1133{
1134	return 0;
1135}
1136
1137int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1138{
1139	return -EOPNOTSUPP;
1140}
1141
1142int sock_no_listen(struct socket *sock, int backlog)
1143{
1144	return -EOPNOTSUPP;
1145}
1146
1147int sock_no_shutdown(struct socket *sock, int how)
1148{
1149	return -EOPNOTSUPP;
1150}
1151
1152int sock_no_setsockopt(struct socket *sock, int level, int optname,
1153		    char __user *optval, int optlen)
1154{
1155	return -EOPNOTSUPP;
1156}
1157
1158int sock_no_getsockopt(struct socket *sock, int level, int optname,
1159		    char __user *optval, int __user *optlen)
1160{
1161	return -EOPNOTSUPP;
1162}
1163
1164int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1165		    size_t len)
1166{
1167	return -EOPNOTSUPP;
1168}
1169
1170int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1171		    size_t len, int flags)
1172{
1173	return -EOPNOTSUPP;
1174}
1175
1176int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1177{
1178	/* Mirror missing mmap method error code */
1179	return -ENODEV;
1180}
1181
1182ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1183{
1184	ssize_t res;
1185	struct msghdr msg = {.msg_flags = flags};
1186	struct kvec iov;
1187	char *kaddr = kmap(page);
1188	iov.iov_base = kaddr + offset;
1189	iov.iov_len = size;
1190	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1191	kunmap(page);
1192	return res;
1193}
1194
1195/*
1196 *	Default Socket Callbacks
1197 */
1198
1199static void sock_def_wakeup(struct sock *sk)
1200{
1201	read_lock(&sk->sk_callback_lock);
1202	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1203		wake_up_interruptible_all(sk->sk_sleep);
1204	read_unlock(&sk->sk_callback_lock);
1205}
1206
1207static void sock_def_error_report(struct sock *sk)
1208{
1209	read_lock(&sk->sk_callback_lock);
1210	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1211		wake_up_interruptible(sk->sk_sleep);
1212	sk_wake_async(sk,0,POLL_ERR);
1213	read_unlock(&sk->sk_callback_lock);
1214}
1215
1216static void sock_def_readable(struct sock *sk, int len)
1217{
1218	read_lock(&sk->sk_callback_lock);
1219	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1220		wake_up_interruptible(sk->sk_sleep);
1221	sk_wake_async(sk,1,POLL_IN);
1222	read_unlock(&sk->sk_callback_lock);
1223}
1224
1225static void sock_def_write_space(struct sock *sk)
1226{
1227	read_lock(&sk->sk_callback_lock);
1228
1229	/* Do not wake up a writer until he can make "significant"
1230	 * progress.  --DaveM
1231	 */
1232	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1233		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1234			wake_up_interruptible(sk->sk_sleep);
1235
1236		/* Should agree with poll, otherwise some programs break */
1237		if (sock_writeable(sk))
1238			sk_wake_async(sk, 2, POLL_OUT);
1239	}
1240
1241	read_unlock(&sk->sk_callback_lock);
1242}
1243
1244static void sock_def_destruct(struct sock *sk)
1245{
1246	kfree(sk->sk_protinfo);
1247}
1248
1249void sk_send_sigurg(struct sock *sk)
1250{
1251	if (sk->sk_socket && sk->sk_socket->file)
1252		if (send_sigurg(&sk->sk_socket->file->f_owner))
1253			sk_wake_async(sk, 3, POLL_PRI);
1254}
1255
1256void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1257		    unsigned long expires)
1258{
1259	if (!mod_timer(timer, expires))
1260		sock_hold(sk);
1261}
1262
1263EXPORT_SYMBOL(sk_reset_timer);
1264
1265void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1266{
1267	if (timer_pending(timer) && del_timer(timer))
1268		__sock_put(sk);
1269}
1270
1271EXPORT_SYMBOL(sk_stop_timer);
1272
1273void sock_init_data(struct socket *sock, struct sock *sk)
1274{
1275	skb_queue_head_init(&sk->sk_receive_queue);
1276	skb_queue_head_init(&sk->sk_write_queue);
1277	skb_queue_head_init(&sk->sk_error_queue);
1278
1279	sk->sk_send_head	=	NULL;
1280
1281	init_timer(&sk->sk_timer);
1282
1283	sk->sk_allocation	=	GFP_KERNEL;
1284	sk->sk_rcvbuf		=	sysctl_rmem_default;
1285	sk->sk_sndbuf		=	sysctl_wmem_default;
1286	sk->sk_state		=	TCP_CLOSE;
1287	sk->sk_socket		=	sock;
1288
1289	sock_set_flag(sk, SOCK_ZAPPED);
1290
1291	if(sock)
1292	{
1293		sk->sk_type	=	sock->type;
1294		sk->sk_sleep	=	&sock->wait;
1295		sock->sk	=	sk;
1296	} else
1297		sk->sk_sleep	=	NULL;
1298
1299	rwlock_init(&sk->sk_dst_lock);
1300	rwlock_init(&sk->sk_callback_lock);
1301
1302	sk->sk_state_change	=	sock_def_wakeup;
1303	sk->sk_data_ready	=	sock_def_readable;
1304	sk->sk_write_space	=	sock_def_write_space;
1305	sk->sk_error_report	=	sock_def_error_report;
1306	sk->sk_destruct		=	sock_def_destruct;
1307
1308	sk->sk_sndmsg_page	=	NULL;
1309	sk->sk_sndmsg_off	=	0;
1310
1311	sk->sk_peercred.pid 	=	0;
1312	sk->sk_peercred.uid	=	-1;
1313	sk->sk_peercred.gid	=	-1;
1314	sk->sk_write_pending	=	0;
1315	sk->sk_rcvlowat		=	1;
1316	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1317	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1318
1319	sk->sk_stamp.tv_sec     = -1L;
1320	sk->sk_stamp.tv_usec    = -1L;
1321
1322	atomic_set(&sk->sk_refcnt, 1);
1323}
1324
1325void fastcall lock_sock(struct sock *sk)
1326{
1327	might_sleep();
1328	spin_lock_bh(&(sk->sk_lock.slock));
1329	if (sk->sk_lock.owner)
1330		__lock_sock(sk);
1331	sk->sk_lock.owner = (void *)1;
1332	spin_unlock_bh(&(sk->sk_lock.slock));
1333}
1334
1335EXPORT_SYMBOL(lock_sock);
1336
1337void fastcall release_sock(struct sock *sk)
1338{
1339	spin_lock_bh(&(sk->sk_lock.slock));
1340	if (sk->sk_backlog.tail)
1341		__release_sock(sk);
1342	sk->sk_lock.owner = NULL;
1343        if (waitqueue_active(&(sk->sk_lock.wq)))
1344		wake_up(&(sk->sk_lock.wq));
1345	spin_unlock_bh(&(sk->sk_lock.slock));
1346}
1347EXPORT_SYMBOL(release_sock);
1348
1349int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1350{
1351	if (!sock_flag(sk, SOCK_TIMESTAMP))
1352		sock_enable_timestamp(sk);
1353	if (sk->sk_stamp.tv_sec == -1)
1354		return -ENOENT;
1355	if (sk->sk_stamp.tv_sec == 0)
1356		do_gettimeofday(&sk->sk_stamp);
1357	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1358		-EFAULT : 0;
1359}
1360EXPORT_SYMBOL(sock_get_timestamp);
1361
1362void sock_enable_timestamp(struct sock *sk)
1363{
1364	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1365		sock_set_flag(sk, SOCK_TIMESTAMP);
1366		net_enable_timestamp();
1367	}
1368}
1369EXPORT_SYMBOL(sock_enable_timestamp);
1370
1371/*
1372 *	Get a socket option on an socket.
1373 *
1374 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1375 *	asynchronous errors should be reported by getsockopt. We assume
1376 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1377 */
1378int sock_common_getsockopt(struct socket *sock, int level, int optname,
1379			   char __user *optval, int __user *optlen)
1380{
1381	struct sock *sk = sock->sk;
1382
1383	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1384}
1385
1386EXPORT_SYMBOL(sock_common_getsockopt);
1387
1388int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1389			struct msghdr *msg, size_t size, int flags)
1390{
1391	struct sock *sk = sock->sk;
1392	int addr_len = 0;
1393	int err;
1394
1395	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1396				   flags & ~MSG_DONTWAIT, &addr_len);
1397	if (err >= 0)
1398		msg->msg_namelen = addr_len;
1399	return err;
1400}
1401
1402EXPORT_SYMBOL(sock_common_recvmsg);
1403
1404/*
1405 *	Set socket options on an inet socket.
1406 */
1407int sock_common_setsockopt(struct socket *sock, int level, int optname,
1408			   char __user *optval, int optlen)
1409{
1410	struct sock *sk = sock->sk;
1411
1412	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1413}
1414
1415EXPORT_SYMBOL(sock_common_setsockopt);
1416
1417void sk_common_release(struct sock *sk)
1418{
1419	if (sk->sk_prot->destroy)
1420		sk->sk_prot->destroy(sk);
1421
1422	/*
1423	 * Observation: when sock_common_release is called, processes have
1424	 * no access to socket. But net still has.
1425	 * Step one, detach it from networking:
1426	 *
1427	 * A. Remove from hash tables.
1428	 */
1429
1430	sk->sk_prot->unhash(sk);
1431
1432	/*
1433	 * In this point socket cannot receive new packets, but it is possible
1434	 * that some packets are in flight because some CPU runs receiver and
1435	 * did hash table lookup before we unhashed socket. They will achieve
1436	 * receive queue and will be purged by socket destructor.
1437	 *
1438	 * Also we still have packets pending on receive queue and probably,
1439	 * our own packets waiting in device queues. sock_destroy will drain
1440	 * receive queue, but transmitted packets will delay socket destruction
1441	 * until the last reference will be released.
1442	 */
1443
1444	sock_orphan(sk);
1445
1446	xfrm_sk_free_policy(sk);
1447
1448	sk_refcnt_debug_release(sk);
1449	sock_put(sk);
1450}
1451
1452EXPORT_SYMBOL(sk_common_release);
1453
1454static DEFINE_RWLOCK(proto_list_lock);
1455static LIST_HEAD(proto_list);
1456
1457int proto_register(struct proto *prot, int alloc_slab)
1458{
1459	char *request_sock_slab_name = NULL;
1460	char *timewait_sock_slab_name;
1461	int rc = -ENOBUFS;
1462
1463	if (alloc_slab) {
1464		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1465					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1466
1467		if (prot->slab == NULL) {
1468			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1469			       prot->name);
1470			goto out;
1471		}
1472
1473		if (prot->rsk_prot != NULL) {
1474			static const char mask[] = "request_sock_%s";
1475
1476			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1477			if (request_sock_slab_name == NULL)
1478				goto out_free_sock_slab;
1479
1480			sprintf(request_sock_slab_name, mask, prot->name);
1481			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1482								 prot->rsk_prot->obj_size, 0,
1483								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1484
1485			if (prot->rsk_prot->slab == NULL) {
1486				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1487				       prot->name);
1488				goto out_free_request_sock_slab_name;
1489			}
1490		}
1491
1492		if (prot->twsk_prot != NULL) {
1493			static const char mask[] = "tw_sock_%s";
1494
1495			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1496
1497			if (timewait_sock_slab_name == NULL)
1498				goto out_free_request_sock_slab;
1499
1500			sprintf(timewait_sock_slab_name, mask, prot->name);
1501			prot->twsk_prot->twsk_slab =
1502				kmem_cache_create(timewait_sock_slab_name,
1503						  prot->twsk_prot->twsk_obj_size,
1504						  0, SLAB_HWCACHE_ALIGN,
1505						  NULL, NULL);
1506			if (prot->twsk_prot->twsk_slab == NULL)
1507				goto out_free_timewait_sock_slab_name;
1508		}
1509	}
1510
1511	write_lock(&proto_list_lock);
1512	list_add(&prot->node, &proto_list);
1513	write_unlock(&proto_list_lock);
1514	rc = 0;
1515out:
1516	return rc;
1517out_free_timewait_sock_slab_name:
1518	kfree(timewait_sock_slab_name);
1519out_free_request_sock_slab:
1520	if (prot->rsk_prot && prot->rsk_prot->slab) {
1521		kmem_cache_destroy(prot->rsk_prot->slab);
1522		prot->rsk_prot->slab = NULL;
1523	}
1524out_free_request_sock_slab_name:
1525	kfree(request_sock_slab_name);
1526out_free_sock_slab:
1527	kmem_cache_destroy(prot->slab);
1528	prot->slab = NULL;
1529	goto out;
1530}
1531
1532EXPORT_SYMBOL(proto_register);
1533
1534void proto_unregister(struct proto *prot)
1535{
1536	write_lock(&proto_list_lock);
1537	list_del(&prot->node);
1538	write_unlock(&proto_list_lock);
1539
1540	if (prot->slab != NULL) {
1541		kmem_cache_destroy(prot->slab);
1542		prot->slab = NULL;
1543	}
1544
1545	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1546		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1547
1548		kmem_cache_destroy(prot->rsk_prot->slab);
1549		kfree(name);
1550		prot->rsk_prot->slab = NULL;
1551	}
1552
1553	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1554		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1555
1556		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1557		kfree(name);
1558		prot->twsk_prot->twsk_slab = NULL;
1559	}
1560}
1561
1562EXPORT_SYMBOL(proto_unregister);
1563
1564#ifdef CONFIG_PROC_FS
1565static inline struct proto *__proto_head(void)
1566{
1567	return list_entry(proto_list.next, struct proto, node);
1568}
1569
1570static inline struct proto *proto_head(void)
1571{
1572	return list_empty(&proto_list) ? NULL : __proto_head();
1573}
1574
1575static inline struct proto *proto_next(struct proto *proto)
1576{
1577	return proto->node.next == &proto_list ? NULL :
1578		list_entry(proto->node.next, struct proto, node);
1579}
1580
1581static inline struct proto *proto_get_idx(loff_t pos)
1582{
1583	struct proto *proto;
1584	loff_t i = 0;
1585
1586	list_for_each_entry(proto, &proto_list, node)
1587		if (i++ == pos)
1588			goto out;
1589
1590	proto = NULL;
1591out:
1592	return proto;
1593}
1594
1595static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1596{
1597	read_lock(&proto_list_lock);
1598	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1599}
1600
1601static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1602{
1603	++*pos;
1604	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1605}
1606
1607static void proto_seq_stop(struct seq_file *seq, void *v)
1608{
1609	read_unlock(&proto_list_lock);
1610}
1611
1612static char proto_method_implemented(const void *method)
1613{
1614	return method == NULL ? 'n' : 'y';
1615}
1616
1617static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1618{
1619	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1620			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1621		   proto->name,
1622		   proto->obj_size,
1623		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1624		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1625		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1626		   proto->max_header,
1627		   proto->slab == NULL ? "no" : "yes",
1628		   module_name(proto->owner),
1629		   proto_method_implemented(proto->close),
1630		   proto_method_implemented(proto->connect),
1631		   proto_method_implemented(proto->disconnect),
1632		   proto_method_implemented(proto->accept),
1633		   proto_method_implemented(proto->ioctl),
1634		   proto_method_implemented(proto->init),
1635		   proto_method_implemented(proto->destroy),
1636		   proto_method_implemented(proto->shutdown),
1637		   proto_method_implemented(proto->setsockopt),
1638		   proto_method_implemented(proto->getsockopt),
1639		   proto_method_implemented(proto->sendmsg),
1640		   proto_method_implemented(proto->recvmsg),
1641		   proto_method_implemented(proto->sendpage),
1642		   proto_method_implemented(proto->bind),
1643		   proto_method_implemented(proto->backlog_rcv),
1644		   proto_method_implemented(proto->hash),
1645		   proto_method_implemented(proto->unhash),
1646		   proto_method_implemented(proto->get_port),
1647		   proto_method_implemented(proto->enter_memory_pressure));
1648}
1649
1650static int proto_seq_show(struct seq_file *seq, void *v)
1651{
1652	if (v == SEQ_START_TOKEN)
1653		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1654			   "protocol",
1655			   "size",
1656			   "sockets",
1657			   "memory",
1658			   "press",
1659			   "maxhdr",
1660			   "slab",
1661			   "module",
1662			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1663	else
1664		proto_seq_printf(seq, v);
1665	return 0;
1666}
1667
1668static struct seq_operations proto_seq_ops = {
1669	.start  = proto_seq_start,
1670	.next   = proto_seq_next,
1671	.stop   = proto_seq_stop,
1672	.show   = proto_seq_show,
1673};
1674
1675static int proto_seq_open(struct inode *inode, struct file *file)
1676{
1677	return seq_open(file, &proto_seq_ops);
1678}
1679
1680static struct file_operations proto_seq_fops = {
1681	.owner		= THIS_MODULE,
1682	.open		= proto_seq_open,
1683	.read		= seq_read,
1684	.llseek		= seq_lseek,
1685	.release	= seq_release,
1686};
1687
1688static int __init proto_init(void)
1689{
1690	/* register /proc/net/protocols */
1691	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1692}
1693
1694subsys_initcall(proto_init);
1695
1696#endif /* PROC_FS */
1697
1698EXPORT_SYMBOL(sk_alloc);
1699EXPORT_SYMBOL(sk_free);
1700EXPORT_SYMBOL(sk_send_sigurg);
1701EXPORT_SYMBOL(sock_alloc_send_skb);
1702EXPORT_SYMBOL(sock_init_data);
1703EXPORT_SYMBOL(sock_kfree_s);
1704EXPORT_SYMBOL(sock_kmalloc);
1705EXPORT_SYMBOL(sock_no_accept);
1706EXPORT_SYMBOL(sock_no_bind);
1707EXPORT_SYMBOL(sock_no_connect);
1708EXPORT_SYMBOL(sock_no_getname);
1709EXPORT_SYMBOL(sock_no_getsockopt);
1710EXPORT_SYMBOL(sock_no_ioctl);
1711EXPORT_SYMBOL(sock_no_listen);
1712EXPORT_SYMBOL(sock_no_mmap);
1713EXPORT_SYMBOL(sock_no_poll);
1714EXPORT_SYMBOL(sock_no_recvmsg);
1715EXPORT_SYMBOL(sock_no_sendmsg);
1716EXPORT_SYMBOL(sock_no_sendpage);
1717EXPORT_SYMBOL(sock_no_setsockopt);
1718EXPORT_SYMBOL(sock_no_shutdown);
1719EXPORT_SYMBOL(sock_no_socketpair);
1720EXPORT_SYMBOL(sock_rfree);
1721EXPORT_SYMBOL(sock_setsockopt);
1722EXPORT_SYMBOL(sock_wfree);
1723EXPORT_SYMBOL(sock_wmalloc);
1724EXPORT_SYMBOL(sock_i_uid);
1725EXPORT_SYMBOL(sock_i_ino);
1726EXPORT_SYMBOL(sysctl_optmem_max);
1727#ifdef CONFIG_SYSCTL
1728EXPORT_SYMBOL(sysctl_rmem_max);
1729EXPORT_SYMBOL(sysctl_wmem_max);
1730#endif
1731