sock.c revision f67ed26f2b3e92c0450deae3ffc3fff21c878a75
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/config.h>
96#include <linux/errno.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/* Take into consideration the size of the struct sk_buff overhead in the
134 * determination of these values, since that is non-constant across
135 * platforms.  This makes socket queueing behavior and performance
136 * not depend upon such differences.
137 */
138#define _SK_MEM_PACKETS		256
139#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
140#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143/* Run time adjustable parameters. */
144__u32 sysctl_wmem_max = SK_WMEM_MAX;
145__u32 sysctl_rmem_max = SK_RMEM_MAX;
146__u32 sysctl_wmem_default = SK_WMEM_MAX;
147__u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149/* Maximal space eaten by iovec or ancilliary data plus some space */
150int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153{
154	struct timeval tv;
155
156	if (optlen < sizeof(tv))
157		return -EINVAL;
158	if (copy_from_user(&tv, optval, sizeof(tv)))
159		return -EFAULT;
160
161	*timeo_p = MAX_SCHEDULE_TIMEOUT;
162	if (tv.tv_sec == 0 && tv.tv_usec == 0)
163		return 0;
164	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166	return 0;
167}
168
169static void sock_warn_obsolete_bsdism(const char *name)
170{
171	static int warned;
172	static char warncomm[TASK_COMM_LEN];
173	if (strcmp(warncomm, current->comm) && warned < 5) {
174		strcpy(warncomm,  current->comm);
175		printk(KERN_WARNING "process `%s' is using obsolete "
176		       "%s SO_BSDCOMPAT\n", warncomm, name);
177		warned++;
178	}
179}
180
181static void sock_disable_timestamp(struct sock *sk)
182{
183	if (sock_flag(sk, SOCK_TIMESTAMP)) {
184		sock_reset_flag(sk, SOCK_TIMESTAMP);
185		net_disable_timestamp();
186	}
187}
188
189
190/*
191 *	This is meant for all protocols to use and covers goings on
192 *	at the socket level. Everything here is generic.
193 */
194
195int sock_setsockopt(struct socket *sock, int level, int optname,
196		    char __user *optval, int optlen)
197{
198	struct sock *sk=sock->sk;
199	struct sk_filter *filter;
200	int val;
201	int valbool;
202	struct linger ling;
203	int ret = 0;
204
205	/*
206	 *	Options without arguments
207	 */
208
209#ifdef SO_DONTLINGER		/* Compatibility item... */
210	if (optname == SO_DONTLINGER) {
211		lock_sock(sk);
212		sock_reset_flag(sk, SOCK_LINGER);
213		release_sock(sk);
214		return 0;
215	}
216#endif
217
218  	if(optlen<sizeof(int))
219  		return(-EINVAL);
220
221	if (get_user(val, (int __user *)optval))
222		return -EFAULT;
223
224  	valbool = val?1:0;
225
226	lock_sock(sk);
227
228  	switch(optname)
229  	{
230		case SO_DEBUG:
231			if(val && !capable(CAP_NET_ADMIN))
232			{
233				ret = -EACCES;
234			}
235			else if (valbool)
236				sock_set_flag(sk, SOCK_DBG);
237			else
238				sock_reset_flag(sk, SOCK_DBG);
239			break;
240		case SO_REUSEADDR:
241			sk->sk_reuse = valbool;
242			break;
243		case SO_TYPE:
244		case SO_ERROR:
245			ret = -ENOPROTOOPT;
246		  	break;
247		case SO_DONTROUTE:
248			if (valbool)
249				sock_set_flag(sk, SOCK_LOCALROUTE);
250			else
251				sock_reset_flag(sk, SOCK_LOCALROUTE);
252			break;
253		case SO_BROADCAST:
254			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
255			break;
256		case SO_SNDBUF:
257			/* Don't error on this BSD doesn't and if you think
258			   about it this is right. Otherwise apps have to
259			   play 'guess the biggest size' games. RCVBUF/SNDBUF
260			   are treated in BSD as hints */
261
262			if (val > sysctl_wmem_max)
263				val = sysctl_wmem_max;
264set_sndbuf:
265			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
266			if ((val * 2) < SOCK_MIN_SNDBUF)
267				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
268			else
269				sk->sk_sndbuf = val * 2;
270
271			/*
272			 *	Wake up sending tasks if we
273			 *	upped the value.
274			 */
275			sk->sk_write_space(sk);
276			break;
277
278		case SO_SNDBUFFORCE:
279			if (!capable(CAP_NET_ADMIN)) {
280				ret = -EPERM;
281				break;
282			}
283			goto set_sndbuf;
284
285		case SO_RCVBUF:
286			/* Don't error on this BSD doesn't and if you think
287			   about it this is right. Otherwise apps have to
288			   play 'guess the biggest size' games. RCVBUF/SNDBUF
289			   are treated in BSD as hints */
290
291			if (val > sysctl_rmem_max)
292				val = sysctl_rmem_max;
293set_rcvbuf:
294			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
295			/* FIXME: is this lower bound the right one? */
296			if ((val * 2) < SOCK_MIN_RCVBUF)
297				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
298			else
299				sk->sk_rcvbuf = val * 2;
300			break;
301
302		case SO_RCVBUFFORCE:
303			if (!capable(CAP_NET_ADMIN)) {
304				ret = -EPERM;
305				break;
306			}
307			goto set_rcvbuf;
308
309		case SO_KEEPALIVE:
310#ifdef CONFIG_INET
311			if (sk->sk_protocol == IPPROTO_TCP)
312				tcp_set_keepalive(sk, valbool);
313#endif
314			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
315			break;
316
317	 	case SO_OOBINLINE:
318			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
319			break;
320
321	 	case SO_NO_CHECK:
322			sk->sk_no_check = valbool;
323			break;
324
325		case SO_PRIORITY:
326			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
327				sk->sk_priority = val;
328			else
329				ret = -EPERM;
330			break;
331
332		case SO_LINGER:
333			if(optlen<sizeof(ling)) {
334				ret = -EINVAL;	/* 1003.1g */
335				break;
336			}
337			if (copy_from_user(&ling,optval,sizeof(ling))) {
338				ret = -EFAULT;
339				break;
340			}
341			if (!ling.l_onoff)
342				sock_reset_flag(sk, SOCK_LINGER);
343			else {
344#if (BITS_PER_LONG == 32)
345				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
346					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
347				else
348#endif
349					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
350				sock_set_flag(sk, SOCK_LINGER);
351			}
352			break;
353
354		case SO_BSDCOMPAT:
355			sock_warn_obsolete_bsdism("setsockopt");
356			break;
357
358		case SO_PASSCRED:
359			if (valbool)
360				set_bit(SOCK_PASSCRED, &sock->flags);
361			else
362				clear_bit(SOCK_PASSCRED, &sock->flags);
363			break;
364
365		case SO_TIMESTAMP:
366			if (valbool)  {
367				sock_set_flag(sk, SOCK_RCVTSTAMP);
368				sock_enable_timestamp(sk);
369			} else
370				sock_reset_flag(sk, SOCK_RCVTSTAMP);
371			break;
372
373		case SO_RCVLOWAT:
374			if (val < 0)
375				val = INT_MAX;
376			sk->sk_rcvlowat = val ? : 1;
377			break;
378
379		case SO_RCVTIMEO:
380			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
381			break;
382
383		case SO_SNDTIMEO:
384			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
385			break;
386
387#ifdef CONFIG_NETDEVICES
388		case SO_BINDTODEVICE:
389		{
390			char devname[IFNAMSIZ];
391
392			/* Sorry... */
393			if (!capable(CAP_NET_RAW)) {
394				ret = -EPERM;
395				break;
396			}
397
398			/* Bind this socket to a particular device like "eth0",
399			 * as specified in the passed interface name. If the
400			 * name is "" or the option length is zero the socket
401			 * is not bound.
402			 */
403
404			if (!valbool) {
405				sk->sk_bound_dev_if = 0;
406			} else {
407				if (optlen > IFNAMSIZ - 1)
408					optlen = IFNAMSIZ - 1;
409				memset(devname, 0, sizeof(devname));
410				if (copy_from_user(devname, optval, optlen)) {
411					ret = -EFAULT;
412					break;
413				}
414
415				/* Remove any cached route for this socket. */
416				sk_dst_reset(sk);
417
418				if (devname[0] == '\0') {
419					sk->sk_bound_dev_if = 0;
420				} else {
421					struct net_device *dev = dev_get_by_name(devname);
422					if (!dev) {
423						ret = -ENODEV;
424						break;
425					}
426					sk->sk_bound_dev_if = dev->ifindex;
427					dev_put(dev);
428				}
429			}
430			break;
431		}
432#endif
433
434
435		case SO_ATTACH_FILTER:
436			ret = -EINVAL;
437			if (optlen == sizeof(struct sock_fprog)) {
438				struct sock_fprog fprog;
439
440				ret = -EFAULT;
441				if (copy_from_user(&fprog, optval, sizeof(fprog)))
442					break;
443
444				ret = sk_attach_filter(&fprog, sk);
445			}
446			break;
447
448		case SO_DETACH_FILTER:
449			spin_lock_bh(&sk->sk_lock.slock);
450			filter = sk->sk_filter;
451                        if (filter) {
452				sk->sk_filter = NULL;
453				spin_unlock_bh(&sk->sk_lock.slock);
454				sk_filter_release(sk, filter);
455				break;
456			}
457			spin_unlock_bh(&sk->sk_lock.slock);
458			ret = -ENONET;
459			break;
460
461		/* We implement the SO_SNDLOWAT etc to
462		   not be settable (1003.1g 5.3) */
463		default:
464		  	ret = -ENOPROTOOPT;
465			break;
466  	}
467	release_sock(sk);
468	return ret;
469}
470
471
472int sock_getsockopt(struct socket *sock, int level, int optname,
473		    char __user *optval, int __user *optlen)
474{
475	struct sock *sk = sock->sk;
476
477	union
478	{
479  		int val;
480  		struct linger ling;
481		struct timeval tm;
482	} v;
483
484	unsigned int lv = sizeof(int);
485	int len;
486
487  	if(get_user(len,optlen))
488  		return -EFAULT;
489	if(len < 0)
490		return -EINVAL;
491
492  	switch(optname)
493  	{
494		case SO_DEBUG:
495			v.val = sock_flag(sk, SOCK_DBG);
496			break;
497
498		case SO_DONTROUTE:
499			v.val = sock_flag(sk, SOCK_LOCALROUTE);
500			break;
501
502		case SO_BROADCAST:
503			v.val = !!sock_flag(sk, SOCK_BROADCAST);
504			break;
505
506		case SO_SNDBUF:
507			v.val = sk->sk_sndbuf;
508			break;
509
510		case SO_RCVBUF:
511			v.val = sk->sk_rcvbuf;
512			break;
513
514		case SO_REUSEADDR:
515			v.val = sk->sk_reuse;
516			break;
517
518		case SO_KEEPALIVE:
519			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
520			break;
521
522		case SO_TYPE:
523			v.val = sk->sk_type;
524			break;
525
526		case SO_ERROR:
527			v.val = -sock_error(sk);
528			if(v.val==0)
529				v.val = xchg(&sk->sk_err_soft, 0);
530			break;
531
532		case SO_OOBINLINE:
533			v.val = !!sock_flag(sk, SOCK_URGINLINE);
534			break;
535
536		case SO_NO_CHECK:
537			v.val = sk->sk_no_check;
538			break;
539
540		case SO_PRIORITY:
541			v.val = sk->sk_priority;
542			break;
543
544		case SO_LINGER:
545			lv		= sizeof(v.ling);
546			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
547 			v.ling.l_linger	= sk->sk_lingertime / HZ;
548			break;
549
550		case SO_BSDCOMPAT:
551			sock_warn_obsolete_bsdism("getsockopt");
552			break;
553
554		case SO_TIMESTAMP:
555			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
556			break;
557
558		case SO_RCVTIMEO:
559			lv=sizeof(struct timeval);
560			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
561				v.tm.tv_sec = 0;
562				v.tm.tv_usec = 0;
563			} else {
564				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
565				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
566			}
567			break;
568
569		case SO_SNDTIMEO:
570			lv=sizeof(struct timeval);
571			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
572				v.tm.tv_sec = 0;
573				v.tm.tv_usec = 0;
574			} else {
575				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
576				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
577			}
578			break;
579
580		case SO_RCVLOWAT:
581			v.val = sk->sk_rcvlowat;
582			break;
583
584		case SO_SNDLOWAT:
585			v.val=1;
586			break;
587
588		case SO_PASSCRED:
589			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
590			break;
591
592		case SO_PEERCRED:
593			if (len > sizeof(sk->sk_peercred))
594				len = sizeof(sk->sk_peercred);
595			if (copy_to_user(optval, &sk->sk_peercred, len))
596				return -EFAULT;
597			goto lenout;
598
599		case SO_PEERNAME:
600		{
601			char address[128];
602
603			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
604				return -ENOTCONN;
605			if (lv < len)
606				return -EINVAL;
607			if (copy_to_user(optval, address, len))
608				return -EFAULT;
609			goto lenout;
610		}
611
612		/* Dubious BSD thing... Probably nobody even uses it, but
613		 * the UNIX standard wants it for whatever reason... -DaveM
614		 */
615		case SO_ACCEPTCONN:
616			v.val = sk->sk_state == TCP_LISTEN;
617			break;
618
619		case SO_PEERSEC:
620			return security_socket_getpeersec_stream(sock, optval, optlen, len);
621
622		default:
623			return(-ENOPROTOOPT);
624	}
625	if (len > lv)
626		len = lv;
627	if (copy_to_user(optval, &v, len))
628		return -EFAULT;
629lenout:
630  	if (put_user(len, optlen))
631  		return -EFAULT;
632  	return 0;
633}
634
635/**
636 *	sk_alloc - All socket objects are allocated here
637 *	@family: protocol family
638 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
639 *	@prot: struct proto associated with this new sock instance
640 *	@zero_it: if we should zero the newly allocated sock
641 */
642struct sock *sk_alloc(int family, gfp_t priority,
643		      struct proto *prot, int zero_it)
644{
645	struct sock *sk = NULL;
646	kmem_cache_t *slab = prot->slab;
647
648	if (slab != NULL)
649		sk = kmem_cache_alloc(slab, priority);
650	else
651		sk = kmalloc(prot->obj_size, priority);
652
653	if (sk) {
654		if (zero_it) {
655			memset(sk, 0, prot->obj_size);
656			sk->sk_family = family;
657			/*
658			 * See comment in struct sock definition to understand
659			 * why we need sk_prot_creator -acme
660			 */
661			sk->sk_prot = sk->sk_prot_creator = prot;
662			sock_lock_init(sk);
663		}
664
665		if (security_sk_alloc(sk, family, priority))
666			goto out_free;
667
668		if (!try_module_get(prot->owner))
669			goto out_free;
670	}
671	return sk;
672
673out_free:
674	if (slab != NULL)
675		kmem_cache_free(slab, sk);
676	else
677		kfree(sk);
678	return NULL;
679}
680
681void sk_free(struct sock *sk)
682{
683	struct sk_filter *filter;
684	struct module *owner = sk->sk_prot_creator->owner;
685
686	if (sk->sk_destruct)
687		sk->sk_destruct(sk);
688
689	filter = sk->sk_filter;
690	if (filter) {
691		sk_filter_release(sk, filter);
692		sk->sk_filter = NULL;
693	}
694
695	sock_disable_timestamp(sk);
696
697	if (atomic_read(&sk->sk_omem_alloc))
698		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
699		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
700
701	security_sk_free(sk);
702	if (sk->sk_prot_creator->slab != NULL)
703		kmem_cache_free(sk->sk_prot_creator->slab, sk);
704	else
705		kfree(sk);
706	module_put(owner);
707}
708
709struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
710{
711	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
712
713	if (newsk != NULL) {
714		struct sk_filter *filter;
715
716		memcpy(newsk, sk, sk->sk_prot->obj_size);
717
718		/* SANITY */
719		sk_node_init(&newsk->sk_node);
720		sock_lock_init(newsk);
721		bh_lock_sock(newsk);
722
723		atomic_set(&newsk->sk_rmem_alloc, 0);
724		atomic_set(&newsk->sk_wmem_alloc, 0);
725		atomic_set(&newsk->sk_omem_alloc, 0);
726		skb_queue_head_init(&newsk->sk_receive_queue);
727		skb_queue_head_init(&newsk->sk_write_queue);
728
729		rwlock_init(&newsk->sk_dst_lock);
730		rwlock_init(&newsk->sk_callback_lock);
731
732		newsk->sk_dst_cache	= NULL;
733		newsk->sk_wmem_queued	= 0;
734		newsk->sk_forward_alloc = 0;
735		newsk->sk_send_head	= NULL;
736		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
737		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
738
739		sock_reset_flag(newsk, SOCK_DONE);
740		skb_queue_head_init(&newsk->sk_error_queue);
741
742		filter = newsk->sk_filter;
743		if (filter != NULL)
744			sk_filter_charge(newsk, filter);
745
746		if (unlikely(xfrm_sk_clone_policy(newsk))) {
747			/* It is still raw copy of parent, so invalidate
748			 * destructor and make plain sk_free() */
749			newsk->sk_destruct = NULL;
750			sk_free(newsk);
751			newsk = NULL;
752			goto out;
753		}
754
755		newsk->sk_err	   = 0;
756		newsk->sk_priority = 0;
757		atomic_set(&newsk->sk_refcnt, 2);
758
759		/*
760		 * Increment the counter in the same struct proto as the master
761		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
762		 * is the same as sk->sk_prot->socks, as this field was copied
763		 * with memcpy).
764		 *
765		 * This _changes_ the previous behaviour, where
766		 * tcp_create_openreq_child always was incrementing the
767		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
768		 * to be taken into account in all callers. -acme
769		 */
770		sk_refcnt_debug_inc(newsk);
771		newsk->sk_socket = NULL;
772		newsk->sk_sleep	 = NULL;
773
774		if (newsk->sk_prot->sockets_allocated)
775			atomic_inc(newsk->sk_prot->sockets_allocated);
776	}
777out:
778	return newsk;
779}
780
781EXPORT_SYMBOL_GPL(sk_clone);
782
783void __init sk_init(void)
784{
785	if (num_physpages <= 4096) {
786		sysctl_wmem_max = 32767;
787		sysctl_rmem_max = 32767;
788		sysctl_wmem_default = 32767;
789		sysctl_rmem_default = 32767;
790	} else if (num_physpages >= 131072) {
791		sysctl_wmem_max = 131071;
792		sysctl_rmem_max = 131071;
793	}
794}
795
796/*
797 *	Simple resource managers for sockets.
798 */
799
800
801/*
802 * Write buffer destructor automatically called from kfree_skb.
803 */
804void sock_wfree(struct sk_buff *skb)
805{
806	struct sock *sk = skb->sk;
807
808	/* In case it might be waiting for more memory. */
809	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
810	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
811		sk->sk_write_space(sk);
812	sock_put(sk);
813}
814
815/*
816 * Read buffer destructor automatically called from kfree_skb.
817 */
818void sock_rfree(struct sk_buff *skb)
819{
820	struct sock *sk = skb->sk;
821
822	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
823}
824
825
826int sock_i_uid(struct sock *sk)
827{
828	int uid;
829
830	read_lock(&sk->sk_callback_lock);
831	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
832	read_unlock(&sk->sk_callback_lock);
833	return uid;
834}
835
836unsigned long sock_i_ino(struct sock *sk)
837{
838	unsigned long ino;
839
840	read_lock(&sk->sk_callback_lock);
841	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
842	read_unlock(&sk->sk_callback_lock);
843	return ino;
844}
845
846/*
847 * Allocate a skb from the socket's send buffer.
848 */
849struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
850			     gfp_t priority)
851{
852	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
853		struct sk_buff * skb = alloc_skb(size, priority);
854		if (skb) {
855			skb_set_owner_w(skb, sk);
856			return skb;
857		}
858	}
859	return NULL;
860}
861
862/*
863 * Allocate a skb from the socket's receive buffer.
864 */
865struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
866			     gfp_t priority)
867{
868	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
869		struct sk_buff *skb = alloc_skb(size, priority);
870		if (skb) {
871			skb_set_owner_r(skb, sk);
872			return skb;
873		}
874	}
875	return NULL;
876}
877
878/*
879 * Allocate a memory block from the socket's option memory buffer.
880 */
881void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
882{
883	if ((unsigned)size <= sysctl_optmem_max &&
884	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
885		void *mem;
886		/* First do the add, to avoid the race if kmalloc
887 		 * might sleep.
888		 */
889		atomic_add(size, &sk->sk_omem_alloc);
890		mem = kmalloc(size, priority);
891		if (mem)
892			return mem;
893		atomic_sub(size, &sk->sk_omem_alloc);
894	}
895	return NULL;
896}
897
898/*
899 * Free an option memory block.
900 */
901void sock_kfree_s(struct sock *sk, void *mem, int size)
902{
903	kfree(mem);
904	atomic_sub(size, &sk->sk_omem_alloc);
905}
906
907/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
908   I think, these locks should be removed for datagram sockets.
909 */
910static long sock_wait_for_wmem(struct sock * sk, long timeo)
911{
912	DEFINE_WAIT(wait);
913
914	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
915	for (;;) {
916		if (!timeo)
917			break;
918		if (signal_pending(current))
919			break;
920		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
921		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
922		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
923			break;
924		if (sk->sk_shutdown & SEND_SHUTDOWN)
925			break;
926		if (sk->sk_err)
927			break;
928		timeo = schedule_timeout(timeo);
929	}
930	finish_wait(sk->sk_sleep, &wait);
931	return timeo;
932}
933
934
935/*
936 *	Generic send/receive buffer handlers
937 */
938
939static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
940					    unsigned long header_len,
941					    unsigned long data_len,
942					    int noblock, int *errcode)
943{
944	struct sk_buff *skb;
945	gfp_t gfp_mask;
946	long timeo;
947	int err;
948
949	gfp_mask = sk->sk_allocation;
950	if (gfp_mask & __GFP_WAIT)
951		gfp_mask |= __GFP_REPEAT;
952
953	timeo = sock_sndtimeo(sk, noblock);
954	while (1) {
955		err = sock_error(sk);
956		if (err != 0)
957			goto failure;
958
959		err = -EPIPE;
960		if (sk->sk_shutdown & SEND_SHUTDOWN)
961			goto failure;
962
963		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
964			skb = alloc_skb(header_len, sk->sk_allocation);
965			if (skb) {
966				int npages;
967				int i;
968
969				/* No pages, we're done... */
970				if (!data_len)
971					break;
972
973				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
974				skb->truesize += data_len;
975				skb_shinfo(skb)->nr_frags = npages;
976				for (i = 0; i < npages; i++) {
977					struct page *page;
978					skb_frag_t *frag;
979
980					page = alloc_pages(sk->sk_allocation, 0);
981					if (!page) {
982						err = -ENOBUFS;
983						skb_shinfo(skb)->nr_frags = i;
984						kfree_skb(skb);
985						goto failure;
986					}
987
988					frag = &skb_shinfo(skb)->frags[i];
989					frag->page = page;
990					frag->page_offset = 0;
991					frag->size = (data_len >= PAGE_SIZE ?
992						      PAGE_SIZE :
993						      data_len);
994					data_len -= PAGE_SIZE;
995				}
996
997				/* Full success... */
998				break;
999			}
1000			err = -ENOBUFS;
1001			goto failure;
1002		}
1003		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1004		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1005		err = -EAGAIN;
1006		if (!timeo)
1007			goto failure;
1008		if (signal_pending(current))
1009			goto interrupted;
1010		timeo = sock_wait_for_wmem(sk, timeo);
1011	}
1012
1013	skb_set_owner_w(skb, sk);
1014	return skb;
1015
1016interrupted:
1017	err = sock_intr_errno(timeo);
1018failure:
1019	*errcode = err;
1020	return NULL;
1021}
1022
1023struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1024				    int noblock, int *errcode)
1025{
1026	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1027}
1028
1029static void __lock_sock(struct sock *sk)
1030{
1031	DEFINE_WAIT(wait);
1032
1033	for(;;) {
1034		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1035					TASK_UNINTERRUPTIBLE);
1036		spin_unlock_bh(&sk->sk_lock.slock);
1037		schedule();
1038		spin_lock_bh(&sk->sk_lock.slock);
1039		if(!sock_owned_by_user(sk))
1040			break;
1041	}
1042	finish_wait(&sk->sk_lock.wq, &wait);
1043}
1044
1045static void __release_sock(struct sock *sk)
1046{
1047	struct sk_buff *skb = sk->sk_backlog.head;
1048
1049	do {
1050		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1051		bh_unlock_sock(sk);
1052
1053		do {
1054			struct sk_buff *next = skb->next;
1055
1056			skb->next = NULL;
1057			sk->sk_backlog_rcv(sk, skb);
1058
1059			/*
1060			 * We are in process context here with softirqs
1061			 * disabled, use cond_resched_softirq() to preempt.
1062			 * This is safe to do because we've taken the backlog
1063			 * queue private:
1064			 */
1065			cond_resched_softirq();
1066
1067			skb = next;
1068		} while (skb != NULL);
1069
1070		bh_lock_sock(sk);
1071	} while((skb = sk->sk_backlog.head) != NULL);
1072}
1073
1074/**
1075 * sk_wait_data - wait for data to arrive at sk_receive_queue
1076 * @sk:    sock to wait on
1077 * @timeo: for how long
1078 *
1079 * Now socket state including sk->sk_err is changed only under lock,
1080 * hence we may omit checks after joining wait queue.
1081 * We check receive queue before schedule() only as optimization;
1082 * it is very likely that release_sock() added new data.
1083 */
1084int sk_wait_data(struct sock *sk, long *timeo)
1085{
1086	int rc;
1087	DEFINE_WAIT(wait);
1088
1089	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1090	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1091	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1092	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1093	finish_wait(sk->sk_sleep, &wait);
1094	return rc;
1095}
1096
1097EXPORT_SYMBOL(sk_wait_data);
1098
1099/*
1100 * Set of default routines for initialising struct proto_ops when
1101 * the protocol does not support a particular function. In certain
1102 * cases where it makes no sense for a protocol to have a "do nothing"
1103 * function, some default processing is provided.
1104 */
1105
1106int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1107{
1108	return -EOPNOTSUPP;
1109}
1110
1111int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1112		    int len, int flags)
1113{
1114	return -EOPNOTSUPP;
1115}
1116
1117int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1118{
1119	return -EOPNOTSUPP;
1120}
1121
1122int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1123{
1124	return -EOPNOTSUPP;
1125}
1126
1127int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1128		    int *len, int peer)
1129{
1130	return -EOPNOTSUPP;
1131}
1132
1133unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1134{
1135	return 0;
1136}
1137
1138int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1139{
1140	return -EOPNOTSUPP;
1141}
1142
1143int sock_no_listen(struct socket *sock, int backlog)
1144{
1145	return -EOPNOTSUPP;
1146}
1147
1148int sock_no_shutdown(struct socket *sock, int how)
1149{
1150	return -EOPNOTSUPP;
1151}
1152
1153int sock_no_setsockopt(struct socket *sock, int level, int optname,
1154		    char __user *optval, int optlen)
1155{
1156	return -EOPNOTSUPP;
1157}
1158
1159int sock_no_getsockopt(struct socket *sock, int level, int optname,
1160		    char __user *optval, int __user *optlen)
1161{
1162	return -EOPNOTSUPP;
1163}
1164
1165int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1166		    size_t len)
1167{
1168	return -EOPNOTSUPP;
1169}
1170
1171int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1172		    size_t len, int flags)
1173{
1174	return -EOPNOTSUPP;
1175}
1176
1177int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1178{
1179	/* Mirror missing mmap method error code */
1180	return -ENODEV;
1181}
1182
1183ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1184{
1185	ssize_t res;
1186	struct msghdr msg = {.msg_flags = flags};
1187	struct kvec iov;
1188	char *kaddr = kmap(page);
1189	iov.iov_base = kaddr + offset;
1190	iov.iov_len = size;
1191	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1192	kunmap(page);
1193	return res;
1194}
1195
1196/*
1197 *	Default Socket Callbacks
1198 */
1199
1200static void sock_def_wakeup(struct sock *sk)
1201{
1202	read_lock(&sk->sk_callback_lock);
1203	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1204		wake_up_interruptible_all(sk->sk_sleep);
1205	read_unlock(&sk->sk_callback_lock);
1206}
1207
1208static void sock_def_error_report(struct sock *sk)
1209{
1210	read_lock(&sk->sk_callback_lock);
1211	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1212		wake_up_interruptible(sk->sk_sleep);
1213	sk_wake_async(sk,0,POLL_ERR);
1214	read_unlock(&sk->sk_callback_lock);
1215}
1216
1217static void sock_def_readable(struct sock *sk, int len)
1218{
1219	read_lock(&sk->sk_callback_lock);
1220	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1221		wake_up_interruptible(sk->sk_sleep);
1222	sk_wake_async(sk,1,POLL_IN);
1223	read_unlock(&sk->sk_callback_lock);
1224}
1225
1226static void sock_def_write_space(struct sock *sk)
1227{
1228	read_lock(&sk->sk_callback_lock);
1229
1230	/* Do not wake up a writer until he can make "significant"
1231	 * progress.  --DaveM
1232	 */
1233	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1234		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1235			wake_up_interruptible(sk->sk_sleep);
1236
1237		/* Should agree with poll, otherwise some programs break */
1238		if (sock_writeable(sk))
1239			sk_wake_async(sk, 2, POLL_OUT);
1240	}
1241
1242	read_unlock(&sk->sk_callback_lock);
1243}
1244
1245static void sock_def_destruct(struct sock *sk)
1246{
1247	kfree(sk->sk_protinfo);
1248}
1249
1250void sk_send_sigurg(struct sock *sk)
1251{
1252	if (sk->sk_socket && sk->sk_socket->file)
1253		if (send_sigurg(&sk->sk_socket->file->f_owner))
1254			sk_wake_async(sk, 3, POLL_PRI);
1255}
1256
1257void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1258		    unsigned long expires)
1259{
1260	if (!mod_timer(timer, expires))
1261		sock_hold(sk);
1262}
1263
1264EXPORT_SYMBOL(sk_reset_timer);
1265
1266void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1267{
1268	if (timer_pending(timer) && del_timer(timer))
1269		__sock_put(sk);
1270}
1271
1272EXPORT_SYMBOL(sk_stop_timer);
1273
1274void sock_init_data(struct socket *sock, struct sock *sk)
1275{
1276	skb_queue_head_init(&sk->sk_receive_queue);
1277	skb_queue_head_init(&sk->sk_write_queue);
1278	skb_queue_head_init(&sk->sk_error_queue);
1279
1280	sk->sk_send_head	=	NULL;
1281
1282	init_timer(&sk->sk_timer);
1283
1284	sk->sk_allocation	=	GFP_KERNEL;
1285	sk->sk_rcvbuf		=	sysctl_rmem_default;
1286	sk->sk_sndbuf		=	sysctl_wmem_default;
1287	sk->sk_state		=	TCP_CLOSE;
1288	sk->sk_socket		=	sock;
1289
1290	sock_set_flag(sk, SOCK_ZAPPED);
1291
1292	if(sock)
1293	{
1294		sk->sk_type	=	sock->type;
1295		sk->sk_sleep	=	&sock->wait;
1296		sock->sk	=	sk;
1297	} else
1298		sk->sk_sleep	=	NULL;
1299
1300	rwlock_init(&sk->sk_dst_lock);
1301	rwlock_init(&sk->sk_callback_lock);
1302
1303	sk->sk_state_change	=	sock_def_wakeup;
1304	sk->sk_data_ready	=	sock_def_readable;
1305	sk->sk_write_space	=	sock_def_write_space;
1306	sk->sk_error_report	=	sock_def_error_report;
1307	sk->sk_destruct		=	sock_def_destruct;
1308
1309	sk->sk_sndmsg_page	=	NULL;
1310	sk->sk_sndmsg_off	=	0;
1311
1312	sk->sk_peercred.pid 	=	0;
1313	sk->sk_peercred.uid	=	-1;
1314	sk->sk_peercred.gid	=	-1;
1315	sk->sk_write_pending	=	0;
1316	sk->sk_rcvlowat		=	1;
1317	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1318	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1319
1320	sk->sk_stamp.tv_sec     = -1L;
1321	sk->sk_stamp.tv_usec    = -1L;
1322
1323	atomic_set(&sk->sk_refcnt, 1);
1324}
1325
1326void fastcall lock_sock(struct sock *sk)
1327{
1328	might_sleep();
1329	spin_lock_bh(&(sk->sk_lock.slock));
1330	if (sk->sk_lock.owner)
1331		__lock_sock(sk);
1332	sk->sk_lock.owner = (void *)1;
1333	spin_unlock_bh(&(sk->sk_lock.slock));
1334}
1335
1336EXPORT_SYMBOL(lock_sock);
1337
1338void fastcall release_sock(struct sock *sk)
1339{
1340	spin_lock_bh(&(sk->sk_lock.slock));
1341	if (sk->sk_backlog.tail)
1342		__release_sock(sk);
1343	sk->sk_lock.owner = NULL;
1344        if (waitqueue_active(&(sk->sk_lock.wq)))
1345		wake_up(&(sk->sk_lock.wq));
1346	spin_unlock_bh(&(sk->sk_lock.slock));
1347}
1348EXPORT_SYMBOL(release_sock);
1349
1350int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1351{
1352	if (!sock_flag(sk, SOCK_TIMESTAMP))
1353		sock_enable_timestamp(sk);
1354	if (sk->sk_stamp.tv_sec == -1)
1355		return -ENOENT;
1356	if (sk->sk_stamp.tv_sec == 0)
1357		do_gettimeofday(&sk->sk_stamp);
1358	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1359		-EFAULT : 0;
1360}
1361EXPORT_SYMBOL(sock_get_timestamp);
1362
1363void sock_enable_timestamp(struct sock *sk)
1364{
1365	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1366		sock_set_flag(sk, SOCK_TIMESTAMP);
1367		net_enable_timestamp();
1368	}
1369}
1370EXPORT_SYMBOL(sock_enable_timestamp);
1371
1372/*
1373 *	Get a socket option on an socket.
1374 *
1375 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1376 *	asynchronous errors should be reported by getsockopt. We assume
1377 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1378 */
1379int sock_common_getsockopt(struct socket *sock, int level, int optname,
1380			   char __user *optval, int __user *optlen)
1381{
1382	struct sock *sk = sock->sk;
1383
1384	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1385}
1386
1387EXPORT_SYMBOL(sock_common_getsockopt);
1388
1389#ifdef CONFIG_COMPAT
1390int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1391				  char __user *optval, int __user *optlen)
1392{
1393	struct sock *sk = sock->sk;
1394
1395	if (sk->sk_prot->compat_setsockopt != NULL)
1396		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1397						      optval, optlen);
1398	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1399}
1400EXPORT_SYMBOL(compat_sock_common_getsockopt);
1401#endif
1402
1403int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1404			struct msghdr *msg, size_t size, int flags)
1405{
1406	struct sock *sk = sock->sk;
1407	int addr_len = 0;
1408	int err;
1409
1410	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1411				   flags & ~MSG_DONTWAIT, &addr_len);
1412	if (err >= 0)
1413		msg->msg_namelen = addr_len;
1414	return err;
1415}
1416
1417EXPORT_SYMBOL(sock_common_recvmsg);
1418
1419/*
1420 *	Set socket options on an inet socket.
1421 */
1422int sock_common_setsockopt(struct socket *sock, int level, int optname,
1423			   char __user *optval, int optlen)
1424{
1425	struct sock *sk = sock->sk;
1426
1427	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1428}
1429
1430EXPORT_SYMBOL(sock_common_setsockopt);
1431
1432#ifdef CONFIG_COMPAT
1433int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1434				  char __user *optval, int optlen)
1435{
1436	struct sock *sk = sock->sk;
1437
1438	if (sk->sk_prot->compat_setsockopt != NULL)
1439		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1440						      optval, optlen);
1441	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1442}
1443EXPORT_SYMBOL(compat_sock_common_setsockopt);
1444#endif
1445
1446void sk_common_release(struct sock *sk)
1447{
1448	if (sk->sk_prot->destroy)
1449		sk->sk_prot->destroy(sk);
1450
1451	/*
1452	 * Observation: when sock_common_release is called, processes have
1453	 * no access to socket. But net still has.
1454	 * Step one, detach it from networking:
1455	 *
1456	 * A. Remove from hash tables.
1457	 */
1458
1459	sk->sk_prot->unhash(sk);
1460
1461	/*
1462	 * In this point socket cannot receive new packets, but it is possible
1463	 * that some packets are in flight because some CPU runs receiver and
1464	 * did hash table lookup before we unhashed socket. They will achieve
1465	 * receive queue and will be purged by socket destructor.
1466	 *
1467	 * Also we still have packets pending on receive queue and probably,
1468	 * our own packets waiting in device queues. sock_destroy will drain
1469	 * receive queue, but transmitted packets will delay socket destruction
1470	 * until the last reference will be released.
1471	 */
1472
1473	sock_orphan(sk);
1474
1475	xfrm_sk_free_policy(sk);
1476
1477	sk_refcnt_debug_release(sk);
1478	sock_put(sk);
1479}
1480
1481EXPORT_SYMBOL(sk_common_release);
1482
1483static DEFINE_RWLOCK(proto_list_lock);
1484static LIST_HEAD(proto_list);
1485
1486int proto_register(struct proto *prot, int alloc_slab)
1487{
1488	char *request_sock_slab_name = NULL;
1489	char *timewait_sock_slab_name;
1490	int rc = -ENOBUFS;
1491
1492	if (alloc_slab) {
1493		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1494					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1495
1496		if (prot->slab == NULL) {
1497			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1498			       prot->name);
1499			goto out;
1500		}
1501
1502		if (prot->rsk_prot != NULL) {
1503			static const char mask[] = "request_sock_%s";
1504
1505			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1506			if (request_sock_slab_name == NULL)
1507				goto out_free_sock_slab;
1508
1509			sprintf(request_sock_slab_name, mask, prot->name);
1510			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1511								 prot->rsk_prot->obj_size, 0,
1512								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1513
1514			if (prot->rsk_prot->slab == NULL) {
1515				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1516				       prot->name);
1517				goto out_free_request_sock_slab_name;
1518			}
1519		}
1520
1521		if (prot->twsk_prot != NULL) {
1522			static const char mask[] = "tw_sock_%s";
1523
1524			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1525
1526			if (timewait_sock_slab_name == NULL)
1527				goto out_free_request_sock_slab;
1528
1529			sprintf(timewait_sock_slab_name, mask, prot->name);
1530			prot->twsk_prot->twsk_slab =
1531				kmem_cache_create(timewait_sock_slab_name,
1532						  prot->twsk_prot->twsk_obj_size,
1533						  0, SLAB_HWCACHE_ALIGN,
1534						  NULL, NULL);
1535			if (prot->twsk_prot->twsk_slab == NULL)
1536				goto out_free_timewait_sock_slab_name;
1537		}
1538	}
1539
1540	write_lock(&proto_list_lock);
1541	list_add(&prot->node, &proto_list);
1542	write_unlock(&proto_list_lock);
1543	rc = 0;
1544out:
1545	return rc;
1546out_free_timewait_sock_slab_name:
1547	kfree(timewait_sock_slab_name);
1548out_free_request_sock_slab:
1549	if (prot->rsk_prot && prot->rsk_prot->slab) {
1550		kmem_cache_destroy(prot->rsk_prot->slab);
1551		prot->rsk_prot->slab = NULL;
1552	}
1553out_free_request_sock_slab_name:
1554	kfree(request_sock_slab_name);
1555out_free_sock_slab:
1556	kmem_cache_destroy(prot->slab);
1557	prot->slab = NULL;
1558	goto out;
1559}
1560
1561EXPORT_SYMBOL(proto_register);
1562
1563void proto_unregister(struct proto *prot)
1564{
1565	write_lock(&proto_list_lock);
1566	list_del(&prot->node);
1567	write_unlock(&proto_list_lock);
1568
1569	if (prot->slab != NULL) {
1570		kmem_cache_destroy(prot->slab);
1571		prot->slab = NULL;
1572	}
1573
1574	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1575		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1576
1577		kmem_cache_destroy(prot->rsk_prot->slab);
1578		kfree(name);
1579		prot->rsk_prot->slab = NULL;
1580	}
1581
1582	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1583		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1584
1585		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1586		kfree(name);
1587		prot->twsk_prot->twsk_slab = NULL;
1588	}
1589}
1590
1591EXPORT_SYMBOL(proto_unregister);
1592
1593#ifdef CONFIG_PROC_FS
1594static inline struct proto *__proto_head(void)
1595{
1596	return list_entry(proto_list.next, struct proto, node);
1597}
1598
1599static inline struct proto *proto_head(void)
1600{
1601	return list_empty(&proto_list) ? NULL : __proto_head();
1602}
1603
1604static inline struct proto *proto_next(struct proto *proto)
1605{
1606	return proto->node.next == &proto_list ? NULL :
1607		list_entry(proto->node.next, struct proto, node);
1608}
1609
1610static inline struct proto *proto_get_idx(loff_t pos)
1611{
1612	struct proto *proto;
1613	loff_t i = 0;
1614
1615	list_for_each_entry(proto, &proto_list, node)
1616		if (i++ == pos)
1617			goto out;
1618
1619	proto = NULL;
1620out:
1621	return proto;
1622}
1623
1624static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1625{
1626	read_lock(&proto_list_lock);
1627	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1628}
1629
1630static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1631{
1632	++*pos;
1633	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1634}
1635
1636static void proto_seq_stop(struct seq_file *seq, void *v)
1637{
1638	read_unlock(&proto_list_lock);
1639}
1640
1641static char proto_method_implemented(const void *method)
1642{
1643	return method == NULL ? 'n' : 'y';
1644}
1645
1646static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1647{
1648	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1649			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1650		   proto->name,
1651		   proto->obj_size,
1652		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1653		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1654		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1655		   proto->max_header,
1656		   proto->slab == NULL ? "no" : "yes",
1657		   module_name(proto->owner),
1658		   proto_method_implemented(proto->close),
1659		   proto_method_implemented(proto->connect),
1660		   proto_method_implemented(proto->disconnect),
1661		   proto_method_implemented(proto->accept),
1662		   proto_method_implemented(proto->ioctl),
1663		   proto_method_implemented(proto->init),
1664		   proto_method_implemented(proto->destroy),
1665		   proto_method_implemented(proto->shutdown),
1666		   proto_method_implemented(proto->setsockopt),
1667		   proto_method_implemented(proto->getsockopt),
1668		   proto_method_implemented(proto->sendmsg),
1669		   proto_method_implemented(proto->recvmsg),
1670		   proto_method_implemented(proto->sendpage),
1671		   proto_method_implemented(proto->bind),
1672		   proto_method_implemented(proto->backlog_rcv),
1673		   proto_method_implemented(proto->hash),
1674		   proto_method_implemented(proto->unhash),
1675		   proto_method_implemented(proto->get_port),
1676		   proto_method_implemented(proto->enter_memory_pressure));
1677}
1678
1679static int proto_seq_show(struct seq_file *seq, void *v)
1680{
1681	if (v == SEQ_START_TOKEN)
1682		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1683			   "protocol",
1684			   "size",
1685			   "sockets",
1686			   "memory",
1687			   "press",
1688			   "maxhdr",
1689			   "slab",
1690			   "module",
1691			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1692	else
1693		proto_seq_printf(seq, v);
1694	return 0;
1695}
1696
1697static struct seq_operations proto_seq_ops = {
1698	.start  = proto_seq_start,
1699	.next   = proto_seq_next,
1700	.stop   = proto_seq_stop,
1701	.show   = proto_seq_show,
1702};
1703
1704static int proto_seq_open(struct inode *inode, struct file *file)
1705{
1706	return seq_open(file, &proto_seq_ops);
1707}
1708
1709static struct file_operations proto_seq_fops = {
1710	.owner		= THIS_MODULE,
1711	.open		= proto_seq_open,
1712	.read		= seq_read,
1713	.llseek		= seq_lseek,
1714	.release	= seq_release,
1715};
1716
1717static int __init proto_init(void)
1718{
1719	/* register /proc/net/protocols */
1720	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1721}
1722
1723subsys_initcall(proto_init);
1724
1725#endif /* PROC_FS */
1726
1727EXPORT_SYMBOL(sk_alloc);
1728EXPORT_SYMBOL(sk_free);
1729EXPORT_SYMBOL(sk_send_sigurg);
1730EXPORT_SYMBOL(sock_alloc_send_skb);
1731EXPORT_SYMBOL(sock_init_data);
1732EXPORT_SYMBOL(sock_kfree_s);
1733EXPORT_SYMBOL(sock_kmalloc);
1734EXPORT_SYMBOL(sock_no_accept);
1735EXPORT_SYMBOL(sock_no_bind);
1736EXPORT_SYMBOL(sock_no_connect);
1737EXPORT_SYMBOL(sock_no_getname);
1738EXPORT_SYMBOL(sock_no_getsockopt);
1739EXPORT_SYMBOL(sock_no_ioctl);
1740EXPORT_SYMBOL(sock_no_listen);
1741EXPORT_SYMBOL(sock_no_mmap);
1742EXPORT_SYMBOL(sock_no_poll);
1743EXPORT_SYMBOL(sock_no_recvmsg);
1744EXPORT_SYMBOL(sock_no_sendmsg);
1745EXPORT_SYMBOL(sock_no_sendpage);
1746EXPORT_SYMBOL(sock_no_setsockopt);
1747EXPORT_SYMBOL(sock_no_shutdown);
1748EXPORT_SYMBOL(sock_no_socketpair);
1749EXPORT_SYMBOL(sock_rfree);
1750EXPORT_SYMBOL(sock_setsockopt);
1751EXPORT_SYMBOL(sock_wfree);
1752EXPORT_SYMBOL(sock_wmalloc);
1753EXPORT_SYMBOL(sock_i_uid);
1754EXPORT_SYMBOL(sock_i_ino);
1755EXPORT_SYMBOL(sysctl_optmem_max);
1756#ifdef CONFIG_SYSCTL
1757EXPORT_SYMBOL(sysctl_rmem_max);
1758EXPORT_SYMBOL(sysctl_wmem_max);
1759#endif
1760