sock.c revision c08e49611a8b4e38a75bf217e1029a48faf10b82
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/config.h>
96#include <linux/errno.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/* Take into consideration the size of the struct sk_buff overhead in the
134 * determination of these values, since that is non-constant across
135 * platforms.  This makes socket queueing behavior and performance
136 * not depend upon such differences.
137 */
138#define _SK_MEM_PACKETS		256
139#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
140#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143/* Run time adjustable parameters. */
144__u32 sysctl_wmem_max = SK_WMEM_MAX;
145__u32 sysctl_rmem_max = SK_RMEM_MAX;
146__u32 sysctl_wmem_default = SK_WMEM_MAX;
147__u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149/* Maximal space eaten by iovec or ancilliary data plus some space */
150int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153{
154	struct timeval tv;
155
156	if (optlen < sizeof(tv))
157		return -EINVAL;
158	if (copy_from_user(&tv, optval, sizeof(tv)))
159		return -EFAULT;
160
161	*timeo_p = MAX_SCHEDULE_TIMEOUT;
162	if (tv.tv_sec == 0 && tv.tv_usec == 0)
163		return 0;
164	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166	return 0;
167}
168
169static void sock_warn_obsolete_bsdism(const char *name)
170{
171	static int warned;
172	static char warncomm[TASK_COMM_LEN];
173	if (strcmp(warncomm, current->comm) && warned < 5) {
174		strcpy(warncomm,  current->comm);
175		printk(KERN_WARNING "process `%s' is using obsolete "
176		       "%s SO_BSDCOMPAT\n", warncomm, name);
177		warned++;
178	}
179}
180
181static void sock_disable_timestamp(struct sock *sk)
182{
183	if (sock_flag(sk, SOCK_TIMESTAMP)) {
184		sock_reset_flag(sk, SOCK_TIMESTAMP);
185		net_disable_timestamp();
186	}
187}
188
189
190int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
191{
192	int err = 0;
193	int skb_len;
194
195	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
196	   number of warnings when compiling with -W --ANK
197	 */
198	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
199	    (unsigned)sk->sk_rcvbuf) {
200		err = -ENOMEM;
201		goto out;
202	}
203
204	/* It would be deadlock, if sock_queue_rcv_skb is used
205	   with socket lock! We assume that users of this
206	   function are lock free.
207	*/
208	err = sk_filter(sk, skb, 1);
209	if (err)
210		goto out;
211
212	skb->dev = NULL;
213	skb_set_owner_r(skb, sk);
214
215	/* Cache the SKB length before we tack it onto the receive
216	 * queue.  Once it is added it no longer belongs to us and
217	 * may be freed by other threads of control pulling packets
218	 * from the queue.
219	 */
220	skb_len = skb->len;
221
222	skb_queue_tail(&sk->sk_receive_queue, skb);
223
224	if (!sock_flag(sk, SOCK_DEAD))
225		sk->sk_data_ready(sk, skb_len);
226out:
227	return err;
228}
229EXPORT_SYMBOL(sock_queue_rcv_skb);
230
231int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
232{
233	int rc = NET_RX_SUCCESS;
234
235	if (sk_filter(sk, skb, 0))
236		goto discard_and_relse;
237
238	skb->dev = NULL;
239
240	bh_lock_sock(sk);
241	if (!sock_owned_by_user(sk))
242		rc = sk->sk_backlog_rcv(sk, skb);
243	else
244		sk_add_backlog(sk, skb);
245	bh_unlock_sock(sk);
246out:
247	sock_put(sk);
248	return rc;
249discard_and_relse:
250	kfree_skb(skb);
251	goto out;
252}
253EXPORT_SYMBOL(sk_receive_skb);
254
255struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
256{
257	struct dst_entry *dst = sk->sk_dst_cache;
258
259	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
260		sk->sk_dst_cache = NULL;
261		dst_release(dst);
262		return NULL;
263	}
264
265	return dst;
266}
267EXPORT_SYMBOL(__sk_dst_check);
268
269struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
270{
271	struct dst_entry *dst = sk_dst_get(sk);
272
273	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
274		sk_dst_reset(sk);
275		dst_release(dst);
276		return NULL;
277	}
278
279	return dst;
280}
281EXPORT_SYMBOL(sk_dst_check);
282
283/*
284 *	This is meant for all protocols to use and covers goings on
285 *	at the socket level. Everything here is generic.
286 */
287
288int sock_setsockopt(struct socket *sock, int level, int optname,
289		    char __user *optval, int optlen)
290{
291	struct sock *sk=sock->sk;
292	struct sk_filter *filter;
293	int val;
294	int valbool;
295	struct linger ling;
296	int ret = 0;
297
298	/*
299	 *	Options without arguments
300	 */
301
302#ifdef SO_DONTLINGER		/* Compatibility item... */
303	if (optname == SO_DONTLINGER) {
304		lock_sock(sk);
305		sock_reset_flag(sk, SOCK_LINGER);
306		release_sock(sk);
307		return 0;
308	}
309#endif
310
311  	if(optlen<sizeof(int))
312  		return(-EINVAL);
313
314	if (get_user(val, (int __user *)optval))
315		return -EFAULT;
316
317  	valbool = val?1:0;
318
319	lock_sock(sk);
320
321  	switch(optname)
322  	{
323		case SO_DEBUG:
324			if(val && !capable(CAP_NET_ADMIN))
325			{
326				ret = -EACCES;
327			}
328			else if (valbool)
329				sock_set_flag(sk, SOCK_DBG);
330			else
331				sock_reset_flag(sk, SOCK_DBG);
332			break;
333		case SO_REUSEADDR:
334			sk->sk_reuse = valbool;
335			break;
336		case SO_TYPE:
337		case SO_ERROR:
338			ret = -ENOPROTOOPT;
339		  	break;
340		case SO_DONTROUTE:
341			if (valbool)
342				sock_set_flag(sk, SOCK_LOCALROUTE);
343			else
344				sock_reset_flag(sk, SOCK_LOCALROUTE);
345			break;
346		case SO_BROADCAST:
347			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
348			break;
349		case SO_SNDBUF:
350			/* Don't error on this BSD doesn't and if you think
351			   about it this is right. Otherwise apps have to
352			   play 'guess the biggest size' games. RCVBUF/SNDBUF
353			   are treated in BSD as hints */
354
355			if (val > sysctl_wmem_max)
356				val = sysctl_wmem_max;
357set_sndbuf:
358			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
359			if ((val * 2) < SOCK_MIN_SNDBUF)
360				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
361			else
362				sk->sk_sndbuf = val * 2;
363
364			/*
365			 *	Wake up sending tasks if we
366			 *	upped the value.
367			 */
368			sk->sk_write_space(sk);
369			break;
370
371		case SO_SNDBUFFORCE:
372			if (!capable(CAP_NET_ADMIN)) {
373				ret = -EPERM;
374				break;
375			}
376			goto set_sndbuf;
377
378		case SO_RCVBUF:
379			/* Don't error on this BSD doesn't and if you think
380			   about it this is right. Otherwise apps have to
381			   play 'guess the biggest size' games. RCVBUF/SNDBUF
382			   are treated in BSD as hints */
383
384			if (val > sysctl_rmem_max)
385				val = sysctl_rmem_max;
386set_rcvbuf:
387			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
388			/*
389			 * We double it on the way in to account for
390			 * "struct sk_buff" etc. overhead.   Applications
391			 * assume that the SO_RCVBUF setting they make will
392			 * allow that much actual data to be received on that
393			 * socket.
394			 *
395			 * Applications are unaware that "struct sk_buff" and
396			 * other overheads allocate from the receive buffer
397			 * during socket buffer allocation.
398			 *
399			 * And after considering the possible alternatives,
400			 * returning the value we actually used in getsockopt
401			 * is the most desirable behavior.
402			 */
403			if ((val * 2) < SOCK_MIN_RCVBUF)
404				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
405			else
406				sk->sk_rcvbuf = val * 2;
407			break;
408
409		case SO_RCVBUFFORCE:
410			if (!capable(CAP_NET_ADMIN)) {
411				ret = -EPERM;
412				break;
413			}
414			goto set_rcvbuf;
415
416		case SO_KEEPALIVE:
417#ifdef CONFIG_INET
418			if (sk->sk_protocol == IPPROTO_TCP)
419				tcp_set_keepalive(sk, valbool);
420#endif
421			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
422			break;
423
424	 	case SO_OOBINLINE:
425			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
426			break;
427
428	 	case SO_NO_CHECK:
429			sk->sk_no_check = valbool;
430			break;
431
432		case SO_PRIORITY:
433			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
434				sk->sk_priority = val;
435			else
436				ret = -EPERM;
437			break;
438
439		case SO_LINGER:
440			if(optlen<sizeof(ling)) {
441				ret = -EINVAL;	/* 1003.1g */
442				break;
443			}
444			if (copy_from_user(&ling,optval,sizeof(ling))) {
445				ret = -EFAULT;
446				break;
447			}
448			if (!ling.l_onoff)
449				sock_reset_flag(sk, SOCK_LINGER);
450			else {
451#if (BITS_PER_LONG == 32)
452				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
453					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
454				else
455#endif
456					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
457				sock_set_flag(sk, SOCK_LINGER);
458			}
459			break;
460
461		case SO_BSDCOMPAT:
462			sock_warn_obsolete_bsdism("setsockopt");
463			break;
464
465		case SO_PASSCRED:
466			if (valbool)
467				set_bit(SOCK_PASSCRED, &sock->flags);
468			else
469				clear_bit(SOCK_PASSCRED, &sock->flags);
470			break;
471
472		case SO_TIMESTAMP:
473			if (valbool)  {
474				sock_set_flag(sk, SOCK_RCVTSTAMP);
475				sock_enable_timestamp(sk);
476			} else
477				sock_reset_flag(sk, SOCK_RCVTSTAMP);
478			break;
479
480		case SO_RCVLOWAT:
481			if (val < 0)
482				val = INT_MAX;
483			sk->sk_rcvlowat = val ? : 1;
484			break;
485
486		case SO_RCVTIMEO:
487			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
488			break;
489
490		case SO_SNDTIMEO:
491			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
492			break;
493
494#ifdef CONFIG_NETDEVICES
495		case SO_BINDTODEVICE:
496		{
497			char devname[IFNAMSIZ];
498
499			/* Sorry... */
500			if (!capable(CAP_NET_RAW)) {
501				ret = -EPERM;
502				break;
503			}
504
505			/* Bind this socket to a particular device like "eth0",
506			 * as specified in the passed interface name. If the
507			 * name is "" or the option length is zero the socket
508			 * is not bound.
509			 */
510
511			if (!valbool) {
512				sk->sk_bound_dev_if = 0;
513			} else {
514				if (optlen > IFNAMSIZ - 1)
515					optlen = IFNAMSIZ - 1;
516				memset(devname, 0, sizeof(devname));
517				if (copy_from_user(devname, optval, optlen)) {
518					ret = -EFAULT;
519					break;
520				}
521
522				/* Remove any cached route for this socket. */
523				sk_dst_reset(sk);
524
525				if (devname[0] == '\0') {
526					sk->sk_bound_dev_if = 0;
527				} else {
528					struct net_device *dev = dev_get_by_name(devname);
529					if (!dev) {
530						ret = -ENODEV;
531						break;
532					}
533					sk->sk_bound_dev_if = dev->ifindex;
534					dev_put(dev);
535				}
536			}
537			break;
538		}
539#endif
540
541
542		case SO_ATTACH_FILTER:
543			ret = -EINVAL;
544			if (optlen == sizeof(struct sock_fprog)) {
545				struct sock_fprog fprog;
546
547				ret = -EFAULT;
548				if (copy_from_user(&fprog, optval, sizeof(fprog)))
549					break;
550
551				ret = sk_attach_filter(&fprog, sk);
552			}
553			break;
554
555		case SO_DETACH_FILTER:
556			spin_lock_bh(&sk->sk_lock.slock);
557			filter = sk->sk_filter;
558                        if (filter) {
559				sk->sk_filter = NULL;
560				spin_unlock_bh(&sk->sk_lock.slock);
561				sk_filter_release(sk, filter);
562				break;
563			}
564			spin_unlock_bh(&sk->sk_lock.slock);
565			ret = -ENONET;
566			break;
567
568		/* We implement the SO_SNDLOWAT etc to
569		   not be settable (1003.1g 5.3) */
570		default:
571		  	ret = -ENOPROTOOPT;
572			break;
573  	}
574	release_sock(sk);
575	return ret;
576}
577
578
579int sock_getsockopt(struct socket *sock, int level, int optname,
580		    char __user *optval, int __user *optlen)
581{
582	struct sock *sk = sock->sk;
583
584	union
585	{
586  		int val;
587  		struct linger ling;
588		struct timeval tm;
589	} v;
590
591	unsigned int lv = sizeof(int);
592	int len;
593
594  	if(get_user(len,optlen))
595  		return -EFAULT;
596	if(len < 0)
597		return -EINVAL;
598
599  	switch(optname)
600  	{
601		case SO_DEBUG:
602			v.val = sock_flag(sk, SOCK_DBG);
603			break;
604
605		case SO_DONTROUTE:
606			v.val = sock_flag(sk, SOCK_LOCALROUTE);
607			break;
608
609		case SO_BROADCAST:
610			v.val = !!sock_flag(sk, SOCK_BROADCAST);
611			break;
612
613		case SO_SNDBUF:
614			v.val = sk->sk_sndbuf;
615			break;
616
617		case SO_RCVBUF:
618			v.val = sk->sk_rcvbuf;
619			break;
620
621		case SO_REUSEADDR:
622			v.val = sk->sk_reuse;
623			break;
624
625		case SO_KEEPALIVE:
626			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
627			break;
628
629		case SO_TYPE:
630			v.val = sk->sk_type;
631			break;
632
633		case SO_ERROR:
634			v.val = -sock_error(sk);
635			if(v.val==0)
636				v.val = xchg(&sk->sk_err_soft, 0);
637			break;
638
639		case SO_OOBINLINE:
640			v.val = !!sock_flag(sk, SOCK_URGINLINE);
641			break;
642
643		case SO_NO_CHECK:
644			v.val = sk->sk_no_check;
645			break;
646
647		case SO_PRIORITY:
648			v.val = sk->sk_priority;
649			break;
650
651		case SO_LINGER:
652			lv		= sizeof(v.ling);
653			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
654 			v.ling.l_linger	= sk->sk_lingertime / HZ;
655			break;
656
657		case SO_BSDCOMPAT:
658			sock_warn_obsolete_bsdism("getsockopt");
659			break;
660
661		case SO_TIMESTAMP:
662			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
663			break;
664
665		case SO_RCVTIMEO:
666			lv=sizeof(struct timeval);
667			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
668				v.tm.tv_sec = 0;
669				v.tm.tv_usec = 0;
670			} else {
671				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
672				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
673			}
674			break;
675
676		case SO_SNDTIMEO:
677			lv=sizeof(struct timeval);
678			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
679				v.tm.tv_sec = 0;
680				v.tm.tv_usec = 0;
681			} else {
682				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
683				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
684			}
685			break;
686
687		case SO_RCVLOWAT:
688			v.val = sk->sk_rcvlowat;
689			break;
690
691		case SO_SNDLOWAT:
692			v.val=1;
693			break;
694
695		case SO_PASSCRED:
696			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
697			break;
698
699		case SO_PEERCRED:
700			if (len > sizeof(sk->sk_peercred))
701				len = sizeof(sk->sk_peercred);
702			if (copy_to_user(optval, &sk->sk_peercred, len))
703				return -EFAULT;
704			goto lenout;
705
706		case SO_PEERNAME:
707		{
708			char address[128];
709
710			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
711				return -ENOTCONN;
712			if (lv < len)
713				return -EINVAL;
714			if (copy_to_user(optval, address, len))
715				return -EFAULT;
716			goto lenout;
717		}
718
719		/* Dubious BSD thing... Probably nobody even uses it, but
720		 * the UNIX standard wants it for whatever reason... -DaveM
721		 */
722		case SO_ACCEPTCONN:
723			v.val = sk->sk_state == TCP_LISTEN;
724			break;
725
726		case SO_PEERSEC:
727			return security_socket_getpeersec_stream(sock, optval, optlen, len);
728
729		default:
730			return(-ENOPROTOOPT);
731	}
732	if (len > lv)
733		len = lv;
734	if (copy_to_user(optval, &v, len))
735		return -EFAULT;
736lenout:
737  	if (put_user(len, optlen))
738  		return -EFAULT;
739  	return 0;
740}
741
742/**
743 *	sk_alloc - All socket objects are allocated here
744 *	@family: protocol family
745 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
746 *	@prot: struct proto associated with this new sock instance
747 *	@zero_it: if we should zero the newly allocated sock
748 */
749struct sock *sk_alloc(int family, gfp_t priority,
750		      struct proto *prot, int zero_it)
751{
752	struct sock *sk = NULL;
753	kmem_cache_t *slab = prot->slab;
754
755	if (slab != NULL)
756		sk = kmem_cache_alloc(slab, priority);
757	else
758		sk = kmalloc(prot->obj_size, priority);
759
760	if (sk) {
761		if (zero_it) {
762			memset(sk, 0, prot->obj_size);
763			sk->sk_family = family;
764			/*
765			 * See comment in struct sock definition to understand
766			 * why we need sk_prot_creator -acme
767			 */
768			sk->sk_prot = sk->sk_prot_creator = prot;
769			sock_lock_init(sk);
770		}
771
772		if (security_sk_alloc(sk, family, priority))
773			goto out_free;
774
775		if (!try_module_get(prot->owner))
776			goto out_free;
777	}
778	return sk;
779
780out_free:
781	if (slab != NULL)
782		kmem_cache_free(slab, sk);
783	else
784		kfree(sk);
785	return NULL;
786}
787
788void sk_free(struct sock *sk)
789{
790	struct sk_filter *filter;
791	struct module *owner = sk->sk_prot_creator->owner;
792
793	if (sk->sk_destruct)
794		sk->sk_destruct(sk);
795
796	filter = sk->sk_filter;
797	if (filter) {
798		sk_filter_release(sk, filter);
799		sk->sk_filter = NULL;
800	}
801
802	sock_disable_timestamp(sk);
803
804	if (atomic_read(&sk->sk_omem_alloc))
805		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
806		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
807
808	security_sk_free(sk);
809	if (sk->sk_prot_creator->slab != NULL)
810		kmem_cache_free(sk->sk_prot_creator->slab, sk);
811	else
812		kfree(sk);
813	module_put(owner);
814}
815
816struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
817{
818	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
819
820	if (newsk != NULL) {
821		struct sk_filter *filter;
822
823		memcpy(newsk, sk, sk->sk_prot->obj_size);
824
825		/* SANITY */
826		sk_node_init(&newsk->sk_node);
827		sock_lock_init(newsk);
828		bh_lock_sock(newsk);
829
830		atomic_set(&newsk->sk_rmem_alloc, 0);
831		atomic_set(&newsk->sk_wmem_alloc, 0);
832		atomic_set(&newsk->sk_omem_alloc, 0);
833		skb_queue_head_init(&newsk->sk_receive_queue);
834		skb_queue_head_init(&newsk->sk_write_queue);
835
836		rwlock_init(&newsk->sk_dst_lock);
837		rwlock_init(&newsk->sk_callback_lock);
838
839		newsk->sk_dst_cache	= NULL;
840		newsk->sk_wmem_queued	= 0;
841		newsk->sk_forward_alloc = 0;
842		newsk->sk_send_head	= NULL;
843		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
844		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
845
846		sock_reset_flag(newsk, SOCK_DONE);
847		skb_queue_head_init(&newsk->sk_error_queue);
848
849		filter = newsk->sk_filter;
850		if (filter != NULL)
851			sk_filter_charge(newsk, filter);
852
853		if (unlikely(xfrm_sk_clone_policy(newsk))) {
854			/* It is still raw copy of parent, so invalidate
855			 * destructor and make plain sk_free() */
856			newsk->sk_destruct = NULL;
857			sk_free(newsk);
858			newsk = NULL;
859			goto out;
860		}
861
862		newsk->sk_err	   = 0;
863		newsk->sk_priority = 0;
864		atomic_set(&newsk->sk_refcnt, 2);
865
866		/*
867		 * Increment the counter in the same struct proto as the master
868		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
869		 * is the same as sk->sk_prot->socks, as this field was copied
870		 * with memcpy).
871		 *
872		 * This _changes_ the previous behaviour, where
873		 * tcp_create_openreq_child always was incrementing the
874		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
875		 * to be taken into account in all callers. -acme
876		 */
877		sk_refcnt_debug_inc(newsk);
878		newsk->sk_socket = NULL;
879		newsk->sk_sleep	 = NULL;
880
881		if (newsk->sk_prot->sockets_allocated)
882			atomic_inc(newsk->sk_prot->sockets_allocated);
883	}
884out:
885	return newsk;
886}
887
888EXPORT_SYMBOL_GPL(sk_clone);
889
890void __init sk_init(void)
891{
892	if (num_physpages <= 4096) {
893		sysctl_wmem_max = 32767;
894		sysctl_rmem_max = 32767;
895		sysctl_wmem_default = 32767;
896		sysctl_rmem_default = 32767;
897	} else if (num_physpages >= 131072) {
898		sysctl_wmem_max = 131071;
899		sysctl_rmem_max = 131071;
900	}
901}
902
903/*
904 *	Simple resource managers for sockets.
905 */
906
907
908/*
909 * Write buffer destructor automatically called from kfree_skb.
910 */
911void sock_wfree(struct sk_buff *skb)
912{
913	struct sock *sk = skb->sk;
914
915	/* In case it might be waiting for more memory. */
916	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
917	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
918		sk->sk_write_space(sk);
919	sock_put(sk);
920}
921
922/*
923 * Read buffer destructor automatically called from kfree_skb.
924 */
925void sock_rfree(struct sk_buff *skb)
926{
927	struct sock *sk = skb->sk;
928
929	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
930}
931
932
933int sock_i_uid(struct sock *sk)
934{
935	int uid;
936
937	read_lock(&sk->sk_callback_lock);
938	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
939	read_unlock(&sk->sk_callback_lock);
940	return uid;
941}
942
943unsigned long sock_i_ino(struct sock *sk)
944{
945	unsigned long ino;
946
947	read_lock(&sk->sk_callback_lock);
948	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
949	read_unlock(&sk->sk_callback_lock);
950	return ino;
951}
952
953/*
954 * Allocate a skb from the socket's send buffer.
955 */
956struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
957			     gfp_t priority)
958{
959	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
960		struct sk_buff * skb = alloc_skb(size, priority);
961		if (skb) {
962			skb_set_owner_w(skb, sk);
963			return skb;
964		}
965	}
966	return NULL;
967}
968
969/*
970 * Allocate a skb from the socket's receive buffer.
971 */
972struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
973			     gfp_t priority)
974{
975	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
976		struct sk_buff *skb = alloc_skb(size, priority);
977		if (skb) {
978			skb_set_owner_r(skb, sk);
979			return skb;
980		}
981	}
982	return NULL;
983}
984
985/*
986 * Allocate a memory block from the socket's option memory buffer.
987 */
988void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
989{
990	if ((unsigned)size <= sysctl_optmem_max &&
991	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
992		void *mem;
993		/* First do the add, to avoid the race if kmalloc
994 		 * might sleep.
995		 */
996		atomic_add(size, &sk->sk_omem_alloc);
997		mem = kmalloc(size, priority);
998		if (mem)
999			return mem;
1000		atomic_sub(size, &sk->sk_omem_alloc);
1001	}
1002	return NULL;
1003}
1004
1005/*
1006 * Free an option memory block.
1007 */
1008void sock_kfree_s(struct sock *sk, void *mem, int size)
1009{
1010	kfree(mem);
1011	atomic_sub(size, &sk->sk_omem_alloc);
1012}
1013
1014/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1015   I think, these locks should be removed for datagram sockets.
1016 */
1017static long sock_wait_for_wmem(struct sock * sk, long timeo)
1018{
1019	DEFINE_WAIT(wait);
1020
1021	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1022	for (;;) {
1023		if (!timeo)
1024			break;
1025		if (signal_pending(current))
1026			break;
1027		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1028		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1029		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1030			break;
1031		if (sk->sk_shutdown & SEND_SHUTDOWN)
1032			break;
1033		if (sk->sk_err)
1034			break;
1035		timeo = schedule_timeout(timeo);
1036	}
1037	finish_wait(sk->sk_sleep, &wait);
1038	return timeo;
1039}
1040
1041
1042/*
1043 *	Generic send/receive buffer handlers
1044 */
1045
1046static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1047					    unsigned long header_len,
1048					    unsigned long data_len,
1049					    int noblock, int *errcode)
1050{
1051	struct sk_buff *skb;
1052	gfp_t gfp_mask;
1053	long timeo;
1054	int err;
1055
1056	gfp_mask = sk->sk_allocation;
1057	if (gfp_mask & __GFP_WAIT)
1058		gfp_mask |= __GFP_REPEAT;
1059
1060	timeo = sock_sndtimeo(sk, noblock);
1061	while (1) {
1062		err = sock_error(sk);
1063		if (err != 0)
1064			goto failure;
1065
1066		err = -EPIPE;
1067		if (sk->sk_shutdown & SEND_SHUTDOWN)
1068			goto failure;
1069
1070		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1071			skb = alloc_skb(header_len, sk->sk_allocation);
1072			if (skb) {
1073				int npages;
1074				int i;
1075
1076				/* No pages, we're done... */
1077				if (!data_len)
1078					break;
1079
1080				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1081				skb->truesize += data_len;
1082				skb_shinfo(skb)->nr_frags = npages;
1083				for (i = 0; i < npages; i++) {
1084					struct page *page;
1085					skb_frag_t *frag;
1086
1087					page = alloc_pages(sk->sk_allocation, 0);
1088					if (!page) {
1089						err = -ENOBUFS;
1090						skb_shinfo(skb)->nr_frags = i;
1091						kfree_skb(skb);
1092						goto failure;
1093					}
1094
1095					frag = &skb_shinfo(skb)->frags[i];
1096					frag->page = page;
1097					frag->page_offset = 0;
1098					frag->size = (data_len >= PAGE_SIZE ?
1099						      PAGE_SIZE :
1100						      data_len);
1101					data_len -= PAGE_SIZE;
1102				}
1103
1104				/* Full success... */
1105				break;
1106			}
1107			err = -ENOBUFS;
1108			goto failure;
1109		}
1110		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1111		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1112		err = -EAGAIN;
1113		if (!timeo)
1114			goto failure;
1115		if (signal_pending(current))
1116			goto interrupted;
1117		timeo = sock_wait_for_wmem(sk, timeo);
1118	}
1119
1120	skb_set_owner_w(skb, sk);
1121	return skb;
1122
1123interrupted:
1124	err = sock_intr_errno(timeo);
1125failure:
1126	*errcode = err;
1127	return NULL;
1128}
1129
1130struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1131				    int noblock, int *errcode)
1132{
1133	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1134}
1135
1136static void __lock_sock(struct sock *sk)
1137{
1138	DEFINE_WAIT(wait);
1139
1140	for(;;) {
1141		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1142					TASK_UNINTERRUPTIBLE);
1143		spin_unlock_bh(&sk->sk_lock.slock);
1144		schedule();
1145		spin_lock_bh(&sk->sk_lock.slock);
1146		if(!sock_owned_by_user(sk))
1147			break;
1148	}
1149	finish_wait(&sk->sk_lock.wq, &wait);
1150}
1151
1152static void __release_sock(struct sock *sk)
1153{
1154	struct sk_buff *skb = sk->sk_backlog.head;
1155
1156	do {
1157		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1158		bh_unlock_sock(sk);
1159
1160		do {
1161			struct sk_buff *next = skb->next;
1162
1163			skb->next = NULL;
1164			sk->sk_backlog_rcv(sk, skb);
1165
1166			/*
1167			 * We are in process context here with softirqs
1168			 * disabled, use cond_resched_softirq() to preempt.
1169			 * This is safe to do because we've taken the backlog
1170			 * queue private:
1171			 */
1172			cond_resched_softirq();
1173
1174			skb = next;
1175		} while (skb != NULL);
1176
1177		bh_lock_sock(sk);
1178	} while((skb = sk->sk_backlog.head) != NULL);
1179}
1180
1181/**
1182 * sk_wait_data - wait for data to arrive at sk_receive_queue
1183 * @sk:    sock to wait on
1184 * @timeo: for how long
1185 *
1186 * Now socket state including sk->sk_err is changed only under lock,
1187 * hence we may omit checks after joining wait queue.
1188 * We check receive queue before schedule() only as optimization;
1189 * it is very likely that release_sock() added new data.
1190 */
1191int sk_wait_data(struct sock *sk, long *timeo)
1192{
1193	int rc;
1194	DEFINE_WAIT(wait);
1195
1196	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1197	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1198	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1199	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1200	finish_wait(sk->sk_sleep, &wait);
1201	return rc;
1202}
1203
1204EXPORT_SYMBOL(sk_wait_data);
1205
1206/*
1207 * Set of default routines for initialising struct proto_ops when
1208 * the protocol does not support a particular function. In certain
1209 * cases where it makes no sense for a protocol to have a "do nothing"
1210 * function, some default processing is provided.
1211 */
1212
1213int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1214{
1215	return -EOPNOTSUPP;
1216}
1217
1218int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1219		    int len, int flags)
1220{
1221	return -EOPNOTSUPP;
1222}
1223
1224int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1225{
1226	return -EOPNOTSUPP;
1227}
1228
1229int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1230{
1231	return -EOPNOTSUPP;
1232}
1233
1234int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1235		    int *len, int peer)
1236{
1237	return -EOPNOTSUPP;
1238}
1239
1240unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1241{
1242	return 0;
1243}
1244
1245int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1246{
1247	return -EOPNOTSUPP;
1248}
1249
1250int sock_no_listen(struct socket *sock, int backlog)
1251{
1252	return -EOPNOTSUPP;
1253}
1254
1255int sock_no_shutdown(struct socket *sock, int how)
1256{
1257	return -EOPNOTSUPP;
1258}
1259
1260int sock_no_setsockopt(struct socket *sock, int level, int optname,
1261		    char __user *optval, int optlen)
1262{
1263	return -EOPNOTSUPP;
1264}
1265
1266int sock_no_getsockopt(struct socket *sock, int level, int optname,
1267		    char __user *optval, int __user *optlen)
1268{
1269	return -EOPNOTSUPP;
1270}
1271
1272int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1273		    size_t len)
1274{
1275	return -EOPNOTSUPP;
1276}
1277
1278int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1279		    size_t len, int flags)
1280{
1281	return -EOPNOTSUPP;
1282}
1283
1284int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1285{
1286	/* Mirror missing mmap method error code */
1287	return -ENODEV;
1288}
1289
1290ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1291{
1292	ssize_t res;
1293	struct msghdr msg = {.msg_flags = flags};
1294	struct kvec iov;
1295	char *kaddr = kmap(page);
1296	iov.iov_base = kaddr + offset;
1297	iov.iov_len = size;
1298	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1299	kunmap(page);
1300	return res;
1301}
1302
1303/*
1304 *	Default Socket Callbacks
1305 */
1306
1307static void sock_def_wakeup(struct sock *sk)
1308{
1309	read_lock(&sk->sk_callback_lock);
1310	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1311		wake_up_interruptible_all(sk->sk_sleep);
1312	read_unlock(&sk->sk_callback_lock);
1313}
1314
1315static void sock_def_error_report(struct sock *sk)
1316{
1317	read_lock(&sk->sk_callback_lock);
1318	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1319		wake_up_interruptible(sk->sk_sleep);
1320	sk_wake_async(sk,0,POLL_ERR);
1321	read_unlock(&sk->sk_callback_lock);
1322}
1323
1324static void sock_def_readable(struct sock *sk, int len)
1325{
1326	read_lock(&sk->sk_callback_lock);
1327	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1328		wake_up_interruptible(sk->sk_sleep);
1329	sk_wake_async(sk,1,POLL_IN);
1330	read_unlock(&sk->sk_callback_lock);
1331}
1332
1333static void sock_def_write_space(struct sock *sk)
1334{
1335	read_lock(&sk->sk_callback_lock);
1336
1337	/* Do not wake up a writer until he can make "significant"
1338	 * progress.  --DaveM
1339	 */
1340	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1341		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1342			wake_up_interruptible(sk->sk_sleep);
1343
1344		/* Should agree with poll, otherwise some programs break */
1345		if (sock_writeable(sk))
1346			sk_wake_async(sk, 2, POLL_OUT);
1347	}
1348
1349	read_unlock(&sk->sk_callback_lock);
1350}
1351
1352static void sock_def_destruct(struct sock *sk)
1353{
1354	kfree(sk->sk_protinfo);
1355}
1356
1357void sk_send_sigurg(struct sock *sk)
1358{
1359	if (sk->sk_socket && sk->sk_socket->file)
1360		if (send_sigurg(&sk->sk_socket->file->f_owner))
1361			sk_wake_async(sk, 3, POLL_PRI);
1362}
1363
1364void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1365		    unsigned long expires)
1366{
1367	if (!mod_timer(timer, expires))
1368		sock_hold(sk);
1369}
1370
1371EXPORT_SYMBOL(sk_reset_timer);
1372
1373void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1374{
1375	if (timer_pending(timer) && del_timer(timer))
1376		__sock_put(sk);
1377}
1378
1379EXPORT_SYMBOL(sk_stop_timer);
1380
1381void sock_init_data(struct socket *sock, struct sock *sk)
1382{
1383	skb_queue_head_init(&sk->sk_receive_queue);
1384	skb_queue_head_init(&sk->sk_write_queue);
1385	skb_queue_head_init(&sk->sk_error_queue);
1386
1387	sk->sk_send_head	=	NULL;
1388
1389	init_timer(&sk->sk_timer);
1390
1391	sk->sk_allocation	=	GFP_KERNEL;
1392	sk->sk_rcvbuf		=	sysctl_rmem_default;
1393	sk->sk_sndbuf		=	sysctl_wmem_default;
1394	sk->sk_state		=	TCP_CLOSE;
1395	sk->sk_socket		=	sock;
1396
1397	sock_set_flag(sk, SOCK_ZAPPED);
1398
1399	if(sock)
1400	{
1401		sk->sk_type	=	sock->type;
1402		sk->sk_sleep	=	&sock->wait;
1403		sock->sk	=	sk;
1404	} else
1405		sk->sk_sleep	=	NULL;
1406
1407	rwlock_init(&sk->sk_dst_lock);
1408	rwlock_init(&sk->sk_callback_lock);
1409
1410	sk->sk_state_change	=	sock_def_wakeup;
1411	sk->sk_data_ready	=	sock_def_readable;
1412	sk->sk_write_space	=	sock_def_write_space;
1413	sk->sk_error_report	=	sock_def_error_report;
1414	sk->sk_destruct		=	sock_def_destruct;
1415
1416	sk->sk_sndmsg_page	=	NULL;
1417	sk->sk_sndmsg_off	=	0;
1418
1419	sk->sk_peercred.pid 	=	0;
1420	sk->sk_peercred.uid	=	-1;
1421	sk->sk_peercred.gid	=	-1;
1422	sk->sk_write_pending	=	0;
1423	sk->sk_rcvlowat		=	1;
1424	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1425	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1426
1427	sk->sk_stamp.tv_sec     = -1L;
1428	sk->sk_stamp.tv_usec    = -1L;
1429
1430	atomic_set(&sk->sk_refcnt, 1);
1431}
1432
1433void fastcall lock_sock(struct sock *sk)
1434{
1435	might_sleep();
1436	spin_lock_bh(&(sk->sk_lock.slock));
1437	if (sk->sk_lock.owner)
1438		__lock_sock(sk);
1439	sk->sk_lock.owner = (void *)1;
1440	spin_unlock_bh(&(sk->sk_lock.slock));
1441}
1442
1443EXPORT_SYMBOL(lock_sock);
1444
1445void fastcall release_sock(struct sock *sk)
1446{
1447	spin_lock_bh(&(sk->sk_lock.slock));
1448	if (sk->sk_backlog.tail)
1449		__release_sock(sk);
1450	sk->sk_lock.owner = NULL;
1451        if (waitqueue_active(&(sk->sk_lock.wq)))
1452		wake_up(&(sk->sk_lock.wq));
1453	spin_unlock_bh(&(sk->sk_lock.slock));
1454}
1455EXPORT_SYMBOL(release_sock);
1456
1457int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1458{
1459	if (!sock_flag(sk, SOCK_TIMESTAMP))
1460		sock_enable_timestamp(sk);
1461	if (sk->sk_stamp.tv_sec == -1)
1462		return -ENOENT;
1463	if (sk->sk_stamp.tv_sec == 0)
1464		do_gettimeofday(&sk->sk_stamp);
1465	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1466		-EFAULT : 0;
1467}
1468EXPORT_SYMBOL(sock_get_timestamp);
1469
1470void sock_enable_timestamp(struct sock *sk)
1471{
1472	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1473		sock_set_flag(sk, SOCK_TIMESTAMP);
1474		net_enable_timestamp();
1475	}
1476}
1477EXPORT_SYMBOL(sock_enable_timestamp);
1478
1479/*
1480 *	Get a socket option on an socket.
1481 *
1482 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1483 *	asynchronous errors should be reported by getsockopt. We assume
1484 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1485 */
1486int sock_common_getsockopt(struct socket *sock, int level, int optname,
1487			   char __user *optval, int __user *optlen)
1488{
1489	struct sock *sk = sock->sk;
1490
1491	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1492}
1493
1494EXPORT_SYMBOL(sock_common_getsockopt);
1495
1496#ifdef CONFIG_COMPAT
1497int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1498				  char __user *optval, int __user *optlen)
1499{
1500	struct sock *sk = sock->sk;
1501
1502	if (sk->sk_prot->compat_setsockopt != NULL)
1503		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1504						      optval, optlen);
1505	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1506}
1507EXPORT_SYMBOL(compat_sock_common_getsockopt);
1508#endif
1509
1510int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1511			struct msghdr *msg, size_t size, int flags)
1512{
1513	struct sock *sk = sock->sk;
1514	int addr_len = 0;
1515	int err;
1516
1517	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1518				   flags & ~MSG_DONTWAIT, &addr_len);
1519	if (err >= 0)
1520		msg->msg_namelen = addr_len;
1521	return err;
1522}
1523
1524EXPORT_SYMBOL(sock_common_recvmsg);
1525
1526/*
1527 *	Set socket options on an inet socket.
1528 */
1529int sock_common_setsockopt(struct socket *sock, int level, int optname,
1530			   char __user *optval, int optlen)
1531{
1532	struct sock *sk = sock->sk;
1533
1534	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1535}
1536
1537EXPORT_SYMBOL(sock_common_setsockopt);
1538
1539#ifdef CONFIG_COMPAT
1540int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1541				  char __user *optval, int optlen)
1542{
1543	struct sock *sk = sock->sk;
1544
1545	if (sk->sk_prot->compat_setsockopt != NULL)
1546		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1547						      optval, optlen);
1548	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1549}
1550EXPORT_SYMBOL(compat_sock_common_setsockopt);
1551#endif
1552
1553void sk_common_release(struct sock *sk)
1554{
1555	if (sk->sk_prot->destroy)
1556		sk->sk_prot->destroy(sk);
1557
1558	/*
1559	 * Observation: when sock_common_release is called, processes have
1560	 * no access to socket. But net still has.
1561	 * Step one, detach it from networking:
1562	 *
1563	 * A. Remove from hash tables.
1564	 */
1565
1566	sk->sk_prot->unhash(sk);
1567
1568	/*
1569	 * In this point socket cannot receive new packets, but it is possible
1570	 * that some packets are in flight because some CPU runs receiver and
1571	 * did hash table lookup before we unhashed socket. They will achieve
1572	 * receive queue and will be purged by socket destructor.
1573	 *
1574	 * Also we still have packets pending on receive queue and probably,
1575	 * our own packets waiting in device queues. sock_destroy will drain
1576	 * receive queue, but transmitted packets will delay socket destruction
1577	 * until the last reference will be released.
1578	 */
1579
1580	sock_orphan(sk);
1581
1582	xfrm_sk_free_policy(sk);
1583
1584	sk_refcnt_debug_release(sk);
1585	sock_put(sk);
1586}
1587
1588EXPORT_SYMBOL(sk_common_release);
1589
1590static DEFINE_RWLOCK(proto_list_lock);
1591static LIST_HEAD(proto_list);
1592
1593int proto_register(struct proto *prot, int alloc_slab)
1594{
1595	char *request_sock_slab_name = NULL;
1596	char *timewait_sock_slab_name;
1597	int rc = -ENOBUFS;
1598
1599	if (alloc_slab) {
1600		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1601					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1602
1603		if (prot->slab == NULL) {
1604			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1605			       prot->name);
1606			goto out;
1607		}
1608
1609		if (prot->rsk_prot != NULL) {
1610			static const char mask[] = "request_sock_%s";
1611
1612			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1613			if (request_sock_slab_name == NULL)
1614				goto out_free_sock_slab;
1615
1616			sprintf(request_sock_slab_name, mask, prot->name);
1617			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1618								 prot->rsk_prot->obj_size, 0,
1619								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1620
1621			if (prot->rsk_prot->slab == NULL) {
1622				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1623				       prot->name);
1624				goto out_free_request_sock_slab_name;
1625			}
1626		}
1627
1628		if (prot->twsk_prot != NULL) {
1629			static const char mask[] = "tw_sock_%s";
1630
1631			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1632
1633			if (timewait_sock_slab_name == NULL)
1634				goto out_free_request_sock_slab;
1635
1636			sprintf(timewait_sock_slab_name, mask, prot->name);
1637			prot->twsk_prot->twsk_slab =
1638				kmem_cache_create(timewait_sock_slab_name,
1639						  prot->twsk_prot->twsk_obj_size,
1640						  0, SLAB_HWCACHE_ALIGN,
1641						  NULL, NULL);
1642			if (prot->twsk_prot->twsk_slab == NULL)
1643				goto out_free_timewait_sock_slab_name;
1644		}
1645	}
1646
1647	write_lock(&proto_list_lock);
1648	list_add(&prot->node, &proto_list);
1649	write_unlock(&proto_list_lock);
1650	rc = 0;
1651out:
1652	return rc;
1653out_free_timewait_sock_slab_name:
1654	kfree(timewait_sock_slab_name);
1655out_free_request_sock_slab:
1656	if (prot->rsk_prot && prot->rsk_prot->slab) {
1657		kmem_cache_destroy(prot->rsk_prot->slab);
1658		prot->rsk_prot->slab = NULL;
1659	}
1660out_free_request_sock_slab_name:
1661	kfree(request_sock_slab_name);
1662out_free_sock_slab:
1663	kmem_cache_destroy(prot->slab);
1664	prot->slab = NULL;
1665	goto out;
1666}
1667
1668EXPORT_SYMBOL(proto_register);
1669
1670void proto_unregister(struct proto *prot)
1671{
1672	write_lock(&proto_list_lock);
1673	list_del(&prot->node);
1674	write_unlock(&proto_list_lock);
1675
1676	if (prot->slab != NULL) {
1677		kmem_cache_destroy(prot->slab);
1678		prot->slab = NULL;
1679	}
1680
1681	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1682		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1683
1684		kmem_cache_destroy(prot->rsk_prot->slab);
1685		kfree(name);
1686		prot->rsk_prot->slab = NULL;
1687	}
1688
1689	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1690		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1691
1692		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1693		kfree(name);
1694		prot->twsk_prot->twsk_slab = NULL;
1695	}
1696}
1697
1698EXPORT_SYMBOL(proto_unregister);
1699
1700#ifdef CONFIG_PROC_FS
1701static inline struct proto *__proto_head(void)
1702{
1703	return list_entry(proto_list.next, struct proto, node);
1704}
1705
1706static inline struct proto *proto_head(void)
1707{
1708	return list_empty(&proto_list) ? NULL : __proto_head();
1709}
1710
1711static inline struct proto *proto_next(struct proto *proto)
1712{
1713	return proto->node.next == &proto_list ? NULL :
1714		list_entry(proto->node.next, struct proto, node);
1715}
1716
1717static inline struct proto *proto_get_idx(loff_t pos)
1718{
1719	struct proto *proto;
1720	loff_t i = 0;
1721
1722	list_for_each_entry(proto, &proto_list, node)
1723		if (i++ == pos)
1724			goto out;
1725
1726	proto = NULL;
1727out:
1728	return proto;
1729}
1730
1731static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1732{
1733	read_lock(&proto_list_lock);
1734	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1735}
1736
1737static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1738{
1739	++*pos;
1740	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1741}
1742
1743static void proto_seq_stop(struct seq_file *seq, void *v)
1744{
1745	read_unlock(&proto_list_lock);
1746}
1747
1748static char proto_method_implemented(const void *method)
1749{
1750	return method == NULL ? 'n' : 'y';
1751}
1752
1753static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1754{
1755	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1756			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1757		   proto->name,
1758		   proto->obj_size,
1759		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1760		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1761		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1762		   proto->max_header,
1763		   proto->slab == NULL ? "no" : "yes",
1764		   module_name(proto->owner),
1765		   proto_method_implemented(proto->close),
1766		   proto_method_implemented(proto->connect),
1767		   proto_method_implemented(proto->disconnect),
1768		   proto_method_implemented(proto->accept),
1769		   proto_method_implemented(proto->ioctl),
1770		   proto_method_implemented(proto->init),
1771		   proto_method_implemented(proto->destroy),
1772		   proto_method_implemented(proto->shutdown),
1773		   proto_method_implemented(proto->setsockopt),
1774		   proto_method_implemented(proto->getsockopt),
1775		   proto_method_implemented(proto->sendmsg),
1776		   proto_method_implemented(proto->recvmsg),
1777		   proto_method_implemented(proto->sendpage),
1778		   proto_method_implemented(proto->bind),
1779		   proto_method_implemented(proto->backlog_rcv),
1780		   proto_method_implemented(proto->hash),
1781		   proto_method_implemented(proto->unhash),
1782		   proto_method_implemented(proto->get_port),
1783		   proto_method_implemented(proto->enter_memory_pressure));
1784}
1785
1786static int proto_seq_show(struct seq_file *seq, void *v)
1787{
1788	if (v == SEQ_START_TOKEN)
1789		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1790			   "protocol",
1791			   "size",
1792			   "sockets",
1793			   "memory",
1794			   "press",
1795			   "maxhdr",
1796			   "slab",
1797			   "module",
1798			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1799	else
1800		proto_seq_printf(seq, v);
1801	return 0;
1802}
1803
1804static struct seq_operations proto_seq_ops = {
1805	.start  = proto_seq_start,
1806	.next   = proto_seq_next,
1807	.stop   = proto_seq_stop,
1808	.show   = proto_seq_show,
1809};
1810
1811static int proto_seq_open(struct inode *inode, struct file *file)
1812{
1813	return seq_open(file, &proto_seq_ops);
1814}
1815
1816static struct file_operations proto_seq_fops = {
1817	.owner		= THIS_MODULE,
1818	.open		= proto_seq_open,
1819	.read		= seq_read,
1820	.llseek		= seq_lseek,
1821	.release	= seq_release,
1822};
1823
1824static int __init proto_init(void)
1825{
1826	/* register /proc/net/protocols */
1827	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1828}
1829
1830subsys_initcall(proto_init);
1831
1832#endif /* PROC_FS */
1833
1834EXPORT_SYMBOL(sk_alloc);
1835EXPORT_SYMBOL(sk_free);
1836EXPORT_SYMBOL(sk_send_sigurg);
1837EXPORT_SYMBOL(sock_alloc_send_skb);
1838EXPORT_SYMBOL(sock_init_data);
1839EXPORT_SYMBOL(sock_kfree_s);
1840EXPORT_SYMBOL(sock_kmalloc);
1841EXPORT_SYMBOL(sock_no_accept);
1842EXPORT_SYMBOL(sock_no_bind);
1843EXPORT_SYMBOL(sock_no_connect);
1844EXPORT_SYMBOL(sock_no_getname);
1845EXPORT_SYMBOL(sock_no_getsockopt);
1846EXPORT_SYMBOL(sock_no_ioctl);
1847EXPORT_SYMBOL(sock_no_listen);
1848EXPORT_SYMBOL(sock_no_mmap);
1849EXPORT_SYMBOL(sock_no_poll);
1850EXPORT_SYMBOL(sock_no_recvmsg);
1851EXPORT_SYMBOL(sock_no_sendmsg);
1852EXPORT_SYMBOL(sock_no_sendpage);
1853EXPORT_SYMBOL(sock_no_setsockopt);
1854EXPORT_SYMBOL(sock_no_shutdown);
1855EXPORT_SYMBOL(sock_no_socketpair);
1856EXPORT_SYMBOL(sock_rfree);
1857EXPORT_SYMBOL(sock_setsockopt);
1858EXPORT_SYMBOL(sock_wfree);
1859EXPORT_SYMBOL(sock_wmalloc);
1860EXPORT_SYMBOL(sock_i_uid);
1861EXPORT_SYMBOL(sock_i_ino);
1862EXPORT_SYMBOL(sysctl_optmem_max);
1863#ifdef CONFIG_SYSCTL
1864EXPORT_SYMBOL(sysctl_rmem_max);
1865EXPORT_SYMBOL(sysctl_wmem_max);
1866#endif
1867