sock.c revision b7aa0bf70c4afb9e38be25f5c0922498d0f8684c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
148  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
149  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
150  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
151  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
152  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
153  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
154  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
155  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
156  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
157  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
158};
159static const char *af_family_slock_key_strings[AF_MAX+1] = {
160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169  "slock-27"       , "slock-28"          , "slock-29"          ,
170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
171};
172#endif
173
174/*
175 * sk_callback_lock locking rules are per-address-family,
176 * so split the lock classes by using a per-AF key:
177 */
178static struct lock_class_key af_callback_keys[AF_MAX];
179
180/* Take into consideration the size of the struct sk_buff overhead in the
181 * determination of these values, since that is non-constant across
182 * platforms.  This makes socket queueing behavior and performance
183 * not depend upon such differences.
184 */
185#define _SK_MEM_PACKETS		256
186#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
187#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
189
190/* Run time adjustable parameters. */
191__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
192__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
193__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
194__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
195
196/* Maximal space eaten by iovec or ancilliary data plus some space */
197int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
198
199static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
200{
201	struct timeval tv;
202
203	if (optlen < sizeof(tv))
204		return -EINVAL;
205	if (copy_from_user(&tv, optval, sizeof(tv)))
206		return -EFAULT;
207
208	*timeo_p = MAX_SCHEDULE_TIMEOUT;
209	if (tv.tv_sec == 0 && tv.tv_usec == 0)
210		return 0;
211	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
212		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
213	return 0;
214}
215
216static void sock_warn_obsolete_bsdism(const char *name)
217{
218	static int warned;
219	static char warncomm[TASK_COMM_LEN];
220	if (strcmp(warncomm, current->comm) && warned < 5) {
221		strcpy(warncomm,  current->comm);
222		printk(KERN_WARNING "process `%s' is using obsolete "
223		       "%s SO_BSDCOMPAT\n", warncomm, name);
224		warned++;
225	}
226}
227
228static void sock_disable_timestamp(struct sock *sk)
229{
230	if (sock_flag(sk, SOCK_TIMESTAMP)) {
231		sock_reset_flag(sk, SOCK_TIMESTAMP);
232		net_disable_timestamp();
233	}
234}
235
236
237int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
238{
239	int err = 0;
240	int skb_len;
241
242	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
243	   number of warnings when compiling with -W --ANK
244	 */
245	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
246	    (unsigned)sk->sk_rcvbuf) {
247		err = -ENOMEM;
248		goto out;
249	}
250
251	err = sk_filter(sk, skb);
252	if (err)
253		goto out;
254
255	skb->dev = NULL;
256	skb_set_owner_r(skb, sk);
257
258	/* Cache the SKB length before we tack it onto the receive
259	 * queue.  Once it is added it no longer belongs to us and
260	 * may be freed by other threads of control pulling packets
261	 * from the queue.
262	 */
263	skb_len = skb->len;
264
265	skb_queue_tail(&sk->sk_receive_queue, skb);
266
267	if (!sock_flag(sk, SOCK_DEAD))
268		sk->sk_data_ready(sk, skb_len);
269out:
270	return err;
271}
272EXPORT_SYMBOL(sock_queue_rcv_skb);
273
274int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
275{
276	int rc = NET_RX_SUCCESS;
277
278	if (sk_filter(sk, skb))
279		goto discard_and_relse;
280
281	skb->dev = NULL;
282
283	if (nested)
284		bh_lock_sock_nested(sk);
285	else
286		bh_lock_sock(sk);
287	if (!sock_owned_by_user(sk)) {
288		/*
289		 * trylock + unlock semantics:
290		 */
291		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
292
293		rc = sk->sk_backlog_rcv(sk, skb);
294
295		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
296	} else
297		sk_add_backlog(sk, skb);
298	bh_unlock_sock(sk);
299out:
300	sock_put(sk);
301	return rc;
302discard_and_relse:
303	kfree_skb(skb);
304	goto out;
305}
306EXPORT_SYMBOL(sk_receive_skb);
307
308struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
309{
310	struct dst_entry *dst = sk->sk_dst_cache;
311
312	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
313		sk->sk_dst_cache = NULL;
314		dst_release(dst);
315		return NULL;
316	}
317
318	return dst;
319}
320EXPORT_SYMBOL(__sk_dst_check);
321
322struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
323{
324	struct dst_entry *dst = sk_dst_get(sk);
325
326	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
327		sk_dst_reset(sk);
328		dst_release(dst);
329		return NULL;
330	}
331
332	return dst;
333}
334EXPORT_SYMBOL(sk_dst_check);
335
336/*
337 *	This is meant for all protocols to use and covers goings on
338 *	at the socket level. Everything here is generic.
339 */
340
341int sock_setsockopt(struct socket *sock, int level, int optname,
342		    char __user *optval, int optlen)
343{
344	struct sock *sk=sock->sk;
345	struct sk_filter *filter;
346	int val;
347	int valbool;
348	struct linger ling;
349	int ret = 0;
350
351	/*
352	 *	Options without arguments
353	 */
354
355#ifdef SO_DONTLINGER		/* Compatibility item... */
356	if (optname == SO_DONTLINGER) {
357		lock_sock(sk);
358		sock_reset_flag(sk, SOCK_LINGER);
359		release_sock(sk);
360		return 0;
361	}
362#endif
363
364	if(optlen<sizeof(int))
365		return(-EINVAL);
366
367	if (get_user(val, (int __user *)optval))
368		return -EFAULT;
369
370	valbool = val?1:0;
371
372	lock_sock(sk);
373
374	switch(optname)
375	{
376		case SO_DEBUG:
377			if(val && !capable(CAP_NET_ADMIN))
378			{
379				ret = -EACCES;
380			}
381			else if (valbool)
382				sock_set_flag(sk, SOCK_DBG);
383			else
384				sock_reset_flag(sk, SOCK_DBG);
385			break;
386		case SO_REUSEADDR:
387			sk->sk_reuse = valbool;
388			break;
389		case SO_TYPE:
390		case SO_ERROR:
391			ret = -ENOPROTOOPT;
392			break;
393		case SO_DONTROUTE:
394			if (valbool)
395				sock_set_flag(sk, SOCK_LOCALROUTE);
396			else
397				sock_reset_flag(sk, SOCK_LOCALROUTE);
398			break;
399		case SO_BROADCAST:
400			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
401			break;
402		case SO_SNDBUF:
403			/* Don't error on this BSD doesn't and if you think
404			   about it this is right. Otherwise apps have to
405			   play 'guess the biggest size' games. RCVBUF/SNDBUF
406			   are treated in BSD as hints */
407
408			if (val > sysctl_wmem_max)
409				val = sysctl_wmem_max;
410set_sndbuf:
411			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
412			if ((val * 2) < SOCK_MIN_SNDBUF)
413				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
414			else
415				sk->sk_sndbuf = val * 2;
416
417			/*
418			 *	Wake up sending tasks if we
419			 *	upped the value.
420			 */
421			sk->sk_write_space(sk);
422			break;
423
424		case SO_SNDBUFFORCE:
425			if (!capable(CAP_NET_ADMIN)) {
426				ret = -EPERM;
427				break;
428			}
429			goto set_sndbuf;
430
431		case SO_RCVBUF:
432			/* Don't error on this BSD doesn't and if you think
433			   about it this is right. Otherwise apps have to
434			   play 'guess the biggest size' games. RCVBUF/SNDBUF
435			   are treated in BSD as hints */
436
437			if (val > sysctl_rmem_max)
438				val = sysctl_rmem_max;
439set_rcvbuf:
440			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
441			/*
442			 * We double it on the way in to account for
443			 * "struct sk_buff" etc. overhead.   Applications
444			 * assume that the SO_RCVBUF setting they make will
445			 * allow that much actual data to be received on that
446			 * socket.
447			 *
448			 * Applications are unaware that "struct sk_buff" and
449			 * other overheads allocate from the receive buffer
450			 * during socket buffer allocation.
451			 *
452			 * And after considering the possible alternatives,
453			 * returning the value we actually used in getsockopt
454			 * is the most desirable behavior.
455			 */
456			if ((val * 2) < SOCK_MIN_RCVBUF)
457				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
458			else
459				sk->sk_rcvbuf = val * 2;
460			break;
461
462		case SO_RCVBUFFORCE:
463			if (!capable(CAP_NET_ADMIN)) {
464				ret = -EPERM;
465				break;
466			}
467			goto set_rcvbuf;
468
469		case SO_KEEPALIVE:
470#ifdef CONFIG_INET
471			if (sk->sk_protocol == IPPROTO_TCP)
472				tcp_set_keepalive(sk, valbool);
473#endif
474			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
475			break;
476
477		case SO_OOBINLINE:
478			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
479			break;
480
481		case SO_NO_CHECK:
482			sk->sk_no_check = valbool;
483			break;
484
485		case SO_PRIORITY:
486			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
487				sk->sk_priority = val;
488			else
489				ret = -EPERM;
490			break;
491
492		case SO_LINGER:
493			if(optlen<sizeof(ling)) {
494				ret = -EINVAL;	/* 1003.1g */
495				break;
496			}
497			if (copy_from_user(&ling,optval,sizeof(ling))) {
498				ret = -EFAULT;
499				break;
500			}
501			if (!ling.l_onoff)
502				sock_reset_flag(sk, SOCK_LINGER);
503			else {
504#if (BITS_PER_LONG == 32)
505				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
506					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
507				else
508#endif
509					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
510				sock_set_flag(sk, SOCK_LINGER);
511			}
512			break;
513
514		case SO_BSDCOMPAT:
515			sock_warn_obsolete_bsdism("setsockopt");
516			break;
517
518		case SO_PASSCRED:
519			if (valbool)
520				set_bit(SOCK_PASSCRED, &sock->flags);
521			else
522				clear_bit(SOCK_PASSCRED, &sock->flags);
523			break;
524
525		case SO_TIMESTAMP:
526			if (valbool)  {
527				sock_set_flag(sk, SOCK_RCVTSTAMP);
528				sock_enable_timestamp(sk);
529			} else
530				sock_reset_flag(sk, SOCK_RCVTSTAMP);
531			break;
532
533		case SO_RCVLOWAT:
534			if (val < 0)
535				val = INT_MAX;
536			sk->sk_rcvlowat = val ? : 1;
537			break;
538
539		case SO_RCVTIMEO:
540			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
541			break;
542
543		case SO_SNDTIMEO:
544			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
545			break;
546
547#ifdef CONFIG_NETDEVICES
548		case SO_BINDTODEVICE:
549		{
550			char devname[IFNAMSIZ];
551
552			/* Sorry... */
553			if (!capable(CAP_NET_RAW)) {
554				ret = -EPERM;
555				break;
556			}
557
558			/* Bind this socket to a particular device like "eth0",
559			 * as specified in the passed interface name. If the
560			 * name is "" or the option length is zero the socket
561			 * is not bound.
562			 */
563
564			if (!valbool) {
565				sk->sk_bound_dev_if = 0;
566			} else {
567				if (optlen > IFNAMSIZ - 1)
568					optlen = IFNAMSIZ - 1;
569				memset(devname, 0, sizeof(devname));
570				if (copy_from_user(devname, optval, optlen)) {
571					ret = -EFAULT;
572					break;
573				}
574
575				/* Remove any cached route for this socket. */
576				sk_dst_reset(sk);
577
578				if (devname[0] == '\0') {
579					sk->sk_bound_dev_if = 0;
580				} else {
581					struct net_device *dev = dev_get_by_name(devname);
582					if (!dev) {
583						ret = -ENODEV;
584						break;
585					}
586					sk->sk_bound_dev_if = dev->ifindex;
587					dev_put(dev);
588				}
589			}
590			break;
591		}
592#endif
593
594
595		case SO_ATTACH_FILTER:
596			ret = -EINVAL;
597			if (optlen == sizeof(struct sock_fprog)) {
598				struct sock_fprog fprog;
599
600				ret = -EFAULT;
601				if (copy_from_user(&fprog, optval, sizeof(fprog)))
602					break;
603
604				ret = sk_attach_filter(&fprog, sk);
605			}
606			break;
607
608		case SO_DETACH_FILTER:
609			rcu_read_lock_bh();
610			filter = rcu_dereference(sk->sk_filter);
611			if (filter) {
612				rcu_assign_pointer(sk->sk_filter, NULL);
613				sk_filter_release(sk, filter);
614				rcu_read_unlock_bh();
615				break;
616			}
617			rcu_read_unlock_bh();
618			ret = -ENONET;
619			break;
620
621		case SO_PASSSEC:
622			if (valbool)
623				set_bit(SOCK_PASSSEC, &sock->flags);
624			else
625				clear_bit(SOCK_PASSSEC, &sock->flags);
626			break;
627
628		/* We implement the SO_SNDLOWAT etc to
629		   not be settable (1003.1g 5.3) */
630		default:
631			ret = -ENOPROTOOPT;
632			break;
633	}
634	release_sock(sk);
635	return ret;
636}
637
638
639int sock_getsockopt(struct socket *sock, int level, int optname,
640		    char __user *optval, int __user *optlen)
641{
642	struct sock *sk = sock->sk;
643
644	union
645	{
646		int val;
647		struct linger ling;
648		struct timeval tm;
649	} v;
650
651	unsigned int lv = sizeof(int);
652	int len;
653
654	if(get_user(len,optlen))
655		return -EFAULT;
656	if(len < 0)
657		return -EINVAL;
658
659	switch(optname)
660	{
661		case SO_DEBUG:
662			v.val = sock_flag(sk, SOCK_DBG);
663			break;
664
665		case SO_DONTROUTE:
666			v.val = sock_flag(sk, SOCK_LOCALROUTE);
667			break;
668
669		case SO_BROADCAST:
670			v.val = !!sock_flag(sk, SOCK_BROADCAST);
671			break;
672
673		case SO_SNDBUF:
674			v.val = sk->sk_sndbuf;
675			break;
676
677		case SO_RCVBUF:
678			v.val = sk->sk_rcvbuf;
679			break;
680
681		case SO_REUSEADDR:
682			v.val = sk->sk_reuse;
683			break;
684
685		case SO_KEEPALIVE:
686			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
687			break;
688
689		case SO_TYPE:
690			v.val = sk->sk_type;
691			break;
692
693		case SO_ERROR:
694			v.val = -sock_error(sk);
695			if(v.val==0)
696				v.val = xchg(&sk->sk_err_soft, 0);
697			break;
698
699		case SO_OOBINLINE:
700			v.val = !!sock_flag(sk, SOCK_URGINLINE);
701			break;
702
703		case SO_NO_CHECK:
704			v.val = sk->sk_no_check;
705			break;
706
707		case SO_PRIORITY:
708			v.val = sk->sk_priority;
709			break;
710
711		case SO_LINGER:
712			lv		= sizeof(v.ling);
713			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
714			v.ling.l_linger	= sk->sk_lingertime / HZ;
715			break;
716
717		case SO_BSDCOMPAT:
718			sock_warn_obsolete_bsdism("getsockopt");
719			break;
720
721		case SO_TIMESTAMP:
722			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
723			break;
724
725		case SO_RCVTIMEO:
726			lv=sizeof(struct timeval);
727			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
728				v.tm.tv_sec = 0;
729				v.tm.tv_usec = 0;
730			} else {
731				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
732				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
733			}
734			break;
735
736		case SO_SNDTIMEO:
737			lv=sizeof(struct timeval);
738			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
739				v.tm.tv_sec = 0;
740				v.tm.tv_usec = 0;
741			} else {
742				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
743				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
744			}
745			break;
746
747		case SO_RCVLOWAT:
748			v.val = sk->sk_rcvlowat;
749			break;
750
751		case SO_SNDLOWAT:
752			v.val=1;
753			break;
754
755		case SO_PASSCRED:
756			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
757			break;
758
759		case SO_PEERCRED:
760			if (len > sizeof(sk->sk_peercred))
761				len = sizeof(sk->sk_peercred);
762			if (copy_to_user(optval, &sk->sk_peercred, len))
763				return -EFAULT;
764			goto lenout;
765
766		case SO_PEERNAME:
767		{
768			char address[128];
769
770			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
771				return -ENOTCONN;
772			if (lv < len)
773				return -EINVAL;
774			if (copy_to_user(optval, address, len))
775				return -EFAULT;
776			goto lenout;
777		}
778
779		/* Dubious BSD thing... Probably nobody even uses it, but
780		 * the UNIX standard wants it for whatever reason... -DaveM
781		 */
782		case SO_ACCEPTCONN:
783			v.val = sk->sk_state == TCP_LISTEN;
784			break;
785
786		case SO_PASSSEC:
787			v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
788			break;
789
790		case SO_PEERSEC:
791			return security_socket_getpeersec_stream(sock, optval, optlen, len);
792
793		default:
794			return(-ENOPROTOOPT);
795	}
796	if (len > lv)
797		len = lv;
798	if (copy_to_user(optval, &v, len))
799		return -EFAULT;
800lenout:
801	if (put_user(len, optlen))
802		return -EFAULT;
803	return 0;
804}
805
806/*
807 * Initialize an sk_lock.
808 *
809 * (We also register the sk_lock with the lock validator.)
810 */
811static inline void sock_lock_init(struct sock *sk)
812{
813	sock_lock_init_class_and_name(sk,
814			af_family_slock_key_strings[sk->sk_family],
815			af_family_slock_keys + sk->sk_family,
816			af_family_key_strings[sk->sk_family],
817			af_family_keys + sk->sk_family);
818}
819
820/**
821 *	sk_alloc - All socket objects are allocated here
822 *	@family: protocol family
823 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
824 *	@prot: struct proto associated with this new sock instance
825 *	@zero_it: if we should zero the newly allocated sock
826 */
827struct sock *sk_alloc(int family, gfp_t priority,
828		      struct proto *prot, int zero_it)
829{
830	struct sock *sk = NULL;
831	struct kmem_cache *slab = prot->slab;
832
833	if (slab != NULL)
834		sk = kmem_cache_alloc(slab, priority);
835	else
836		sk = kmalloc(prot->obj_size, priority);
837
838	if (sk) {
839		if (zero_it) {
840			memset(sk, 0, prot->obj_size);
841			sk->sk_family = family;
842			/*
843			 * See comment in struct sock definition to understand
844			 * why we need sk_prot_creator -acme
845			 */
846			sk->sk_prot = sk->sk_prot_creator = prot;
847			sock_lock_init(sk);
848		}
849
850		if (security_sk_alloc(sk, family, priority))
851			goto out_free;
852
853		if (!try_module_get(prot->owner))
854			goto out_free;
855	}
856	return sk;
857
858out_free:
859	if (slab != NULL)
860		kmem_cache_free(slab, sk);
861	else
862		kfree(sk);
863	return NULL;
864}
865
866void sk_free(struct sock *sk)
867{
868	struct sk_filter *filter;
869	struct module *owner = sk->sk_prot_creator->owner;
870
871	if (sk->sk_destruct)
872		sk->sk_destruct(sk);
873
874	filter = rcu_dereference(sk->sk_filter);
875	if (filter) {
876		sk_filter_release(sk, filter);
877		rcu_assign_pointer(sk->sk_filter, NULL);
878	}
879
880	sock_disable_timestamp(sk);
881
882	if (atomic_read(&sk->sk_omem_alloc))
883		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
884		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
885
886	security_sk_free(sk);
887	if (sk->sk_prot_creator->slab != NULL)
888		kmem_cache_free(sk->sk_prot_creator->slab, sk);
889	else
890		kfree(sk);
891	module_put(owner);
892}
893
894struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
895{
896	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
897
898	if (newsk != NULL) {
899		struct sk_filter *filter;
900
901		sock_copy(newsk, sk);
902
903		/* SANITY */
904		sk_node_init(&newsk->sk_node);
905		sock_lock_init(newsk);
906		bh_lock_sock(newsk);
907		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
908
909		atomic_set(&newsk->sk_rmem_alloc, 0);
910		atomic_set(&newsk->sk_wmem_alloc, 0);
911		atomic_set(&newsk->sk_omem_alloc, 0);
912		skb_queue_head_init(&newsk->sk_receive_queue);
913		skb_queue_head_init(&newsk->sk_write_queue);
914#ifdef CONFIG_NET_DMA
915		skb_queue_head_init(&newsk->sk_async_wait_queue);
916#endif
917
918		rwlock_init(&newsk->sk_dst_lock);
919		rwlock_init(&newsk->sk_callback_lock);
920		lockdep_set_class(&newsk->sk_callback_lock,
921				   af_callback_keys + newsk->sk_family);
922
923		newsk->sk_dst_cache	= NULL;
924		newsk->sk_wmem_queued	= 0;
925		newsk->sk_forward_alloc = 0;
926		newsk->sk_send_head	= NULL;
927		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
928
929		sock_reset_flag(newsk, SOCK_DONE);
930		skb_queue_head_init(&newsk->sk_error_queue);
931
932		filter = newsk->sk_filter;
933		if (filter != NULL)
934			sk_filter_charge(newsk, filter);
935
936		if (unlikely(xfrm_sk_clone_policy(newsk))) {
937			/* It is still raw copy of parent, so invalidate
938			 * destructor and make plain sk_free() */
939			newsk->sk_destruct = NULL;
940			sk_free(newsk);
941			newsk = NULL;
942			goto out;
943		}
944
945		newsk->sk_err	   = 0;
946		newsk->sk_priority = 0;
947		atomic_set(&newsk->sk_refcnt, 2);
948
949		/*
950		 * Increment the counter in the same struct proto as the master
951		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
952		 * is the same as sk->sk_prot->socks, as this field was copied
953		 * with memcpy).
954		 *
955		 * This _changes_ the previous behaviour, where
956		 * tcp_create_openreq_child always was incrementing the
957		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
958		 * to be taken into account in all callers. -acme
959		 */
960		sk_refcnt_debug_inc(newsk);
961		newsk->sk_socket = NULL;
962		newsk->sk_sleep	 = NULL;
963
964		if (newsk->sk_prot->sockets_allocated)
965			atomic_inc(newsk->sk_prot->sockets_allocated);
966	}
967out:
968	return newsk;
969}
970
971EXPORT_SYMBOL_GPL(sk_clone);
972
973void __init sk_init(void)
974{
975	if (num_physpages <= 4096) {
976		sysctl_wmem_max = 32767;
977		sysctl_rmem_max = 32767;
978		sysctl_wmem_default = 32767;
979		sysctl_rmem_default = 32767;
980	} else if (num_physpages >= 131072) {
981		sysctl_wmem_max = 131071;
982		sysctl_rmem_max = 131071;
983	}
984}
985
986/*
987 *	Simple resource managers for sockets.
988 */
989
990
991/*
992 * Write buffer destructor automatically called from kfree_skb.
993 */
994void sock_wfree(struct sk_buff *skb)
995{
996	struct sock *sk = skb->sk;
997
998	/* In case it might be waiting for more memory. */
999	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1000	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1001		sk->sk_write_space(sk);
1002	sock_put(sk);
1003}
1004
1005/*
1006 * Read buffer destructor automatically called from kfree_skb.
1007 */
1008void sock_rfree(struct sk_buff *skb)
1009{
1010	struct sock *sk = skb->sk;
1011
1012	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1013}
1014
1015
1016int sock_i_uid(struct sock *sk)
1017{
1018	int uid;
1019
1020	read_lock(&sk->sk_callback_lock);
1021	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1022	read_unlock(&sk->sk_callback_lock);
1023	return uid;
1024}
1025
1026unsigned long sock_i_ino(struct sock *sk)
1027{
1028	unsigned long ino;
1029
1030	read_lock(&sk->sk_callback_lock);
1031	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1032	read_unlock(&sk->sk_callback_lock);
1033	return ino;
1034}
1035
1036/*
1037 * Allocate a skb from the socket's send buffer.
1038 */
1039struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1040			     gfp_t priority)
1041{
1042	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1043		struct sk_buff * skb = alloc_skb(size, priority);
1044		if (skb) {
1045			skb_set_owner_w(skb, sk);
1046			return skb;
1047		}
1048	}
1049	return NULL;
1050}
1051
1052/*
1053 * Allocate a skb from the socket's receive buffer.
1054 */
1055struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1056			     gfp_t priority)
1057{
1058	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1059		struct sk_buff *skb = alloc_skb(size, priority);
1060		if (skb) {
1061			skb_set_owner_r(skb, sk);
1062			return skb;
1063		}
1064	}
1065	return NULL;
1066}
1067
1068/*
1069 * Allocate a memory block from the socket's option memory buffer.
1070 */
1071void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1072{
1073	if ((unsigned)size <= sysctl_optmem_max &&
1074	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1075		void *mem;
1076		/* First do the add, to avoid the race if kmalloc
1077		 * might sleep.
1078		 */
1079		atomic_add(size, &sk->sk_omem_alloc);
1080		mem = kmalloc(size, priority);
1081		if (mem)
1082			return mem;
1083		atomic_sub(size, &sk->sk_omem_alloc);
1084	}
1085	return NULL;
1086}
1087
1088/*
1089 * Free an option memory block.
1090 */
1091void sock_kfree_s(struct sock *sk, void *mem, int size)
1092{
1093	kfree(mem);
1094	atomic_sub(size, &sk->sk_omem_alloc);
1095}
1096
1097/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1098   I think, these locks should be removed for datagram sockets.
1099 */
1100static long sock_wait_for_wmem(struct sock * sk, long timeo)
1101{
1102	DEFINE_WAIT(wait);
1103
1104	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1105	for (;;) {
1106		if (!timeo)
1107			break;
1108		if (signal_pending(current))
1109			break;
1110		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1111		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1112		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1113			break;
1114		if (sk->sk_shutdown & SEND_SHUTDOWN)
1115			break;
1116		if (sk->sk_err)
1117			break;
1118		timeo = schedule_timeout(timeo);
1119	}
1120	finish_wait(sk->sk_sleep, &wait);
1121	return timeo;
1122}
1123
1124
1125/*
1126 *	Generic send/receive buffer handlers
1127 */
1128
1129static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1130					    unsigned long header_len,
1131					    unsigned long data_len,
1132					    int noblock, int *errcode)
1133{
1134	struct sk_buff *skb;
1135	gfp_t gfp_mask;
1136	long timeo;
1137	int err;
1138
1139	gfp_mask = sk->sk_allocation;
1140	if (gfp_mask & __GFP_WAIT)
1141		gfp_mask |= __GFP_REPEAT;
1142
1143	timeo = sock_sndtimeo(sk, noblock);
1144	while (1) {
1145		err = sock_error(sk);
1146		if (err != 0)
1147			goto failure;
1148
1149		err = -EPIPE;
1150		if (sk->sk_shutdown & SEND_SHUTDOWN)
1151			goto failure;
1152
1153		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1154			skb = alloc_skb(header_len, gfp_mask);
1155			if (skb) {
1156				int npages;
1157				int i;
1158
1159				/* No pages, we're done... */
1160				if (!data_len)
1161					break;
1162
1163				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1164				skb->truesize += data_len;
1165				skb_shinfo(skb)->nr_frags = npages;
1166				for (i = 0; i < npages; i++) {
1167					struct page *page;
1168					skb_frag_t *frag;
1169
1170					page = alloc_pages(sk->sk_allocation, 0);
1171					if (!page) {
1172						err = -ENOBUFS;
1173						skb_shinfo(skb)->nr_frags = i;
1174						kfree_skb(skb);
1175						goto failure;
1176					}
1177
1178					frag = &skb_shinfo(skb)->frags[i];
1179					frag->page = page;
1180					frag->page_offset = 0;
1181					frag->size = (data_len >= PAGE_SIZE ?
1182						      PAGE_SIZE :
1183						      data_len);
1184					data_len -= PAGE_SIZE;
1185				}
1186
1187				/* Full success... */
1188				break;
1189			}
1190			err = -ENOBUFS;
1191			goto failure;
1192		}
1193		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1194		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1195		err = -EAGAIN;
1196		if (!timeo)
1197			goto failure;
1198		if (signal_pending(current))
1199			goto interrupted;
1200		timeo = sock_wait_for_wmem(sk, timeo);
1201	}
1202
1203	skb_set_owner_w(skb, sk);
1204	return skb;
1205
1206interrupted:
1207	err = sock_intr_errno(timeo);
1208failure:
1209	*errcode = err;
1210	return NULL;
1211}
1212
1213struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1214				    int noblock, int *errcode)
1215{
1216	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1217}
1218
1219static void __lock_sock(struct sock *sk)
1220{
1221	DEFINE_WAIT(wait);
1222
1223	for(;;) {
1224		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1225					TASK_UNINTERRUPTIBLE);
1226		spin_unlock_bh(&sk->sk_lock.slock);
1227		schedule();
1228		spin_lock_bh(&sk->sk_lock.slock);
1229		if(!sock_owned_by_user(sk))
1230			break;
1231	}
1232	finish_wait(&sk->sk_lock.wq, &wait);
1233}
1234
1235static void __release_sock(struct sock *sk)
1236{
1237	struct sk_buff *skb = sk->sk_backlog.head;
1238
1239	do {
1240		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1241		bh_unlock_sock(sk);
1242
1243		do {
1244			struct sk_buff *next = skb->next;
1245
1246			skb->next = NULL;
1247			sk->sk_backlog_rcv(sk, skb);
1248
1249			/*
1250			 * We are in process context here with softirqs
1251			 * disabled, use cond_resched_softirq() to preempt.
1252			 * This is safe to do because we've taken the backlog
1253			 * queue private:
1254			 */
1255			cond_resched_softirq();
1256
1257			skb = next;
1258		} while (skb != NULL);
1259
1260		bh_lock_sock(sk);
1261	} while((skb = sk->sk_backlog.head) != NULL);
1262}
1263
1264/**
1265 * sk_wait_data - wait for data to arrive at sk_receive_queue
1266 * @sk:    sock to wait on
1267 * @timeo: for how long
1268 *
1269 * Now socket state including sk->sk_err is changed only under lock,
1270 * hence we may omit checks after joining wait queue.
1271 * We check receive queue before schedule() only as optimization;
1272 * it is very likely that release_sock() added new data.
1273 */
1274int sk_wait_data(struct sock *sk, long *timeo)
1275{
1276	int rc;
1277	DEFINE_WAIT(wait);
1278
1279	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1280	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1281	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1282	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1283	finish_wait(sk->sk_sleep, &wait);
1284	return rc;
1285}
1286
1287EXPORT_SYMBOL(sk_wait_data);
1288
1289/*
1290 * Set of default routines for initialising struct proto_ops when
1291 * the protocol does not support a particular function. In certain
1292 * cases where it makes no sense for a protocol to have a "do nothing"
1293 * function, some default processing is provided.
1294 */
1295
1296int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1297{
1298	return -EOPNOTSUPP;
1299}
1300
1301int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1302		    int len, int flags)
1303{
1304	return -EOPNOTSUPP;
1305}
1306
1307int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1308{
1309	return -EOPNOTSUPP;
1310}
1311
1312int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1313{
1314	return -EOPNOTSUPP;
1315}
1316
1317int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1318		    int *len, int peer)
1319{
1320	return -EOPNOTSUPP;
1321}
1322
1323unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1324{
1325	return 0;
1326}
1327
1328int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1329{
1330	return -EOPNOTSUPP;
1331}
1332
1333int sock_no_listen(struct socket *sock, int backlog)
1334{
1335	return -EOPNOTSUPP;
1336}
1337
1338int sock_no_shutdown(struct socket *sock, int how)
1339{
1340	return -EOPNOTSUPP;
1341}
1342
1343int sock_no_setsockopt(struct socket *sock, int level, int optname,
1344		    char __user *optval, int optlen)
1345{
1346	return -EOPNOTSUPP;
1347}
1348
1349int sock_no_getsockopt(struct socket *sock, int level, int optname,
1350		    char __user *optval, int __user *optlen)
1351{
1352	return -EOPNOTSUPP;
1353}
1354
1355int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1356		    size_t len)
1357{
1358	return -EOPNOTSUPP;
1359}
1360
1361int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1362		    size_t len, int flags)
1363{
1364	return -EOPNOTSUPP;
1365}
1366
1367int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1368{
1369	/* Mirror missing mmap method error code */
1370	return -ENODEV;
1371}
1372
1373ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1374{
1375	ssize_t res;
1376	struct msghdr msg = {.msg_flags = flags};
1377	struct kvec iov;
1378	char *kaddr = kmap(page);
1379	iov.iov_base = kaddr + offset;
1380	iov.iov_len = size;
1381	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1382	kunmap(page);
1383	return res;
1384}
1385
1386/*
1387 *	Default Socket Callbacks
1388 */
1389
1390static void sock_def_wakeup(struct sock *sk)
1391{
1392	read_lock(&sk->sk_callback_lock);
1393	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1394		wake_up_interruptible_all(sk->sk_sleep);
1395	read_unlock(&sk->sk_callback_lock);
1396}
1397
1398static void sock_def_error_report(struct sock *sk)
1399{
1400	read_lock(&sk->sk_callback_lock);
1401	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1402		wake_up_interruptible(sk->sk_sleep);
1403	sk_wake_async(sk,0,POLL_ERR);
1404	read_unlock(&sk->sk_callback_lock);
1405}
1406
1407static void sock_def_readable(struct sock *sk, int len)
1408{
1409	read_lock(&sk->sk_callback_lock);
1410	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1411		wake_up_interruptible(sk->sk_sleep);
1412	sk_wake_async(sk,1,POLL_IN);
1413	read_unlock(&sk->sk_callback_lock);
1414}
1415
1416static void sock_def_write_space(struct sock *sk)
1417{
1418	read_lock(&sk->sk_callback_lock);
1419
1420	/* Do not wake up a writer until he can make "significant"
1421	 * progress.  --DaveM
1422	 */
1423	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1424		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1425			wake_up_interruptible(sk->sk_sleep);
1426
1427		/* Should agree with poll, otherwise some programs break */
1428		if (sock_writeable(sk))
1429			sk_wake_async(sk, 2, POLL_OUT);
1430	}
1431
1432	read_unlock(&sk->sk_callback_lock);
1433}
1434
1435static void sock_def_destruct(struct sock *sk)
1436{
1437	kfree(sk->sk_protinfo);
1438}
1439
1440void sk_send_sigurg(struct sock *sk)
1441{
1442	if (sk->sk_socket && sk->sk_socket->file)
1443		if (send_sigurg(&sk->sk_socket->file->f_owner))
1444			sk_wake_async(sk, 3, POLL_PRI);
1445}
1446
1447void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1448		    unsigned long expires)
1449{
1450	if (!mod_timer(timer, expires))
1451		sock_hold(sk);
1452}
1453
1454EXPORT_SYMBOL(sk_reset_timer);
1455
1456void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1457{
1458	if (timer_pending(timer) && del_timer(timer))
1459		__sock_put(sk);
1460}
1461
1462EXPORT_SYMBOL(sk_stop_timer);
1463
1464void sock_init_data(struct socket *sock, struct sock *sk)
1465{
1466	skb_queue_head_init(&sk->sk_receive_queue);
1467	skb_queue_head_init(&sk->sk_write_queue);
1468	skb_queue_head_init(&sk->sk_error_queue);
1469#ifdef CONFIG_NET_DMA
1470	skb_queue_head_init(&sk->sk_async_wait_queue);
1471#endif
1472
1473	sk->sk_send_head	=	NULL;
1474
1475	init_timer(&sk->sk_timer);
1476
1477	sk->sk_allocation	=	GFP_KERNEL;
1478	sk->sk_rcvbuf		=	sysctl_rmem_default;
1479	sk->sk_sndbuf		=	sysctl_wmem_default;
1480	sk->sk_state		=	TCP_CLOSE;
1481	sk->sk_socket		=	sock;
1482
1483	sock_set_flag(sk, SOCK_ZAPPED);
1484
1485	if(sock)
1486	{
1487		sk->sk_type	=	sock->type;
1488		sk->sk_sleep	=	&sock->wait;
1489		sock->sk	=	sk;
1490	} else
1491		sk->sk_sleep	=	NULL;
1492
1493	rwlock_init(&sk->sk_dst_lock);
1494	rwlock_init(&sk->sk_callback_lock);
1495	lockdep_set_class(&sk->sk_callback_lock,
1496			   af_callback_keys + sk->sk_family);
1497
1498	sk->sk_state_change	=	sock_def_wakeup;
1499	sk->sk_data_ready	=	sock_def_readable;
1500	sk->sk_write_space	=	sock_def_write_space;
1501	sk->sk_error_report	=	sock_def_error_report;
1502	sk->sk_destruct		=	sock_def_destruct;
1503
1504	sk->sk_sndmsg_page	=	NULL;
1505	sk->sk_sndmsg_off	=	0;
1506
1507	sk->sk_peercred.pid 	=	0;
1508	sk->sk_peercred.uid	=	-1;
1509	sk->sk_peercred.gid	=	-1;
1510	sk->sk_write_pending	=	0;
1511	sk->sk_rcvlowat		=	1;
1512	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1513	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1514
1515	sk->sk_stamp = ktime_set(-1L, -1L);
1516
1517	atomic_set(&sk->sk_refcnt, 1);
1518}
1519
1520void fastcall lock_sock_nested(struct sock *sk, int subclass)
1521{
1522	might_sleep();
1523	spin_lock_bh(&sk->sk_lock.slock);
1524	if (sk->sk_lock.owner)
1525		__lock_sock(sk);
1526	sk->sk_lock.owner = (void *)1;
1527	spin_unlock(&sk->sk_lock.slock);
1528	/*
1529	 * The sk_lock has mutex_lock() semantics here:
1530	 */
1531	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1532	local_bh_enable();
1533}
1534
1535EXPORT_SYMBOL(lock_sock_nested);
1536
1537void fastcall release_sock(struct sock *sk)
1538{
1539	/*
1540	 * The sk_lock has mutex_unlock() semantics:
1541	 */
1542	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1543
1544	spin_lock_bh(&sk->sk_lock.slock);
1545	if (sk->sk_backlog.tail)
1546		__release_sock(sk);
1547	sk->sk_lock.owner = NULL;
1548	if (waitqueue_active(&sk->sk_lock.wq))
1549		wake_up(&sk->sk_lock.wq);
1550	spin_unlock_bh(&sk->sk_lock.slock);
1551}
1552EXPORT_SYMBOL(release_sock);
1553
1554int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1555{
1556	struct timeval tv;
1557	if (!sock_flag(sk, SOCK_TIMESTAMP))
1558		sock_enable_timestamp(sk);
1559	tv = ktime_to_timeval(sk->sk_stamp);
1560	if (tv.tv_sec == -1)
1561		return -ENOENT;
1562	if (tv.tv_sec == 0) {
1563		sk->sk_stamp = ktime_get_real();
1564		tv = ktime_to_timeval(sk->sk_stamp);
1565	}
1566	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1567}
1568EXPORT_SYMBOL(sock_get_timestamp);
1569
1570void sock_enable_timestamp(struct sock *sk)
1571{
1572	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1573		sock_set_flag(sk, SOCK_TIMESTAMP);
1574		net_enable_timestamp();
1575	}
1576}
1577EXPORT_SYMBOL(sock_enable_timestamp);
1578
1579/*
1580 *	Get a socket option on an socket.
1581 *
1582 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1583 *	asynchronous errors should be reported by getsockopt. We assume
1584 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1585 */
1586int sock_common_getsockopt(struct socket *sock, int level, int optname,
1587			   char __user *optval, int __user *optlen)
1588{
1589	struct sock *sk = sock->sk;
1590
1591	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1592}
1593
1594EXPORT_SYMBOL(sock_common_getsockopt);
1595
1596#ifdef CONFIG_COMPAT
1597int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1598				  char __user *optval, int __user *optlen)
1599{
1600	struct sock *sk = sock->sk;
1601
1602	if (sk->sk_prot->compat_getsockopt != NULL)
1603		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1604						      optval, optlen);
1605	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1606}
1607EXPORT_SYMBOL(compat_sock_common_getsockopt);
1608#endif
1609
1610int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1611			struct msghdr *msg, size_t size, int flags)
1612{
1613	struct sock *sk = sock->sk;
1614	int addr_len = 0;
1615	int err;
1616
1617	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1618				   flags & ~MSG_DONTWAIT, &addr_len);
1619	if (err >= 0)
1620		msg->msg_namelen = addr_len;
1621	return err;
1622}
1623
1624EXPORT_SYMBOL(sock_common_recvmsg);
1625
1626/*
1627 *	Set socket options on an inet socket.
1628 */
1629int sock_common_setsockopt(struct socket *sock, int level, int optname,
1630			   char __user *optval, int optlen)
1631{
1632	struct sock *sk = sock->sk;
1633
1634	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1635}
1636
1637EXPORT_SYMBOL(sock_common_setsockopt);
1638
1639#ifdef CONFIG_COMPAT
1640int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1641				  char __user *optval, int optlen)
1642{
1643	struct sock *sk = sock->sk;
1644
1645	if (sk->sk_prot->compat_setsockopt != NULL)
1646		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1647						      optval, optlen);
1648	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1649}
1650EXPORT_SYMBOL(compat_sock_common_setsockopt);
1651#endif
1652
1653void sk_common_release(struct sock *sk)
1654{
1655	if (sk->sk_prot->destroy)
1656		sk->sk_prot->destroy(sk);
1657
1658	/*
1659	 * Observation: when sock_common_release is called, processes have
1660	 * no access to socket. But net still has.
1661	 * Step one, detach it from networking:
1662	 *
1663	 * A. Remove from hash tables.
1664	 */
1665
1666	sk->sk_prot->unhash(sk);
1667
1668	/*
1669	 * In this point socket cannot receive new packets, but it is possible
1670	 * that some packets are in flight because some CPU runs receiver and
1671	 * did hash table lookup before we unhashed socket. They will achieve
1672	 * receive queue and will be purged by socket destructor.
1673	 *
1674	 * Also we still have packets pending on receive queue and probably,
1675	 * our own packets waiting in device queues. sock_destroy will drain
1676	 * receive queue, but transmitted packets will delay socket destruction
1677	 * until the last reference will be released.
1678	 */
1679
1680	sock_orphan(sk);
1681
1682	xfrm_sk_free_policy(sk);
1683
1684	sk_refcnt_debug_release(sk);
1685	sock_put(sk);
1686}
1687
1688EXPORT_SYMBOL(sk_common_release);
1689
1690static DEFINE_RWLOCK(proto_list_lock);
1691static LIST_HEAD(proto_list);
1692
1693int proto_register(struct proto *prot, int alloc_slab)
1694{
1695	char *request_sock_slab_name = NULL;
1696	char *timewait_sock_slab_name;
1697	int rc = -ENOBUFS;
1698
1699	if (alloc_slab) {
1700		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1701					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1702
1703		if (prot->slab == NULL) {
1704			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1705			       prot->name);
1706			goto out;
1707		}
1708
1709		if (prot->rsk_prot != NULL) {
1710			static const char mask[] = "request_sock_%s";
1711
1712			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1713			if (request_sock_slab_name == NULL)
1714				goto out_free_sock_slab;
1715
1716			sprintf(request_sock_slab_name, mask, prot->name);
1717			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1718								 prot->rsk_prot->obj_size, 0,
1719								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1720
1721			if (prot->rsk_prot->slab == NULL) {
1722				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1723				       prot->name);
1724				goto out_free_request_sock_slab_name;
1725			}
1726		}
1727
1728		if (prot->twsk_prot != NULL) {
1729			static const char mask[] = "tw_sock_%s";
1730
1731			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1732
1733			if (timewait_sock_slab_name == NULL)
1734				goto out_free_request_sock_slab;
1735
1736			sprintf(timewait_sock_slab_name, mask, prot->name);
1737			prot->twsk_prot->twsk_slab =
1738				kmem_cache_create(timewait_sock_slab_name,
1739						  prot->twsk_prot->twsk_obj_size,
1740						  0, SLAB_HWCACHE_ALIGN,
1741						  NULL, NULL);
1742			if (prot->twsk_prot->twsk_slab == NULL)
1743				goto out_free_timewait_sock_slab_name;
1744		}
1745	}
1746
1747	write_lock(&proto_list_lock);
1748	list_add(&prot->node, &proto_list);
1749	write_unlock(&proto_list_lock);
1750	rc = 0;
1751out:
1752	return rc;
1753out_free_timewait_sock_slab_name:
1754	kfree(timewait_sock_slab_name);
1755out_free_request_sock_slab:
1756	if (prot->rsk_prot && prot->rsk_prot->slab) {
1757		kmem_cache_destroy(prot->rsk_prot->slab);
1758		prot->rsk_prot->slab = NULL;
1759	}
1760out_free_request_sock_slab_name:
1761	kfree(request_sock_slab_name);
1762out_free_sock_slab:
1763	kmem_cache_destroy(prot->slab);
1764	prot->slab = NULL;
1765	goto out;
1766}
1767
1768EXPORT_SYMBOL(proto_register);
1769
1770void proto_unregister(struct proto *prot)
1771{
1772	write_lock(&proto_list_lock);
1773	list_del(&prot->node);
1774	write_unlock(&proto_list_lock);
1775
1776	if (prot->slab != NULL) {
1777		kmem_cache_destroy(prot->slab);
1778		prot->slab = NULL;
1779	}
1780
1781	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1782		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1783
1784		kmem_cache_destroy(prot->rsk_prot->slab);
1785		kfree(name);
1786		prot->rsk_prot->slab = NULL;
1787	}
1788
1789	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1790		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1791
1792		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1793		kfree(name);
1794		prot->twsk_prot->twsk_slab = NULL;
1795	}
1796}
1797
1798EXPORT_SYMBOL(proto_unregister);
1799
1800#ifdef CONFIG_PROC_FS
1801static inline struct proto *__proto_head(void)
1802{
1803	return list_entry(proto_list.next, struct proto, node);
1804}
1805
1806static inline struct proto *proto_head(void)
1807{
1808	return list_empty(&proto_list) ? NULL : __proto_head();
1809}
1810
1811static inline struct proto *proto_next(struct proto *proto)
1812{
1813	return proto->node.next == &proto_list ? NULL :
1814		list_entry(proto->node.next, struct proto, node);
1815}
1816
1817static inline struct proto *proto_get_idx(loff_t pos)
1818{
1819	struct proto *proto;
1820	loff_t i = 0;
1821
1822	list_for_each_entry(proto, &proto_list, node)
1823		if (i++ == pos)
1824			goto out;
1825
1826	proto = NULL;
1827out:
1828	return proto;
1829}
1830
1831static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1832{
1833	read_lock(&proto_list_lock);
1834	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1835}
1836
1837static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1838{
1839	++*pos;
1840	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1841}
1842
1843static void proto_seq_stop(struct seq_file *seq, void *v)
1844{
1845	read_unlock(&proto_list_lock);
1846}
1847
1848static char proto_method_implemented(const void *method)
1849{
1850	return method == NULL ? 'n' : 'y';
1851}
1852
1853static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1854{
1855	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1856			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1857		   proto->name,
1858		   proto->obj_size,
1859		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1860		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1861		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1862		   proto->max_header,
1863		   proto->slab == NULL ? "no" : "yes",
1864		   module_name(proto->owner),
1865		   proto_method_implemented(proto->close),
1866		   proto_method_implemented(proto->connect),
1867		   proto_method_implemented(proto->disconnect),
1868		   proto_method_implemented(proto->accept),
1869		   proto_method_implemented(proto->ioctl),
1870		   proto_method_implemented(proto->init),
1871		   proto_method_implemented(proto->destroy),
1872		   proto_method_implemented(proto->shutdown),
1873		   proto_method_implemented(proto->setsockopt),
1874		   proto_method_implemented(proto->getsockopt),
1875		   proto_method_implemented(proto->sendmsg),
1876		   proto_method_implemented(proto->recvmsg),
1877		   proto_method_implemented(proto->sendpage),
1878		   proto_method_implemented(proto->bind),
1879		   proto_method_implemented(proto->backlog_rcv),
1880		   proto_method_implemented(proto->hash),
1881		   proto_method_implemented(proto->unhash),
1882		   proto_method_implemented(proto->get_port),
1883		   proto_method_implemented(proto->enter_memory_pressure));
1884}
1885
1886static int proto_seq_show(struct seq_file *seq, void *v)
1887{
1888	if (v == SEQ_START_TOKEN)
1889		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1890			   "protocol",
1891			   "size",
1892			   "sockets",
1893			   "memory",
1894			   "press",
1895			   "maxhdr",
1896			   "slab",
1897			   "module",
1898			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1899	else
1900		proto_seq_printf(seq, v);
1901	return 0;
1902}
1903
1904static struct seq_operations proto_seq_ops = {
1905	.start  = proto_seq_start,
1906	.next   = proto_seq_next,
1907	.stop   = proto_seq_stop,
1908	.show   = proto_seq_show,
1909};
1910
1911static int proto_seq_open(struct inode *inode, struct file *file)
1912{
1913	return seq_open(file, &proto_seq_ops);
1914}
1915
1916static const struct file_operations proto_seq_fops = {
1917	.owner		= THIS_MODULE,
1918	.open		= proto_seq_open,
1919	.read		= seq_read,
1920	.llseek		= seq_lseek,
1921	.release	= seq_release,
1922};
1923
1924static int __init proto_init(void)
1925{
1926	/* register /proc/net/protocols */
1927	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1928}
1929
1930subsys_initcall(proto_init);
1931
1932#endif /* PROC_FS */
1933
1934EXPORT_SYMBOL(sk_alloc);
1935EXPORT_SYMBOL(sk_free);
1936EXPORT_SYMBOL(sk_send_sigurg);
1937EXPORT_SYMBOL(sock_alloc_send_skb);
1938EXPORT_SYMBOL(sock_init_data);
1939EXPORT_SYMBOL(sock_kfree_s);
1940EXPORT_SYMBOL(sock_kmalloc);
1941EXPORT_SYMBOL(sock_no_accept);
1942EXPORT_SYMBOL(sock_no_bind);
1943EXPORT_SYMBOL(sock_no_connect);
1944EXPORT_SYMBOL(sock_no_getname);
1945EXPORT_SYMBOL(sock_no_getsockopt);
1946EXPORT_SYMBOL(sock_no_ioctl);
1947EXPORT_SYMBOL(sock_no_listen);
1948EXPORT_SYMBOL(sock_no_mmap);
1949EXPORT_SYMBOL(sock_no_poll);
1950EXPORT_SYMBOL(sock_no_recvmsg);
1951EXPORT_SYMBOL(sock_no_sendmsg);
1952EXPORT_SYMBOL(sock_no_sendpage);
1953EXPORT_SYMBOL(sock_no_setsockopt);
1954EXPORT_SYMBOL(sock_no_shutdown);
1955EXPORT_SYMBOL(sock_no_socketpair);
1956EXPORT_SYMBOL(sock_rfree);
1957EXPORT_SYMBOL(sock_setsockopt);
1958EXPORT_SYMBOL(sock_wfree);
1959EXPORT_SYMBOL(sock_wmalloc);
1960EXPORT_SYMBOL(sock_i_uid);
1961EXPORT_SYMBOL(sock_i_ino);
1962EXPORT_SYMBOL(sysctl_optmem_max);
1963#ifdef CONFIG_SYSCTL
1964EXPORT_SYMBOL(sysctl_rmem_max);
1965EXPORT_SYMBOL(sysctl_wmem_max);
1966#endif
1967