sock.c revision e71a4783aae059931f63b2d4e7013e36529badef
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
148  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
149  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
150  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
151  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
152  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
153  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
154  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
155  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
156  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
157  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
158};
159static const char *af_family_slock_key_strings[AF_MAX+1] = {
160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169  "slock-27"       , "slock-28"          , "slock-29"          ,
170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
171};
172#endif
173
174/*
175 * sk_callback_lock locking rules are per-address-family,
176 * so split the lock classes by using a per-AF key:
177 */
178static struct lock_class_key af_callback_keys[AF_MAX];
179
180/* Take into consideration the size of the struct sk_buff overhead in the
181 * determination of these values, since that is non-constant across
182 * platforms.  This makes socket queueing behavior and performance
183 * not depend upon such differences.
184 */
185#define _SK_MEM_PACKETS		256
186#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
187#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
189
190/* Run time adjustable parameters. */
191__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
192__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
193__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
194__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
195
196/* Maximal space eaten by iovec or ancilliary data plus some space */
197int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
198
199static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
200{
201	struct timeval tv;
202
203	if (optlen < sizeof(tv))
204		return -EINVAL;
205	if (copy_from_user(&tv, optval, sizeof(tv)))
206		return -EFAULT;
207
208	*timeo_p = MAX_SCHEDULE_TIMEOUT;
209	if (tv.tv_sec == 0 && tv.tv_usec == 0)
210		return 0;
211	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
212		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
213	return 0;
214}
215
216static void sock_warn_obsolete_bsdism(const char *name)
217{
218	static int warned;
219	static char warncomm[TASK_COMM_LEN];
220	if (strcmp(warncomm, current->comm) && warned < 5) {
221		strcpy(warncomm,  current->comm);
222		printk(KERN_WARNING "process `%s' is using obsolete "
223		       "%s SO_BSDCOMPAT\n", warncomm, name);
224		warned++;
225	}
226}
227
228static void sock_disable_timestamp(struct sock *sk)
229{
230	if (sock_flag(sk, SOCK_TIMESTAMP)) {
231		sock_reset_flag(sk, SOCK_TIMESTAMP);
232		net_disable_timestamp();
233	}
234}
235
236
237int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
238{
239	int err = 0;
240	int skb_len;
241
242	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
243	   number of warnings when compiling with -W --ANK
244	 */
245	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
246	    (unsigned)sk->sk_rcvbuf) {
247		err = -ENOMEM;
248		goto out;
249	}
250
251	err = sk_filter(sk, skb);
252	if (err)
253		goto out;
254
255	skb->dev = NULL;
256	skb_set_owner_r(skb, sk);
257
258	/* Cache the SKB length before we tack it onto the receive
259	 * queue.  Once it is added it no longer belongs to us and
260	 * may be freed by other threads of control pulling packets
261	 * from the queue.
262	 */
263	skb_len = skb->len;
264
265	skb_queue_tail(&sk->sk_receive_queue, skb);
266
267	if (!sock_flag(sk, SOCK_DEAD))
268		sk->sk_data_ready(sk, skb_len);
269out:
270	return err;
271}
272EXPORT_SYMBOL(sock_queue_rcv_skb);
273
274int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
275{
276	int rc = NET_RX_SUCCESS;
277
278	if (sk_filter(sk, skb))
279		goto discard_and_relse;
280
281	skb->dev = NULL;
282
283	if (nested)
284		bh_lock_sock_nested(sk);
285	else
286		bh_lock_sock(sk);
287	if (!sock_owned_by_user(sk)) {
288		/*
289		 * trylock + unlock semantics:
290		 */
291		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
292
293		rc = sk->sk_backlog_rcv(sk, skb);
294
295		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
296	} else
297		sk_add_backlog(sk, skb);
298	bh_unlock_sock(sk);
299out:
300	sock_put(sk);
301	return rc;
302discard_and_relse:
303	kfree_skb(skb);
304	goto out;
305}
306EXPORT_SYMBOL(sk_receive_skb);
307
308struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
309{
310	struct dst_entry *dst = sk->sk_dst_cache;
311
312	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
313		sk->sk_dst_cache = NULL;
314		dst_release(dst);
315		return NULL;
316	}
317
318	return dst;
319}
320EXPORT_SYMBOL(__sk_dst_check);
321
322struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
323{
324	struct dst_entry *dst = sk_dst_get(sk);
325
326	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
327		sk_dst_reset(sk);
328		dst_release(dst);
329		return NULL;
330	}
331
332	return dst;
333}
334EXPORT_SYMBOL(sk_dst_check);
335
336/*
337 *	This is meant for all protocols to use and covers goings on
338 *	at the socket level. Everything here is generic.
339 */
340
341int sock_setsockopt(struct socket *sock, int level, int optname,
342		    char __user *optval, int optlen)
343{
344	struct sock *sk=sock->sk;
345	struct sk_filter *filter;
346	int val;
347	int valbool;
348	struct linger ling;
349	int ret = 0;
350
351	/*
352	 *	Options without arguments
353	 */
354
355#ifdef SO_DONTLINGER		/* Compatibility item... */
356	if (optname == SO_DONTLINGER) {
357		lock_sock(sk);
358		sock_reset_flag(sk, SOCK_LINGER);
359		release_sock(sk);
360		return 0;
361	}
362#endif
363
364	if (optlen < sizeof(int))
365		return -EINVAL;
366
367	if (get_user(val, (int __user *)optval))
368		return -EFAULT;
369
370	valbool = val?1:0;
371
372	lock_sock(sk);
373
374	switch(optname) {
375	case SO_DEBUG:
376		if (val && !capable(CAP_NET_ADMIN)) {
377			ret = -EACCES;
378		}
379		else if (valbool)
380			sock_set_flag(sk, SOCK_DBG);
381		else
382			sock_reset_flag(sk, SOCK_DBG);
383		break;
384	case SO_REUSEADDR:
385		sk->sk_reuse = valbool;
386		break;
387	case SO_TYPE:
388	case SO_ERROR:
389		ret = -ENOPROTOOPT;
390		break;
391	case SO_DONTROUTE:
392		if (valbool)
393			sock_set_flag(sk, SOCK_LOCALROUTE);
394		else
395			sock_reset_flag(sk, SOCK_LOCALROUTE);
396		break;
397	case SO_BROADCAST:
398		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
399		break;
400	case SO_SNDBUF:
401		/* Don't error on this BSD doesn't and if you think
402		   about it this is right. Otherwise apps have to
403		   play 'guess the biggest size' games. RCVBUF/SNDBUF
404		   are treated in BSD as hints */
405
406		if (val > sysctl_wmem_max)
407			val = sysctl_wmem_max;
408set_sndbuf:
409		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
410		if ((val * 2) < SOCK_MIN_SNDBUF)
411			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
412		else
413			sk->sk_sndbuf = val * 2;
414
415		/*
416		 *	Wake up sending tasks if we
417		 *	upped the value.
418		 */
419		sk->sk_write_space(sk);
420		break;
421
422	case SO_SNDBUFFORCE:
423		if (!capable(CAP_NET_ADMIN)) {
424			ret = -EPERM;
425			break;
426		}
427		goto set_sndbuf;
428
429	case SO_RCVBUF:
430		/* Don't error on this BSD doesn't and if you think
431		   about it this is right. Otherwise apps have to
432		   play 'guess the biggest size' games. RCVBUF/SNDBUF
433		   are treated in BSD as hints */
434
435		if (val > sysctl_rmem_max)
436			val = sysctl_rmem_max;
437set_rcvbuf:
438		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
439		/*
440		 * We double it on the way in to account for
441		 * "struct sk_buff" etc. overhead.   Applications
442		 * assume that the SO_RCVBUF setting they make will
443		 * allow that much actual data to be received on that
444		 * socket.
445		 *
446		 * Applications are unaware that "struct sk_buff" and
447		 * other overheads allocate from the receive buffer
448		 * during socket buffer allocation.
449		 *
450		 * And after considering the possible alternatives,
451		 * returning the value we actually used in getsockopt
452		 * is the most desirable behavior.
453		 */
454		if ((val * 2) < SOCK_MIN_RCVBUF)
455			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
456		else
457			sk->sk_rcvbuf = val * 2;
458		break;
459
460	case SO_RCVBUFFORCE:
461		if (!capable(CAP_NET_ADMIN)) {
462			ret = -EPERM;
463			break;
464		}
465		goto set_rcvbuf;
466
467	case SO_KEEPALIVE:
468#ifdef CONFIG_INET
469		if (sk->sk_protocol == IPPROTO_TCP)
470			tcp_set_keepalive(sk, valbool);
471#endif
472		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
473		break;
474
475	case SO_OOBINLINE:
476		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
477		break;
478
479	case SO_NO_CHECK:
480		sk->sk_no_check = valbool;
481		break;
482
483	case SO_PRIORITY:
484		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
485			sk->sk_priority = val;
486		else
487			ret = -EPERM;
488		break;
489
490	case SO_LINGER:
491		if (optlen < sizeof(ling)) {
492			ret = -EINVAL;	/* 1003.1g */
493			break;
494		}
495		if (copy_from_user(&ling,optval,sizeof(ling))) {
496			ret = -EFAULT;
497			break;
498		}
499		if (!ling.l_onoff)
500			sock_reset_flag(sk, SOCK_LINGER);
501		else {
502#if (BITS_PER_LONG == 32)
503			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
504				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
505			else
506#endif
507				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
508			sock_set_flag(sk, SOCK_LINGER);
509		}
510		break;
511
512	case SO_BSDCOMPAT:
513		sock_warn_obsolete_bsdism("setsockopt");
514		break;
515
516	case SO_PASSCRED:
517		if (valbool)
518			set_bit(SOCK_PASSCRED, &sock->flags);
519		else
520			clear_bit(SOCK_PASSCRED, &sock->flags);
521		break;
522
523	case SO_TIMESTAMP:
524		if (valbool)  {
525			sock_set_flag(sk, SOCK_RCVTSTAMP);
526			sock_enable_timestamp(sk);
527		} else
528			sock_reset_flag(sk, SOCK_RCVTSTAMP);
529		break;
530
531	case SO_RCVLOWAT:
532		if (val < 0)
533			val = INT_MAX;
534		sk->sk_rcvlowat = val ? : 1;
535		break;
536
537	case SO_RCVTIMEO:
538		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
539		break;
540
541	case SO_SNDTIMEO:
542		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
543		break;
544
545#ifdef CONFIG_NETDEVICES
546	case SO_BINDTODEVICE:
547	{
548		char devname[IFNAMSIZ];
549
550		/* Sorry... */
551		if (!capable(CAP_NET_RAW)) {
552			ret = -EPERM;
553			break;
554		}
555
556		/* Bind this socket to a particular device like "eth0",
557		 * as specified in the passed interface name. If the
558		 * name is "" or the option length is zero the socket
559		 * is not bound.
560		 */
561
562		if (!valbool) {
563			sk->sk_bound_dev_if = 0;
564		} else {
565			if (optlen > IFNAMSIZ - 1)
566				optlen = IFNAMSIZ - 1;
567			memset(devname, 0, sizeof(devname));
568			if (copy_from_user(devname, optval, optlen)) {
569				ret = -EFAULT;
570				break;
571			}
572
573			/* Remove any cached route for this socket. */
574			sk_dst_reset(sk);
575
576			if (devname[0] == '\0') {
577				sk->sk_bound_dev_if = 0;
578			} else {
579				struct net_device *dev = dev_get_by_name(devname);
580				if (!dev) {
581					ret = -ENODEV;
582					break;
583				}
584				sk->sk_bound_dev_if = dev->ifindex;
585				dev_put(dev);
586			}
587		}
588		break;
589	}
590#endif
591
592
593	case SO_ATTACH_FILTER:
594		ret = -EINVAL;
595		if (optlen == sizeof(struct sock_fprog)) {
596			struct sock_fprog fprog;
597
598			ret = -EFAULT;
599			if (copy_from_user(&fprog, optval, sizeof(fprog)))
600				break;
601
602			ret = sk_attach_filter(&fprog, sk);
603		}
604		break;
605
606	case SO_DETACH_FILTER:
607		rcu_read_lock_bh();
608		filter = rcu_dereference(sk->sk_filter);
609		if (filter) {
610			rcu_assign_pointer(sk->sk_filter, NULL);
611			sk_filter_release(sk, filter);
612			rcu_read_unlock_bh();
613			break;
614		}
615		rcu_read_unlock_bh();
616		ret = -ENONET;
617		break;
618
619	case SO_PASSSEC:
620		if (valbool)
621			set_bit(SOCK_PASSSEC, &sock->flags);
622		else
623			clear_bit(SOCK_PASSSEC, &sock->flags);
624		break;
625
626		/* We implement the SO_SNDLOWAT etc to
627		   not be settable (1003.1g 5.3) */
628	default:
629		ret = -ENOPROTOOPT;
630		break;
631	}
632	release_sock(sk);
633	return ret;
634}
635
636
637int sock_getsockopt(struct socket *sock, int level, int optname,
638		    char __user *optval, int __user *optlen)
639{
640	struct sock *sk = sock->sk;
641
642	union {
643		int val;
644		struct linger ling;
645		struct timeval tm;
646	} v;
647
648	unsigned int lv = sizeof(int);
649	int len;
650
651	if (get_user(len, optlen))
652		return -EFAULT;
653	if (len < 0)
654		return -EINVAL;
655
656	switch(optname) {
657	case SO_DEBUG:
658		v.val = sock_flag(sk, SOCK_DBG);
659		break;
660
661	case SO_DONTROUTE:
662		v.val = sock_flag(sk, SOCK_LOCALROUTE);
663		break;
664
665	case SO_BROADCAST:
666		v.val = !!sock_flag(sk, SOCK_BROADCAST);
667		break;
668
669	case SO_SNDBUF:
670		v.val = sk->sk_sndbuf;
671		break;
672
673	case SO_RCVBUF:
674		v.val = sk->sk_rcvbuf;
675		break;
676
677	case SO_REUSEADDR:
678		v.val = sk->sk_reuse;
679		break;
680
681	case SO_KEEPALIVE:
682		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
683		break;
684
685	case SO_TYPE:
686		v.val = sk->sk_type;
687		break;
688
689	case SO_ERROR:
690		v.val = -sock_error(sk);
691		if (v.val==0)
692			v.val = xchg(&sk->sk_err_soft, 0);
693		break;
694
695	case SO_OOBINLINE:
696		v.val = !!sock_flag(sk, SOCK_URGINLINE);
697		break;
698
699	case SO_NO_CHECK:
700		v.val = sk->sk_no_check;
701		break;
702
703	case SO_PRIORITY:
704		v.val = sk->sk_priority;
705		break;
706
707	case SO_LINGER:
708		lv		= sizeof(v.ling);
709		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
710		v.ling.l_linger	= sk->sk_lingertime / HZ;
711		break;
712
713	case SO_BSDCOMPAT:
714		sock_warn_obsolete_bsdism("getsockopt");
715		break;
716
717	case SO_TIMESTAMP:
718		v.val = sock_flag(sk, SOCK_RCVTSTAMP);
719		break;
720
721	case SO_RCVTIMEO:
722		lv=sizeof(struct timeval);
723		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
724			v.tm.tv_sec = 0;
725			v.tm.tv_usec = 0;
726		} else {
727			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
728			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
729		}
730		break;
731
732	case SO_SNDTIMEO:
733		lv=sizeof(struct timeval);
734		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
735			v.tm.tv_sec = 0;
736			v.tm.tv_usec = 0;
737		} else {
738			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
739			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
740		}
741		break;
742
743	case SO_RCVLOWAT:
744		v.val = sk->sk_rcvlowat;
745		break;
746
747	case SO_SNDLOWAT:
748		v.val=1;
749		break;
750
751	case SO_PASSCRED:
752		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
753		break;
754
755	case SO_PEERCRED:
756		if (len > sizeof(sk->sk_peercred))
757			len = sizeof(sk->sk_peercred);
758		if (copy_to_user(optval, &sk->sk_peercred, len))
759			return -EFAULT;
760		goto lenout;
761
762	case SO_PEERNAME:
763	{
764		char address[128];
765
766		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
767			return -ENOTCONN;
768		if (lv < len)
769			return -EINVAL;
770		if (copy_to_user(optval, address, len))
771			return -EFAULT;
772		goto lenout;
773	}
774
775	/* Dubious BSD thing... Probably nobody even uses it, but
776	 * the UNIX standard wants it for whatever reason... -DaveM
777	 */
778	case SO_ACCEPTCONN:
779		v.val = sk->sk_state == TCP_LISTEN;
780		break;
781
782	case SO_PASSSEC:
783		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
784		break;
785
786	case SO_PEERSEC:
787		return security_socket_getpeersec_stream(sock, optval, optlen, len);
788
789	default:
790		return -ENOPROTOOPT;
791	}
792
793	if (len > lv)
794		len = lv;
795	if (copy_to_user(optval, &v, len))
796		return -EFAULT;
797lenout:
798	if (put_user(len, optlen))
799		return -EFAULT;
800	return 0;
801}
802
803/*
804 * Initialize an sk_lock.
805 *
806 * (We also register the sk_lock with the lock validator.)
807 */
808static inline void sock_lock_init(struct sock *sk)
809{
810	sock_lock_init_class_and_name(sk,
811			af_family_slock_key_strings[sk->sk_family],
812			af_family_slock_keys + sk->sk_family,
813			af_family_key_strings[sk->sk_family],
814			af_family_keys + sk->sk_family);
815}
816
817/**
818 *	sk_alloc - All socket objects are allocated here
819 *	@family: protocol family
820 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
821 *	@prot: struct proto associated with this new sock instance
822 *	@zero_it: if we should zero the newly allocated sock
823 */
824struct sock *sk_alloc(int family, gfp_t priority,
825		      struct proto *prot, int zero_it)
826{
827	struct sock *sk = NULL;
828	struct kmem_cache *slab = prot->slab;
829
830	if (slab != NULL)
831		sk = kmem_cache_alloc(slab, priority);
832	else
833		sk = kmalloc(prot->obj_size, priority);
834
835	if (sk) {
836		if (zero_it) {
837			memset(sk, 0, prot->obj_size);
838			sk->sk_family = family;
839			/*
840			 * See comment in struct sock definition to understand
841			 * why we need sk_prot_creator -acme
842			 */
843			sk->sk_prot = sk->sk_prot_creator = prot;
844			sock_lock_init(sk);
845		}
846
847		if (security_sk_alloc(sk, family, priority))
848			goto out_free;
849
850		if (!try_module_get(prot->owner))
851			goto out_free;
852	}
853	return sk;
854
855out_free:
856	if (slab != NULL)
857		kmem_cache_free(slab, sk);
858	else
859		kfree(sk);
860	return NULL;
861}
862
863void sk_free(struct sock *sk)
864{
865	struct sk_filter *filter;
866	struct module *owner = sk->sk_prot_creator->owner;
867
868	if (sk->sk_destruct)
869		sk->sk_destruct(sk);
870
871	filter = rcu_dereference(sk->sk_filter);
872	if (filter) {
873		sk_filter_release(sk, filter);
874		rcu_assign_pointer(sk->sk_filter, NULL);
875	}
876
877	sock_disable_timestamp(sk);
878
879	if (atomic_read(&sk->sk_omem_alloc))
880		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
881		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
882
883	security_sk_free(sk);
884	if (sk->sk_prot_creator->slab != NULL)
885		kmem_cache_free(sk->sk_prot_creator->slab, sk);
886	else
887		kfree(sk);
888	module_put(owner);
889}
890
891struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
892{
893	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
894
895	if (newsk != NULL) {
896		struct sk_filter *filter;
897
898		sock_copy(newsk, sk);
899
900		/* SANITY */
901		sk_node_init(&newsk->sk_node);
902		sock_lock_init(newsk);
903		bh_lock_sock(newsk);
904		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
905
906		atomic_set(&newsk->sk_rmem_alloc, 0);
907		atomic_set(&newsk->sk_wmem_alloc, 0);
908		atomic_set(&newsk->sk_omem_alloc, 0);
909		skb_queue_head_init(&newsk->sk_receive_queue);
910		skb_queue_head_init(&newsk->sk_write_queue);
911#ifdef CONFIG_NET_DMA
912		skb_queue_head_init(&newsk->sk_async_wait_queue);
913#endif
914
915		rwlock_init(&newsk->sk_dst_lock);
916		rwlock_init(&newsk->sk_callback_lock);
917		lockdep_set_class(&newsk->sk_callback_lock,
918				   af_callback_keys + newsk->sk_family);
919
920		newsk->sk_dst_cache	= NULL;
921		newsk->sk_wmem_queued	= 0;
922		newsk->sk_forward_alloc = 0;
923		newsk->sk_send_head	= NULL;
924		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
925
926		sock_reset_flag(newsk, SOCK_DONE);
927		skb_queue_head_init(&newsk->sk_error_queue);
928
929		filter = newsk->sk_filter;
930		if (filter != NULL)
931			sk_filter_charge(newsk, filter);
932
933		if (unlikely(xfrm_sk_clone_policy(newsk))) {
934			/* It is still raw copy of parent, so invalidate
935			 * destructor and make plain sk_free() */
936			newsk->sk_destruct = NULL;
937			sk_free(newsk);
938			newsk = NULL;
939			goto out;
940		}
941
942		newsk->sk_err	   = 0;
943		newsk->sk_priority = 0;
944		atomic_set(&newsk->sk_refcnt, 2);
945
946		/*
947		 * Increment the counter in the same struct proto as the master
948		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
949		 * is the same as sk->sk_prot->socks, as this field was copied
950		 * with memcpy).
951		 *
952		 * This _changes_ the previous behaviour, where
953		 * tcp_create_openreq_child always was incrementing the
954		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
955		 * to be taken into account in all callers. -acme
956		 */
957		sk_refcnt_debug_inc(newsk);
958		newsk->sk_socket = NULL;
959		newsk->sk_sleep	 = NULL;
960
961		if (newsk->sk_prot->sockets_allocated)
962			atomic_inc(newsk->sk_prot->sockets_allocated);
963	}
964out:
965	return newsk;
966}
967
968EXPORT_SYMBOL_GPL(sk_clone);
969
970void __init sk_init(void)
971{
972	if (num_physpages <= 4096) {
973		sysctl_wmem_max = 32767;
974		sysctl_rmem_max = 32767;
975		sysctl_wmem_default = 32767;
976		sysctl_rmem_default = 32767;
977	} else if (num_physpages >= 131072) {
978		sysctl_wmem_max = 131071;
979		sysctl_rmem_max = 131071;
980	}
981}
982
983/*
984 *	Simple resource managers for sockets.
985 */
986
987
988/*
989 * Write buffer destructor automatically called from kfree_skb.
990 */
991void sock_wfree(struct sk_buff *skb)
992{
993	struct sock *sk = skb->sk;
994
995	/* In case it might be waiting for more memory. */
996	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
997	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
998		sk->sk_write_space(sk);
999	sock_put(sk);
1000}
1001
1002/*
1003 * Read buffer destructor automatically called from kfree_skb.
1004 */
1005void sock_rfree(struct sk_buff *skb)
1006{
1007	struct sock *sk = skb->sk;
1008
1009	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1010}
1011
1012
1013int sock_i_uid(struct sock *sk)
1014{
1015	int uid;
1016
1017	read_lock(&sk->sk_callback_lock);
1018	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1019	read_unlock(&sk->sk_callback_lock);
1020	return uid;
1021}
1022
1023unsigned long sock_i_ino(struct sock *sk)
1024{
1025	unsigned long ino;
1026
1027	read_lock(&sk->sk_callback_lock);
1028	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1029	read_unlock(&sk->sk_callback_lock);
1030	return ino;
1031}
1032
1033/*
1034 * Allocate a skb from the socket's send buffer.
1035 */
1036struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1037			     gfp_t priority)
1038{
1039	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1040		struct sk_buff * skb = alloc_skb(size, priority);
1041		if (skb) {
1042			skb_set_owner_w(skb, sk);
1043			return skb;
1044		}
1045	}
1046	return NULL;
1047}
1048
1049/*
1050 * Allocate a skb from the socket's receive buffer.
1051 */
1052struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1053			     gfp_t priority)
1054{
1055	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1056		struct sk_buff *skb = alloc_skb(size, priority);
1057		if (skb) {
1058			skb_set_owner_r(skb, sk);
1059			return skb;
1060		}
1061	}
1062	return NULL;
1063}
1064
1065/*
1066 * Allocate a memory block from the socket's option memory buffer.
1067 */
1068void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1069{
1070	if ((unsigned)size <= sysctl_optmem_max &&
1071	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1072		void *mem;
1073		/* First do the add, to avoid the race if kmalloc
1074		 * might sleep.
1075		 */
1076		atomic_add(size, &sk->sk_omem_alloc);
1077		mem = kmalloc(size, priority);
1078		if (mem)
1079			return mem;
1080		atomic_sub(size, &sk->sk_omem_alloc);
1081	}
1082	return NULL;
1083}
1084
1085/*
1086 * Free an option memory block.
1087 */
1088void sock_kfree_s(struct sock *sk, void *mem, int size)
1089{
1090	kfree(mem);
1091	atomic_sub(size, &sk->sk_omem_alloc);
1092}
1093
1094/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1095   I think, these locks should be removed for datagram sockets.
1096 */
1097static long sock_wait_for_wmem(struct sock * sk, long timeo)
1098{
1099	DEFINE_WAIT(wait);
1100
1101	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1102	for (;;) {
1103		if (!timeo)
1104			break;
1105		if (signal_pending(current))
1106			break;
1107		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1108		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1109		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1110			break;
1111		if (sk->sk_shutdown & SEND_SHUTDOWN)
1112			break;
1113		if (sk->sk_err)
1114			break;
1115		timeo = schedule_timeout(timeo);
1116	}
1117	finish_wait(sk->sk_sleep, &wait);
1118	return timeo;
1119}
1120
1121
1122/*
1123 *	Generic send/receive buffer handlers
1124 */
1125
1126static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1127					    unsigned long header_len,
1128					    unsigned long data_len,
1129					    int noblock, int *errcode)
1130{
1131	struct sk_buff *skb;
1132	gfp_t gfp_mask;
1133	long timeo;
1134	int err;
1135
1136	gfp_mask = sk->sk_allocation;
1137	if (gfp_mask & __GFP_WAIT)
1138		gfp_mask |= __GFP_REPEAT;
1139
1140	timeo = sock_sndtimeo(sk, noblock);
1141	while (1) {
1142		err = sock_error(sk);
1143		if (err != 0)
1144			goto failure;
1145
1146		err = -EPIPE;
1147		if (sk->sk_shutdown & SEND_SHUTDOWN)
1148			goto failure;
1149
1150		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1151			skb = alloc_skb(header_len, gfp_mask);
1152			if (skb) {
1153				int npages;
1154				int i;
1155
1156				/* No pages, we're done... */
1157				if (!data_len)
1158					break;
1159
1160				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1161				skb->truesize += data_len;
1162				skb_shinfo(skb)->nr_frags = npages;
1163				for (i = 0; i < npages; i++) {
1164					struct page *page;
1165					skb_frag_t *frag;
1166
1167					page = alloc_pages(sk->sk_allocation, 0);
1168					if (!page) {
1169						err = -ENOBUFS;
1170						skb_shinfo(skb)->nr_frags = i;
1171						kfree_skb(skb);
1172						goto failure;
1173					}
1174
1175					frag = &skb_shinfo(skb)->frags[i];
1176					frag->page = page;
1177					frag->page_offset = 0;
1178					frag->size = (data_len >= PAGE_SIZE ?
1179						      PAGE_SIZE :
1180						      data_len);
1181					data_len -= PAGE_SIZE;
1182				}
1183
1184				/* Full success... */
1185				break;
1186			}
1187			err = -ENOBUFS;
1188			goto failure;
1189		}
1190		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1191		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1192		err = -EAGAIN;
1193		if (!timeo)
1194			goto failure;
1195		if (signal_pending(current))
1196			goto interrupted;
1197		timeo = sock_wait_for_wmem(sk, timeo);
1198	}
1199
1200	skb_set_owner_w(skb, sk);
1201	return skb;
1202
1203interrupted:
1204	err = sock_intr_errno(timeo);
1205failure:
1206	*errcode = err;
1207	return NULL;
1208}
1209
1210struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1211				    int noblock, int *errcode)
1212{
1213	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1214}
1215
1216static void __lock_sock(struct sock *sk)
1217{
1218	DEFINE_WAIT(wait);
1219
1220	for (;;) {
1221		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1222					TASK_UNINTERRUPTIBLE);
1223		spin_unlock_bh(&sk->sk_lock.slock);
1224		schedule();
1225		spin_lock_bh(&sk->sk_lock.slock);
1226		if (!sock_owned_by_user(sk))
1227			break;
1228	}
1229	finish_wait(&sk->sk_lock.wq, &wait);
1230}
1231
1232static void __release_sock(struct sock *sk)
1233{
1234	struct sk_buff *skb = sk->sk_backlog.head;
1235
1236	do {
1237		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1238		bh_unlock_sock(sk);
1239
1240		do {
1241			struct sk_buff *next = skb->next;
1242
1243			skb->next = NULL;
1244			sk->sk_backlog_rcv(sk, skb);
1245
1246			/*
1247			 * We are in process context here with softirqs
1248			 * disabled, use cond_resched_softirq() to preempt.
1249			 * This is safe to do because we've taken the backlog
1250			 * queue private:
1251			 */
1252			cond_resched_softirq();
1253
1254			skb = next;
1255		} while (skb != NULL);
1256
1257		bh_lock_sock(sk);
1258	} while ((skb = sk->sk_backlog.head) != NULL);
1259}
1260
1261/**
1262 * sk_wait_data - wait for data to arrive at sk_receive_queue
1263 * @sk:    sock to wait on
1264 * @timeo: for how long
1265 *
1266 * Now socket state including sk->sk_err is changed only under lock,
1267 * hence we may omit checks after joining wait queue.
1268 * We check receive queue before schedule() only as optimization;
1269 * it is very likely that release_sock() added new data.
1270 */
1271int sk_wait_data(struct sock *sk, long *timeo)
1272{
1273	int rc;
1274	DEFINE_WAIT(wait);
1275
1276	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1277	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1278	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1279	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1280	finish_wait(sk->sk_sleep, &wait);
1281	return rc;
1282}
1283
1284EXPORT_SYMBOL(sk_wait_data);
1285
1286/*
1287 * Set of default routines for initialising struct proto_ops when
1288 * the protocol does not support a particular function. In certain
1289 * cases where it makes no sense for a protocol to have a "do nothing"
1290 * function, some default processing is provided.
1291 */
1292
1293int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1294{
1295	return -EOPNOTSUPP;
1296}
1297
1298int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1299		    int len, int flags)
1300{
1301	return -EOPNOTSUPP;
1302}
1303
1304int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1305{
1306	return -EOPNOTSUPP;
1307}
1308
1309int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1310{
1311	return -EOPNOTSUPP;
1312}
1313
1314int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1315		    int *len, int peer)
1316{
1317	return -EOPNOTSUPP;
1318}
1319
1320unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1321{
1322	return 0;
1323}
1324
1325int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1326{
1327	return -EOPNOTSUPP;
1328}
1329
1330int sock_no_listen(struct socket *sock, int backlog)
1331{
1332	return -EOPNOTSUPP;
1333}
1334
1335int sock_no_shutdown(struct socket *sock, int how)
1336{
1337	return -EOPNOTSUPP;
1338}
1339
1340int sock_no_setsockopt(struct socket *sock, int level, int optname,
1341		    char __user *optval, int optlen)
1342{
1343	return -EOPNOTSUPP;
1344}
1345
1346int sock_no_getsockopt(struct socket *sock, int level, int optname,
1347		    char __user *optval, int __user *optlen)
1348{
1349	return -EOPNOTSUPP;
1350}
1351
1352int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1353		    size_t len)
1354{
1355	return -EOPNOTSUPP;
1356}
1357
1358int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1359		    size_t len, int flags)
1360{
1361	return -EOPNOTSUPP;
1362}
1363
1364int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1365{
1366	/* Mirror missing mmap method error code */
1367	return -ENODEV;
1368}
1369
1370ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1371{
1372	ssize_t res;
1373	struct msghdr msg = {.msg_flags = flags};
1374	struct kvec iov;
1375	char *kaddr = kmap(page);
1376	iov.iov_base = kaddr + offset;
1377	iov.iov_len = size;
1378	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1379	kunmap(page);
1380	return res;
1381}
1382
1383/*
1384 *	Default Socket Callbacks
1385 */
1386
1387static void sock_def_wakeup(struct sock *sk)
1388{
1389	read_lock(&sk->sk_callback_lock);
1390	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1391		wake_up_interruptible_all(sk->sk_sleep);
1392	read_unlock(&sk->sk_callback_lock);
1393}
1394
1395static void sock_def_error_report(struct sock *sk)
1396{
1397	read_lock(&sk->sk_callback_lock);
1398	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1399		wake_up_interruptible(sk->sk_sleep);
1400	sk_wake_async(sk,0,POLL_ERR);
1401	read_unlock(&sk->sk_callback_lock);
1402}
1403
1404static void sock_def_readable(struct sock *sk, int len)
1405{
1406	read_lock(&sk->sk_callback_lock);
1407	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1408		wake_up_interruptible(sk->sk_sleep);
1409	sk_wake_async(sk,1,POLL_IN);
1410	read_unlock(&sk->sk_callback_lock);
1411}
1412
1413static void sock_def_write_space(struct sock *sk)
1414{
1415	read_lock(&sk->sk_callback_lock);
1416
1417	/* Do not wake up a writer until he can make "significant"
1418	 * progress.  --DaveM
1419	 */
1420	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1421		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1422			wake_up_interruptible(sk->sk_sleep);
1423
1424		/* Should agree with poll, otherwise some programs break */
1425		if (sock_writeable(sk))
1426			sk_wake_async(sk, 2, POLL_OUT);
1427	}
1428
1429	read_unlock(&sk->sk_callback_lock);
1430}
1431
1432static void sock_def_destruct(struct sock *sk)
1433{
1434	kfree(sk->sk_protinfo);
1435}
1436
1437void sk_send_sigurg(struct sock *sk)
1438{
1439	if (sk->sk_socket && sk->sk_socket->file)
1440		if (send_sigurg(&sk->sk_socket->file->f_owner))
1441			sk_wake_async(sk, 3, POLL_PRI);
1442}
1443
1444void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1445		    unsigned long expires)
1446{
1447	if (!mod_timer(timer, expires))
1448		sock_hold(sk);
1449}
1450
1451EXPORT_SYMBOL(sk_reset_timer);
1452
1453void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1454{
1455	if (timer_pending(timer) && del_timer(timer))
1456		__sock_put(sk);
1457}
1458
1459EXPORT_SYMBOL(sk_stop_timer);
1460
1461void sock_init_data(struct socket *sock, struct sock *sk)
1462{
1463	skb_queue_head_init(&sk->sk_receive_queue);
1464	skb_queue_head_init(&sk->sk_write_queue);
1465	skb_queue_head_init(&sk->sk_error_queue);
1466#ifdef CONFIG_NET_DMA
1467	skb_queue_head_init(&sk->sk_async_wait_queue);
1468#endif
1469
1470	sk->sk_send_head	=	NULL;
1471
1472	init_timer(&sk->sk_timer);
1473
1474	sk->sk_allocation	=	GFP_KERNEL;
1475	sk->sk_rcvbuf		=	sysctl_rmem_default;
1476	sk->sk_sndbuf		=	sysctl_wmem_default;
1477	sk->sk_state		=	TCP_CLOSE;
1478	sk->sk_socket		=	sock;
1479
1480	sock_set_flag(sk, SOCK_ZAPPED);
1481
1482	if (sock) {
1483		sk->sk_type	=	sock->type;
1484		sk->sk_sleep	=	&sock->wait;
1485		sock->sk	=	sk;
1486	} else
1487		sk->sk_sleep	=	NULL;
1488
1489	rwlock_init(&sk->sk_dst_lock);
1490	rwlock_init(&sk->sk_callback_lock);
1491	lockdep_set_class(&sk->sk_callback_lock,
1492			   af_callback_keys + sk->sk_family);
1493
1494	sk->sk_state_change	=	sock_def_wakeup;
1495	sk->sk_data_ready	=	sock_def_readable;
1496	sk->sk_write_space	=	sock_def_write_space;
1497	sk->sk_error_report	=	sock_def_error_report;
1498	sk->sk_destruct		=	sock_def_destruct;
1499
1500	sk->sk_sndmsg_page	=	NULL;
1501	sk->sk_sndmsg_off	=	0;
1502
1503	sk->sk_peercred.pid 	=	0;
1504	sk->sk_peercred.uid	=	-1;
1505	sk->sk_peercred.gid	=	-1;
1506	sk->sk_write_pending	=	0;
1507	sk->sk_rcvlowat		=	1;
1508	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1509	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1510
1511	sk->sk_stamp = ktime_set(-1L, -1L);
1512
1513	atomic_set(&sk->sk_refcnt, 1);
1514}
1515
1516void fastcall lock_sock_nested(struct sock *sk, int subclass)
1517{
1518	might_sleep();
1519	spin_lock_bh(&sk->sk_lock.slock);
1520	if (sk->sk_lock.owner)
1521		__lock_sock(sk);
1522	sk->sk_lock.owner = (void *)1;
1523	spin_unlock(&sk->sk_lock.slock);
1524	/*
1525	 * The sk_lock has mutex_lock() semantics here:
1526	 */
1527	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1528	local_bh_enable();
1529}
1530
1531EXPORT_SYMBOL(lock_sock_nested);
1532
1533void fastcall release_sock(struct sock *sk)
1534{
1535	/*
1536	 * The sk_lock has mutex_unlock() semantics:
1537	 */
1538	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1539
1540	spin_lock_bh(&sk->sk_lock.slock);
1541	if (sk->sk_backlog.tail)
1542		__release_sock(sk);
1543	sk->sk_lock.owner = NULL;
1544	if (waitqueue_active(&sk->sk_lock.wq))
1545		wake_up(&sk->sk_lock.wq);
1546	spin_unlock_bh(&sk->sk_lock.slock);
1547}
1548EXPORT_SYMBOL(release_sock);
1549
1550int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1551{
1552	struct timeval tv;
1553	if (!sock_flag(sk, SOCK_TIMESTAMP))
1554		sock_enable_timestamp(sk);
1555	tv = ktime_to_timeval(sk->sk_stamp);
1556	if (tv.tv_sec == -1)
1557		return -ENOENT;
1558	if (tv.tv_sec == 0) {
1559		sk->sk_stamp = ktime_get_real();
1560		tv = ktime_to_timeval(sk->sk_stamp);
1561	}
1562	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1563}
1564EXPORT_SYMBOL(sock_get_timestamp);
1565
1566int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1567{
1568	struct timespec ts;
1569	if (!sock_flag(sk, SOCK_TIMESTAMP))
1570		sock_enable_timestamp(sk);
1571	ts = ktime_to_timespec(sk->sk_stamp);
1572	if (ts.tv_sec == -1)
1573		return -ENOENT;
1574	if (ts.tv_sec == 0) {
1575		sk->sk_stamp = ktime_get_real();
1576		ts = ktime_to_timespec(sk->sk_stamp);
1577	}
1578	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1579}
1580EXPORT_SYMBOL(sock_get_timestampns);
1581
1582void sock_enable_timestamp(struct sock *sk)
1583{
1584	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1585		sock_set_flag(sk, SOCK_TIMESTAMP);
1586		net_enable_timestamp();
1587	}
1588}
1589EXPORT_SYMBOL(sock_enable_timestamp);
1590
1591/*
1592 *	Get a socket option on an socket.
1593 *
1594 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1595 *	asynchronous errors should be reported by getsockopt. We assume
1596 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1597 */
1598int sock_common_getsockopt(struct socket *sock, int level, int optname,
1599			   char __user *optval, int __user *optlen)
1600{
1601	struct sock *sk = sock->sk;
1602
1603	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1604}
1605
1606EXPORT_SYMBOL(sock_common_getsockopt);
1607
1608#ifdef CONFIG_COMPAT
1609int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1610				  char __user *optval, int __user *optlen)
1611{
1612	struct sock *sk = sock->sk;
1613
1614	if (sk->sk_prot->compat_getsockopt != NULL)
1615		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1616						      optval, optlen);
1617	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1618}
1619EXPORT_SYMBOL(compat_sock_common_getsockopt);
1620#endif
1621
1622int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1623			struct msghdr *msg, size_t size, int flags)
1624{
1625	struct sock *sk = sock->sk;
1626	int addr_len = 0;
1627	int err;
1628
1629	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1630				   flags & ~MSG_DONTWAIT, &addr_len);
1631	if (err >= 0)
1632		msg->msg_namelen = addr_len;
1633	return err;
1634}
1635
1636EXPORT_SYMBOL(sock_common_recvmsg);
1637
1638/*
1639 *	Set socket options on an inet socket.
1640 */
1641int sock_common_setsockopt(struct socket *sock, int level, int optname,
1642			   char __user *optval, int optlen)
1643{
1644	struct sock *sk = sock->sk;
1645
1646	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1647}
1648
1649EXPORT_SYMBOL(sock_common_setsockopt);
1650
1651#ifdef CONFIG_COMPAT
1652int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1653				  char __user *optval, int optlen)
1654{
1655	struct sock *sk = sock->sk;
1656
1657	if (sk->sk_prot->compat_setsockopt != NULL)
1658		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1659						      optval, optlen);
1660	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1661}
1662EXPORT_SYMBOL(compat_sock_common_setsockopt);
1663#endif
1664
1665void sk_common_release(struct sock *sk)
1666{
1667	if (sk->sk_prot->destroy)
1668		sk->sk_prot->destroy(sk);
1669
1670	/*
1671	 * Observation: when sock_common_release is called, processes have
1672	 * no access to socket. But net still has.
1673	 * Step one, detach it from networking:
1674	 *
1675	 * A. Remove from hash tables.
1676	 */
1677
1678	sk->sk_prot->unhash(sk);
1679
1680	/*
1681	 * In this point socket cannot receive new packets, but it is possible
1682	 * that some packets are in flight because some CPU runs receiver and
1683	 * did hash table lookup before we unhashed socket. They will achieve
1684	 * receive queue and will be purged by socket destructor.
1685	 *
1686	 * Also we still have packets pending on receive queue and probably,
1687	 * our own packets waiting in device queues. sock_destroy will drain
1688	 * receive queue, but transmitted packets will delay socket destruction
1689	 * until the last reference will be released.
1690	 */
1691
1692	sock_orphan(sk);
1693
1694	xfrm_sk_free_policy(sk);
1695
1696	sk_refcnt_debug_release(sk);
1697	sock_put(sk);
1698}
1699
1700EXPORT_SYMBOL(sk_common_release);
1701
1702static DEFINE_RWLOCK(proto_list_lock);
1703static LIST_HEAD(proto_list);
1704
1705int proto_register(struct proto *prot, int alloc_slab)
1706{
1707	char *request_sock_slab_name = NULL;
1708	char *timewait_sock_slab_name;
1709	int rc = -ENOBUFS;
1710
1711	if (alloc_slab) {
1712		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1713					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1714
1715		if (prot->slab == NULL) {
1716			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1717			       prot->name);
1718			goto out;
1719		}
1720
1721		if (prot->rsk_prot != NULL) {
1722			static const char mask[] = "request_sock_%s";
1723
1724			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1725			if (request_sock_slab_name == NULL)
1726				goto out_free_sock_slab;
1727
1728			sprintf(request_sock_slab_name, mask, prot->name);
1729			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1730								 prot->rsk_prot->obj_size, 0,
1731								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1732
1733			if (prot->rsk_prot->slab == NULL) {
1734				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1735				       prot->name);
1736				goto out_free_request_sock_slab_name;
1737			}
1738		}
1739
1740		if (prot->twsk_prot != NULL) {
1741			static const char mask[] = "tw_sock_%s";
1742
1743			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1744
1745			if (timewait_sock_slab_name == NULL)
1746				goto out_free_request_sock_slab;
1747
1748			sprintf(timewait_sock_slab_name, mask, prot->name);
1749			prot->twsk_prot->twsk_slab =
1750				kmem_cache_create(timewait_sock_slab_name,
1751						  prot->twsk_prot->twsk_obj_size,
1752						  0, SLAB_HWCACHE_ALIGN,
1753						  NULL, NULL);
1754			if (prot->twsk_prot->twsk_slab == NULL)
1755				goto out_free_timewait_sock_slab_name;
1756		}
1757	}
1758
1759	write_lock(&proto_list_lock);
1760	list_add(&prot->node, &proto_list);
1761	write_unlock(&proto_list_lock);
1762	rc = 0;
1763out:
1764	return rc;
1765out_free_timewait_sock_slab_name:
1766	kfree(timewait_sock_slab_name);
1767out_free_request_sock_slab:
1768	if (prot->rsk_prot && prot->rsk_prot->slab) {
1769		kmem_cache_destroy(prot->rsk_prot->slab);
1770		prot->rsk_prot->slab = NULL;
1771	}
1772out_free_request_sock_slab_name:
1773	kfree(request_sock_slab_name);
1774out_free_sock_slab:
1775	kmem_cache_destroy(prot->slab);
1776	prot->slab = NULL;
1777	goto out;
1778}
1779
1780EXPORT_SYMBOL(proto_register);
1781
1782void proto_unregister(struct proto *prot)
1783{
1784	write_lock(&proto_list_lock);
1785	list_del(&prot->node);
1786	write_unlock(&proto_list_lock);
1787
1788	if (prot->slab != NULL) {
1789		kmem_cache_destroy(prot->slab);
1790		prot->slab = NULL;
1791	}
1792
1793	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1794		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1795
1796		kmem_cache_destroy(prot->rsk_prot->slab);
1797		kfree(name);
1798		prot->rsk_prot->slab = NULL;
1799	}
1800
1801	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1802		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1803
1804		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1805		kfree(name);
1806		prot->twsk_prot->twsk_slab = NULL;
1807	}
1808}
1809
1810EXPORT_SYMBOL(proto_unregister);
1811
1812#ifdef CONFIG_PROC_FS
1813static inline struct proto *__proto_head(void)
1814{
1815	return list_entry(proto_list.next, struct proto, node);
1816}
1817
1818static inline struct proto *proto_head(void)
1819{
1820	return list_empty(&proto_list) ? NULL : __proto_head();
1821}
1822
1823static inline struct proto *proto_next(struct proto *proto)
1824{
1825	return proto->node.next == &proto_list ? NULL :
1826		list_entry(proto->node.next, struct proto, node);
1827}
1828
1829static inline struct proto *proto_get_idx(loff_t pos)
1830{
1831	struct proto *proto;
1832	loff_t i = 0;
1833
1834	list_for_each_entry(proto, &proto_list, node)
1835		if (i++ == pos)
1836			goto out;
1837
1838	proto = NULL;
1839out:
1840	return proto;
1841}
1842
1843static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1844{
1845	read_lock(&proto_list_lock);
1846	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1847}
1848
1849static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1850{
1851	++*pos;
1852	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1853}
1854
1855static void proto_seq_stop(struct seq_file *seq, void *v)
1856{
1857	read_unlock(&proto_list_lock);
1858}
1859
1860static char proto_method_implemented(const void *method)
1861{
1862	return method == NULL ? 'n' : 'y';
1863}
1864
1865static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1866{
1867	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1868			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1869		   proto->name,
1870		   proto->obj_size,
1871		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1872		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1873		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1874		   proto->max_header,
1875		   proto->slab == NULL ? "no" : "yes",
1876		   module_name(proto->owner),
1877		   proto_method_implemented(proto->close),
1878		   proto_method_implemented(proto->connect),
1879		   proto_method_implemented(proto->disconnect),
1880		   proto_method_implemented(proto->accept),
1881		   proto_method_implemented(proto->ioctl),
1882		   proto_method_implemented(proto->init),
1883		   proto_method_implemented(proto->destroy),
1884		   proto_method_implemented(proto->shutdown),
1885		   proto_method_implemented(proto->setsockopt),
1886		   proto_method_implemented(proto->getsockopt),
1887		   proto_method_implemented(proto->sendmsg),
1888		   proto_method_implemented(proto->recvmsg),
1889		   proto_method_implemented(proto->sendpage),
1890		   proto_method_implemented(proto->bind),
1891		   proto_method_implemented(proto->backlog_rcv),
1892		   proto_method_implemented(proto->hash),
1893		   proto_method_implemented(proto->unhash),
1894		   proto_method_implemented(proto->get_port),
1895		   proto_method_implemented(proto->enter_memory_pressure));
1896}
1897
1898static int proto_seq_show(struct seq_file *seq, void *v)
1899{
1900	if (v == SEQ_START_TOKEN)
1901		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1902			   "protocol",
1903			   "size",
1904			   "sockets",
1905			   "memory",
1906			   "press",
1907			   "maxhdr",
1908			   "slab",
1909			   "module",
1910			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1911	else
1912		proto_seq_printf(seq, v);
1913	return 0;
1914}
1915
1916static struct seq_operations proto_seq_ops = {
1917	.start  = proto_seq_start,
1918	.next   = proto_seq_next,
1919	.stop   = proto_seq_stop,
1920	.show   = proto_seq_show,
1921};
1922
1923static int proto_seq_open(struct inode *inode, struct file *file)
1924{
1925	return seq_open(file, &proto_seq_ops);
1926}
1927
1928static const struct file_operations proto_seq_fops = {
1929	.owner		= THIS_MODULE,
1930	.open		= proto_seq_open,
1931	.read		= seq_read,
1932	.llseek		= seq_lseek,
1933	.release	= seq_release,
1934};
1935
1936static int __init proto_init(void)
1937{
1938	/* register /proc/net/protocols */
1939	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1940}
1941
1942subsys_initcall(proto_init);
1943
1944#endif /* PROC_FS */
1945
1946EXPORT_SYMBOL(sk_alloc);
1947EXPORT_SYMBOL(sk_free);
1948EXPORT_SYMBOL(sk_send_sigurg);
1949EXPORT_SYMBOL(sock_alloc_send_skb);
1950EXPORT_SYMBOL(sock_init_data);
1951EXPORT_SYMBOL(sock_kfree_s);
1952EXPORT_SYMBOL(sock_kmalloc);
1953EXPORT_SYMBOL(sock_no_accept);
1954EXPORT_SYMBOL(sock_no_bind);
1955EXPORT_SYMBOL(sock_no_connect);
1956EXPORT_SYMBOL(sock_no_getname);
1957EXPORT_SYMBOL(sock_no_getsockopt);
1958EXPORT_SYMBOL(sock_no_ioctl);
1959EXPORT_SYMBOL(sock_no_listen);
1960EXPORT_SYMBOL(sock_no_mmap);
1961EXPORT_SYMBOL(sock_no_poll);
1962EXPORT_SYMBOL(sock_no_recvmsg);
1963EXPORT_SYMBOL(sock_no_sendmsg);
1964EXPORT_SYMBOL(sock_no_sendpage);
1965EXPORT_SYMBOL(sock_no_setsockopt);
1966EXPORT_SYMBOL(sock_no_shutdown);
1967EXPORT_SYMBOL(sock_no_socketpair);
1968EXPORT_SYMBOL(sock_rfree);
1969EXPORT_SYMBOL(sock_setsockopt);
1970EXPORT_SYMBOL(sock_wfree);
1971EXPORT_SYMBOL(sock_wmalloc);
1972EXPORT_SYMBOL(sock_i_uid);
1973EXPORT_SYMBOL(sock_i_ino);
1974EXPORT_SYMBOL(sysctl_optmem_max);
1975#ifdef CONFIG_SYSCTL
1976EXPORT_SYMBOL(sysctl_rmem_max);
1977EXPORT_SYMBOL(sysctl_wmem_max);
1978#endif
1979