sock.c revision a1f8e7f7fb9d7e2cbcb53170edca7c0ac4680697
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
148  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
149  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
150  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
151  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
152  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
153  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
154  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
155  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
156  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
157  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
158};
159static const char *af_family_slock_key_strings[AF_MAX+1] = {
160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169  "slock-27"       , "slock-28"          , "slock-29"          ,
170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
171};
172#endif
173
174/*
175 * sk_callback_lock locking rules are per-address-family,
176 * so split the lock classes by using a per-AF key:
177 */
178static struct lock_class_key af_callback_keys[AF_MAX];
179
180/* Take into consideration the size of the struct sk_buff overhead in the
181 * determination of these values, since that is non-constant across
182 * platforms.  This makes socket queueing behavior and performance
183 * not depend upon such differences.
184 */
185#define _SK_MEM_PACKETS		256
186#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
187#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
189
190/* Run time adjustable parameters. */
191__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
192__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
193__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
194__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
195
196/* Maximal space eaten by iovec or ancilliary data plus some space */
197int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
198
199static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
200{
201	struct timeval tv;
202
203	if (optlen < sizeof(tv))
204		return -EINVAL;
205	if (copy_from_user(&tv, optval, sizeof(tv)))
206		return -EFAULT;
207
208	*timeo_p = MAX_SCHEDULE_TIMEOUT;
209	if (tv.tv_sec == 0 && tv.tv_usec == 0)
210		return 0;
211	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
212		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
213	return 0;
214}
215
216static void sock_warn_obsolete_bsdism(const char *name)
217{
218	static int warned;
219	static char warncomm[TASK_COMM_LEN];
220	if (strcmp(warncomm, current->comm) && warned < 5) {
221		strcpy(warncomm,  current->comm);
222		printk(KERN_WARNING "process `%s' is using obsolete "
223		       "%s SO_BSDCOMPAT\n", warncomm, name);
224		warned++;
225	}
226}
227
228static void sock_disable_timestamp(struct sock *sk)
229{
230	if (sock_flag(sk, SOCK_TIMESTAMP)) {
231		sock_reset_flag(sk, SOCK_TIMESTAMP);
232		net_disable_timestamp();
233	}
234}
235
236
237int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
238{
239	int err = 0;
240	int skb_len;
241
242	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
243	   number of warnings when compiling with -W --ANK
244	 */
245	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
246	    (unsigned)sk->sk_rcvbuf) {
247		err = -ENOMEM;
248		goto out;
249	}
250
251	err = sk_filter(sk, skb);
252	if (err)
253		goto out;
254
255	skb->dev = NULL;
256	skb_set_owner_r(skb, sk);
257
258	/* Cache the SKB length before we tack it onto the receive
259	 * queue.  Once it is added it no longer belongs to us and
260	 * may be freed by other threads of control pulling packets
261	 * from the queue.
262	 */
263	skb_len = skb->len;
264
265	skb_queue_tail(&sk->sk_receive_queue, skb);
266
267	if (!sock_flag(sk, SOCK_DEAD))
268		sk->sk_data_ready(sk, skb_len);
269out:
270	return err;
271}
272EXPORT_SYMBOL(sock_queue_rcv_skb);
273
274int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
275{
276	int rc = NET_RX_SUCCESS;
277
278	if (sk_filter(sk, skb))
279		goto discard_and_relse;
280
281	skb->dev = NULL;
282
283	if (nested)
284		bh_lock_sock_nested(sk);
285	else
286		bh_lock_sock(sk);
287	if (!sock_owned_by_user(sk)) {
288		/*
289		 * trylock + unlock semantics:
290		 */
291		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
292
293		rc = sk->sk_backlog_rcv(sk, skb);
294
295		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
296	} else
297		sk_add_backlog(sk, skb);
298	bh_unlock_sock(sk);
299out:
300	sock_put(sk);
301	return rc;
302discard_and_relse:
303	kfree_skb(skb);
304	goto out;
305}
306EXPORT_SYMBOL(sk_receive_skb);
307
308struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
309{
310	struct dst_entry *dst = sk->sk_dst_cache;
311
312	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
313		sk->sk_dst_cache = NULL;
314		dst_release(dst);
315		return NULL;
316	}
317
318	return dst;
319}
320EXPORT_SYMBOL(__sk_dst_check);
321
322struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
323{
324	struct dst_entry *dst = sk_dst_get(sk);
325
326	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
327		sk_dst_reset(sk);
328		dst_release(dst);
329		return NULL;
330	}
331
332	return dst;
333}
334EXPORT_SYMBOL(sk_dst_check);
335
336/*
337 *	This is meant for all protocols to use and covers goings on
338 *	at the socket level. Everything here is generic.
339 */
340
341int sock_setsockopt(struct socket *sock, int level, int optname,
342		    char __user *optval, int optlen)
343{
344	struct sock *sk=sock->sk;
345	struct sk_filter *filter;
346	int val;
347	int valbool;
348	struct linger ling;
349	int ret = 0;
350
351	/*
352	 *	Options without arguments
353	 */
354
355#ifdef SO_DONTLINGER		/* Compatibility item... */
356	if (optname == SO_DONTLINGER) {
357		lock_sock(sk);
358		sock_reset_flag(sk, SOCK_LINGER);
359		release_sock(sk);
360		return 0;
361	}
362#endif
363
364  	if(optlen<sizeof(int))
365  		return(-EINVAL);
366
367	if (get_user(val, (int __user *)optval))
368		return -EFAULT;
369
370  	valbool = val?1:0;
371
372	lock_sock(sk);
373
374  	switch(optname)
375  	{
376		case SO_DEBUG:
377			if(val && !capable(CAP_NET_ADMIN))
378			{
379				ret = -EACCES;
380			}
381			else if (valbool)
382				sock_set_flag(sk, SOCK_DBG);
383			else
384				sock_reset_flag(sk, SOCK_DBG);
385			break;
386		case SO_REUSEADDR:
387			sk->sk_reuse = valbool;
388			break;
389		case SO_TYPE:
390		case SO_ERROR:
391			ret = -ENOPROTOOPT;
392		  	break;
393		case SO_DONTROUTE:
394			if (valbool)
395				sock_set_flag(sk, SOCK_LOCALROUTE);
396			else
397				sock_reset_flag(sk, SOCK_LOCALROUTE);
398			break;
399		case SO_BROADCAST:
400			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
401			break;
402		case SO_SNDBUF:
403			/* Don't error on this BSD doesn't and if you think
404			   about it this is right. Otherwise apps have to
405			   play 'guess the biggest size' games. RCVBUF/SNDBUF
406			   are treated in BSD as hints */
407
408			if (val > sysctl_wmem_max)
409				val = sysctl_wmem_max;
410set_sndbuf:
411			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
412			if ((val * 2) < SOCK_MIN_SNDBUF)
413				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
414			else
415				sk->sk_sndbuf = val * 2;
416
417			/*
418			 *	Wake up sending tasks if we
419			 *	upped the value.
420			 */
421			sk->sk_write_space(sk);
422			break;
423
424		case SO_SNDBUFFORCE:
425			if (!capable(CAP_NET_ADMIN)) {
426				ret = -EPERM;
427				break;
428			}
429			goto set_sndbuf;
430
431		case SO_RCVBUF:
432			/* Don't error on this BSD doesn't and if you think
433			   about it this is right. Otherwise apps have to
434			   play 'guess the biggest size' games. RCVBUF/SNDBUF
435			   are treated in BSD as hints */
436
437			if (val > sysctl_rmem_max)
438				val = sysctl_rmem_max;
439set_rcvbuf:
440			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
441			/*
442			 * We double it on the way in to account for
443			 * "struct sk_buff" etc. overhead.   Applications
444			 * assume that the SO_RCVBUF setting they make will
445			 * allow that much actual data to be received on that
446			 * socket.
447			 *
448			 * Applications are unaware that "struct sk_buff" and
449			 * other overheads allocate from the receive buffer
450			 * during socket buffer allocation.
451			 *
452			 * And after considering the possible alternatives,
453			 * returning the value we actually used in getsockopt
454			 * is the most desirable behavior.
455			 */
456			if ((val * 2) < SOCK_MIN_RCVBUF)
457				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
458			else
459				sk->sk_rcvbuf = val * 2;
460			break;
461
462		case SO_RCVBUFFORCE:
463			if (!capable(CAP_NET_ADMIN)) {
464				ret = -EPERM;
465				break;
466			}
467			goto set_rcvbuf;
468
469		case SO_KEEPALIVE:
470#ifdef CONFIG_INET
471			if (sk->sk_protocol == IPPROTO_TCP)
472				tcp_set_keepalive(sk, valbool);
473#endif
474			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
475			break;
476
477	 	case SO_OOBINLINE:
478			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
479			break;
480
481	 	case SO_NO_CHECK:
482			sk->sk_no_check = valbool;
483			break;
484
485		case SO_PRIORITY:
486			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
487				sk->sk_priority = val;
488			else
489				ret = -EPERM;
490			break;
491
492		case SO_LINGER:
493			if(optlen<sizeof(ling)) {
494				ret = -EINVAL;	/* 1003.1g */
495				break;
496			}
497			if (copy_from_user(&ling,optval,sizeof(ling))) {
498				ret = -EFAULT;
499				break;
500			}
501			if (!ling.l_onoff)
502				sock_reset_flag(sk, SOCK_LINGER);
503			else {
504#if (BITS_PER_LONG == 32)
505				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
506					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
507				else
508#endif
509					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
510				sock_set_flag(sk, SOCK_LINGER);
511			}
512			break;
513
514		case SO_BSDCOMPAT:
515			sock_warn_obsolete_bsdism("setsockopt");
516			break;
517
518		case SO_PASSCRED:
519			if (valbool)
520				set_bit(SOCK_PASSCRED, &sock->flags);
521			else
522				clear_bit(SOCK_PASSCRED, &sock->flags);
523			break;
524
525		case SO_TIMESTAMP:
526			if (valbool)  {
527				sock_set_flag(sk, SOCK_RCVTSTAMP);
528				sock_enable_timestamp(sk);
529			} else
530				sock_reset_flag(sk, SOCK_RCVTSTAMP);
531			break;
532
533		case SO_RCVLOWAT:
534			if (val < 0)
535				val = INT_MAX;
536			sk->sk_rcvlowat = val ? : 1;
537			break;
538
539		case SO_RCVTIMEO:
540			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
541			break;
542
543		case SO_SNDTIMEO:
544			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
545			break;
546
547#ifdef CONFIG_NETDEVICES
548		case SO_BINDTODEVICE:
549		{
550			char devname[IFNAMSIZ];
551
552			/* Sorry... */
553			if (!capable(CAP_NET_RAW)) {
554				ret = -EPERM;
555				break;
556			}
557
558			/* Bind this socket to a particular device like "eth0",
559			 * as specified in the passed interface name. If the
560			 * name is "" or the option length is zero the socket
561			 * is not bound.
562			 */
563
564			if (!valbool) {
565				sk->sk_bound_dev_if = 0;
566			} else {
567				if (optlen > IFNAMSIZ - 1)
568					optlen = IFNAMSIZ - 1;
569				memset(devname, 0, sizeof(devname));
570				if (copy_from_user(devname, optval, optlen)) {
571					ret = -EFAULT;
572					break;
573				}
574
575				/* Remove any cached route for this socket. */
576				sk_dst_reset(sk);
577
578				if (devname[0] == '\0') {
579					sk->sk_bound_dev_if = 0;
580				} else {
581					struct net_device *dev = dev_get_by_name(devname);
582					if (!dev) {
583						ret = -ENODEV;
584						break;
585					}
586					sk->sk_bound_dev_if = dev->ifindex;
587					dev_put(dev);
588				}
589			}
590			break;
591		}
592#endif
593
594
595		case SO_ATTACH_FILTER:
596			ret = -EINVAL;
597			if (optlen == sizeof(struct sock_fprog)) {
598				struct sock_fprog fprog;
599
600				ret = -EFAULT;
601				if (copy_from_user(&fprog, optval, sizeof(fprog)))
602					break;
603
604				ret = sk_attach_filter(&fprog, sk);
605			}
606			break;
607
608		case SO_DETACH_FILTER:
609			rcu_read_lock_bh();
610			filter = rcu_dereference(sk->sk_filter);
611                        if (filter) {
612				rcu_assign_pointer(sk->sk_filter, NULL);
613				sk_filter_release(sk, filter);
614				rcu_read_unlock_bh();
615				break;
616			}
617			rcu_read_unlock_bh();
618			ret = -ENONET;
619			break;
620
621		case SO_PASSSEC:
622			if (valbool)
623				set_bit(SOCK_PASSSEC, &sock->flags);
624			else
625				clear_bit(SOCK_PASSSEC, &sock->flags);
626			break;
627
628		/* We implement the SO_SNDLOWAT etc to
629		   not be settable (1003.1g 5.3) */
630		default:
631		  	ret = -ENOPROTOOPT;
632			break;
633  	}
634	release_sock(sk);
635	return ret;
636}
637
638
639int sock_getsockopt(struct socket *sock, int level, int optname,
640		    char __user *optval, int __user *optlen)
641{
642	struct sock *sk = sock->sk;
643
644	union
645	{
646  		int val;
647  		struct linger ling;
648		struct timeval tm;
649	} v;
650
651	unsigned int lv = sizeof(int);
652	int len;
653
654  	if(get_user(len,optlen))
655  		return -EFAULT;
656	if(len < 0)
657		return -EINVAL;
658
659  	switch(optname)
660  	{
661		case SO_DEBUG:
662			v.val = sock_flag(sk, SOCK_DBG);
663			break;
664
665		case SO_DONTROUTE:
666			v.val = sock_flag(sk, SOCK_LOCALROUTE);
667			break;
668
669		case SO_BROADCAST:
670			v.val = !!sock_flag(sk, SOCK_BROADCAST);
671			break;
672
673		case SO_SNDBUF:
674			v.val = sk->sk_sndbuf;
675			break;
676
677		case SO_RCVBUF:
678			v.val = sk->sk_rcvbuf;
679			break;
680
681		case SO_REUSEADDR:
682			v.val = sk->sk_reuse;
683			break;
684
685		case SO_KEEPALIVE:
686			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
687			break;
688
689		case SO_TYPE:
690			v.val = sk->sk_type;
691			break;
692
693		case SO_ERROR:
694			v.val = -sock_error(sk);
695			if(v.val==0)
696				v.val = xchg(&sk->sk_err_soft, 0);
697			break;
698
699		case SO_OOBINLINE:
700			v.val = !!sock_flag(sk, SOCK_URGINLINE);
701			break;
702
703		case SO_NO_CHECK:
704			v.val = sk->sk_no_check;
705			break;
706
707		case SO_PRIORITY:
708			v.val = sk->sk_priority;
709			break;
710
711		case SO_LINGER:
712			lv		= sizeof(v.ling);
713			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
714 			v.ling.l_linger	= sk->sk_lingertime / HZ;
715			break;
716
717		case SO_BSDCOMPAT:
718			sock_warn_obsolete_bsdism("getsockopt");
719			break;
720
721		case SO_TIMESTAMP:
722			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
723			break;
724
725		case SO_RCVTIMEO:
726			lv=sizeof(struct timeval);
727			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
728				v.tm.tv_sec = 0;
729				v.tm.tv_usec = 0;
730			} else {
731				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
732				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
733			}
734			break;
735
736		case SO_SNDTIMEO:
737			lv=sizeof(struct timeval);
738			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
739				v.tm.tv_sec = 0;
740				v.tm.tv_usec = 0;
741			} else {
742				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
743				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
744			}
745			break;
746
747		case SO_RCVLOWAT:
748			v.val = sk->sk_rcvlowat;
749			break;
750
751		case SO_SNDLOWAT:
752			v.val=1;
753			break;
754
755		case SO_PASSCRED:
756			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
757			break;
758
759		case SO_PEERCRED:
760			if (len > sizeof(sk->sk_peercred))
761				len = sizeof(sk->sk_peercred);
762			if (copy_to_user(optval, &sk->sk_peercred, len))
763				return -EFAULT;
764			goto lenout;
765
766		case SO_PEERNAME:
767		{
768			char address[128];
769
770			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
771				return -ENOTCONN;
772			if (lv < len)
773				return -EINVAL;
774			if (copy_to_user(optval, address, len))
775				return -EFAULT;
776			goto lenout;
777		}
778
779		/* Dubious BSD thing... Probably nobody even uses it, but
780		 * the UNIX standard wants it for whatever reason... -DaveM
781		 */
782		case SO_ACCEPTCONN:
783			v.val = sk->sk_state == TCP_LISTEN;
784			break;
785
786		case SO_PASSSEC:
787			v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
788			break;
789
790		case SO_PEERSEC:
791			return security_socket_getpeersec_stream(sock, optval, optlen, len);
792
793		default:
794			return(-ENOPROTOOPT);
795	}
796	if (len > lv)
797		len = lv;
798	if (copy_to_user(optval, &v, len))
799		return -EFAULT;
800lenout:
801  	if (put_user(len, optlen))
802  		return -EFAULT;
803  	return 0;
804}
805
806/*
807 * Initialize an sk_lock.
808 *
809 * (We also register the sk_lock with the lock validator.)
810 */
811static void inline sock_lock_init(struct sock *sk)
812{
813	spin_lock_init(&sk->sk_lock.slock);
814	sk->sk_lock.owner = NULL;
815	init_waitqueue_head(&sk->sk_lock.wq);
816	/*
817	 * Make sure we are not reinitializing a held lock:
818	 */
819	debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
820
821	/*
822	 * Mark both the sk_lock and the sk_lock.slock as a
823	 * per-address-family lock class:
824	 */
825	lockdep_set_class_and_name(&sk->sk_lock.slock,
826				   af_family_slock_keys + sk->sk_family,
827				   af_family_slock_key_strings[sk->sk_family]);
828	lockdep_init_map(&sk->sk_lock.dep_map,
829			 af_family_key_strings[sk->sk_family],
830			 af_family_keys + sk->sk_family, 0);
831}
832
833/**
834 *	sk_alloc - All socket objects are allocated here
835 *	@family: protocol family
836 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
837 *	@prot: struct proto associated with this new sock instance
838 *	@zero_it: if we should zero the newly allocated sock
839 */
840struct sock *sk_alloc(int family, gfp_t priority,
841		      struct proto *prot, int zero_it)
842{
843	struct sock *sk = NULL;
844	kmem_cache_t *slab = prot->slab;
845
846	if (slab != NULL)
847		sk = kmem_cache_alloc(slab, priority);
848	else
849		sk = kmalloc(prot->obj_size, priority);
850
851	if (sk) {
852		if (zero_it) {
853			memset(sk, 0, prot->obj_size);
854			sk->sk_family = family;
855			/*
856			 * See comment in struct sock definition to understand
857			 * why we need sk_prot_creator -acme
858			 */
859			sk->sk_prot = sk->sk_prot_creator = prot;
860			sock_lock_init(sk);
861		}
862
863		if (security_sk_alloc(sk, family, priority))
864			goto out_free;
865
866		if (!try_module_get(prot->owner))
867			goto out_free;
868	}
869	return sk;
870
871out_free:
872	if (slab != NULL)
873		kmem_cache_free(slab, sk);
874	else
875		kfree(sk);
876	return NULL;
877}
878
879void sk_free(struct sock *sk)
880{
881	struct sk_filter *filter;
882	struct module *owner = sk->sk_prot_creator->owner;
883
884	if (sk->sk_destruct)
885		sk->sk_destruct(sk);
886
887	filter = rcu_dereference(sk->sk_filter);
888	if (filter) {
889		sk_filter_release(sk, filter);
890		rcu_assign_pointer(sk->sk_filter, NULL);
891	}
892
893	sock_disable_timestamp(sk);
894
895	if (atomic_read(&sk->sk_omem_alloc))
896		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
897		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
898
899	security_sk_free(sk);
900	if (sk->sk_prot_creator->slab != NULL)
901		kmem_cache_free(sk->sk_prot_creator->slab, sk);
902	else
903		kfree(sk);
904	module_put(owner);
905}
906
907struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
908{
909	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
910
911	if (newsk != NULL) {
912		struct sk_filter *filter;
913
914		sock_copy(newsk, sk);
915
916		/* SANITY */
917		sk_node_init(&newsk->sk_node);
918		sock_lock_init(newsk);
919		bh_lock_sock(newsk);
920
921		atomic_set(&newsk->sk_rmem_alloc, 0);
922		atomic_set(&newsk->sk_wmem_alloc, 0);
923		atomic_set(&newsk->sk_omem_alloc, 0);
924		skb_queue_head_init(&newsk->sk_receive_queue);
925		skb_queue_head_init(&newsk->sk_write_queue);
926#ifdef CONFIG_NET_DMA
927		skb_queue_head_init(&newsk->sk_async_wait_queue);
928#endif
929
930		rwlock_init(&newsk->sk_dst_lock);
931		rwlock_init(&newsk->sk_callback_lock);
932		lockdep_set_class(&newsk->sk_callback_lock,
933				   af_callback_keys + newsk->sk_family);
934
935		newsk->sk_dst_cache	= NULL;
936		newsk->sk_wmem_queued	= 0;
937		newsk->sk_forward_alloc = 0;
938		newsk->sk_send_head	= NULL;
939		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
940		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
941
942		sock_reset_flag(newsk, SOCK_DONE);
943		skb_queue_head_init(&newsk->sk_error_queue);
944
945		filter = newsk->sk_filter;
946		if (filter != NULL)
947			sk_filter_charge(newsk, filter);
948
949		if (unlikely(xfrm_sk_clone_policy(newsk))) {
950			/* It is still raw copy of parent, so invalidate
951			 * destructor and make plain sk_free() */
952			newsk->sk_destruct = NULL;
953			sk_free(newsk);
954			newsk = NULL;
955			goto out;
956		}
957
958		newsk->sk_err	   = 0;
959		newsk->sk_priority = 0;
960		atomic_set(&newsk->sk_refcnt, 2);
961
962		/*
963		 * Increment the counter in the same struct proto as the master
964		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
965		 * is the same as sk->sk_prot->socks, as this field was copied
966		 * with memcpy).
967		 *
968		 * This _changes_ the previous behaviour, where
969		 * tcp_create_openreq_child always was incrementing the
970		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
971		 * to be taken into account in all callers. -acme
972		 */
973		sk_refcnt_debug_inc(newsk);
974		newsk->sk_socket = NULL;
975		newsk->sk_sleep	 = NULL;
976
977		if (newsk->sk_prot->sockets_allocated)
978			atomic_inc(newsk->sk_prot->sockets_allocated);
979	}
980out:
981	return newsk;
982}
983
984EXPORT_SYMBOL_GPL(sk_clone);
985
986void __init sk_init(void)
987{
988	if (num_physpages <= 4096) {
989		sysctl_wmem_max = 32767;
990		sysctl_rmem_max = 32767;
991		sysctl_wmem_default = 32767;
992		sysctl_rmem_default = 32767;
993	} else if (num_physpages >= 131072) {
994		sysctl_wmem_max = 131071;
995		sysctl_rmem_max = 131071;
996	}
997}
998
999/*
1000 *	Simple resource managers for sockets.
1001 */
1002
1003
1004/*
1005 * Write buffer destructor automatically called from kfree_skb.
1006 */
1007void sock_wfree(struct sk_buff *skb)
1008{
1009	struct sock *sk = skb->sk;
1010
1011	/* In case it might be waiting for more memory. */
1012	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1013	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1014		sk->sk_write_space(sk);
1015	sock_put(sk);
1016}
1017
1018/*
1019 * Read buffer destructor automatically called from kfree_skb.
1020 */
1021void sock_rfree(struct sk_buff *skb)
1022{
1023	struct sock *sk = skb->sk;
1024
1025	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1026}
1027
1028
1029int sock_i_uid(struct sock *sk)
1030{
1031	int uid;
1032
1033	read_lock(&sk->sk_callback_lock);
1034	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1035	read_unlock(&sk->sk_callback_lock);
1036	return uid;
1037}
1038
1039unsigned long sock_i_ino(struct sock *sk)
1040{
1041	unsigned long ino;
1042
1043	read_lock(&sk->sk_callback_lock);
1044	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1045	read_unlock(&sk->sk_callback_lock);
1046	return ino;
1047}
1048
1049/*
1050 * Allocate a skb from the socket's send buffer.
1051 */
1052struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1053			     gfp_t priority)
1054{
1055	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1056		struct sk_buff * skb = alloc_skb(size, priority);
1057		if (skb) {
1058			skb_set_owner_w(skb, sk);
1059			return skb;
1060		}
1061	}
1062	return NULL;
1063}
1064
1065/*
1066 * Allocate a skb from the socket's receive buffer.
1067 */
1068struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1069			     gfp_t priority)
1070{
1071	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1072		struct sk_buff *skb = alloc_skb(size, priority);
1073		if (skb) {
1074			skb_set_owner_r(skb, sk);
1075			return skb;
1076		}
1077	}
1078	return NULL;
1079}
1080
1081/*
1082 * Allocate a memory block from the socket's option memory buffer.
1083 */
1084void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1085{
1086	if ((unsigned)size <= sysctl_optmem_max &&
1087	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1088		void *mem;
1089		/* First do the add, to avoid the race if kmalloc
1090 		 * might sleep.
1091		 */
1092		atomic_add(size, &sk->sk_omem_alloc);
1093		mem = kmalloc(size, priority);
1094		if (mem)
1095			return mem;
1096		atomic_sub(size, &sk->sk_omem_alloc);
1097	}
1098	return NULL;
1099}
1100
1101/*
1102 * Free an option memory block.
1103 */
1104void sock_kfree_s(struct sock *sk, void *mem, int size)
1105{
1106	kfree(mem);
1107	atomic_sub(size, &sk->sk_omem_alloc);
1108}
1109
1110/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1111   I think, these locks should be removed for datagram sockets.
1112 */
1113static long sock_wait_for_wmem(struct sock * sk, long timeo)
1114{
1115	DEFINE_WAIT(wait);
1116
1117	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1118	for (;;) {
1119		if (!timeo)
1120			break;
1121		if (signal_pending(current))
1122			break;
1123		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1124		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1125		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1126			break;
1127		if (sk->sk_shutdown & SEND_SHUTDOWN)
1128			break;
1129		if (sk->sk_err)
1130			break;
1131		timeo = schedule_timeout(timeo);
1132	}
1133	finish_wait(sk->sk_sleep, &wait);
1134	return timeo;
1135}
1136
1137
1138/*
1139 *	Generic send/receive buffer handlers
1140 */
1141
1142static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1143					    unsigned long header_len,
1144					    unsigned long data_len,
1145					    int noblock, int *errcode)
1146{
1147	struct sk_buff *skb;
1148	gfp_t gfp_mask;
1149	long timeo;
1150	int err;
1151
1152	gfp_mask = sk->sk_allocation;
1153	if (gfp_mask & __GFP_WAIT)
1154		gfp_mask |= __GFP_REPEAT;
1155
1156	timeo = sock_sndtimeo(sk, noblock);
1157	while (1) {
1158		err = sock_error(sk);
1159		if (err != 0)
1160			goto failure;
1161
1162		err = -EPIPE;
1163		if (sk->sk_shutdown & SEND_SHUTDOWN)
1164			goto failure;
1165
1166		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1167			skb = alloc_skb(header_len, gfp_mask);
1168			if (skb) {
1169				int npages;
1170				int i;
1171
1172				/* No pages, we're done... */
1173				if (!data_len)
1174					break;
1175
1176				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1177				skb->truesize += data_len;
1178				skb_shinfo(skb)->nr_frags = npages;
1179				for (i = 0; i < npages; i++) {
1180					struct page *page;
1181					skb_frag_t *frag;
1182
1183					page = alloc_pages(sk->sk_allocation, 0);
1184					if (!page) {
1185						err = -ENOBUFS;
1186						skb_shinfo(skb)->nr_frags = i;
1187						kfree_skb(skb);
1188						goto failure;
1189					}
1190
1191					frag = &skb_shinfo(skb)->frags[i];
1192					frag->page = page;
1193					frag->page_offset = 0;
1194					frag->size = (data_len >= PAGE_SIZE ?
1195						      PAGE_SIZE :
1196						      data_len);
1197					data_len -= PAGE_SIZE;
1198				}
1199
1200				/* Full success... */
1201				break;
1202			}
1203			err = -ENOBUFS;
1204			goto failure;
1205		}
1206		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1207		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1208		err = -EAGAIN;
1209		if (!timeo)
1210			goto failure;
1211		if (signal_pending(current))
1212			goto interrupted;
1213		timeo = sock_wait_for_wmem(sk, timeo);
1214	}
1215
1216	skb_set_owner_w(skb, sk);
1217	return skb;
1218
1219interrupted:
1220	err = sock_intr_errno(timeo);
1221failure:
1222	*errcode = err;
1223	return NULL;
1224}
1225
1226struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1227				    int noblock, int *errcode)
1228{
1229	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1230}
1231
1232static void __lock_sock(struct sock *sk)
1233{
1234	DEFINE_WAIT(wait);
1235
1236	for(;;) {
1237		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1238					TASK_UNINTERRUPTIBLE);
1239		spin_unlock_bh(&sk->sk_lock.slock);
1240		schedule();
1241		spin_lock_bh(&sk->sk_lock.slock);
1242		if(!sock_owned_by_user(sk))
1243			break;
1244	}
1245	finish_wait(&sk->sk_lock.wq, &wait);
1246}
1247
1248static void __release_sock(struct sock *sk)
1249{
1250	struct sk_buff *skb = sk->sk_backlog.head;
1251
1252	do {
1253		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1254		bh_unlock_sock(sk);
1255
1256		do {
1257			struct sk_buff *next = skb->next;
1258
1259			skb->next = NULL;
1260			sk->sk_backlog_rcv(sk, skb);
1261
1262			/*
1263			 * We are in process context here with softirqs
1264			 * disabled, use cond_resched_softirq() to preempt.
1265			 * This is safe to do because we've taken the backlog
1266			 * queue private:
1267			 */
1268			cond_resched_softirq();
1269
1270			skb = next;
1271		} while (skb != NULL);
1272
1273		bh_lock_sock(sk);
1274	} while((skb = sk->sk_backlog.head) != NULL);
1275}
1276
1277/**
1278 * sk_wait_data - wait for data to arrive at sk_receive_queue
1279 * @sk:    sock to wait on
1280 * @timeo: for how long
1281 *
1282 * Now socket state including sk->sk_err is changed only under lock,
1283 * hence we may omit checks after joining wait queue.
1284 * We check receive queue before schedule() only as optimization;
1285 * it is very likely that release_sock() added new data.
1286 */
1287int sk_wait_data(struct sock *sk, long *timeo)
1288{
1289	int rc;
1290	DEFINE_WAIT(wait);
1291
1292	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1293	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1294	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1295	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1296	finish_wait(sk->sk_sleep, &wait);
1297	return rc;
1298}
1299
1300EXPORT_SYMBOL(sk_wait_data);
1301
1302/*
1303 * Set of default routines for initialising struct proto_ops when
1304 * the protocol does not support a particular function. In certain
1305 * cases where it makes no sense for a protocol to have a "do nothing"
1306 * function, some default processing is provided.
1307 */
1308
1309int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1310{
1311	return -EOPNOTSUPP;
1312}
1313
1314int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1315		    int len, int flags)
1316{
1317	return -EOPNOTSUPP;
1318}
1319
1320int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1321{
1322	return -EOPNOTSUPP;
1323}
1324
1325int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1326{
1327	return -EOPNOTSUPP;
1328}
1329
1330int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1331		    int *len, int peer)
1332{
1333	return -EOPNOTSUPP;
1334}
1335
1336unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1337{
1338	return 0;
1339}
1340
1341int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1342{
1343	return -EOPNOTSUPP;
1344}
1345
1346int sock_no_listen(struct socket *sock, int backlog)
1347{
1348	return -EOPNOTSUPP;
1349}
1350
1351int sock_no_shutdown(struct socket *sock, int how)
1352{
1353	return -EOPNOTSUPP;
1354}
1355
1356int sock_no_setsockopt(struct socket *sock, int level, int optname,
1357		    char __user *optval, int optlen)
1358{
1359	return -EOPNOTSUPP;
1360}
1361
1362int sock_no_getsockopt(struct socket *sock, int level, int optname,
1363		    char __user *optval, int __user *optlen)
1364{
1365	return -EOPNOTSUPP;
1366}
1367
1368int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1369		    size_t len)
1370{
1371	return -EOPNOTSUPP;
1372}
1373
1374int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1375		    size_t len, int flags)
1376{
1377	return -EOPNOTSUPP;
1378}
1379
1380int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1381{
1382	/* Mirror missing mmap method error code */
1383	return -ENODEV;
1384}
1385
1386ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1387{
1388	ssize_t res;
1389	struct msghdr msg = {.msg_flags = flags};
1390	struct kvec iov;
1391	char *kaddr = kmap(page);
1392	iov.iov_base = kaddr + offset;
1393	iov.iov_len = size;
1394	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1395	kunmap(page);
1396	return res;
1397}
1398
1399/*
1400 *	Default Socket Callbacks
1401 */
1402
1403static void sock_def_wakeup(struct sock *sk)
1404{
1405	read_lock(&sk->sk_callback_lock);
1406	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1407		wake_up_interruptible_all(sk->sk_sleep);
1408	read_unlock(&sk->sk_callback_lock);
1409}
1410
1411static void sock_def_error_report(struct sock *sk)
1412{
1413	read_lock(&sk->sk_callback_lock);
1414	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1415		wake_up_interruptible(sk->sk_sleep);
1416	sk_wake_async(sk,0,POLL_ERR);
1417	read_unlock(&sk->sk_callback_lock);
1418}
1419
1420static void sock_def_readable(struct sock *sk, int len)
1421{
1422	read_lock(&sk->sk_callback_lock);
1423	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1424		wake_up_interruptible(sk->sk_sleep);
1425	sk_wake_async(sk,1,POLL_IN);
1426	read_unlock(&sk->sk_callback_lock);
1427}
1428
1429static void sock_def_write_space(struct sock *sk)
1430{
1431	read_lock(&sk->sk_callback_lock);
1432
1433	/* Do not wake up a writer until he can make "significant"
1434	 * progress.  --DaveM
1435	 */
1436	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1437		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1438			wake_up_interruptible(sk->sk_sleep);
1439
1440		/* Should agree with poll, otherwise some programs break */
1441		if (sock_writeable(sk))
1442			sk_wake_async(sk, 2, POLL_OUT);
1443	}
1444
1445	read_unlock(&sk->sk_callback_lock);
1446}
1447
1448static void sock_def_destruct(struct sock *sk)
1449{
1450	kfree(sk->sk_protinfo);
1451}
1452
1453void sk_send_sigurg(struct sock *sk)
1454{
1455	if (sk->sk_socket && sk->sk_socket->file)
1456		if (send_sigurg(&sk->sk_socket->file->f_owner))
1457			sk_wake_async(sk, 3, POLL_PRI);
1458}
1459
1460void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1461		    unsigned long expires)
1462{
1463	if (!mod_timer(timer, expires))
1464		sock_hold(sk);
1465}
1466
1467EXPORT_SYMBOL(sk_reset_timer);
1468
1469void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1470{
1471	if (timer_pending(timer) && del_timer(timer))
1472		__sock_put(sk);
1473}
1474
1475EXPORT_SYMBOL(sk_stop_timer);
1476
1477void sock_init_data(struct socket *sock, struct sock *sk)
1478{
1479	skb_queue_head_init(&sk->sk_receive_queue);
1480	skb_queue_head_init(&sk->sk_write_queue);
1481	skb_queue_head_init(&sk->sk_error_queue);
1482#ifdef CONFIG_NET_DMA
1483	skb_queue_head_init(&sk->sk_async_wait_queue);
1484#endif
1485
1486	sk->sk_send_head	=	NULL;
1487
1488	init_timer(&sk->sk_timer);
1489
1490	sk->sk_allocation	=	GFP_KERNEL;
1491	sk->sk_rcvbuf		=	sysctl_rmem_default;
1492	sk->sk_sndbuf		=	sysctl_wmem_default;
1493	sk->sk_state		=	TCP_CLOSE;
1494	sk->sk_socket		=	sock;
1495
1496	sock_set_flag(sk, SOCK_ZAPPED);
1497
1498	if(sock)
1499	{
1500		sk->sk_type	=	sock->type;
1501		sk->sk_sleep	=	&sock->wait;
1502		sock->sk	=	sk;
1503	} else
1504		sk->sk_sleep	=	NULL;
1505
1506	rwlock_init(&sk->sk_dst_lock);
1507	rwlock_init(&sk->sk_callback_lock);
1508	lockdep_set_class(&sk->sk_callback_lock,
1509			   af_callback_keys + sk->sk_family);
1510
1511	sk->sk_state_change	=	sock_def_wakeup;
1512	sk->sk_data_ready	=	sock_def_readable;
1513	sk->sk_write_space	=	sock_def_write_space;
1514	sk->sk_error_report	=	sock_def_error_report;
1515	sk->sk_destruct		=	sock_def_destruct;
1516
1517	sk->sk_sndmsg_page	=	NULL;
1518	sk->sk_sndmsg_off	=	0;
1519
1520	sk->sk_peercred.pid 	=	0;
1521	sk->sk_peercred.uid	=	-1;
1522	sk->sk_peercred.gid	=	-1;
1523	sk->sk_write_pending	=	0;
1524	sk->sk_rcvlowat		=	1;
1525	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1526	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1527
1528	sk->sk_stamp.tv_sec     = -1L;
1529	sk->sk_stamp.tv_usec    = -1L;
1530
1531	atomic_set(&sk->sk_refcnt, 1);
1532}
1533
1534void fastcall lock_sock_nested(struct sock *sk, int subclass)
1535{
1536	might_sleep();
1537	spin_lock_bh(&sk->sk_lock.slock);
1538	if (sk->sk_lock.owner)
1539		__lock_sock(sk);
1540	sk->sk_lock.owner = (void *)1;
1541	spin_unlock(&sk->sk_lock.slock);
1542	/*
1543	 * The sk_lock has mutex_lock() semantics here:
1544	 */
1545	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1546	local_bh_enable();
1547}
1548
1549EXPORT_SYMBOL(lock_sock_nested);
1550
1551void fastcall release_sock(struct sock *sk)
1552{
1553	/*
1554	 * The sk_lock has mutex_unlock() semantics:
1555	 */
1556	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1557
1558	spin_lock_bh(&sk->sk_lock.slock);
1559	if (sk->sk_backlog.tail)
1560		__release_sock(sk);
1561	sk->sk_lock.owner = NULL;
1562	if (waitqueue_active(&sk->sk_lock.wq))
1563		wake_up(&sk->sk_lock.wq);
1564	spin_unlock_bh(&sk->sk_lock.slock);
1565}
1566EXPORT_SYMBOL(release_sock);
1567
1568int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1569{
1570	if (!sock_flag(sk, SOCK_TIMESTAMP))
1571		sock_enable_timestamp(sk);
1572	if (sk->sk_stamp.tv_sec == -1)
1573		return -ENOENT;
1574	if (sk->sk_stamp.tv_sec == 0)
1575		do_gettimeofday(&sk->sk_stamp);
1576	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1577		-EFAULT : 0;
1578}
1579EXPORT_SYMBOL(sock_get_timestamp);
1580
1581void sock_enable_timestamp(struct sock *sk)
1582{
1583	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1584		sock_set_flag(sk, SOCK_TIMESTAMP);
1585		net_enable_timestamp();
1586	}
1587}
1588EXPORT_SYMBOL(sock_enable_timestamp);
1589
1590/*
1591 *	Get a socket option on an socket.
1592 *
1593 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1594 *	asynchronous errors should be reported by getsockopt. We assume
1595 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1596 */
1597int sock_common_getsockopt(struct socket *sock, int level, int optname,
1598			   char __user *optval, int __user *optlen)
1599{
1600	struct sock *sk = sock->sk;
1601
1602	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1603}
1604
1605EXPORT_SYMBOL(sock_common_getsockopt);
1606
1607#ifdef CONFIG_COMPAT
1608int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1609				  char __user *optval, int __user *optlen)
1610{
1611	struct sock *sk = sock->sk;
1612
1613	if (sk->sk_prot->compat_setsockopt != NULL)
1614		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1615						      optval, optlen);
1616	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1617}
1618EXPORT_SYMBOL(compat_sock_common_getsockopt);
1619#endif
1620
1621int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1622			struct msghdr *msg, size_t size, int flags)
1623{
1624	struct sock *sk = sock->sk;
1625	int addr_len = 0;
1626	int err;
1627
1628	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1629				   flags & ~MSG_DONTWAIT, &addr_len);
1630	if (err >= 0)
1631		msg->msg_namelen = addr_len;
1632	return err;
1633}
1634
1635EXPORT_SYMBOL(sock_common_recvmsg);
1636
1637/*
1638 *	Set socket options on an inet socket.
1639 */
1640int sock_common_setsockopt(struct socket *sock, int level, int optname,
1641			   char __user *optval, int optlen)
1642{
1643	struct sock *sk = sock->sk;
1644
1645	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1646}
1647
1648EXPORT_SYMBOL(sock_common_setsockopt);
1649
1650#ifdef CONFIG_COMPAT
1651int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1652				  char __user *optval, int optlen)
1653{
1654	struct sock *sk = sock->sk;
1655
1656	if (sk->sk_prot->compat_setsockopt != NULL)
1657		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1658						      optval, optlen);
1659	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1660}
1661EXPORT_SYMBOL(compat_sock_common_setsockopt);
1662#endif
1663
1664void sk_common_release(struct sock *sk)
1665{
1666	if (sk->sk_prot->destroy)
1667		sk->sk_prot->destroy(sk);
1668
1669	/*
1670	 * Observation: when sock_common_release is called, processes have
1671	 * no access to socket. But net still has.
1672	 * Step one, detach it from networking:
1673	 *
1674	 * A. Remove from hash tables.
1675	 */
1676
1677	sk->sk_prot->unhash(sk);
1678
1679	/*
1680	 * In this point socket cannot receive new packets, but it is possible
1681	 * that some packets are in flight because some CPU runs receiver and
1682	 * did hash table lookup before we unhashed socket. They will achieve
1683	 * receive queue and will be purged by socket destructor.
1684	 *
1685	 * Also we still have packets pending on receive queue and probably,
1686	 * our own packets waiting in device queues. sock_destroy will drain
1687	 * receive queue, but transmitted packets will delay socket destruction
1688	 * until the last reference will be released.
1689	 */
1690
1691	sock_orphan(sk);
1692
1693	xfrm_sk_free_policy(sk);
1694
1695	sk_refcnt_debug_release(sk);
1696	sock_put(sk);
1697}
1698
1699EXPORT_SYMBOL(sk_common_release);
1700
1701static DEFINE_RWLOCK(proto_list_lock);
1702static LIST_HEAD(proto_list);
1703
1704int proto_register(struct proto *prot, int alloc_slab)
1705{
1706	char *request_sock_slab_name = NULL;
1707	char *timewait_sock_slab_name;
1708	int rc = -ENOBUFS;
1709
1710	if (alloc_slab) {
1711		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1712					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1713
1714		if (prot->slab == NULL) {
1715			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1716			       prot->name);
1717			goto out;
1718		}
1719
1720		if (prot->rsk_prot != NULL) {
1721			static const char mask[] = "request_sock_%s";
1722
1723			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1724			if (request_sock_slab_name == NULL)
1725				goto out_free_sock_slab;
1726
1727			sprintf(request_sock_slab_name, mask, prot->name);
1728			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1729								 prot->rsk_prot->obj_size, 0,
1730								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1731
1732			if (prot->rsk_prot->slab == NULL) {
1733				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1734				       prot->name);
1735				goto out_free_request_sock_slab_name;
1736			}
1737		}
1738
1739		if (prot->twsk_prot != NULL) {
1740			static const char mask[] = "tw_sock_%s";
1741
1742			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1743
1744			if (timewait_sock_slab_name == NULL)
1745				goto out_free_request_sock_slab;
1746
1747			sprintf(timewait_sock_slab_name, mask, prot->name);
1748			prot->twsk_prot->twsk_slab =
1749				kmem_cache_create(timewait_sock_slab_name,
1750						  prot->twsk_prot->twsk_obj_size,
1751						  0, SLAB_HWCACHE_ALIGN,
1752						  NULL, NULL);
1753			if (prot->twsk_prot->twsk_slab == NULL)
1754				goto out_free_timewait_sock_slab_name;
1755		}
1756	}
1757
1758	write_lock(&proto_list_lock);
1759	list_add(&prot->node, &proto_list);
1760	write_unlock(&proto_list_lock);
1761	rc = 0;
1762out:
1763	return rc;
1764out_free_timewait_sock_slab_name:
1765	kfree(timewait_sock_slab_name);
1766out_free_request_sock_slab:
1767	if (prot->rsk_prot && prot->rsk_prot->slab) {
1768		kmem_cache_destroy(prot->rsk_prot->slab);
1769		prot->rsk_prot->slab = NULL;
1770	}
1771out_free_request_sock_slab_name:
1772	kfree(request_sock_slab_name);
1773out_free_sock_slab:
1774	kmem_cache_destroy(prot->slab);
1775	prot->slab = NULL;
1776	goto out;
1777}
1778
1779EXPORT_SYMBOL(proto_register);
1780
1781void proto_unregister(struct proto *prot)
1782{
1783	write_lock(&proto_list_lock);
1784	list_del(&prot->node);
1785	write_unlock(&proto_list_lock);
1786
1787	if (prot->slab != NULL) {
1788		kmem_cache_destroy(prot->slab);
1789		prot->slab = NULL;
1790	}
1791
1792	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1793		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1794
1795		kmem_cache_destroy(prot->rsk_prot->slab);
1796		kfree(name);
1797		prot->rsk_prot->slab = NULL;
1798	}
1799
1800	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1801		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1802
1803		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1804		kfree(name);
1805		prot->twsk_prot->twsk_slab = NULL;
1806	}
1807}
1808
1809EXPORT_SYMBOL(proto_unregister);
1810
1811#ifdef CONFIG_PROC_FS
1812static inline struct proto *__proto_head(void)
1813{
1814	return list_entry(proto_list.next, struct proto, node);
1815}
1816
1817static inline struct proto *proto_head(void)
1818{
1819	return list_empty(&proto_list) ? NULL : __proto_head();
1820}
1821
1822static inline struct proto *proto_next(struct proto *proto)
1823{
1824	return proto->node.next == &proto_list ? NULL :
1825		list_entry(proto->node.next, struct proto, node);
1826}
1827
1828static inline struct proto *proto_get_idx(loff_t pos)
1829{
1830	struct proto *proto;
1831	loff_t i = 0;
1832
1833	list_for_each_entry(proto, &proto_list, node)
1834		if (i++ == pos)
1835			goto out;
1836
1837	proto = NULL;
1838out:
1839	return proto;
1840}
1841
1842static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1843{
1844	read_lock(&proto_list_lock);
1845	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1846}
1847
1848static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1849{
1850	++*pos;
1851	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1852}
1853
1854static void proto_seq_stop(struct seq_file *seq, void *v)
1855{
1856	read_unlock(&proto_list_lock);
1857}
1858
1859static char proto_method_implemented(const void *method)
1860{
1861	return method == NULL ? 'n' : 'y';
1862}
1863
1864static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1865{
1866	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1867			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1868		   proto->name,
1869		   proto->obj_size,
1870		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1871		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1872		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1873		   proto->max_header,
1874		   proto->slab == NULL ? "no" : "yes",
1875		   module_name(proto->owner),
1876		   proto_method_implemented(proto->close),
1877		   proto_method_implemented(proto->connect),
1878		   proto_method_implemented(proto->disconnect),
1879		   proto_method_implemented(proto->accept),
1880		   proto_method_implemented(proto->ioctl),
1881		   proto_method_implemented(proto->init),
1882		   proto_method_implemented(proto->destroy),
1883		   proto_method_implemented(proto->shutdown),
1884		   proto_method_implemented(proto->setsockopt),
1885		   proto_method_implemented(proto->getsockopt),
1886		   proto_method_implemented(proto->sendmsg),
1887		   proto_method_implemented(proto->recvmsg),
1888		   proto_method_implemented(proto->sendpage),
1889		   proto_method_implemented(proto->bind),
1890		   proto_method_implemented(proto->backlog_rcv),
1891		   proto_method_implemented(proto->hash),
1892		   proto_method_implemented(proto->unhash),
1893		   proto_method_implemented(proto->get_port),
1894		   proto_method_implemented(proto->enter_memory_pressure));
1895}
1896
1897static int proto_seq_show(struct seq_file *seq, void *v)
1898{
1899	if (v == SEQ_START_TOKEN)
1900		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1901			   "protocol",
1902			   "size",
1903			   "sockets",
1904			   "memory",
1905			   "press",
1906			   "maxhdr",
1907			   "slab",
1908			   "module",
1909			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1910	else
1911		proto_seq_printf(seq, v);
1912	return 0;
1913}
1914
1915static struct seq_operations proto_seq_ops = {
1916	.start  = proto_seq_start,
1917	.next   = proto_seq_next,
1918	.stop   = proto_seq_stop,
1919	.show   = proto_seq_show,
1920};
1921
1922static int proto_seq_open(struct inode *inode, struct file *file)
1923{
1924	return seq_open(file, &proto_seq_ops);
1925}
1926
1927static struct file_operations proto_seq_fops = {
1928	.owner		= THIS_MODULE,
1929	.open		= proto_seq_open,
1930	.read		= seq_read,
1931	.llseek		= seq_lseek,
1932	.release	= seq_release,
1933};
1934
1935static int __init proto_init(void)
1936{
1937	/* register /proc/net/protocols */
1938	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1939}
1940
1941subsys_initcall(proto_init);
1942
1943#endif /* PROC_FS */
1944
1945EXPORT_SYMBOL(sk_alloc);
1946EXPORT_SYMBOL(sk_free);
1947EXPORT_SYMBOL(sk_send_sigurg);
1948EXPORT_SYMBOL(sock_alloc_send_skb);
1949EXPORT_SYMBOL(sock_init_data);
1950EXPORT_SYMBOL(sock_kfree_s);
1951EXPORT_SYMBOL(sock_kmalloc);
1952EXPORT_SYMBOL(sock_no_accept);
1953EXPORT_SYMBOL(sock_no_bind);
1954EXPORT_SYMBOL(sock_no_connect);
1955EXPORT_SYMBOL(sock_no_getname);
1956EXPORT_SYMBOL(sock_no_getsockopt);
1957EXPORT_SYMBOL(sock_no_ioctl);
1958EXPORT_SYMBOL(sock_no_listen);
1959EXPORT_SYMBOL(sock_no_mmap);
1960EXPORT_SYMBOL(sock_no_poll);
1961EXPORT_SYMBOL(sock_no_recvmsg);
1962EXPORT_SYMBOL(sock_no_sendmsg);
1963EXPORT_SYMBOL(sock_no_sendpage);
1964EXPORT_SYMBOL(sock_no_setsockopt);
1965EXPORT_SYMBOL(sock_no_shutdown);
1966EXPORT_SYMBOL(sock_no_socketpair);
1967EXPORT_SYMBOL(sock_rfree);
1968EXPORT_SYMBOL(sock_setsockopt);
1969EXPORT_SYMBOL(sock_wfree);
1970EXPORT_SYMBOL(sock_wmalloc);
1971EXPORT_SYMBOL(sock_i_uid);
1972EXPORT_SYMBOL(sock_i_ino);
1973EXPORT_SYMBOL(sysctl_optmem_max);
1974#ifdef CONFIG_SYSCTL
1975EXPORT_SYMBOL(sysctl_rmem_max);
1976EXPORT_SYMBOL(sysctl_wmem_max);
1977#endif
1978