sock.c revision 58a5a7b9555ea231b557ebef5cabeaf8e951df0b
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <net/xfrm.h>
124#include <linux/ipsec.h>
125
126#include <linux/filter.h>
127
128#ifdef CONFIG_INET
129#include <net/tcp.h>
130#endif
131
132/*
133 * Each address family might have different locking rules, so we have
134 * one slock key per address family:
135 */
136static struct lock_class_key af_family_keys[AF_MAX];
137static struct lock_class_key af_family_slock_keys[AF_MAX];
138
139#ifdef CONFIG_DEBUG_LOCK_ALLOC
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
157};
158static const char *af_family_slock_key_strings[AF_MAX+1] = {
159  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
160  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
161  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
162  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
163  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
164  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
165  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
166  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
167  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
168  "slock-27"       , "slock-28"          , "slock-29"          ,
169  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
170};
171#endif
172
173/*
174 * sk_callback_lock locking rules are per-address-family,
175 * so split the lock classes by using a per-AF key:
176 */
177static struct lock_class_key af_callback_keys[AF_MAX];
178
179/* Take into consideration the size of the struct sk_buff overhead in the
180 * determination of these values, since that is non-constant across
181 * platforms.  This makes socket queueing behavior and performance
182 * not depend upon such differences.
183 */
184#define _SK_MEM_PACKETS		256
185#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
186#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
187#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188
189/* Run time adjustable parameters. */
190__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
191__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
192__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
193__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
194
195/* Maximal space eaten by iovec or ancilliary data plus some space */
196int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
197
198static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
199{
200	struct timeval tv;
201
202	if (optlen < sizeof(tv))
203		return -EINVAL;
204	if (copy_from_user(&tv, optval, sizeof(tv)))
205		return -EFAULT;
206
207	*timeo_p = MAX_SCHEDULE_TIMEOUT;
208	if (tv.tv_sec == 0 && tv.tv_usec == 0)
209		return 0;
210	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
211		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
212	return 0;
213}
214
215static void sock_warn_obsolete_bsdism(const char *name)
216{
217	static int warned;
218	static char warncomm[TASK_COMM_LEN];
219	if (strcmp(warncomm, current->comm) && warned < 5) {
220		strcpy(warncomm,  current->comm);
221		printk(KERN_WARNING "process `%s' is using obsolete "
222		       "%s SO_BSDCOMPAT\n", warncomm, name);
223		warned++;
224	}
225}
226
227static void sock_disable_timestamp(struct sock *sk)
228{
229	if (sock_flag(sk, SOCK_TIMESTAMP)) {
230		sock_reset_flag(sk, SOCK_TIMESTAMP);
231		net_disable_timestamp();
232	}
233}
234
235
236int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
237{
238	int err = 0;
239	int skb_len;
240
241	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
242	   number of warnings when compiling with -W --ANK
243	 */
244	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
245	    (unsigned)sk->sk_rcvbuf) {
246		err = -ENOMEM;
247		goto out;
248	}
249
250	err = sk_filter(sk, skb);
251	if (err)
252		goto out;
253
254	skb->dev = NULL;
255	skb_set_owner_r(skb, sk);
256
257	/* Cache the SKB length before we tack it onto the receive
258	 * queue.  Once it is added it no longer belongs to us and
259	 * may be freed by other threads of control pulling packets
260	 * from the queue.
261	 */
262	skb_len = skb->len;
263
264	skb_queue_tail(&sk->sk_receive_queue, skb);
265
266	if (!sock_flag(sk, SOCK_DEAD))
267		sk->sk_data_ready(sk, skb_len);
268out:
269	return err;
270}
271EXPORT_SYMBOL(sock_queue_rcv_skb);
272
273int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
274{
275	int rc = NET_RX_SUCCESS;
276
277	if (sk_filter(sk, skb))
278		goto discard_and_relse;
279
280	skb->dev = NULL;
281
282	if (nested)
283		bh_lock_sock_nested(sk);
284	else
285		bh_lock_sock(sk);
286	if (!sock_owned_by_user(sk)) {
287		/*
288		 * trylock + unlock semantics:
289		 */
290		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
291
292		rc = sk->sk_backlog_rcv(sk, skb);
293
294		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
295	} else
296		sk_add_backlog(sk, skb);
297	bh_unlock_sock(sk);
298out:
299	sock_put(sk);
300	return rc;
301discard_and_relse:
302	kfree_skb(skb);
303	goto out;
304}
305EXPORT_SYMBOL(sk_receive_skb);
306
307struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
308{
309	struct dst_entry *dst = sk->sk_dst_cache;
310
311	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
312		sk->sk_dst_cache = NULL;
313		dst_release(dst);
314		return NULL;
315	}
316
317	return dst;
318}
319EXPORT_SYMBOL(__sk_dst_check);
320
321struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
322{
323	struct dst_entry *dst = sk_dst_get(sk);
324
325	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
326		sk_dst_reset(sk);
327		dst_release(dst);
328		return NULL;
329	}
330
331	return dst;
332}
333EXPORT_SYMBOL(sk_dst_check);
334
335/*
336 *	This is meant for all protocols to use and covers goings on
337 *	at the socket level. Everything here is generic.
338 */
339
340int sock_setsockopt(struct socket *sock, int level, int optname,
341		    char __user *optval, int optlen)
342{
343	struct sock *sk=sock->sk;
344	struct sk_filter *filter;
345	int val;
346	int valbool;
347	struct linger ling;
348	int ret = 0;
349
350	/*
351	 *	Options without arguments
352	 */
353
354#ifdef SO_DONTLINGER		/* Compatibility item... */
355	if (optname == SO_DONTLINGER) {
356		lock_sock(sk);
357		sock_reset_flag(sk, SOCK_LINGER);
358		release_sock(sk);
359		return 0;
360	}
361#endif
362
363  	if(optlen<sizeof(int))
364  		return(-EINVAL);
365
366	if (get_user(val, (int __user *)optval))
367		return -EFAULT;
368
369  	valbool = val?1:0;
370
371	lock_sock(sk);
372
373  	switch(optname)
374  	{
375		case SO_DEBUG:
376			if(val && !capable(CAP_NET_ADMIN))
377			{
378				ret = -EACCES;
379			}
380			else if (valbool)
381				sock_set_flag(sk, SOCK_DBG);
382			else
383				sock_reset_flag(sk, SOCK_DBG);
384			break;
385		case SO_REUSEADDR:
386			sk->sk_reuse = valbool;
387			break;
388		case SO_TYPE:
389		case SO_ERROR:
390			ret = -ENOPROTOOPT;
391		  	break;
392		case SO_DONTROUTE:
393			if (valbool)
394				sock_set_flag(sk, SOCK_LOCALROUTE);
395			else
396				sock_reset_flag(sk, SOCK_LOCALROUTE);
397			break;
398		case SO_BROADCAST:
399			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
400			break;
401		case SO_SNDBUF:
402			/* Don't error on this BSD doesn't and if you think
403			   about it this is right. Otherwise apps have to
404			   play 'guess the biggest size' games. RCVBUF/SNDBUF
405			   are treated in BSD as hints */
406
407			if (val > sysctl_wmem_max)
408				val = sysctl_wmem_max;
409set_sndbuf:
410			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
411			if ((val * 2) < SOCK_MIN_SNDBUF)
412				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
413			else
414				sk->sk_sndbuf = val * 2;
415
416			/*
417			 *	Wake up sending tasks if we
418			 *	upped the value.
419			 */
420			sk->sk_write_space(sk);
421			break;
422
423		case SO_SNDBUFFORCE:
424			if (!capable(CAP_NET_ADMIN)) {
425				ret = -EPERM;
426				break;
427			}
428			goto set_sndbuf;
429
430		case SO_RCVBUF:
431			/* Don't error on this BSD doesn't and if you think
432			   about it this is right. Otherwise apps have to
433			   play 'guess the biggest size' games. RCVBUF/SNDBUF
434			   are treated in BSD as hints */
435
436			if (val > sysctl_rmem_max)
437				val = sysctl_rmem_max;
438set_rcvbuf:
439			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
440			/*
441			 * We double it on the way in to account for
442			 * "struct sk_buff" etc. overhead.   Applications
443			 * assume that the SO_RCVBUF setting they make will
444			 * allow that much actual data to be received on that
445			 * socket.
446			 *
447			 * Applications are unaware that "struct sk_buff" and
448			 * other overheads allocate from the receive buffer
449			 * during socket buffer allocation.
450			 *
451			 * And after considering the possible alternatives,
452			 * returning the value we actually used in getsockopt
453			 * is the most desirable behavior.
454			 */
455			if ((val * 2) < SOCK_MIN_RCVBUF)
456				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
457			else
458				sk->sk_rcvbuf = val * 2;
459			break;
460
461		case SO_RCVBUFFORCE:
462			if (!capable(CAP_NET_ADMIN)) {
463				ret = -EPERM;
464				break;
465			}
466			goto set_rcvbuf;
467
468		case SO_KEEPALIVE:
469#ifdef CONFIG_INET
470			if (sk->sk_protocol == IPPROTO_TCP)
471				tcp_set_keepalive(sk, valbool);
472#endif
473			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
474			break;
475
476	 	case SO_OOBINLINE:
477			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
478			break;
479
480	 	case SO_NO_CHECK:
481			sk->sk_no_check = valbool;
482			break;
483
484		case SO_PRIORITY:
485			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
486				sk->sk_priority = val;
487			else
488				ret = -EPERM;
489			break;
490
491		case SO_LINGER:
492			if(optlen<sizeof(ling)) {
493				ret = -EINVAL;	/* 1003.1g */
494				break;
495			}
496			if (copy_from_user(&ling,optval,sizeof(ling))) {
497				ret = -EFAULT;
498				break;
499			}
500			if (!ling.l_onoff)
501				sock_reset_flag(sk, SOCK_LINGER);
502			else {
503#if (BITS_PER_LONG == 32)
504				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
505					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
506				else
507#endif
508					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
509				sock_set_flag(sk, SOCK_LINGER);
510			}
511			break;
512
513		case SO_BSDCOMPAT:
514			sock_warn_obsolete_bsdism("setsockopt");
515			break;
516
517		case SO_PASSCRED:
518			if (valbool)
519				set_bit(SOCK_PASSCRED, &sock->flags);
520			else
521				clear_bit(SOCK_PASSCRED, &sock->flags);
522			break;
523
524		case SO_TIMESTAMP:
525			if (valbool)  {
526				sock_set_flag(sk, SOCK_RCVTSTAMP);
527				sock_enable_timestamp(sk);
528			} else
529				sock_reset_flag(sk, SOCK_RCVTSTAMP);
530			break;
531
532		case SO_RCVLOWAT:
533			if (val < 0)
534				val = INT_MAX;
535			sk->sk_rcvlowat = val ? : 1;
536			break;
537
538		case SO_RCVTIMEO:
539			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
540			break;
541
542		case SO_SNDTIMEO:
543			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
544			break;
545
546#ifdef CONFIG_NETDEVICES
547		case SO_BINDTODEVICE:
548		{
549			char devname[IFNAMSIZ];
550
551			/* Sorry... */
552			if (!capable(CAP_NET_RAW)) {
553				ret = -EPERM;
554				break;
555			}
556
557			/* Bind this socket to a particular device like "eth0",
558			 * as specified in the passed interface name. If the
559			 * name is "" or the option length is zero the socket
560			 * is not bound.
561			 */
562
563			if (!valbool) {
564				sk->sk_bound_dev_if = 0;
565			} else {
566				if (optlen > IFNAMSIZ - 1)
567					optlen = IFNAMSIZ - 1;
568				memset(devname, 0, sizeof(devname));
569				if (copy_from_user(devname, optval, optlen)) {
570					ret = -EFAULT;
571					break;
572				}
573
574				/* Remove any cached route for this socket. */
575				sk_dst_reset(sk);
576
577				if (devname[0] == '\0') {
578					sk->sk_bound_dev_if = 0;
579				} else {
580					struct net_device *dev = dev_get_by_name(devname);
581					if (!dev) {
582						ret = -ENODEV;
583						break;
584					}
585					sk->sk_bound_dev_if = dev->ifindex;
586					dev_put(dev);
587				}
588			}
589			break;
590		}
591#endif
592
593
594		case SO_ATTACH_FILTER:
595			ret = -EINVAL;
596			if (optlen == sizeof(struct sock_fprog)) {
597				struct sock_fprog fprog;
598
599				ret = -EFAULT;
600				if (copy_from_user(&fprog, optval, sizeof(fprog)))
601					break;
602
603				ret = sk_attach_filter(&fprog, sk);
604			}
605			break;
606
607		case SO_DETACH_FILTER:
608			rcu_read_lock_bh();
609			filter = rcu_dereference(sk->sk_filter);
610                        if (filter) {
611				rcu_assign_pointer(sk->sk_filter, NULL);
612				sk_filter_release(sk, filter);
613				rcu_read_unlock_bh();
614				break;
615			}
616			rcu_read_unlock_bh();
617			ret = -ENONET;
618			break;
619
620		case SO_PASSSEC:
621			if (valbool)
622				set_bit(SOCK_PASSSEC, &sock->flags);
623			else
624				clear_bit(SOCK_PASSSEC, &sock->flags);
625			break;
626
627		/* We implement the SO_SNDLOWAT etc to
628		   not be settable (1003.1g 5.3) */
629		default:
630		  	ret = -ENOPROTOOPT;
631			break;
632  	}
633	release_sock(sk);
634	return ret;
635}
636
637
638int sock_getsockopt(struct socket *sock, int level, int optname,
639		    char __user *optval, int __user *optlen)
640{
641	struct sock *sk = sock->sk;
642
643	union
644	{
645  		int val;
646  		struct linger ling;
647		struct timeval tm;
648	} v;
649
650	unsigned int lv = sizeof(int);
651	int len;
652
653  	if(get_user(len,optlen))
654  		return -EFAULT;
655	if(len < 0)
656		return -EINVAL;
657
658  	switch(optname)
659  	{
660		case SO_DEBUG:
661			v.val = sock_flag(sk, SOCK_DBG);
662			break;
663
664		case SO_DONTROUTE:
665			v.val = sock_flag(sk, SOCK_LOCALROUTE);
666			break;
667
668		case SO_BROADCAST:
669			v.val = !!sock_flag(sk, SOCK_BROADCAST);
670			break;
671
672		case SO_SNDBUF:
673			v.val = sk->sk_sndbuf;
674			break;
675
676		case SO_RCVBUF:
677			v.val = sk->sk_rcvbuf;
678			break;
679
680		case SO_REUSEADDR:
681			v.val = sk->sk_reuse;
682			break;
683
684		case SO_KEEPALIVE:
685			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
686			break;
687
688		case SO_TYPE:
689			v.val = sk->sk_type;
690			break;
691
692		case SO_ERROR:
693			v.val = -sock_error(sk);
694			if(v.val==0)
695				v.val = xchg(&sk->sk_err_soft, 0);
696			break;
697
698		case SO_OOBINLINE:
699			v.val = !!sock_flag(sk, SOCK_URGINLINE);
700			break;
701
702		case SO_NO_CHECK:
703			v.val = sk->sk_no_check;
704			break;
705
706		case SO_PRIORITY:
707			v.val = sk->sk_priority;
708			break;
709
710		case SO_LINGER:
711			lv		= sizeof(v.ling);
712			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
713 			v.ling.l_linger	= sk->sk_lingertime / HZ;
714			break;
715
716		case SO_BSDCOMPAT:
717			sock_warn_obsolete_bsdism("getsockopt");
718			break;
719
720		case SO_TIMESTAMP:
721			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
722			break;
723
724		case SO_RCVTIMEO:
725			lv=sizeof(struct timeval);
726			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
727				v.tm.tv_sec = 0;
728				v.tm.tv_usec = 0;
729			} else {
730				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
731				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
732			}
733			break;
734
735		case SO_SNDTIMEO:
736			lv=sizeof(struct timeval);
737			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
738				v.tm.tv_sec = 0;
739				v.tm.tv_usec = 0;
740			} else {
741				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
742				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
743			}
744			break;
745
746		case SO_RCVLOWAT:
747			v.val = sk->sk_rcvlowat;
748			break;
749
750		case SO_SNDLOWAT:
751			v.val=1;
752			break;
753
754		case SO_PASSCRED:
755			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
756			break;
757
758		case SO_PEERCRED:
759			if (len > sizeof(sk->sk_peercred))
760				len = sizeof(sk->sk_peercred);
761			if (copy_to_user(optval, &sk->sk_peercred, len))
762				return -EFAULT;
763			goto lenout;
764
765		case SO_PEERNAME:
766		{
767			char address[128];
768
769			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
770				return -ENOTCONN;
771			if (lv < len)
772				return -EINVAL;
773			if (copy_to_user(optval, address, len))
774				return -EFAULT;
775			goto lenout;
776		}
777
778		/* Dubious BSD thing... Probably nobody even uses it, but
779		 * the UNIX standard wants it for whatever reason... -DaveM
780		 */
781		case SO_ACCEPTCONN:
782			v.val = sk->sk_state == TCP_LISTEN;
783			break;
784
785		case SO_PASSSEC:
786			v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
787			break;
788
789		case SO_PEERSEC:
790			return security_socket_getpeersec_stream(sock, optval, optlen, len);
791
792		default:
793			return(-ENOPROTOOPT);
794	}
795	if (len > lv)
796		len = lv;
797	if (copy_to_user(optval, &v, len))
798		return -EFAULT;
799lenout:
800  	if (put_user(len, optlen))
801  		return -EFAULT;
802  	return 0;
803}
804
805/*
806 * Initialize an sk_lock.
807 *
808 * (We also register the sk_lock with the lock validator.)
809 */
810static void inline sock_lock_init(struct sock *sk)
811{
812	spin_lock_init(&sk->sk_lock.slock);
813	sk->sk_lock.owner = NULL;
814	init_waitqueue_head(&sk->sk_lock.wq);
815	/*
816	 * Make sure we are not reinitializing a held lock:
817	 */
818	debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
819
820	/*
821	 * Mark both the sk_lock and the sk_lock.slock as a
822	 * per-address-family lock class:
823	 */
824	lockdep_set_class_and_name(&sk->sk_lock.slock,
825				   af_family_slock_keys + sk->sk_family,
826				   af_family_slock_key_strings[sk->sk_family]);
827	lockdep_init_map(&sk->sk_lock.dep_map,
828			 af_family_key_strings[sk->sk_family],
829			 af_family_keys + sk->sk_family, 0);
830}
831
832/**
833 *	sk_alloc - All socket objects are allocated here
834 *	@family: protocol family
835 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
836 *	@prot: struct proto associated with this new sock instance
837 *	@zero_it: if we should zero the newly allocated sock
838 */
839struct sock *sk_alloc(int family, gfp_t priority,
840		      struct proto *prot, int zero_it)
841{
842	struct sock *sk = NULL;
843	kmem_cache_t *slab = prot->slab;
844
845	if (slab != NULL)
846		sk = kmem_cache_alloc(slab, priority);
847	else
848		sk = kmalloc(prot->obj_size, priority);
849
850	if (sk) {
851		if (zero_it) {
852			memset(sk, 0, prot->obj_size);
853			sk->sk_family = family;
854			/*
855			 * See comment in struct sock definition to understand
856			 * why we need sk_prot_creator -acme
857			 */
858			sk->sk_prot = sk->sk_prot_creator = prot;
859			sock_lock_init(sk);
860		}
861
862		if (security_sk_alloc(sk, family, priority))
863			goto out_free;
864
865		if (!try_module_get(prot->owner))
866			goto out_free;
867	}
868	return sk;
869
870out_free:
871	if (slab != NULL)
872		kmem_cache_free(slab, sk);
873	else
874		kfree(sk);
875	return NULL;
876}
877
878void sk_free(struct sock *sk)
879{
880	struct sk_filter *filter;
881	struct module *owner = sk->sk_prot_creator->owner;
882
883	if (sk->sk_destruct)
884		sk->sk_destruct(sk);
885
886	filter = rcu_dereference(sk->sk_filter);
887	if (filter) {
888		sk_filter_release(sk, filter);
889		rcu_assign_pointer(sk->sk_filter, NULL);
890	}
891
892	sock_disable_timestamp(sk);
893
894	if (atomic_read(&sk->sk_omem_alloc))
895		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
896		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
897
898	security_sk_free(sk);
899	if (sk->sk_prot_creator->slab != NULL)
900		kmem_cache_free(sk->sk_prot_creator->slab, sk);
901	else
902		kfree(sk);
903	module_put(owner);
904}
905
906struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
907{
908	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
909
910	if (newsk != NULL) {
911		struct sk_filter *filter;
912
913		sock_copy(newsk, sk);
914
915		/* SANITY */
916		sk_node_init(&newsk->sk_node);
917		sock_lock_init(newsk);
918		bh_lock_sock(newsk);
919
920		atomic_set(&newsk->sk_rmem_alloc, 0);
921		atomic_set(&newsk->sk_wmem_alloc, 0);
922		atomic_set(&newsk->sk_omem_alloc, 0);
923		skb_queue_head_init(&newsk->sk_receive_queue);
924		skb_queue_head_init(&newsk->sk_write_queue);
925#ifdef CONFIG_NET_DMA
926		skb_queue_head_init(&newsk->sk_async_wait_queue);
927#endif
928
929		rwlock_init(&newsk->sk_dst_lock);
930		rwlock_init(&newsk->sk_callback_lock);
931		lockdep_set_class(&newsk->sk_callback_lock,
932				   af_callback_keys + newsk->sk_family);
933
934		newsk->sk_dst_cache	= NULL;
935		newsk->sk_wmem_queued	= 0;
936		newsk->sk_forward_alloc = 0;
937		newsk->sk_send_head	= NULL;
938		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
939		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
940
941		sock_reset_flag(newsk, SOCK_DONE);
942		skb_queue_head_init(&newsk->sk_error_queue);
943
944		filter = newsk->sk_filter;
945		if (filter != NULL)
946			sk_filter_charge(newsk, filter);
947
948		if (unlikely(xfrm_sk_clone_policy(newsk))) {
949			/* It is still raw copy of parent, so invalidate
950			 * destructor and make plain sk_free() */
951			newsk->sk_destruct = NULL;
952			sk_free(newsk);
953			newsk = NULL;
954			goto out;
955		}
956
957		newsk->sk_err	   = 0;
958		newsk->sk_priority = 0;
959		atomic_set(&newsk->sk_refcnt, 2);
960
961		/*
962		 * Increment the counter in the same struct proto as the master
963		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
964		 * is the same as sk->sk_prot->socks, as this field was copied
965		 * with memcpy).
966		 *
967		 * This _changes_ the previous behaviour, where
968		 * tcp_create_openreq_child always was incrementing the
969		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
970		 * to be taken into account in all callers. -acme
971		 */
972		sk_refcnt_debug_inc(newsk);
973		newsk->sk_socket = NULL;
974		newsk->sk_sleep	 = NULL;
975
976		if (newsk->sk_prot->sockets_allocated)
977			atomic_inc(newsk->sk_prot->sockets_allocated);
978	}
979out:
980	return newsk;
981}
982
983EXPORT_SYMBOL_GPL(sk_clone);
984
985void __init sk_init(void)
986{
987	if (num_physpages <= 4096) {
988		sysctl_wmem_max = 32767;
989		sysctl_rmem_max = 32767;
990		sysctl_wmem_default = 32767;
991		sysctl_rmem_default = 32767;
992	} else if (num_physpages >= 131072) {
993		sysctl_wmem_max = 131071;
994		sysctl_rmem_max = 131071;
995	}
996}
997
998/*
999 *	Simple resource managers for sockets.
1000 */
1001
1002
1003/*
1004 * Write buffer destructor automatically called from kfree_skb.
1005 */
1006void sock_wfree(struct sk_buff *skb)
1007{
1008	struct sock *sk = skb->sk;
1009
1010	/* In case it might be waiting for more memory. */
1011	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1012	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1013		sk->sk_write_space(sk);
1014	sock_put(sk);
1015}
1016
1017/*
1018 * Read buffer destructor automatically called from kfree_skb.
1019 */
1020void sock_rfree(struct sk_buff *skb)
1021{
1022	struct sock *sk = skb->sk;
1023
1024	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1025}
1026
1027
1028int sock_i_uid(struct sock *sk)
1029{
1030	int uid;
1031
1032	read_lock(&sk->sk_callback_lock);
1033	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1034	read_unlock(&sk->sk_callback_lock);
1035	return uid;
1036}
1037
1038unsigned long sock_i_ino(struct sock *sk)
1039{
1040	unsigned long ino;
1041
1042	read_lock(&sk->sk_callback_lock);
1043	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1044	read_unlock(&sk->sk_callback_lock);
1045	return ino;
1046}
1047
1048/*
1049 * Allocate a skb from the socket's send buffer.
1050 */
1051struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1052			     gfp_t priority)
1053{
1054	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1055		struct sk_buff * skb = alloc_skb(size, priority);
1056		if (skb) {
1057			skb_set_owner_w(skb, sk);
1058			return skb;
1059		}
1060	}
1061	return NULL;
1062}
1063
1064/*
1065 * Allocate a skb from the socket's receive buffer.
1066 */
1067struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1068			     gfp_t priority)
1069{
1070	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1071		struct sk_buff *skb = alloc_skb(size, priority);
1072		if (skb) {
1073			skb_set_owner_r(skb, sk);
1074			return skb;
1075		}
1076	}
1077	return NULL;
1078}
1079
1080/*
1081 * Allocate a memory block from the socket's option memory buffer.
1082 */
1083void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1084{
1085	if ((unsigned)size <= sysctl_optmem_max &&
1086	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1087		void *mem;
1088		/* First do the add, to avoid the race if kmalloc
1089 		 * might sleep.
1090		 */
1091		atomic_add(size, &sk->sk_omem_alloc);
1092		mem = kmalloc(size, priority);
1093		if (mem)
1094			return mem;
1095		atomic_sub(size, &sk->sk_omem_alloc);
1096	}
1097	return NULL;
1098}
1099
1100/*
1101 * Free an option memory block.
1102 */
1103void sock_kfree_s(struct sock *sk, void *mem, int size)
1104{
1105	kfree(mem);
1106	atomic_sub(size, &sk->sk_omem_alloc);
1107}
1108
1109/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1110   I think, these locks should be removed for datagram sockets.
1111 */
1112static long sock_wait_for_wmem(struct sock * sk, long timeo)
1113{
1114	DEFINE_WAIT(wait);
1115
1116	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1117	for (;;) {
1118		if (!timeo)
1119			break;
1120		if (signal_pending(current))
1121			break;
1122		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1123		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1124		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1125			break;
1126		if (sk->sk_shutdown & SEND_SHUTDOWN)
1127			break;
1128		if (sk->sk_err)
1129			break;
1130		timeo = schedule_timeout(timeo);
1131	}
1132	finish_wait(sk->sk_sleep, &wait);
1133	return timeo;
1134}
1135
1136
1137/*
1138 *	Generic send/receive buffer handlers
1139 */
1140
1141static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1142					    unsigned long header_len,
1143					    unsigned long data_len,
1144					    int noblock, int *errcode)
1145{
1146	struct sk_buff *skb;
1147	gfp_t gfp_mask;
1148	long timeo;
1149	int err;
1150
1151	gfp_mask = sk->sk_allocation;
1152	if (gfp_mask & __GFP_WAIT)
1153		gfp_mask |= __GFP_REPEAT;
1154
1155	timeo = sock_sndtimeo(sk, noblock);
1156	while (1) {
1157		err = sock_error(sk);
1158		if (err != 0)
1159			goto failure;
1160
1161		err = -EPIPE;
1162		if (sk->sk_shutdown & SEND_SHUTDOWN)
1163			goto failure;
1164
1165		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1166			skb = alloc_skb(header_len, gfp_mask);
1167			if (skb) {
1168				int npages;
1169				int i;
1170
1171				/* No pages, we're done... */
1172				if (!data_len)
1173					break;
1174
1175				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1176				skb->truesize += data_len;
1177				skb_shinfo(skb)->nr_frags = npages;
1178				for (i = 0; i < npages; i++) {
1179					struct page *page;
1180					skb_frag_t *frag;
1181
1182					page = alloc_pages(sk->sk_allocation, 0);
1183					if (!page) {
1184						err = -ENOBUFS;
1185						skb_shinfo(skb)->nr_frags = i;
1186						kfree_skb(skb);
1187						goto failure;
1188					}
1189
1190					frag = &skb_shinfo(skb)->frags[i];
1191					frag->page = page;
1192					frag->page_offset = 0;
1193					frag->size = (data_len >= PAGE_SIZE ?
1194						      PAGE_SIZE :
1195						      data_len);
1196					data_len -= PAGE_SIZE;
1197				}
1198
1199				/* Full success... */
1200				break;
1201			}
1202			err = -ENOBUFS;
1203			goto failure;
1204		}
1205		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1206		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1207		err = -EAGAIN;
1208		if (!timeo)
1209			goto failure;
1210		if (signal_pending(current))
1211			goto interrupted;
1212		timeo = sock_wait_for_wmem(sk, timeo);
1213	}
1214
1215	skb_set_owner_w(skb, sk);
1216	return skb;
1217
1218interrupted:
1219	err = sock_intr_errno(timeo);
1220failure:
1221	*errcode = err;
1222	return NULL;
1223}
1224
1225struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1226				    int noblock, int *errcode)
1227{
1228	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1229}
1230
1231static void __lock_sock(struct sock *sk)
1232{
1233	DEFINE_WAIT(wait);
1234
1235	for(;;) {
1236		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1237					TASK_UNINTERRUPTIBLE);
1238		spin_unlock_bh(&sk->sk_lock.slock);
1239		schedule();
1240		spin_lock_bh(&sk->sk_lock.slock);
1241		if(!sock_owned_by_user(sk))
1242			break;
1243	}
1244	finish_wait(&sk->sk_lock.wq, &wait);
1245}
1246
1247static void __release_sock(struct sock *sk)
1248{
1249	struct sk_buff *skb = sk->sk_backlog.head;
1250
1251	do {
1252		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1253		bh_unlock_sock(sk);
1254
1255		do {
1256			struct sk_buff *next = skb->next;
1257
1258			skb->next = NULL;
1259			sk->sk_backlog_rcv(sk, skb);
1260
1261			/*
1262			 * We are in process context here with softirqs
1263			 * disabled, use cond_resched_softirq() to preempt.
1264			 * This is safe to do because we've taken the backlog
1265			 * queue private:
1266			 */
1267			cond_resched_softirq();
1268
1269			skb = next;
1270		} while (skb != NULL);
1271
1272		bh_lock_sock(sk);
1273	} while((skb = sk->sk_backlog.head) != NULL);
1274}
1275
1276/**
1277 * sk_wait_data - wait for data to arrive at sk_receive_queue
1278 * @sk:    sock to wait on
1279 * @timeo: for how long
1280 *
1281 * Now socket state including sk->sk_err is changed only under lock,
1282 * hence we may omit checks after joining wait queue.
1283 * We check receive queue before schedule() only as optimization;
1284 * it is very likely that release_sock() added new data.
1285 */
1286int sk_wait_data(struct sock *sk, long *timeo)
1287{
1288	int rc;
1289	DEFINE_WAIT(wait);
1290
1291	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1292	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1293	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1294	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1295	finish_wait(sk->sk_sleep, &wait);
1296	return rc;
1297}
1298
1299EXPORT_SYMBOL(sk_wait_data);
1300
1301/*
1302 * Set of default routines for initialising struct proto_ops when
1303 * the protocol does not support a particular function. In certain
1304 * cases where it makes no sense for a protocol to have a "do nothing"
1305 * function, some default processing is provided.
1306 */
1307
1308int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1309{
1310	return -EOPNOTSUPP;
1311}
1312
1313int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1314		    int len, int flags)
1315{
1316	return -EOPNOTSUPP;
1317}
1318
1319int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1320{
1321	return -EOPNOTSUPP;
1322}
1323
1324int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1325{
1326	return -EOPNOTSUPP;
1327}
1328
1329int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1330		    int *len, int peer)
1331{
1332	return -EOPNOTSUPP;
1333}
1334
1335unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1336{
1337	return 0;
1338}
1339
1340int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1341{
1342	return -EOPNOTSUPP;
1343}
1344
1345int sock_no_listen(struct socket *sock, int backlog)
1346{
1347	return -EOPNOTSUPP;
1348}
1349
1350int sock_no_shutdown(struct socket *sock, int how)
1351{
1352	return -EOPNOTSUPP;
1353}
1354
1355int sock_no_setsockopt(struct socket *sock, int level, int optname,
1356		    char __user *optval, int optlen)
1357{
1358	return -EOPNOTSUPP;
1359}
1360
1361int sock_no_getsockopt(struct socket *sock, int level, int optname,
1362		    char __user *optval, int __user *optlen)
1363{
1364	return -EOPNOTSUPP;
1365}
1366
1367int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1368		    size_t len)
1369{
1370	return -EOPNOTSUPP;
1371}
1372
1373int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1374		    size_t len, int flags)
1375{
1376	return -EOPNOTSUPP;
1377}
1378
1379int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1380{
1381	/* Mirror missing mmap method error code */
1382	return -ENODEV;
1383}
1384
1385ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1386{
1387	ssize_t res;
1388	struct msghdr msg = {.msg_flags = flags};
1389	struct kvec iov;
1390	char *kaddr = kmap(page);
1391	iov.iov_base = kaddr + offset;
1392	iov.iov_len = size;
1393	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1394	kunmap(page);
1395	return res;
1396}
1397
1398/*
1399 *	Default Socket Callbacks
1400 */
1401
1402static void sock_def_wakeup(struct sock *sk)
1403{
1404	read_lock(&sk->sk_callback_lock);
1405	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1406		wake_up_interruptible_all(sk->sk_sleep);
1407	read_unlock(&sk->sk_callback_lock);
1408}
1409
1410static void sock_def_error_report(struct sock *sk)
1411{
1412	read_lock(&sk->sk_callback_lock);
1413	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1414		wake_up_interruptible(sk->sk_sleep);
1415	sk_wake_async(sk,0,POLL_ERR);
1416	read_unlock(&sk->sk_callback_lock);
1417}
1418
1419static void sock_def_readable(struct sock *sk, int len)
1420{
1421	read_lock(&sk->sk_callback_lock);
1422	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1423		wake_up_interruptible(sk->sk_sleep);
1424	sk_wake_async(sk,1,POLL_IN);
1425	read_unlock(&sk->sk_callback_lock);
1426}
1427
1428static void sock_def_write_space(struct sock *sk)
1429{
1430	read_lock(&sk->sk_callback_lock);
1431
1432	/* Do not wake up a writer until he can make "significant"
1433	 * progress.  --DaveM
1434	 */
1435	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1436		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1437			wake_up_interruptible(sk->sk_sleep);
1438
1439		/* Should agree with poll, otherwise some programs break */
1440		if (sock_writeable(sk))
1441			sk_wake_async(sk, 2, POLL_OUT);
1442	}
1443
1444	read_unlock(&sk->sk_callback_lock);
1445}
1446
1447static void sock_def_destruct(struct sock *sk)
1448{
1449	kfree(sk->sk_protinfo);
1450}
1451
1452void sk_send_sigurg(struct sock *sk)
1453{
1454	if (sk->sk_socket && sk->sk_socket->file)
1455		if (send_sigurg(&sk->sk_socket->file->f_owner))
1456			sk_wake_async(sk, 3, POLL_PRI);
1457}
1458
1459void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1460		    unsigned long expires)
1461{
1462	if (!mod_timer(timer, expires))
1463		sock_hold(sk);
1464}
1465
1466EXPORT_SYMBOL(sk_reset_timer);
1467
1468void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1469{
1470	if (timer_pending(timer) && del_timer(timer))
1471		__sock_put(sk);
1472}
1473
1474EXPORT_SYMBOL(sk_stop_timer);
1475
1476void sock_init_data(struct socket *sock, struct sock *sk)
1477{
1478	skb_queue_head_init(&sk->sk_receive_queue);
1479	skb_queue_head_init(&sk->sk_write_queue);
1480	skb_queue_head_init(&sk->sk_error_queue);
1481#ifdef CONFIG_NET_DMA
1482	skb_queue_head_init(&sk->sk_async_wait_queue);
1483#endif
1484
1485	sk->sk_send_head	=	NULL;
1486
1487	init_timer(&sk->sk_timer);
1488
1489	sk->sk_allocation	=	GFP_KERNEL;
1490	sk->sk_rcvbuf		=	sysctl_rmem_default;
1491	sk->sk_sndbuf		=	sysctl_wmem_default;
1492	sk->sk_state		=	TCP_CLOSE;
1493	sk->sk_socket		=	sock;
1494
1495	sock_set_flag(sk, SOCK_ZAPPED);
1496
1497	if(sock)
1498	{
1499		sk->sk_type	=	sock->type;
1500		sk->sk_sleep	=	&sock->wait;
1501		sock->sk	=	sk;
1502	} else
1503		sk->sk_sleep	=	NULL;
1504
1505	rwlock_init(&sk->sk_dst_lock);
1506	rwlock_init(&sk->sk_callback_lock);
1507	lockdep_set_class(&sk->sk_callback_lock,
1508			   af_callback_keys + sk->sk_family);
1509
1510	sk->sk_state_change	=	sock_def_wakeup;
1511	sk->sk_data_ready	=	sock_def_readable;
1512	sk->sk_write_space	=	sock_def_write_space;
1513	sk->sk_error_report	=	sock_def_error_report;
1514	sk->sk_destruct		=	sock_def_destruct;
1515
1516	sk->sk_sndmsg_page	=	NULL;
1517	sk->sk_sndmsg_off	=	0;
1518
1519	sk->sk_peercred.pid 	=	0;
1520	sk->sk_peercred.uid	=	-1;
1521	sk->sk_peercred.gid	=	-1;
1522	sk->sk_write_pending	=	0;
1523	sk->sk_rcvlowat		=	1;
1524	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1525	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1526
1527	sk->sk_stamp.tv_sec     = -1L;
1528	sk->sk_stamp.tv_usec    = -1L;
1529
1530	atomic_set(&sk->sk_refcnt, 1);
1531}
1532
1533void fastcall lock_sock_nested(struct sock *sk, int subclass)
1534{
1535	might_sleep();
1536	spin_lock_bh(&sk->sk_lock.slock);
1537	if (sk->sk_lock.owner)
1538		__lock_sock(sk);
1539	sk->sk_lock.owner = (void *)1;
1540	spin_unlock(&sk->sk_lock.slock);
1541	/*
1542	 * The sk_lock has mutex_lock() semantics here:
1543	 */
1544	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1545	local_bh_enable();
1546}
1547
1548EXPORT_SYMBOL(lock_sock_nested);
1549
1550void fastcall release_sock(struct sock *sk)
1551{
1552	/*
1553	 * The sk_lock has mutex_unlock() semantics:
1554	 */
1555	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1556
1557	spin_lock_bh(&sk->sk_lock.slock);
1558	if (sk->sk_backlog.tail)
1559		__release_sock(sk);
1560	sk->sk_lock.owner = NULL;
1561	if (waitqueue_active(&sk->sk_lock.wq))
1562		wake_up(&sk->sk_lock.wq);
1563	spin_unlock_bh(&sk->sk_lock.slock);
1564}
1565EXPORT_SYMBOL(release_sock);
1566
1567int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1568{
1569	if (!sock_flag(sk, SOCK_TIMESTAMP))
1570		sock_enable_timestamp(sk);
1571	if (sk->sk_stamp.tv_sec == -1)
1572		return -ENOENT;
1573	if (sk->sk_stamp.tv_sec == 0)
1574		do_gettimeofday(&sk->sk_stamp);
1575	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1576		-EFAULT : 0;
1577}
1578EXPORT_SYMBOL(sock_get_timestamp);
1579
1580void sock_enable_timestamp(struct sock *sk)
1581{
1582	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1583		sock_set_flag(sk, SOCK_TIMESTAMP);
1584		net_enable_timestamp();
1585	}
1586}
1587EXPORT_SYMBOL(sock_enable_timestamp);
1588
1589/*
1590 *	Get a socket option on an socket.
1591 *
1592 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1593 *	asynchronous errors should be reported by getsockopt. We assume
1594 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1595 */
1596int sock_common_getsockopt(struct socket *sock, int level, int optname,
1597			   char __user *optval, int __user *optlen)
1598{
1599	struct sock *sk = sock->sk;
1600
1601	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1602}
1603
1604EXPORT_SYMBOL(sock_common_getsockopt);
1605
1606#ifdef CONFIG_COMPAT
1607int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1608				  char __user *optval, int __user *optlen)
1609{
1610	struct sock *sk = sock->sk;
1611
1612	if (sk->sk_prot->compat_setsockopt != NULL)
1613		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1614						      optval, optlen);
1615	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1616}
1617EXPORT_SYMBOL(compat_sock_common_getsockopt);
1618#endif
1619
1620int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1621			struct msghdr *msg, size_t size, int flags)
1622{
1623	struct sock *sk = sock->sk;
1624	int addr_len = 0;
1625	int err;
1626
1627	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1628				   flags & ~MSG_DONTWAIT, &addr_len);
1629	if (err >= 0)
1630		msg->msg_namelen = addr_len;
1631	return err;
1632}
1633
1634EXPORT_SYMBOL(sock_common_recvmsg);
1635
1636/*
1637 *	Set socket options on an inet socket.
1638 */
1639int sock_common_setsockopt(struct socket *sock, int level, int optname,
1640			   char __user *optval, int optlen)
1641{
1642	struct sock *sk = sock->sk;
1643
1644	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1645}
1646
1647EXPORT_SYMBOL(sock_common_setsockopt);
1648
1649#ifdef CONFIG_COMPAT
1650int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1651				  char __user *optval, int optlen)
1652{
1653	struct sock *sk = sock->sk;
1654
1655	if (sk->sk_prot->compat_setsockopt != NULL)
1656		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1657						      optval, optlen);
1658	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1659}
1660EXPORT_SYMBOL(compat_sock_common_setsockopt);
1661#endif
1662
1663void sk_common_release(struct sock *sk)
1664{
1665	if (sk->sk_prot->destroy)
1666		sk->sk_prot->destroy(sk);
1667
1668	/*
1669	 * Observation: when sock_common_release is called, processes have
1670	 * no access to socket. But net still has.
1671	 * Step one, detach it from networking:
1672	 *
1673	 * A. Remove from hash tables.
1674	 */
1675
1676	sk->sk_prot->unhash(sk);
1677
1678	/*
1679	 * In this point socket cannot receive new packets, but it is possible
1680	 * that some packets are in flight because some CPU runs receiver and
1681	 * did hash table lookup before we unhashed socket. They will achieve
1682	 * receive queue and will be purged by socket destructor.
1683	 *
1684	 * Also we still have packets pending on receive queue and probably,
1685	 * our own packets waiting in device queues. sock_destroy will drain
1686	 * receive queue, but transmitted packets will delay socket destruction
1687	 * until the last reference will be released.
1688	 */
1689
1690	sock_orphan(sk);
1691
1692	xfrm_sk_free_policy(sk);
1693
1694	sk_refcnt_debug_release(sk);
1695	sock_put(sk);
1696}
1697
1698EXPORT_SYMBOL(sk_common_release);
1699
1700static DEFINE_RWLOCK(proto_list_lock);
1701static LIST_HEAD(proto_list);
1702
1703int proto_register(struct proto *prot, int alloc_slab)
1704{
1705	char *request_sock_slab_name = NULL;
1706	char *timewait_sock_slab_name;
1707	int rc = -ENOBUFS;
1708
1709	if (alloc_slab) {
1710		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1711					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1712
1713		if (prot->slab == NULL) {
1714			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1715			       prot->name);
1716			goto out;
1717		}
1718
1719		if (prot->rsk_prot != NULL) {
1720			static const char mask[] = "request_sock_%s";
1721
1722			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1723			if (request_sock_slab_name == NULL)
1724				goto out_free_sock_slab;
1725
1726			sprintf(request_sock_slab_name, mask, prot->name);
1727			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1728								 prot->rsk_prot->obj_size, 0,
1729								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1730
1731			if (prot->rsk_prot->slab == NULL) {
1732				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1733				       prot->name);
1734				goto out_free_request_sock_slab_name;
1735			}
1736		}
1737
1738		if (prot->twsk_prot != NULL) {
1739			static const char mask[] = "tw_sock_%s";
1740
1741			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1742
1743			if (timewait_sock_slab_name == NULL)
1744				goto out_free_request_sock_slab;
1745
1746			sprintf(timewait_sock_slab_name, mask, prot->name);
1747			prot->twsk_prot->twsk_slab =
1748				kmem_cache_create(timewait_sock_slab_name,
1749						  prot->twsk_prot->twsk_obj_size,
1750						  0, SLAB_HWCACHE_ALIGN,
1751						  NULL, NULL);
1752			if (prot->twsk_prot->twsk_slab == NULL)
1753				goto out_free_timewait_sock_slab_name;
1754		}
1755	}
1756
1757	write_lock(&proto_list_lock);
1758	list_add(&prot->node, &proto_list);
1759	write_unlock(&proto_list_lock);
1760	rc = 0;
1761out:
1762	return rc;
1763out_free_timewait_sock_slab_name:
1764	kfree(timewait_sock_slab_name);
1765out_free_request_sock_slab:
1766	if (prot->rsk_prot && prot->rsk_prot->slab) {
1767		kmem_cache_destroy(prot->rsk_prot->slab);
1768		prot->rsk_prot->slab = NULL;
1769	}
1770out_free_request_sock_slab_name:
1771	kfree(request_sock_slab_name);
1772out_free_sock_slab:
1773	kmem_cache_destroy(prot->slab);
1774	prot->slab = NULL;
1775	goto out;
1776}
1777
1778EXPORT_SYMBOL(proto_register);
1779
1780void proto_unregister(struct proto *prot)
1781{
1782	write_lock(&proto_list_lock);
1783	list_del(&prot->node);
1784	write_unlock(&proto_list_lock);
1785
1786	if (prot->slab != NULL) {
1787		kmem_cache_destroy(prot->slab);
1788		prot->slab = NULL;
1789	}
1790
1791	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1792		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1793
1794		kmem_cache_destroy(prot->rsk_prot->slab);
1795		kfree(name);
1796		prot->rsk_prot->slab = NULL;
1797	}
1798
1799	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1800		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1801
1802		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1803		kfree(name);
1804		prot->twsk_prot->twsk_slab = NULL;
1805	}
1806}
1807
1808EXPORT_SYMBOL(proto_unregister);
1809
1810#ifdef CONFIG_PROC_FS
1811static inline struct proto *__proto_head(void)
1812{
1813	return list_entry(proto_list.next, struct proto, node);
1814}
1815
1816static inline struct proto *proto_head(void)
1817{
1818	return list_empty(&proto_list) ? NULL : __proto_head();
1819}
1820
1821static inline struct proto *proto_next(struct proto *proto)
1822{
1823	return proto->node.next == &proto_list ? NULL :
1824		list_entry(proto->node.next, struct proto, node);
1825}
1826
1827static inline struct proto *proto_get_idx(loff_t pos)
1828{
1829	struct proto *proto;
1830	loff_t i = 0;
1831
1832	list_for_each_entry(proto, &proto_list, node)
1833		if (i++ == pos)
1834			goto out;
1835
1836	proto = NULL;
1837out:
1838	return proto;
1839}
1840
1841static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1842{
1843	read_lock(&proto_list_lock);
1844	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1845}
1846
1847static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1848{
1849	++*pos;
1850	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1851}
1852
1853static void proto_seq_stop(struct seq_file *seq, void *v)
1854{
1855	read_unlock(&proto_list_lock);
1856}
1857
1858static char proto_method_implemented(const void *method)
1859{
1860	return method == NULL ? 'n' : 'y';
1861}
1862
1863static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1864{
1865	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1866			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1867		   proto->name,
1868		   proto->obj_size,
1869		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1870		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1871		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1872		   proto->max_header,
1873		   proto->slab == NULL ? "no" : "yes",
1874		   module_name(proto->owner),
1875		   proto_method_implemented(proto->close),
1876		   proto_method_implemented(proto->connect),
1877		   proto_method_implemented(proto->disconnect),
1878		   proto_method_implemented(proto->accept),
1879		   proto_method_implemented(proto->ioctl),
1880		   proto_method_implemented(proto->init),
1881		   proto_method_implemented(proto->destroy),
1882		   proto_method_implemented(proto->shutdown),
1883		   proto_method_implemented(proto->setsockopt),
1884		   proto_method_implemented(proto->getsockopt),
1885		   proto_method_implemented(proto->sendmsg),
1886		   proto_method_implemented(proto->recvmsg),
1887		   proto_method_implemented(proto->sendpage),
1888		   proto_method_implemented(proto->bind),
1889		   proto_method_implemented(proto->backlog_rcv),
1890		   proto_method_implemented(proto->hash),
1891		   proto_method_implemented(proto->unhash),
1892		   proto_method_implemented(proto->get_port),
1893		   proto_method_implemented(proto->enter_memory_pressure));
1894}
1895
1896static int proto_seq_show(struct seq_file *seq, void *v)
1897{
1898	if (v == SEQ_START_TOKEN)
1899		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1900			   "protocol",
1901			   "size",
1902			   "sockets",
1903			   "memory",
1904			   "press",
1905			   "maxhdr",
1906			   "slab",
1907			   "module",
1908			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1909	else
1910		proto_seq_printf(seq, v);
1911	return 0;
1912}
1913
1914static struct seq_operations proto_seq_ops = {
1915	.start  = proto_seq_start,
1916	.next   = proto_seq_next,
1917	.stop   = proto_seq_stop,
1918	.show   = proto_seq_show,
1919};
1920
1921static int proto_seq_open(struct inode *inode, struct file *file)
1922{
1923	return seq_open(file, &proto_seq_ops);
1924}
1925
1926static struct file_operations proto_seq_fops = {
1927	.owner		= THIS_MODULE,
1928	.open		= proto_seq_open,
1929	.read		= seq_read,
1930	.llseek		= seq_lseek,
1931	.release	= seq_release,
1932};
1933
1934static int __init proto_init(void)
1935{
1936	/* register /proc/net/protocols */
1937	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1938}
1939
1940subsys_initcall(proto_init);
1941
1942#endif /* PROC_FS */
1943
1944EXPORT_SYMBOL(sk_alloc);
1945EXPORT_SYMBOL(sk_free);
1946EXPORT_SYMBOL(sk_send_sigurg);
1947EXPORT_SYMBOL(sock_alloc_send_skb);
1948EXPORT_SYMBOL(sock_init_data);
1949EXPORT_SYMBOL(sock_kfree_s);
1950EXPORT_SYMBOL(sock_kmalloc);
1951EXPORT_SYMBOL(sock_no_accept);
1952EXPORT_SYMBOL(sock_no_bind);
1953EXPORT_SYMBOL(sock_no_connect);
1954EXPORT_SYMBOL(sock_no_getname);
1955EXPORT_SYMBOL(sock_no_getsockopt);
1956EXPORT_SYMBOL(sock_no_ioctl);
1957EXPORT_SYMBOL(sock_no_listen);
1958EXPORT_SYMBOL(sock_no_mmap);
1959EXPORT_SYMBOL(sock_no_poll);
1960EXPORT_SYMBOL(sock_no_recvmsg);
1961EXPORT_SYMBOL(sock_no_sendmsg);
1962EXPORT_SYMBOL(sock_no_sendpage);
1963EXPORT_SYMBOL(sock_no_setsockopt);
1964EXPORT_SYMBOL(sock_no_shutdown);
1965EXPORT_SYMBOL(sock_no_socketpair);
1966EXPORT_SYMBOL(sock_rfree);
1967EXPORT_SYMBOL(sock_setsockopt);
1968EXPORT_SYMBOL(sock_wfree);
1969EXPORT_SYMBOL(sock_wmalloc);
1970EXPORT_SYMBOL(sock_i_uid);
1971EXPORT_SYMBOL(sock_i_ino);
1972EXPORT_SYMBOL(sysctl_optmem_max);
1973#ifdef CONFIG_SYSCTL
1974EXPORT_SYMBOL(sysctl_rmem_max);
1975EXPORT_SYMBOL(sysctl_wmem_max);
1976#endif
1977