sock.c revision 4dfbb9d8c6cbfc32faa5c71145bd2a43e1f8237c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <net/xfrm.h>
124#include <linux/ipsec.h>
125
126#include <linux/filter.h>
127
128#ifdef CONFIG_INET
129#include <net/tcp.h>
130#endif
131
132/*
133 * Each address family might have different locking rules, so we have
134 * one slock key per address family:
135 */
136static struct lock_class_key af_family_keys[AF_MAX];
137static struct lock_class_key af_family_slock_keys[AF_MAX];
138
139#ifdef CONFIG_DEBUG_LOCK_ALLOC
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
157};
158static const char *af_family_slock_key_strings[AF_MAX+1] = {
159  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
160  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
161  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
162  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
163  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
164  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
165  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
166  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
167  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
168  "slock-27"       , "slock-28"          , "slock-29"          ,
169  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
170};
171#endif
172
173/*
174 * sk_callback_lock locking rules are per-address-family,
175 * so split the lock classes by using a per-AF key:
176 */
177static struct lock_class_key af_callback_keys[AF_MAX];
178
179/* Take into consideration the size of the struct sk_buff overhead in the
180 * determination of these values, since that is non-constant across
181 * platforms.  This makes socket queueing behavior and performance
182 * not depend upon such differences.
183 */
184#define _SK_MEM_PACKETS		256
185#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
186#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
187#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188
189/* Run time adjustable parameters. */
190__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
191__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
192__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
193__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
194
195/* Maximal space eaten by iovec or ancilliary data plus some space */
196int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
197
198static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
199{
200	struct timeval tv;
201
202	if (optlen < sizeof(tv))
203		return -EINVAL;
204	if (copy_from_user(&tv, optval, sizeof(tv)))
205		return -EFAULT;
206
207	*timeo_p = MAX_SCHEDULE_TIMEOUT;
208	if (tv.tv_sec == 0 && tv.tv_usec == 0)
209		return 0;
210	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
211		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
212	return 0;
213}
214
215static void sock_warn_obsolete_bsdism(const char *name)
216{
217	static int warned;
218	static char warncomm[TASK_COMM_LEN];
219	if (strcmp(warncomm, current->comm) && warned < 5) {
220		strcpy(warncomm,  current->comm);
221		printk(KERN_WARNING "process `%s' is using obsolete "
222		       "%s SO_BSDCOMPAT\n", warncomm, name);
223		warned++;
224	}
225}
226
227static void sock_disable_timestamp(struct sock *sk)
228{
229	if (sock_flag(sk, SOCK_TIMESTAMP)) {
230		sock_reset_flag(sk, SOCK_TIMESTAMP);
231		net_disable_timestamp();
232	}
233}
234
235
236int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
237{
238	int err = 0;
239	int skb_len;
240
241	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
242	   number of warnings when compiling with -W --ANK
243	 */
244	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
245	    (unsigned)sk->sk_rcvbuf) {
246		err = -ENOMEM;
247		goto out;
248	}
249
250	err = sk_filter(sk, skb);
251	if (err)
252		goto out;
253
254	skb->dev = NULL;
255	skb_set_owner_r(skb, sk);
256
257	/* Cache the SKB length before we tack it onto the receive
258	 * queue.  Once it is added it no longer belongs to us and
259	 * may be freed by other threads of control pulling packets
260	 * from the queue.
261	 */
262	skb_len = skb->len;
263
264	skb_queue_tail(&sk->sk_receive_queue, skb);
265
266	if (!sock_flag(sk, SOCK_DEAD))
267		sk->sk_data_ready(sk, skb_len);
268out:
269	return err;
270}
271EXPORT_SYMBOL(sock_queue_rcv_skb);
272
273int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
274{
275	int rc = NET_RX_SUCCESS;
276
277	if (sk_filter(sk, skb))
278		goto discard_and_relse;
279
280	skb->dev = NULL;
281
282	bh_lock_sock(sk);
283	if (!sock_owned_by_user(sk)) {
284		/*
285		 * trylock + unlock semantics:
286		 */
287		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
288
289		rc = sk->sk_backlog_rcv(sk, skb);
290
291		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
292	} else
293		sk_add_backlog(sk, skb);
294	bh_unlock_sock(sk);
295out:
296	sock_put(sk);
297	return rc;
298discard_and_relse:
299	kfree_skb(skb);
300	goto out;
301}
302EXPORT_SYMBOL(sk_receive_skb);
303
304struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
305{
306	struct dst_entry *dst = sk->sk_dst_cache;
307
308	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
309		sk->sk_dst_cache = NULL;
310		dst_release(dst);
311		return NULL;
312	}
313
314	return dst;
315}
316EXPORT_SYMBOL(__sk_dst_check);
317
318struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
319{
320	struct dst_entry *dst = sk_dst_get(sk);
321
322	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
323		sk_dst_reset(sk);
324		dst_release(dst);
325		return NULL;
326	}
327
328	return dst;
329}
330EXPORT_SYMBOL(sk_dst_check);
331
332/*
333 *	This is meant for all protocols to use and covers goings on
334 *	at the socket level. Everything here is generic.
335 */
336
337int sock_setsockopt(struct socket *sock, int level, int optname,
338		    char __user *optval, int optlen)
339{
340	struct sock *sk=sock->sk;
341	struct sk_filter *filter;
342	int val;
343	int valbool;
344	struct linger ling;
345	int ret = 0;
346
347	/*
348	 *	Options without arguments
349	 */
350
351#ifdef SO_DONTLINGER		/* Compatibility item... */
352	if (optname == SO_DONTLINGER) {
353		lock_sock(sk);
354		sock_reset_flag(sk, SOCK_LINGER);
355		release_sock(sk);
356		return 0;
357	}
358#endif
359
360  	if(optlen<sizeof(int))
361  		return(-EINVAL);
362
363	if (get_user(val, (int __user *)optval))
364		return -EFAULT;
365
366  	valbool = val?1:0;
367
368	lock_sock(sk);
369
370  	switch(optname)
371  	{
372		case SO_DEBUG:
373			if(val && !capable(CAP_NET_ADMIN))
374			{
375				ret = -EACCES;
376			}
377			else if (valbool)
378				sock_set_flag(sk, SOCK_DBG);
379			else
380				sock_reset_flag(sk, SOCK_DBG);
381			break;
382		case SO_REUSEADDR:
383			sk->sk_reuse = valbool;
384			break;
385		case SO_TYPE:
386		case SO_ERROR:
387			ret = -ENOPROTOOPT;
388		  	break;
389		case SO_DONTROUTE:
390			if (valbool)
391				sock_set_flag(sk, SOCK_LOCALROUTE);
392			else
393				sock_reset_flag(sk, SOCK_LOCALROUTE);
394			break;
395		case SO_BROADCAST:
396			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
397			break;
398		case SO_SNDBUF:
399			/* Don't error on this BSD doesn't and if you think
400			   about it this is right. Otherwise apps have to
401			   play 'guess the biggest size' games. RCVBUF/SNDBUF
402			   are treated in BSD as hints */
403
404			if (val > sysctl_wmem_max)
405				val = sysctl_wmem_max;
406set_sndbuf:
407			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
408			if ((val * 2) < SOCK_MIN_SNDBUF)
409				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
410			else
411				sk->sk_sndbuf = val * 2;
412
413			/*
414			 *	Wake up sending tasks if we
415			 *	upped the value.
416			 */
417			sk->sk_write_space(sk);
418			break;
419
420		case SO_SNDBUFFORCE:
421			if (!capable(CAP_NET_ADMIN)) {
422				ret = -EPERM;
423				break;
424			}
425			goto set_sndbuf;
426
427		case SO_RCVBUF:
428			/* Don't error on this BSD doesn't and if you think
429			   about it this is right. Otherwise apps have to
430			   play 'guess the biggest size' games. RCVBUF/SNDBUF
431			   are treated in BSD as hints */
432
433			if (val > sysctl_rmem_max)
434				val = sysctl_rmem_max;
435set_rcvbuf:
436			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
437			/*
438			 * We double it on the way in to account for
439			 * "struct sk_buff" etc. overhead.   Applications
440			 * assume that the SO_RCVBUF setting they make will
441			 * allow that much actual data to be received on that
442			 * socket.
443			 *
444			 * Applications are unaware that "struct sk_buff" and
445			 * other overheads allocate from the receive buffer
446			 * during socket buffer allocation.
447			 *
448			 * And after considering the possible alternatives,
449			 * returning the value we actually used in getsockopt
450			 * is the most desirable behavior.
451			 */
452			if ((val * 2) < SOCK_MIN_RCVBUF)
453				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
454			else
455				sk->sk_rcvbuf = val * 2;
456			break;
457
458		case SO_RCVBUFFORCE:
459			if (!capable(CAP_NET_ADMIN)) {
460				ret = -EPERM;
461				break;
462			}
463			goto set_rcvbuf;
464
465		case SO_KEEPALIVE:
466#ifdef CONFIG_INET
467			if (sk->sk_protocol == IPPROTO_TCP)
468				tcp_set_keepalive(sk, valbool);
469#endif
470			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
471			break;
472
473	 	case SO_OOBINLINE:
474			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
475			break;
476
477	 	case SO_NO_CHECK:
478			sk->sk_no_check = valbool;
479			break;
480
481		case SO_PRIORITY:
482			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
483				sk->sk_priority = val;
484			else
485				ret = -EPERM;
486			break;
487
488		case SO_LINGER:
489			if(optlen<sizeof(ling)) {
490				ret = -EINVAL;	/* 1003.1g */
491				break;
492			}
493			if (copy_from_user(&ling,optval,sizeof(ling))) {
494				ret = -EFAULT;
495				break;
496			}
497			if (!ling.l_onoff)
498				sock_reset_flag(sk, SOCK_LINGER);
499			else {
500#if (BITS_PER_LONG == 32)
501				if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
502					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
503				else
504#endif
505					sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
506				sock_set_flag(sk, SOCK_LINGER);
507			}
508			break;
509
510		case SO_BSDCOMPAT:
511			sock_warn_obsolete_bsdism("setsockopt");
512			break;
513
514		case SO_PASSCRED:
515			if (valbool)
516				set_bit(SOCK_PASSCRED, &sock->flags);
517			else
518				clear_bit(SOCK_PASSCRED, &sock->flags);
519			break;
520
521		case SO_TIMESTAMP:
522			if (valbool)  {
523				sock_set_flag(sk, SOCK_RCVTSTAMP);
524				sock_enable_timestamp(sk);
525			} else
526				sock_reset_flag(sk, SOCK_RCVTSTAMP);
527			break;
528
529		case SO_RCVLOWAT:
530			if (val < 0)
531				val = INT_MAX;
532			sk->sk_rcvlowat = val ? : 1;
533			break;
534
535		case SO_RCVTIMEO:
536			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
537			break;
538
539		case SO_SNDTIMEO:
540			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
541			break;
542
543#ifdef CONFIG_NETDEVICES
544		case SO_BINDTODEVICE:
545		{
546			char devname[IFNAMSIZ];
547
548			/* Sorry... */
549			if (!capable(CAP_NET_RAW)) {
550				ret = -EPERM;
551				break;
552			}
553
554			/* Bind this socket to a particular device like "eth0",
555			 * as specified in the passed interface name. If the
556			 * name is "" or the option length is zero the socket
557			 * is not bound.
558			 */
559
560			if (!valbool) {
561				sk->sk_bound_dev_if = 0;
562			} else {
563				if (optlen > IFNAMSIZ - 1)
564					optlen = IFNAMSIZ - 1;
565				memset(devname, 0, sizeof(devname));
566				if (copy_from_user(devname, optval, optlen)) {
567					ret = -EFAULT;
568					break;
569				}
570
571				/* Remove any cached route for this socket. */
572				sk_dst_reset(sk);
573
574				if (devname[0] == '\0') {
575					sk->sk_bound_dev_if = 0;
576				} else {
577					struct net_device *dev = dev_get_by_name(devname);
578					if (!dev) {
579						ret = -ENODEV;
580						break;
581					}
582					sk->sk_bound_dev_if = dev->ifindex;
583					dev_put(dev);
584				}
585			}
586			break;
587		}
588#endif
589
590
591		case SO_ATTACH_FILTER:
592			ret = -EINVAL;
593			if (optlen == sizeof(struct sock_fprog)) {
594				struct sock_fprog fprog;
595
596				ret = -EFAULT;
597				if (copy_from_user(&fprog, optval, sizeof(fprog)))
598					break;
599
600				ret = sk_attach_filter(&fprog, sk);
601			}
602			break;
603
604		case SO_DETACH_FILTER:
605			rcu_read_lock_bh();
606			filter = rcu_dereference(sk->sk_filter);
607                        if (filter) {
608				rcu_assign_pointer(sk->sk_filter, NULL);
609				sk_filter_release(sk, filter);
610				rcu_read_unlock_bh();
611				break;
612			}
613			rcu_read_unlock_bh();
614			ret = -ENONET;
615			break;
616
617		case SO_PASSSEC:
618			if (valbool)
619				set_bit(SOCK_PASSSEC, &sock->flags);
620			else
621				clear_bit(SOCK_PASSSEC, &sock->flags);
622			break;
623
624		/* We implement the SO_SNDLOWAT etc to
625		   not be settable (1003.1g 5.3) */
626		default:
627		  	ret = -ENOPROTOOPT;
628			break;
629  	}
630	release_sock(sk);
631	return ret;
632}
633
634
635int sock_getsockopt(struct socket *sock, int level, int optname,
636		    char __user *optval, int __user *optlen)
637{
638	struct sock *sk = sock->sk;
639
640	union
641	{
642  		int val;
643  		struct linger ling;
644		struct timeval tm;
645	} v;
646
647	unsigned int lv = sizeof(int);
648	int len;
649
650  	if(get_user(len,optlen))
651  		return -EFAULT;
652	if(len < 0)
653		return -EINVAL;
654
655  	switch(optname)
656  	{
657		case SO_DEBUG:
658			v.val = sock_flag(sk, SOCK_DBG);
659			break;
660
661		case SO_DONTROUTE:
662			v.val = sock_flag(sk, SOCK_LOCALROUTE);
663			break;
664
665		case SO_BROADCAST:
666			v.val = !!sock_flag(sk, SOCK_BROADCAST);
667			break;
668
669		case SO_SNDBUF:
670			v.val = sk->sk_sndbuf;
671			break;
672
673		case SO_RCVBUF:
674			v.val = sk->sk_rcvbuf;
675			break;
676
677		case SO_REUSEADDR:
678			v.val = sk->sk_reuse;
679			break;
680
681		case SO_KEEPALIVE:
682			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
683			break;
684
685		case SO_TYPE:
686			v.val = sk->sk_type;
687			break;
688
689		case SO_ERROR:
690			v.val = -sock_error(sk);
691			if(v.val==0)
692				v.val = xchg(&sk->sk_err_soft, 0);
693			break;
694
695		case SO_OOBINLINE:
696			v.val = !!sock_flag(sk, SOCK_URGINLINE);
697			break;
698
699		case SO_NO_CHECK:
700			v.val = sk->sk_no_check;
701			break;
702
703		case SO_PRIORITY:
704			v.val = sk->sk_priority;
705			break;
706
707		case SO_LINGER:
708			lv		= sizeof(v.ling);
709			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
710 			v.ling.l_linger	= sk->sk_lingertime / HZ;
711			break;
712
713		case SO_BSDCOMPAT:
714			sock_warn_obsolete_bsdism("getsockopt");
715			break;
716
717		case SO_TIMESTAMP:
718			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
719			break;
720
721		case SO_RCVTIMEO:
722			lv=sizeof(struct timeval);
723			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
724				v.tm.tv_sec = 0;
725				v.tm.tv_usec = 0;
726			} else {
727				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
728				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
729			}
730			break;
731
732		case SO_SNDTIMEO:
733			lv=sizeof(struct timeval);
734			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
735				v.tm.tv_sec = 0;
736				v.tm.tv_usec = 0;
737			} else {
738				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
739				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
740			}
741			break;
742
743		case SO_RCVLOWAT:
744			v.val = sk->sk_rcvlowat;
745			break;
746
747		case SO_SNDLOWAT:
748			v.val=1;
749			break;
750
751		case SO_PASSCRED:
752			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
753			break;
754
755		case SO_PEERCRED:
756			if (len > sizeof(sk->sk_peercred))
757				len = sizeof(sk->sk_peercred);
758			if (copy_to_user(optval, &sk->sk_peercred, len))
759				return -EFAULT;
760			goto lenout;
761
762		case SO_PEERNAME:
763		{
764			char address[128];
765
766			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
767				return -ENOTCONN;
768			if (lv < len)
769				return -EINVAL;
770			if (copy_to_user(optval, address, len))
771				return -EFAULT;
772			goto lenout;
773		}
774
775		/* Dubious BSD thing... Probably nobody even uses it, but
776		 * the UNIX standard wants it for whatever reason... -DaveM
777		 */
778		case SO_ACCEPTCONN:
779			v.val = sk->sk_state == TCP_LISTEN;
780			break;
781
782		case SO_PASSSEC:
783			v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
784			break;
785
786		case SO_PEERSEC:
787			return security_socket_getpeersec_stream(sock, optval, optlen, len);
788
789		default:
790			return(-ENOPROTOOPT);
791	}
792	if (len > lv)
793		len = lv;
794	if (copy_to_user(optval, &v, len))
795		return -EFAULT;
796lenout:
797  	if (put_user(len, optlen))
798  		return -EFAULT;
799  	return 0;
800}
801
802/*
803 * Initialize an sk_lock.
804 *
805 * (We also register the sk_lock with the lock validator.)
806 */
807static void inline sock_lock_init(struct sock *sk)
808{
809	spin_lock_init(&sk->sk_lock.slock);
810	sk->sk_lock.owner = NULL;
811	init_waitqueue_head(&sk->sk_lock.wq);
812	/*
813	 * Make sure we are not reinitializing a held lock:
814	 */
815	debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
816
817	/*
818	 * Mark both the sk_lock and the sk_lock.slock as a
819	 * per-address-family lock class:
820	 */
821	lockdep_set_class_and_name(&sk->sk_lock.slock,
822				   af_family_slock_keys + sk->sk_family,
823				   af_family_slock_key_strings[sk->sk_family]);
824	lockdep_init_map(&sk->sk_lock.dep_map,
825			 af_family_key_strings[sk->sk_family],
826			 af_family_keys + sk->sk_family, 0);
827}
828
829/**
830 *	sk_alloc - All socket objects are allocated here
831 *	@family: protocol family
832 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
833 *	@prot: struct proto associated with this new sock instance
834 *	@zero_it: if we should zero the newly allocated sock
835 */
836struct sock *sk_alloc(int family, gfp_t priority,
837		      struct proto *prot, int zero_it)
838{
839	struct sock *sk = NULL;
840	kmem_cache_t *slab = prot->slab;
841
842	if (slab != NULL)
843		sk = kmem_cache_alloc(slab, priority);
844	else
845		sk = kmalloc(prot->obj_size, priority);
846
847	if (sk) {
848		if (zero_it) {
849			memset(sk, 0, prot->obj_size);
850			sk->sk_family = family;
851			/*
852			 * See comment in struct sock definition to understand
853			 * why we need sk_prot_creator -acme
854			 */
855			sk->sk_prot = sk->sk_prot_creator = prot;
856			sock_lock_init(sk);
857		}
858
859		if (security_sk_alloc(sk, family, priority))
860			goto out_free;
861
862		if (!try_module_get(prot->owner))
863			goto out_free;
864	}
865	return sk;
866
867out_free:
868	if (slab != NULL)
869		kmem_cache_free(slab, sk);
870	else
871		kfree(sk);
872	return NULL;
873}
874
875void sk_free(struct sock *sk)
876{
877	struct sk_filter *filter;
878	struct module *owner = sk->sk_prot_creator->owner;
879
880	if (sk->sk_destruct)
881		sk->sk_destruct(sk);
882
883	filter = rcu_dereference(sk->sk_filter);
884	if (filter) {
885		sk_filter_release(sk, filter);
886		rcu_assign_pointer(sk->sk_filter, NULL);
887	}
888
889	sock_disable_timestamp(sk);
890
891	if (atomic_read(&sk->sk_omem_alloc))
892		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
893		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
894
895	security_sk_free(sk);
896	if (sk->sk_prot_creator->slab != NULL)
897		kmem_cache_free(sk->sk_prot_creator->slab, sk);
898	else
899		kfree(sk);
900	module_put(owner);
901}
902
903struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
904{
905	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
906
907	if (newsk != NULL) {
908		struct sk_filter *filter;
909
910		sock_copy(newsk, sk);
911
912		/* SANITY */
913		sk_node_init(&newsk->sk_node);
914		sock_lock_init(newsk);
915		bh_lock_sock(newsk);
916
917		atomic_set(&newsk->sk_rmem_alloc, 0);
918		atomic_set(&newsk->sk_wmem_alloc, 0);
919		atomic_set(&newsk->sk_omem_alloc, 0);
920		skb_queue_head_init(&newsk->sk_receive_queue);
921		skb_queue_head_init(&newsk->sk_write_queue);
922#ifdef CONFIG_NET_DMA
923		skb_queue_head_init(&newsk->sk_async_wait_queue);
924#endif
925
926		rwlock_init(&newsk->sk_dst_lock);
927		rwlock_init(&newsk->sk_callback_lock);
928		lockdep_set_class(&newsk->sk_callback_lock,
929				   af_callback_keys + newsk->sk_family);
930
931		newsk->sk_dst_cache	= NULL;
932		newsk->sk_wmem_queued	= 0;
933		newsk->sk_forward_alloc = 0;
934		newsk->sk_send_head	= NULL;
935		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
936		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
937
938		sock_reset_flag(newsk, SOCK_DONE);
939		skb_queue_head_init(&newsk->sk_error_queue);
940
941		filter = newsk->sk_filter;
942		if (filter != NULL)
943			sk_filter_charge(newsk, filter);
944
945		if (unlikely(xfrm_sk_clone_policy(newsk))) {
946			/* It is still raw copy of parent, so invalidate
947			 * destructor and make plain sk_free() */
948			newsk->sk_destruct = NULL;
949			sk_free(newsk);
950			newsk = NULL;
951			goto out;
952		}
953
954		newsk->sk_err	   = 0;
955		newsk->sk_priority = 0;
956		atomic_set(&newsk->sk_refcnt, 2);
957
958		/*
959		 * Increment the counter in the same struct proto as the master
960		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
961		 * is the same as sk->sk_prot->socks, as this field was copied
962		 * with memcpy).
963		 *
964		 * This _changes_ the previous behaviour, where
965		 * tcp_create_openreq_child always was incrementing the
966		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
967		 * to be taken into account in all callers. -acme
968		 */
969		sk_refcnt_debug_inc(newsk);
970		newsk->sk_socket = NULL;
971		newsk->sk_sleep	 = NULL;
972
973		if (newsk->sk_prot->sockets_allocated)
974			atomic_inc(newsk->sk_prot->sockets_allocated);
975	}
976out:
977	return newsk;
978}
979
980EXPORT_SYMBOL_GPL(sk_clone);
981
982void __init sk_init(void)
983{
984	if (num_physpages <= 4096) {
985		sysctl_wmem_max = 32767;
986		sysctl_rmem_max = 32767;
987		sysctl_wmem_default = 32767;
988		sysctl_rmem_default = 32767;
989	} else if (num_physpages >= 131072) {
990		sysctl_wmem_max = 131071;
991		sysctl_rmem_max = 131071;
992	}
993}
994
995/*
996 *	Simple resource managers for sockets.
997 */
998
999
1000/*
1001 * Write buffer destructor automatically called from kfree_skb.
1002 */
1003void sock_wfree(struct sk_buff *skb)
1004{
1005	struct sock *sk = skb->sk;
1006
1007	/* In case it might be waiting for more memory. */
1008	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1009	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1010		sk->sk_write_space(sk);
1011	sock_put(sk);
1012}
1013
1014/*
1015 * Read buffer destructor automatically called from kfree_skb.
1016 */
1017void sock_rfree(struct sk_buff *skb)
1018{
1019	struct sock *sk = skb->sk;
1020
1021	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1022}
1023
1024
1025int sock_i_uid(struct sock *sk)
1026{
1027	int uid;
1028
1029	read_lock(&sk->sk_callback_lock);
1030	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1031	read_unlock(&sk->sk_callback_lock);
1032	return uid;
1033}
1034
1035unsigned long sock_i_ino(struct sock *sk)
1036{
1037	unsigned long ino;
1038
1039	read_lock(&sk->sk_callback_lock);
1040	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1041	read_unlock(&sk->sk_callback_lock);
1042	return ino;
1043}
1044
1045/*
1046 * Allocate a skb from the socket's send buffer.
1047 */
1048struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1049			     gfp_t priority)
1050{
1051	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1052		struct sk_buff * skb = alloc_skb(size, priority);
1053		if (skb) {
1054			skb_set_owner_w(skb, sk);
1055			return skb;
1056		}
1057	}
1058	return NULL;
1059}
1060
1061/*
1062 * Allocate a skb from the socket's receive buffer.
1063 */
1064struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1065			     gfp_t priority)
1066{
1067	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1068		struct sk_buff *skb = alloc_skb(size, priority);
1069		if (skb) {
1070			skb_set_owner_r(skb, sk);
1071			return skb;
1072		}
1073	}
1074	return NULL;
1075}
1076
1077/*
1078 * Allocate a memory block from the socket's option memory buffer.
1079 */
1080void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1081{
1082	if ((unsigned)size <= sysctl_optmem_max &&
1083	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1084		void *mem;
1085		/* First do the add, to avoid the race if kmalloc
1086 		 * might sleep.
1087		 */
1088		atomic_add(size, &sk->sk_omem_alloc);
1089		mem = kmalloc(size, priority);
1090		if (mem)
1091			return mem;
1092		atomic_sub(size, &sk->sk_omem_alloc);
1093	}
1094	return NULL;
1095}
1096
1097/*
1098 * Free an option memory block.
1099 */
1100void sock_kfree_s(struct sock *sk, void *mem, int size)
1101{
1102	kfree(mem);
1103	atomic_sub(size, &sk->sk_omem_alloc);
1104}
1105
1106/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1107   I think, these locks should be removed for datagram sockets.
1108 */
1109static long sock_wait_for_wmem(struct sock * sk, long timeo)
1110{
1111	DEFINE_WAIT(wait);
1112
1113	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1114	for (;;) {
1115		if (!timeo)
1116			break;
1117		if (signal_pending(current))
1118			break;
1119		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1120		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1121		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1122			break;
1123		if (sk->sk_shutdown & SEND_SHUTDOWN)
1124			break;
1125		if (sk->sk_err)
1126			break;
1127		timeo = schedule_timeout(timeo);
1128	}
1129	finish_wait(sk->sk_sleep, &wait);
1130	return timeo;
1131}
1132
1133
1134/*
1135 *	Generic send/receive buffer handlers
1136 */
1137
1138static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1139					    unsigned long header_len,
1140					    unsigned long data_len,
1141					    int noblock, int *errcode)
1142{
1143	struct sk_buff *skb;
1144	gfp_t gfp_mask;
1145	long timeo;
1146	int err;
1147
1148	gfp_mask = sk->sk_allocation;
1149	if (gfp_mask & __GFP_WAIT)
1150		gfp_mask |= __GFP_REPEAT;
1151
1152	timeo = sock_sndtimeo(sk, noblock);
1153	while (1) {
1154		err = sock_error(sk);
1155		if (err != 0)
1156			goto failure;
1157
1158		err = -EPIPE;
1159		if (sk->sk_shutdown & SEND_SHUTDOWN)
1160			goto failure;
1161
1162		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1163			skb = alloc_skb(header_len, sk->sk_allocation);
1164			if (skb) {
1165				int npages;
1166				int i;
1167
1168				/* No pages, we're done... */
1169				if (!data_len)
1170					break;
1171
1172				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1173				skb->truesize += data_len;
1174				skb_shinfo(skb)->nr_frags = npages;
1175				for (i = 0; i < npages; i++) {
1176					struct page *page;
1177					skb_frag_t *frag;
1178
1179					page = alloc_pages(sk->sk_allocation, 0);
1180					if (!page) {
1181						err = -ENOBUFS;
1182						skb_shinfo(skb)->nr_frags = i;
1183						kfree_skb(skb);
1184						goto failure;
1185					}
1186
1187					frag = &skb_shinfo(skb)->frags[i];
1188					frag->page = page;
1189					frag->page_offset = 0;
1190					frag->size = (data_len >= PAGE_SIZE ?
1191						      PAGE_SIZE :
1192						      data_len);
1193					data_len -= PAGE_SIZE;
1194				}
1195
1196				/* Full success... */
1197				break;
1198			}
1199			err = -ENOBUFS;
1200			goto failure;
1201		}
1202		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1203		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1204		err = -EAGAIN;
1205		if (!timeo)
1206			goto failure;
1207		if (signal_pending(current))
1208			goto interrupted;
1209		timeo = sock_wait_for_wmem(sk, timeo);
1210	}
1211
1212	skb_set_owner_w(skb, sk);
1213	return skb;
1214
1215interrupted:
1216	err = sock_intr_errno(timeo);
1217failure:
1218	*errcode = err;
1219	return NULL;
1220}
1221
1222struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1223				    int noblock, int *errcode)
1224{
1225	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1226}
1227
1228static void __lock_sock(struct sock *sk)
1229{
1230	DEFINE_WAIT(wait);
1231
1232	for(;;) {
1233		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1234					TASK_UNINTERRUPTIBLE);
1235		spin_unlock_bh(&sk->sk_lock.slock);
1236		schedule();
1237		spin_lock_bh(&sk->sk_lock.slock);
1238		if(!sock_owned_by_user(sk))
1239			break;
1240	}
1241	finish_wait(&sk->sk_lock.wq, &wait);
1242}
1243
1244static void __release_sock(struct sock *sk)
1245{
1246	struct sk_buff *skb = sk->sk_backlog.head;
1247
1248	do {
1249		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1250		bh_unlock_sock(sk);
1251
1252		do {
1253			struct sk_buff *next = skb->next;
1254
1255			skb->next = NULL;
1256			sk->sk_backlog_rcv(sk, skb);
1257
1258			/*
1259			 * We are in process context here with softirqs
1260			 * disabled, use cond_resched_softirq() to preempt.
1261			 * This is safe to do because we've taken the backlog
1262			 * queue private:
1263			 */
1264			cond_resched_softirq();
1265
1266			skb = next;
1267		} while (skb != NULL);
1268
1269		bh_lock_sock(sk);
1270	} while((skb = sk->sk_backlog.head) != NULL);
1271}
1272
1273/**
1274 * sk_wait_data - wait for data to arrive at sk_receive_queue
1275 * @sk:    sock to wait on
1276 * @timeo: for how long
1277 *
1278 * Now socket state including sk->sk_err is changed only under lock,
1279 * hence we may omit checks after joining wait queue.
1280 * We check receive queue before schedule() only as optimization;
1281 * it is very likely that release_sock() added new data.
1282 */
1283int sk_wait_data(struct sock *sk, long *timeo)
1284{
1285	int rc;
1286	DEFINE_WAIT(wait);
1287
1288	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1289	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1290	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1291	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1292	finish_wait(sk->sk_sleep, &wait);
1293	return rc;
1294}
1295
1296EXPORT_SYMBOL(sk_wait_data);
1297
1298/*
1299 * Set of default routines for initialising struct proto_ops when
1300 * the protocol does not support a particular function. In certain
1301 * cases where it makes no sense for a protocol to have a "do nothing"
1302 * function, some default processing is provided.
1303 */
1304
1305int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1306{
1307	return -EOPNOTSUPP;
1308}
1309
1310int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1311		    int len, int flags)
1312{
1313	return -EOPNOTSUPP;
1314}
1315
1316int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1317{
1318	return -EOPNOTSUPP;
1319}
1320
1321int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1322{
1323	return -EOPNOTSUPP;
1324}
1325
1326int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1327		    int *len, int peer)
1328{
1329	return -EOPNOTSUPP;
1330}
1331
1332unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1333{
1334	return 0;
1335}
1336
1337int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1338{
1339	return -EOPNOTSUPP;
1340}
1341
1342int sock_no_listen(struct socket *sock, int backlog)
1343{
1344	return -EOPNOTSUPP;
1345}
1346
1347int sock_no_shutdown(struct socket *sock, int how)
1348{
1349	return -EOPNOTSUPP;
1350}
1351
1352int sock_no_setsockopt(struct socket *sock, int level, int optname,
1353		    char __user *optval, int optlen)
1354{
1355	return -EOPNOTSUPP;
1356}
1357
1358int sock_no_getsockopt(struct socket *sock, int level, int optname,
1359		    char __user *optval, int __user *optlen)
1360{
1361	return -EOPNOTSUPP;
1362}
1363
1364int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1365		    size_t len)
1366{
1367	return -EOPNOTSUPP;
1368}
1369
1370int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1371		    size_t len, int flags)
1372{
1373	return -EOPNOTSUPP;
1374}
1375
1376int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1377{
1378	/* Mirror missing mmap method error code */
1379	return -ENODEV;
1380}
1381
1382ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1383{
1384	ssize_t res;
1385	struct msghdr msg = {.msg_flags = flags};
1386	struct kvec iov;
1387	char *kaddr = kmap(page);
1388	iov.iov_base = kaddr + offset;
1389	iov.iov_len = size;
1390	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1391	kunmap(page);
1392	return res;
1393}
1394
1395/*
1396 *	Default Socket Callbacks
1397 */
1398
1399static void sock_def_wakeup(struct sock *sk)
1400{
1401	read_lock(&sk->sk_callback_lock);
1402	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1403		wake_up_interruptible_all(sk->sk_sleep);
1404	read_unlock(&sk->sk_callback_lock);
1405}
1406
1407static void sock_def_error_report(struct sock *sk)
1408{
1409	read_lock(&sk->sk_callback_lock);
1410	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1411		wake_up_interruptible(sk->sk_sleep);
1412	sk_wake_async(sk,0,POLL_ERR);
1413	read_unlock(&sk->sk_callback_lock);
1414}
1415
1416static void sock_def_readable(struct sock *sk, int len)
1417{
1418	read_lock(&sk->sk_callback_lock);
1419	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1420		wake_up_interruptible(sk->sk_sleep);
1421	sk_wake_async(sk,1,POLL_IN);
1422	read_unlock(&sk->sk_callback_lock);
1423}
1424
1425static void sock_def_write_space(struct sock *sk)
1426{
1427	read_lock(&sk->sk_callback_lock);
1428
1429	/* Do not wake up a writer until he can make "significant"
1430	 * progress.  --DaveM
1431	 */
1432	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1433		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1434			wake_up_interruptible(sk->sk_sleep);
1435
1436		/* Should agree with poll, otherwise some programs break */
1437		if (sock_writeable(sk))
1438			sk_wake_async(sk, 2, POLL_OUT);
1439	}
1440
1441	read_unlock(&sk->sk_callback_lock);
1442}
1443
1444static void sock_def_destruct(struct sock *sk)
1445{
1446	kfree(sk->sk_protinfo);
1447}
1448
1449void sk_send_sigurg(struct sock *sk)
1450{
1451	if (sk->sk_socket && sk->sk_socket->file)
1452		if (send_sigurg(&sk->sk_socket->file->f_owner))
1453			sk_wake_async(sk, 3, POLL_PRI);
1454}
1455
1456void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1457		    unsigned long expires)
1458{
1459	if (!mod_timer(timer, expires))
1460		sock_hold(sk);
1461}
1462
1463EXPORT_SYMBOL(sk_reset_timer);
1464
1465void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1466{
1467	if (timer_pending(timer) && del_timer(timer))
1468		__sock_put(sk);
1469}
1470
1471EXPORT_SYMBOL(sk_stop_timer);
1472
1473void sock_init_data(struct socket *sock, struct sock *sk)
1474{
1475	skb_queue_head_init(&sk->sk_receive_queue);
1476	skb_queue_head_init(&sk->sk_write_queue);
1477	skb_queue_head_init(&sk->sk_error_queue);
1478#ifdef CONFIG_NET_DMA
1479	skb_queue_head_init(&sk->sk_async_wait_queue);
1480#endif
1481
1482	sk->sk_send_head	=	NULL;
1483
1484	init_timer(&sk->sk_timer);
1485
1486	sk->sk_allocation	=	GFP_KERNEL;
1487	sk->sk_rcvbuf		=	sysctl_rmem_default;
1488	sk->sk_sndbuf		=	sysctl_wmem_default;
1489	sk->sk_state		=	TCP_CLOSE;
1490	sk->sk_socket		=	sock;
1491
1492	sock_set_flag(sk, SOCK_ZAPPED);
1493
1494	if(sock)
1495	{
1496		sk->sk_type	=	sock->type;
1497		sk->sk_sleep	=	&sock->wait;
1498		sock->sk	=	sk;
1499	} else
1500		sk->sk_sleep	=	NULL;
1501
1502	rwlock_init(&sk->sk_dst_lock);
1503	rwlock_init(&sk->sk_callback_lock);
1504	lockdep_set_class(&sk->sk_callback_lock,
1505			   af_callback_keys + sk->sk_family);
1506
1507	sk->sk_state_change	=	sock_def_wakeup;
1508	sk->sk_data_ready	=	sock_def_readable;
1509	sk->sk_write_space	=	sock_def_write_space;
1510	sk->sk_error_report	=	sock_def_error_report;
1511	sk->sk_destruct		=	sock_def_destruct;
1512
1513	sk->sk_sndmsg_page	=	NULL;
1514	sk->sk_sndmsg_off	=	0;
1515
1516	sk->sk_peercred.pid 	=	0;
1517	sk->sk_peercred.uid	=	-1;
1518	sk->sk_peercred.gid	=	-1;
1519	sk->sk_write_pending	=	0;
1520	sk->sk_rcvlowat		=	1;
1521	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1522	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1523
1524	sk->sk_stamp.tv_sec     = -1L;
1525	sk->sk_stamp.tv_usec    = -1L;
1526
1527	atomic_set(&sk->sk_refcnt, 1);
1528}
1529
1530void fastcall lock_sock(struct sock *sk)
1531{
1532	might_sleep();
1533	spin_lock_bh(&sk->sk_lock.slock);
1534	if (sk->sk_lock.owner)
1535		__lock_sock(sk);
1536	sk->sk_lock.owner = (void *)1;
1537	spin_unlock(&sk->sk_lock.slock);
1538	/*
1539	 * The sk_lock has mutex_lock() semantics here:
1540	 */
1541	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
1542	local_bh_enable();
1543}
1544
1545EXPORT_SYMBOL(lock_sock);
1546
1547void fastcall release_sock(struct sock *sk)
1548{
1549	/*
1550	 * The sk_lock has mutex_unlock() semantics:
1551	 */
1552	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1553
1554	spin_lock_bh(&sk->sk_lock.slock);
1555	if (sk->sk_backlog.tail)
1556		__release_sock(sk);
1557	sk->sk_lock.owner = NULL;
1558	if (waitqueue_active(&sk->sk_lock.wq))
1559		wake_up(&sk->sk_lock.wq);
1560	spin_unlock_bh(&sk->sk_lock.slock);
1561}
1562EXPORT_SYMBOL(release_sock);
1563
1564int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1565{
1566	if (!sock_flag(sk, SOCK_TIMESTAMP))
1567		sock_enable_timestamp(sk);
1568	if (sk->sk_stamp.tv_sec == -1)
1569		return -ENOENT;
1570	if (sk->sk_stamp.tv_sec == 0)
1571		do_gettimeofday(&sk->sk_stamp);
1572	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1573		-EFAULT : 0;
1574}
1575EXPORT_SYMBOL(sock_get_timestamp);
1576
1577void sock_enable_timestamp(struct sock *sk)
1578{
1579	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1580		sock_set_flag(sk, SOCK_TIMESTAMP);
1581		net_enable_timestamp();
1582	}
1583}
1584EXPORT_SYMBOL(sock_enable_timestamp);
1585
1586/*
1587 *	Get a socket option on an socket.
1588 *
1589 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1590 *	asynchronous errors should be reported by getsockopt. We assume
1591 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1592 */
1593int sock_common_getsockopt(struct socket *sock, int level, int optname,
1594			   char __user *optval, int __user *optlen)
1595{
1596	struct sock *sk = sock->sk;
1597
1598	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1599}
1600
1601EXPORT_SYMBOL(sock_common_getsockopt);
1602
1603#ifdef CONFIG_COMPAT
1604int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1605				  char __user *optval, int __user *optlen)
1606{
1607	struct sock *sk = sock->sk;
1608
1609	if (sk->sk_prot->compat_setsockopt != NULL)
1610		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1611						      optval, optlen);
1612	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1613}
1614EXPORT_SYMBOL(compat_sock_common_getsockopt);
1615#endif
1616
1617int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1618			struct msghdr *msg, size_t size, int flags)
1619{
1620	struct sock *sk = sock->sk;
1621	int addr_len = 0;
1622	int err;
1623
1624	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1625				   flags & ~MSG_DONTWAIT, &addr_len);
1626	if (err >= 0)
1627		msg->msg_namelen = addr_len;
1628	return err;
1629}
1630
1631EXPORT_SYMBOL(sock_common_recvmsg);
1632
1633/*
1634 *	Set socket options on an inet socket.
1635 */
1636int sock_common_setsockopt(struct socket *sock, int level, int optname,
1637			   char __user *optval, int optlen)
1638{
1639	struct sock *sk = sock->sk;
1640
1641	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1642}
1643
1644EXPORT_SYMBOL(sock_common_setsockopt);
1645
1646#ifdef CONFIG_COMPAT
1647int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1648				  char __user *optval, int optlen)
1649{
1650	struct sock *sk = sock->sk;
1651
1652	if (sk->sk_prot->compat_setsockopt != NULL)
1653		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1654						      optval, optlen);
1655	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1656}
1657EXPORT_SYMBOL(compat_sock_common_setsockopt);
1658#endif
1659
1660void sk_common_release(struct sock *sk)
1661{
1662	if (sk->sk_prot->destroy)
1663		sk->sk_prot->destroy(sk);
1664
1665	/*
1666	 * Observation: when sock_common_release is called, processes have
1667	 * no access to socket. But net still has.
1668	 * Step one, detach it from networking:
1669	 *
1670	 * A. Remove from hash tables.
1671	 */
1672
1673	sk->sk_prot->unhash(sk);
1674
1675	/*
1676	 * In this point socket cannot receive new packets, but it is possible
1677	 * that some packets are in flight because some CPU runs receiver and
1678	 * did hash table lookup before we unhashed socket. They will achieve
1679	 * receive queue and will be purged by socket destructor.
1680	 *
1681	 * Also we still have packets pending on receive queue and probably,
1682	 * our own packets waiting in device queues. sock_destroy will drain
1683	 * receive queue, but transmitted packets will delay socket destruction
1684	 * until the last reference will be released.
1685	 */
1686
1687	sock_orphan(sk);
1688
1689	xfrm_sk_free_policy(sk);
1690
1691	sk_refcnt_debug_release(sk);
1692	sock_put(sk);
1693}
1694
1695EXPORT_SYMBOL(sk_common_release);
1696
1697static DEFINE_RWLOCK(proto_list_lock);
1698static LIST_HEAD(proto_list);
1699
1700int proto_register(struct proto *prot, int alloc_slab)
1701{
1702	char *request_sock_slab_name = NULL;
1703	char *timewait_sock_slab_name;
1704	int rc = -ENOBUFS;
1705
1706	if (alloc_slab) {
1707		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1708					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1709
1710		if (prot->slab == NULL) {
1711			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1712			       prot->name);
1713			goto out;
1714		}
1715
1716		if (prot->rsk_prot != NULL) {
1717			static const char mask[] = "request_sock_%s";
1718
1719			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1720			if (request_sock_slab_name == NULL)
1721				goto out_free_sock_slab;
1722
1723			sprintf(request_sock_slab_name, mask, prot->name);
1724			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1725								 prot->rsk_prot->obj_size, 0,
1726								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1727
1728			if (prot->rsk_prot->slab == NULL) {
1729				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1730				       prot->name);
1731				goto out_free_request_sock_slab_name;
1732			}
1733		}
1734
1735		if (prot->twsk_prot != NULL) {
1736			static const char mask[] = "tw_sock_%s";
1737
1738			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1739
1740			if (timewait_sock_slab_name == NULL)
1741				goto out_free_request_sock_slab;
1742
1743			sprintf(timewait_sock_slab_name, mask, prot->name);
1744			prot->twsk_prot->twsk_slab =
1745				kmem_cache_create(timewait_sock_slab_name,
1746						  prot->twsk_prot->twsk_obj_size,
1747						  0, SLAB_HWCACHE_ALIGN,
1748						  NULL, NULL);
1749			if (prot->twsk_prot->twsk_slab == NULL)
1750				goto out_free_timewait_sock_slab_name;
1751		}
1752	}
1753
1754	write_lock(&proto_list_lock);
1755	list_add(&prot->node, &proto_list);
1756	write_unlock(&proto_list_lock);
1757	rc = 0;
1758out:
1759	return rc;
1760out_free_timewait_sock_slab_name:
1761	kfree(timewait_sock_slab_name);
1762out_free_request_sock_slab:
1763	if (prot->rsk_prot && prot->rsk_prot->slab) {
1764		kmem_cache_destroy(prot->rsk_prot->slab);
1765		prot->rsk_prot->slab = NULL;
1766	}
1767out_free_request_sock_slab_name:
1768	kfree(request_sock_slab_name);
1769out_free_sock_slab:
1770	kmem_cache_destroy(prot->slab);
1771	prot->slab = NULL;
1772	goto out;
1773}
1774
1775EXPORT_SYMBOL(proto_register);
1776
1777void proto_unregister(struct proto *prot)
1778{
1779	write_lock(&proto_list_lock);
1780	list_del(&prot->node);
1781	write_unlock(&proto_list_lock);
1782
1783	if (prot->slab != NULL) {
1784		kmem_cache_destroy(prot->slab);
1785		prot->slab = NULL;
1786	}
1787
1788	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1789		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1790
1791		kmem_cache_destroy(prot->rsk_prot->slab);
1792		kfree(name);
1793		prot->rsk_prot->slab = NULL;
1794	}
1795
1796	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1797		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1798
1799		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1800		kfree(name);
1801		prot->twsk_prot->twsk_slab = NULL;
1802	}
1803}
1804
1805EXPORT_SYMBOL(proto_unregister);
1806
1807#ifdef CONFIG_PROC_FS
1808static inline struct proto *__proto_head(void)
1809{
1810	return list_entry(proto_list.next, struct proto, node);
1811}
1812
1813static inline struct proto *proto_head(void)
1814{
1815	return list_empty(&proto_list) ? NULL : __proto_head();
1816}
1817
1818static inline struct proto *proto_next(struct proto *proto)
1819{
1820	return proto->node.next == &proto_list ? NULL :
1821		list_entry(proto->node.next, struct proto, node);
1822}
1823
1824static inline struct proto *proto_get_idx(loff_t pos)
1825{
1826	struct proto *proto;
1827	loff_t i = 0;
1828
1829	list_for_each_entry(proto, &proto_list, node)
1830		if (i++ == pos)
1831			goto out;
1832
1833	proto = NULL;
1834out:
1835	return proto;
1836}
1837
1838static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1839{
1840	read_lock(&proto_list_lock);
1841	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1842}
1843
1844static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1845{
1846	++*pos;
1847	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1848}
1849
1850static void proto_seq_stop(struct seq_file *seq, void *v)
1851{
1852	read_unlock(&proto_list_lock);
1853}
1854
1855static char proto_method_implemented(const void *method)
1856{
1857	return method == NULL ? 'n' : 'y';
1858}
1859
1860static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1861{
1862	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1863			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1864		   proto->name,
1865		   proto->obj_size,
1866		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1867		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1868		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1869		   proto->max_header,
1870		   proto->slab == NULL ? "no" : "yes",
1871		   module_name(proto->owner),
1872		   proto_method_implemented(proto->close),
1873		   proto_method_implemented(proto->connect),
1874		   proto_method_implemented(proto->disconnect),
1875		   proto_method_implemented(proto->accept),
1876		   proto_method_implemented(proto->ioctl),
1877		   proto_method_implemented(proto->init),
1878		   proto_method_implemented(proto->destroy),
1879		   proto_method_implemented(proto->shutdown),
1880		   proto_method_implemented(proto->setsockopt),
1881		   proto_method_implemented(proto->getsockopt),
1882		   proto_method_implemented(proto->sendmsg),
1883		   proto_method_implemented(proto->recvmsg),
1884		   proto_method_implemented(proto->sendpage),
1885		   proto_method_implemented(proto->bind),
1886		   proto_method_implemented(proto->backlog_rcv),
1887		   proto_method_implemented(proto->hash),
1888		   proto_method_implemented(proto->unhash),
1889		   proto_method_implemented(proto->get_port),
1890		   proto_method_implemented(proto->enter_memory_pressure));
1891}
1892
1893static int proto_seq_show(struct seq_file *seq, void *v)
1894{
1895	if (v == SEQ_START_TOKEN)
1896		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1897			   "protocol",
1898			   "size",
1899			   "sockets",
1900			   "memory",
1901			   "press",
1902			   "maxhdr",
1903			   "slab",
1904			   "module",
1905			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1906	else
1907		proto_seq_printf(seq, v);
1908	return 0;
1909}
1910
1911static struct seq_operations proto_seq_ops = {
1912	.start  = proto_seq_start,
1913	.next   = proto_seq_next,
1914	.stop   = proto_seq_stop,
1915	.show   = proto_seq_show,
1916};
1917
1918static int proto_seq_open(struct inode *inode, struct file *file)
1919{
1920	return seq_open(file, &proto_seq_ops);
1921}
1922
1923static struct file_operations proto_seq_fops = {
1924	.owner		= THIS_MODULE,
1925	.open		= proto_seq_open,
1926	.read		= seq_read,
1927	.llseek		= seq_lseek,
1928	.release	= seq_release,
1929};
1930
1931static int __init proto_init(void)
1932{
1933	/* register /proc/net/protocols */
1934	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1935}
1936
1937subsys_initcall(proto_init);
1938
1939#endif /* PROC_FS */
1940
1941EXPORT_SYMBOL(sk_alloc);
1942EXPORT_SYMBOL(sk_free);
1943EXPORT_SYMBOL(sk_send_sigurg);
1944EXPORT_SYMBOL(sock_alloc_send_skb);
1945EXPORT_SYMBOL(sock_init_data);
1946EXPORT_SYMBOL(sock_kfree_s);
1947EXPORT_SYMBOL(sock_kmalloc);
1948EXPORT_SYMBOL(sock_no_accept);
1949EXPORT_SYMBOL(sock_no_bind);
1950EXPORT_SYMBOL(sock_no_connect);
1951EXPORT_SYMBOL(sock_no_getname);
1952EXPORT_SYMBOL(sock_no_getsockopt);
1953EXPORT_SYMBOL(sock_no_ioctl);
1954EXPORT_SYMBOL(sock_no_listen);
1955EXPORT_SYMBOL(sock_no_mmap);
1956EXPORT_SYMBOL(sock_no_poll);
1957EXPORT_SYMBOL(sock_no_recvmsg);
1958EXPORT_SYMBOL(sock_no_sendmsg);
1959EXPORT_SYMBOL(sock_no_sendpage);
1960EXPORT_SYMBOL(sock_no_setsockopt);
1961EXPORT_SYMBOL(sock_no_shutdown);
1962EXPORT_SYMBOL(sock_no_socketpair);
1963EXPORT_SYMBOL(sock_rfree);
1964EXPORT_SYMBOL(sock_setsockopt);
1965EXPORT_SYMBOL(sock_wfree);
1966EXPORT_SYMBOL(sock_wmalloc);
1967EXPORT_SYMBOL(sock_i_uid);
1968EXPORT_SYMBOL(sock_i_ino);
1969EXPORT_SYMBOL(sysctl_optmem_max);
1970#ifdef CONFIG_SYSCTL
1971EXPORT_SYMBOL(sysctl_rmem_max);
1972EXPORT_SYMBOL(sysctl_wmem_max);
1973#endif
1974