sock.c revision f690808e17925fc45217eb22e8670902ecee5c1b
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors:	Ross Biro
13 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 *		Florian La Roche, <flla@stud.uni-sb.de>
15 *		Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 *		Alan Cox	: 	Numerous verify_area() problems
19 *		Alan Cox	:	Connecting on a connecting socket
20 *					now returns an error for tcp.
21 *		Alan Cox	:	sock->protocol is set correctly.
22 *					and is not sometimes left as 0.
23 *		Alan Cox	:	connect handles icmp errors on a
24 *					connect properly. Unfortunately there
25 *					is a restart syscall nasty there. I
26 *					can't match BSD without hacking the C
27 *					library. Ideas urgently sought!
28 *		Alan Cox	:	Disallow bind() to addresses that are
29 *					not ours - especially broadcast ones!!
30 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32 *					instead they leave that for the DESTROY timer.
33 *		Alan Cox	:	Clean up error flag in accept
34 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35 *					was buggy. Put a remove_sock() in the handler
36 *					for memory when we hit 0. Also altered the timer
37 *					code. The ACK stuff can wait and needs major
38 *					TCP layer surgery.
39 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40 *					and fixed timer/inet_bh race.
41 *		Alan Cox	:	Added zapped flag for TCP
42 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49 *	Pauline Middelink	:	identd support
50 *		Alan Cox	:	Fixed connect() taking signals I think.
51 *		Alan Cox	:	SO_LINGER supported
52 *		Alan Cox	:	Error reporting fixes
53 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54 *		Alan Cox	:	inet sockets don't set sk->type!
55 *		Alan Cox	:	Split socket option code
56 *		Alan Cox	:	Callbacks
57 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58 *		Alex		:	Removed restriction on inet fioctl
59 *		Alan Cox	:	Splitting INET from NET core
60 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62 *		Alan Cox	:	Split IP from generic code
63 *		Alan Cox	:	New kfree_skbmem()
64 *		Alan Cox	:	Make SO_DEBUG superuser only.
65 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66 *					(compatibility fix)
67 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68 *		Alan Cox	:	Allocator for a socket is settable.
69 *		Alan Cox	:	SO_ERROR includes soft errors.
70 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71 *		Alan Cox	: 	Generic socket allocation to make hooks
72 *					easier (suggested by Craig Metz).
73 *		Michael Pall	:	SO_ERROR returns positive errno again
74 *              Steve Whitehouse:       Added default destructor to free
75 *                                      protocol private data.
76 *              Steve Whitehouse:       Added various other default routines
77 *                                      common to several socket families.
78 *              Chris Evans     :       Call suser() check last on F_SETOWN
79 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81 *		Andi Kleen	:	Fix write_space callback
82 *		Chris Evans	:	Security fixes - signedness again
83 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 *		This program is free software; you can redistribute it and/or
89 *		modify it under the terms of the GNU General Public License
90 *		as published by the Free Software Foundation; either version
91 *		2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
148  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
149  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
150  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
151  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
152  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
153  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
154  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
155  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
156  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
157  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
158};
159static const char *af_family_slock_key_strings[AF_MAX+1] = {
160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169  "slock-27"       , "slock-28"          , "slock-29"          ,
170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
171};
172#endif
173
174/*
175 * sk_callback_lock locking rules are per-address-family,
176 * so split the lock classes by using a per-AF key:
177 */
178static struct lock_class_key af_callback_keys[AF_MAX];
179
180/* Take into consideration the size of the struct sk_buff overhead in the
181 * determination of these values, since that is non-constant across
182 * platforms.  This makes socket queueing behavior and performance
183 * not depend upon such differences.
184 */
185#define _SK_MEM_PACKETS		256
186#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
187#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
188#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
189
190/* Run time adjustable parameters. */
191__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
192__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
193__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
194__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
195
196/* Maximal space eaten by iovec or ancilliary data plus some space */
197int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
198
199static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
200{
201	struct timeval tv;
202
203	if (optlen < sizeof(tv))
204		return -EINVAL;
205	if (copy_from_user(&tv, optval, sizeof(tv)))
206		return -EFAULT;
207
208	*timeo_p = MAX_SCHEDULE_TIMEOUT;
209	if (tv.tv_sec == 0 && tv.tv_usec == 0)
210		return 0;
211	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
212		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
213	return 0;
214}
215
216static void sock_warn_obsolete_bsdism(const char *name)
217{
218	static int warned;
219	static char warncomm[TASK_COMM_LEN];
220	if (strcmp(warncomm, current->comm) && warned < 5) {
221		strcpy(warncomm,  current->comm);
222		printk(KERN_WARNING "process `%s' is using obsolete "
223		       "%s SO_BSDCOMPAT\n", warncomm, name);
224		warned++;
225	}
226}
227
228static void sock_disable_timestamp(struct sock *sk)
229{
230	if (sock_flag(sk, SOCK_TIMESTAMP)) {
231		sock_reset_flag(sk, SOCK_TIMESTAMP);
232		net_disable_timestamp();
233	}
234}
235
236
237int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
238{
239	int err = 0;
240	int skb_len;
241
242	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
243	   number of warnings when compiling with -W --ANK
244	 */
245	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
246	    (unsigned)sk->sk_rcvbuf) {
247		err = -ENOMEM;
248		goto out;
249	}
250
251	err = sk_filter(sk, skb);
252	if (err)
253		goto out;
254
255	skb->dev = NULL;
256	skb_set_owner_r(skb, sk);
257
258	/* Cache the SKB length before we tack it onto the receive
259	 * queue.  Once it is added it no longer belongs to us and
260	 * may be freed by other threads of control pulling packets
261	 * from the queue.
262	 */
263	skb_len = skb->len;
264
265	skb_queue_tail(&sk->sk_receive_queue, skb);
266
267	if (!sock_flag(sk, SOCK_DEAD))
268		sk->sk_data_ready(sk, skb_len);
269out:
270	return err;
271}
272EXPORT_SYMBOL(sock_queue_rcv_skb);
273
274int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
275{
276	int rc = NET_RX_SUCCESS;
277
278	if (sk_filter(sk, skb))
279		goto discard_and_relse;
280
281	skb->dev = NULL;
282
283	if (nested)
284		bh_lock_sock_nested(sk);
285	else
286		bh_lock_sock(sk);
287	if (!sock_owned_by_user(sk)) {
288		/*
289		 * trylock + unlock semantics:
290		 */
291		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
292
293		rc = sk->sk_backlog_rcv(sk, skb);
294
295		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
296	} else
297		sk_add_backlog(sk, skb);
298	bh_unlock_sock(sk);
299out:
300	sock_put(sk);
301	return rc;
302discard_and_relse:
303	kfree_skb(skb);
304	goto out;
305}
306EXPORT_SYMBOL(sk_receive_skb);
307
308struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
309{
310	struct dst_entry *dst = sk->sk_dst_cache;
311
312	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
313		sk->sk_dst_cache = NULL;
314		dst_release(dst);
315		return NULL;
316	}
317
318	return dst;
319}
320EXPORT_SYMBOL(__sk_dst_check);
321
322struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
323{
324	struct dst_entry *dst = sk_dst_get(sk);
325
326	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
327		sk_dst_reset(sk);
328		dst_release(dst);
329		return NULL;
330	}
331
332	return dst;
333}
334EXPORT_SYMBOL(sk_dst_check);
335
336/*
337 *	This is meant for all protocols to use and covers goings on
338 *	at the socket level. Everything here is generic.
339 */
340
341int sock_setsockopt(struct socket *sock, int level, int optname,
342		    char __user *optval, int optlen)
343{
344	struct sock *sk=sock->sk;
345	struct sk_filter *filter;
346	int val;
347	int valbool;
348	struct linger ling;
349	int ret = 0;
350
351	/*
352	 *	Options without arguments
353	 */
354
355#ifdef SO_DONTLINGER		/* Compatibility item... */
356	if (optname == SO_DONTLINGER) {
357		lock_sock(sk);
358		sock_reset_flag(sk, SOCK_LINGER);
359		release_sock(sk);
360		return 0;
361	}
362#endif
363
364	if (optlen < sizeof(int))
365		return -EINVAL;
366
367	if (get_user(val, (int __user *)optval))
368		return -EFAULT;
369
370	valbool = val?1:0;
371
372	lock_sock(sk);
373
374	switch(optname) {
375	case SO_DEBUG:
376		if (val && !capable(CAP_NET_ADMIN)) {
377			ret = -EACCES;
378		}
379		else if (valbool)
380			sock_set_flag(sk, SOCK_DBG);
381		else
382			sock_reset_flag(sk, SOCK_DBG);
383		break;
384	case SO_REUSEADDR:
385		sk->sk_reuse = valbool;
386		break;
387	case SO_TYPE:
388	case SO_ERROR:
389		ret = -ENOPROTOOPT;
390		break;
391	case SO_DONTROUTE:
392		if (valbool)
393			sock_set_flag(sk, SOCK_LOCALROUTE);
394		else
395			sock_reset_flag(sk, SOCK_LOCALROUTE);
396		break;
397	case SO_BROADCAST:
398		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
399		break;
400	case SO_SNDBUF:
401		/* Don't error on this BSD doesn't and if you think
402		   about it this is right. Otherwise apps have to
403		   play 'guess the biggest size' games. RCVBUF/SNDBUF
404		   are treated in BSD as hints */
405
406		if (val > sysctl_wmem_max)
407			val = sysctl_wmem_max;
408set_sndbuf:
409		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
410		if ((val * 2) < SOCK_MIN_SNDBUF)
411			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
412		else
413			sk->sk_sndbuf = val * 2;
414
415		/*
416		 *	Wake up sending tasks if we
417		 *	upped the value.
418		 */
419		sk->sk_write_space(sk);
420		break;
421
422	case SO_SNDBUFFORCE:
423		if (!capable(CAP_NET_ADMIN)) {
424			ret = -EPERM;
425			break;
426		}
427		goto set_sndbuf;
428
429	case SO_RCVBUF:
430		/* Don't error on this BSD doesn't and if you think
431		   about it this is right. Otherwise apps have to
432		   play 'guess the biggest size' games. RCVBUF/SNDBUF
433		   are treated in BSD as hints */
434
435		if (val > sysctl_rmem_max)
436			val = sysctl_rmem_max;
437set_rcvbuf:
438		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
439		/*
440		 * We double it on the way in to account for
441		 * "struct sk_buff" etc. overhead.   Applications
442		 * assume that the SO_RCVBUF setting they make will
443		 * allow that much actual data to be received on that
444		 * socket.
445		 *
446		 * Applications are unaware that "struct sk_buff" and
447		 * other overheads allocate from the receive buffer
448		 * during socket buffer allocation.
449		 *
450		 * And after considering the possible alternatives,
451		 * returning the value we actually used in getsockopt
452		 * is the most desirable behavior.
453		 */
454		if ((val * 2) < SOCK_MIN_RCVBUF)
455			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
456		else
457			sk->sk_rcvbuf = val * 2;
458		break;
459
460	case SO_RCVBUFFORCE:
461		if (!capable(CAP_NET_ADMIN)) {
462			ret = -EPERM;
463			break;
464		}
465		goto set_rcvbuf;
466
467	case SO_KEEPALIVE:
468#ifdef CONFIG_INET
469		if (sk->sk_protocol == IPPROTO_TCP)
470			tcp_set_keepalive(sk, valbool);
471#endif
472		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
473		break;
474
475	case SO_OOBINLINE:
476		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
477		break;
478
479	case SO_NO_CHECK:
480		sk->sk_no_check = valbool;
481		break;
482
483	case SO_PRIORITY:
484		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
485			sk->sk_priority = val;
486		else
487			ret = -EPERM;
488		break;
489
490	case SO_LINGER:
491		if (optlen < sizeof(ling)) {
492			ret = -EINVAL;	/* 1003.1g */
493			break;
494		}
495		if (copy_from_user(&ling,optval,sizeof(ling))) {
496			ret = -EFAULT;
497			break;
498		}
499		if (!ling.l_onoff)
500			sock_reset_flag(sk, SOCK_LINGER);
501		else {
502#if (BITS_PER_LONG == 32)
503			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
504				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
505			else
506#endif
507				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
508			sock_set_flag(sk, SOCK_LINGER);
509		}
510		break;
511
512	case SO_BSDCOMPAT:
513		sock_warn_obsolete_bsdism("setsockopt");
514		break;
515
516	case SO_PASSCRED:
517		if (valbool)
518			set_bit(SOCK_PASSCRED, &sock->flags);
519		else
520			clear_bit(SOCK_PASSCRED, &sock->flags);
521		break;
522
523	case SO_TIMESTAMP:
524	case SO_TIMESTAMPNS:
525		if (valbool)  {
526			if (optname == SO_TIMESTAMP)
527				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
528			else
529				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
530			sock_set_flag(sk, SOCK_RCVTSTAMP);
531			sock_enable_timestamp(sk);
532		} else {
533			sock_reset_flag(sk, SOCK_RCVTSTAMP);
534			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
535		}
536		break;
537
538	case SO_RCVLOWAT:
539		if (val < 0)
540			val = INT_MAX;
541		sk->sk_rcvlowat = val ? : 1;
542		break;
543
544	case SO_RCVTIMEO:
545		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
546		break;
547
548	case SO_SNDTIMEO:
549		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
550		break;
551
552#ifdef CONFIG_NETDEVICES
553	case SO_BINDTODEVICE:
554	{
555		char devname[IFNAMSIZ];
556
557		/* Sorry... */
558		if (!capable(CAP_NET_RAW)) {
559			ret = -EPERM;
560			break;
561		}
562
563		/* Bind this socket to a particular device like "eth0",
564		 * as specified in the passed interface name. If the
565		 * name is "" or the option length is zero the socket
566		 * is not bound.
567		 */
568
569		if (!valbool) {
570			sk->sk_bound_dev_if = 0;
571		} else {
572			if (optlen > IFNAMSIZ - 1)
573				optlen = IFNAMSIZ - 1;
574			memset(devname, 0, sizeof(devname));
575			if (copy_from_user(devname, optval, optlen)) {
576				ret = -EFAULT;
577				break;
578			}
579
580			/* Remove any cached route for this socket. */
581			sk_dst_reset(sk);
582
583			if (devname[0] == '\0') {
584				sk->sk_bound_dev_if = 0;
585			} else {
586				struct net_device *dev = dev_get_by_name(devname);
587				if (!dev) {
588					ret = -ENODEV;
589					break;
590				}
591				sk->sk_bound_dev_if = dev->ifindex;
592				dev_put(dev);
593			}
594		}
595		break;
596	}
597#endif
598
599
600	case SO_ATTACH_FILTER:
601		ret = -EINVAL;
602		if (optlen == sizeof(struct sock_fprog)) {
603			struct sock_fprog fprog;
604
605			ret = -EFAULT;
606			if (copy_from_user(&fprog, optval, sizeof(fprog)))
607				break;
608
609			ret = sk_attach_filter(&fprog, sk);
610		}
611		break;
612
613	case SO_DETACH_FILTER:
614		rcu_read_lock_bh();
615		filter = rcu_dereference(sk->sk_filter);
616		if (filter) {
617			rcu_assign_pointer(sk->sk_filter, NULL);
618			sk_filter_release(sk, filter);
619			rcu_read_unlock_bh();
620			break;
621		}
622		rcu_read_unlock_bh();
623		ret = -ENONET;
624		break;
625
626	case SO_PASSSEC:
627		if (valbool)
628			set_bit(SOCK_PASSSEC, &sock->flags);
629		else
630			clear_bit(SOCK_PASSSEC, &sock->flags);
631		break;
632
633		/* We implement the SO_SNDLOWAT etc to
634		   not be settable (1003.1g 5.3) */
635	default:
636		ret = -ENOPROTOOPT;
637		break;
638	}
639	release_sock(sk);
640	return ret;
641}
642
643
644int sock_getsockopt(struct socket *sock, int level, int optname,
645		    char __user *optval, int __user *optlen)
646{
647	struct sock *sk = sock->sk;
648
649	union {
650		int val;
651		struct linger ling;
652		struct timeval tm;
653	} v;
654
655	unsigned int lv = sizeof(int);
656	int len;
657
658	if (get_user(len, optlen))
659		return -EFAULT;
660	if (len < 0)
661		return -EINVAL;
662
663	switch(optname) {
664	case SO_DEBUG:
665		v.val = sock_flag(sk, SOCK_DBG);
666		break;
667
668	case SO_DONTROUTE:
669		v.val = sock_flag(sk, SOCK_LOCALROUTE);
670		break;
671
672	case SO_BROADCAST:
673		v.val = !!sock_flag(sk, SOCK_BROADCAST);
674		break;
675
676	case SO_SNDBUF:
677		v.val = sk->sk_sndbuf;
678		break;
679
680	case SO_RCVBUF:
681		v.val = sk->sk_rcvbuf;
682		break;
683
684	case SO_REUSEADDR:
685		v.val = sk->sk_reuse;
686		break;
687
688	case SO_KEEPALIVE:
689		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
690		break;
691
692	case SO_TYPE:
693		v.val = sk->sk_type;
694		break;
695
696	case SO_ERROR:
697		v.val = -sock_error(sk);
698		if (v.val==0)
699			v.val = xchg(&sk->sk_err_soft, 0);
700		break;
701
702	case SO_OOBINLINE:
703		v.val = !!sock_flag(sk, SOCK_URGINLINE);
704		break;
705
706	case SO_NO_CHECK:
707		v.val = sk->sk_no_check;
708		break;
709
710	case SO_PRIORITY:
711		v.val = sk->sk_priority;
712		break;
713
714	case SO_LINGER:
715		lv		= sizeof(v.ling);
716		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
717		v.ling.l_linger	= sk->sk_lingertime / HZ;
718		break;
719
720	case SO_BSDCOMPAT:
721		sock_warn_obsolete_bsdism("getsockopt");
722		break;
723
724	case SO_TIMESTAMP:
725		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
726				!sock_flag(sk, SOCK_RCVTSTAMPNS);
727		break;
728
729	case SO_TIMESTAMPNS:
730		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
731		break;
732
733	case SO_RCVTIMEO:
734		lv=sizeof(struct timeval);
735		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
736			v.tm.tv_sec = 0;
737			v.tm.tv_usec = 0;
738		} else {
739			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
740			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
741		}
742		break;
743
744	case SO_SNDTIMEO:
745		lv=sizeof(struct timeval);
746		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
747			v.tm.tv_sec = 0;
748			v.tm.tv_usec = 0;
749		} else {
750			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
751			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
752		}
753		break;
754
755	case SO_RCVLOWAT:
756		v.val = sk->sk_rcvlowat;
757		break;
758
759	case SO_SNDLOWAT:
760		v.val=1;
761		break;
762
763	case SO_PASSCRED:
764		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
765		break;
766
767	case SO_PEERCRED:
768		if (len > sizeof(sk->sk_peercred))
769			len = sizeof(sk->sk_peercred);
770		if (copy_to_user(optval, &sk->sk_peercred, len))
771			return -EFAULT;
772		goto lenout;
773
774	case SO_PEERNAME:
775	{
776		char address[128];
777
778		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
779			return -ENOTCONN;
780		if (lv < len)
781			return -EINVAL;
782		if (copy_to_user(optval, address, len))
783			return -EFAULT;
784		goto lenout;
785	}
786
787	/* Dubious BSD thing... Probably nobody even uses it, but
788	 * the UNIX standard wants it for whatever reason... -DaveM
789	 */
790	case SO_ACCEPTCONN:
791		v.val = sk->sk_state == TCP_LISTEN;
792		break;
793
794	case SO_PASSSEC:
795		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
796		break;
797
798	case SO_PEERSEC:
799		return security_socket_getpeersec_stream(sock, optval, optlen, len);
800
801	default:
802		return -ENOPROTOOPT;
803	}
804
805	if (len > lv)
806		len = lv;
807	if (copy_to_user(optval, &v, len))
808		return -EFAULT;
809lenout:
810	if (put_user(len, optlen))
811		return -EFAULT;
812	return 0;
813}
814
815/*
816 * Initialize an sk_lock.
817 *
818 * (We also register the sk_lock with the lock validator.)
819 */
820static inline void sock_lock_init(struct sock *sk)
821{
822	sock_lock_init_class_and_name(sk,
823			af_family_slock_key_strings[sk->sk_family],
824			af_family_slock_keys + sk->sk_family,
825			af_family_key_strings[sk->sk_family],
826			af_family_keys + sk->sk_family);
827}
828
829/**
830 *	sk_alloc - All socket objects are allocated here
831 *	@family: protocol family
832 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
833 *	@prot: struct proto associated with this new sock instance
834 *	@zero_it: if we should zero the newly allocated sock
835 */
836struct sock *sk_alloc(int family, gfp_t priority,
837		      struct proto *prot, int zero_it)
838{
839	struct sock *sk = NULL;
840	struct kmem_cache *slab = prot->slab;
841
842	if (slab != NULL)
843		sk = kmem_cache_alloc(slab, priority);
844	else
845		sk = kmalloc(prot->obj_size, priority);
846
847	if (sk) {
848		if (zero_it) {
849			memset(sk, 0, prot->obj_size);
850			sk->sk_family = family;
851			/*
852			 * See comment in struct sock definition to understand
853			 * why we need sk_prot_creator -acme
854			 */
855			sk->sk_prot = sk->sk_prot_creator = prot;
856			sock_lock_init(sk);
857		}
858
859		if (security_sk_alloc(sk, family, priority))
860			goto out_free;
861
862		if (!try_module_get(prot->owner))
863			goto out_free;
864	}
865	return sk;
866
867out_free:
868	if (slab != NULL)
869		kmem_cache_free(slab, sk);
870	else
871		kfree(sk);
872	return NULL;
873}
874
875void sk_free(struct sock *sk)
876{
877	struct sk_filter *filter;
878	struct module *owner = sk->sk_prot_creator->owner;
879
880	if (sk->sk_destruct)
881		sk->sk_destruct(sk);
882
883	filter = rcu_dereference(sk->sk_filter);
884	if (filter) {
885		sk_filter_release(sk, filter);
886		rcu_assign_pointer(sk->sk_filter, NULL);
887	}
888
889	sock_disable_timestamp(sk);
890
891	if (atomic_read(&sk->sk_omem_alloc))
892		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
893		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
894
895	security_sk_free(sk);
896	if (sk->sk_prot_creator->slab != NULL)
897		kmem_cache_free(sk->sk_prot_creator->slab, sk);
898	else
899		kfree(sk);
900	module_put(owner);
901}
902
903struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
904{
905	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
906
907	if (newsk != NULL) {
908		struct sk_filter *filter;
909
910		sock_copy(newsk, sk);
911
912		/* SANITY */
913		sk_node_init(&newsk->sk_node);
914		sock_lock_init(newsk);
915		bh_lock_sock(newsk);
916		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
917
918		atomic_set(&newsk->sk_rmem_alloc, 0);
919		atomic_set(&newsk->sk_wmem_alloc, 0);
920		atomic_set(&newsk->sk_omem_alloc, 0);
921		skb_queue_head_init(&newsk->sk_receive_queue);
922		skb_queue_head_init(&newsk->sk_write_queue);
923#ifdef CONFIG_NET_DMA
924		skb_queue_head_init(&newsk->sk_async_wait_queue);
925#endif
926
927		rwlock_init(&newsk->sk_dst_lock);
928		rwlock_init(&newsk->sk_callback_lock);
929		lockdep_set_class(&newsk->sk_callback_lock,
930				   af_callback_keys + newsk->sk_family);
931
932		newsk->sk_dst_cache	= NULL;
933		newsk->sk_wmem_queued	= 0;
934		newsk->sk_forward_alloc = 0;
935		newsk->sk_send_head	= NULL;
936		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
937
938		sock_reset_flag(newsk, SOCK_DONE);
939		skb_queue_head_init(&newsk->sk_error_queue);
940
941		filter = newsk->sk_filter;
942		if (filter != NULL)
943			sk_filter_charge(newsk, filter);
944
945		if (unlikely(xfrm_sk_clone_policy(newsk))) {
946			/* It is still raw copy of parent, so invalidate
947			 * destructor and make plain sk_free() */
948			newsk->sk_destruct = NULL;
949			sk_free(newsk);
950			newsk = NULL;
951			goto out;
952		}
953
954		newsk->sk_err	   = 0;
955		newsk->sk_priority = 0;
956		atomic_set(&newsk->sk_refcnt, 2);
957
958		/*
959		 * Increment the counter in the same struct proto as the master
960		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
961		 * is the same as sk->sk_prot->socks, as this field was copied
962		 * with memcpy).
963		 *
964		 * This _changes_ the previous behaviour, where
965		 * tcp_create_openreq_child always was incrementing the
966		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
967		 * to be taken into account in all callers. -acme
968		 */
969		sk_refcnt_debug_inc(newsk);
970		newsk->sk_socket = NULL;
971		newsk->sk_sleep	 = NULL;
972
973		if (newsk->sk_prot->sockets_allocated)
974			atomic_inc(newsk->sk_prot->sockets_allocated);
975	}
976out:
977	return newsk;
978}
979
980EXPORT_SYMBOL_GPL(sk_clone);
981
982void __init sk_init(void)
983{
984	if (num_physpages <= 4096) {
985		sysctl_wmem_max = 32767;
986		sysctl_rmem_max = 32767;
987		sysctl_wmem_default = 32767;
988		sysctl_rmem_default = 32767;
989	} else if (num_physpages >= 131072) {
990		sysctl_wmem_max = 131071;
991		sysctl_rmem_max = 131071;
992	}
993}
994
995/*
996 *	Simple resource managers for sockets.
997 */
998
999
1000/*
1001 * Write buffer destructor automatically called from kfree_skb.
1002 */
1003void sock_wfree(struct sk_buff *skb)
1004{
1005	struct sock *sk = skb->sk;
1006
1007	/* In case it might be waiting for more memory. */
1008	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1009	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1010		sk->sk_write_space(sk);
1011	sock_put(sk);
1012}
1013
1014/*
1015 * Read buffer destructor automatically called from kfree_skb.
1016 */
1017void sock_rfree(struct sk_buff *skb)
1018{
1019	struct sock *sk = skb->sk;
1020
1021	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1022}
1023
1024
1025int sock_i_uid(struct sock *sk)
1026{
1027	int uid;
1028
1029	read_lock(&sk->sk_callback_lock);
1030	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1031	read_unlock(&sk->sk_callback_lock);
1032	return uid;
1033}
1034
1035unsigned long sock_i_ino(struct sock *sk)
1036{
1037	unsigned long ino;
1038
1039	read_lock(&sk->sk_callback_lock);
1040	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1041	read_unlock(&sk->sk_callback_lock);
1042	return ino;
1043}
1044
1045/*
1046 * Allocate a skb from the socket's send buffer.
1047 */
1048struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1049			     gfp_t priority)
1050{
1051	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1052		struct sk_buff * skb = alloc_skb(size, priority);
1053		if (skb) {
1054			skb_set_owner_w(skb, sk);
1055			return skb;
1056		}
1057	}
1058	return NULL;
1059}
1060
1061/*
1062 * Allocate a skb from the socket's receive buffer.
1063 */
1064struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1065			     gfp_t priority)
1066{
1067	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1068		struct sk_buff *skb = alloc_skb(size, priority);
1069		if (skb) {
1070			skb_set_owner_r(skb, sk);
1071			return skb;
1072		}
1073	}
1074	return NULL;
1075}
1076
1077/*
1078 * Allocate a memory block from the socket's option memory buffer.
1079 */
1080void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1081{
1082	if ((unsigned)size <= sysctl_optmem_max &&
1083	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1084		void *mem;
1085		/* First do the add, to avoid the race if kmalloc
1086		 * might sleep.
1087		 */
1088		atomic_add(size, &sk->sk_omem_alloc);
1089		mem = kmalloc(size, priority);
1090		if (mem)
1091			return mem;
1092		atomic_sub(size, &sk->sk_omem_alloc);
1093	}
1094	return NULL;
1095}
1096
1097/*
1098 * Free an option memory block.
1099 */
1100void sock_kfree_s(struct sock *sk, void *mem, int size)
1101{
1102	kfree(mem);
1103	atomic_sub(size, &sk->sk_omem_alloc);
1104}
1105
1106/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1107   I think, these locks should be removed for datagram sockets.
1108 */
1109static long sock_wait_for_wmem(struct sock * sk, long timeo)
1110{
1111	DEFINE_WAIT(wait);
1112
1113	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1114	for (;;) {
1115		if (!timeo)
1116			break;
1117		if (signal_pending(current))
1118			break;
1119		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1120		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1121		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1122			break;
1123		if (sk->sk_shutdown & SEND_SHUTDOWN)
1124			break;
1125		if (sk->sk_err)
1126			break;
1127		timeo = schedule_timeout(timeo);
1128	}
1129	finish_wait(sk->sk_sleep, &wait);
1130	return timeo;
1131}
1132
1133
1134/*
1135 *	Generic send/receive buffer handlers
1136 */
1137
1138static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1139					    unsigned long header_len,
1140					    unsigned long data_len,
1141					    int noblock, int *errcode)
1142{
1143	struct sk_buff *skb;
1144	gfp_t gfp_mask;
1145	long timeo;
1146	int err;
1147
1148	gfp_mask = sk->sk_allocation;
1149	if (gfp_mask & __GFP_WAIT)
1150		gfp_mask |= __GFP_REPEAT;
1151
1152	timeo = sock_sndtimeo(sk, noblock);
1153	while (1) {
1154		err = sock_error(sk);
1155		if (err != 0)
1156			goto failure;
1157
1158		err = -EPIPE;
1159		if (sk->sk_shutdown & SEND_SHUTDOWN)
1160			goto failure;
1161
1162		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1163			skb = alloc_skb(header_len, gfp_mask);
1164			if (skb) {
1165				int npages;
1166				int i;
1167
1168				/* No pages, we're done... */
1169				if (!data_len)
1170					break;
1171
1172				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1173				skb->truesize += data_len;
1174				skb_shinfo(skb)->nr_frags = npages;
1175				for (i = 0; i < npages; i++) {
1176					struct page *page;
1177					skb_frag_t *frag;
1178
1179					page = alloc_pages(sk->sk_allocation, 0);
1180					if (!page) {
1181						err = -ENOBUFS;
1182						skb_shinfo(skb)->nr_frags = i;
1183						kfree_skb(skb);
1184						goto failure;
1185					}
1186
1187					frag = &skb_shinfo(skb)->frags[i];
1188					frag->page = page;
1189					frag->page_offset = 0;
1190					frag->size = (data_len >= PAGE_SIZE ?
1191						      PAGE_SIZE :
1192						      data_len);
1193					data_len -= PAGE_SIZE;
1194				}
1195
1196				/* Full success... */
1197				break;
1198			}
1199			err = -ENOBUFS;
1200			goto failure;
1201		}
1202		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1203		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1204		err = -EAGAIN;
1205		if (!timeo)
1206			goto failure;
1207		if (signal_pending(current))
1208			goto interrupted;
1209		timeo = sock_wait_for_wmem(sk, timeo);
1210	}
1211
1212	skb_set_owner_w(skb, sk);
1213	return skb;
1214
1215interrupted:
1216	err = sock_intr_errno(timeo);
1217failure:
1218	*errcode = err;
1219	return NULL;
1220}
1221
1222struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1223				    int noblock, int *errcode)
1224{
1225	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1226}
1227
1228static void __lock_sock(struct sock *sk)
1229{
1230	DEFINE_WAIT(wait);
1231
1232	for (;;) {
1233		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1234					TASK_UNINTERRUPTIBLE);
1235		spin_unlock_bh(&sk->sk_lock.slock);
1236		schedule();
1237		spin_lock_bh(&sk->sk_lock.slock);
1238		if (!sock_owned_by_user(sk))
1239			break;
1240	}
1241	finish_wait(&sk->sk_lock.wq, &wait);
1242}
1243
1244static void __release_sock(struct sock *sk)
1245{
1246	struct sk_buff *skb = sk->sk_backlog.head;
1247
1248	do {
1249		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1250		bh_unlock_sock(sk);
1251
1252		do {
1253			struct sk_buff *next = skb->next;
1254
1255			skb->next = NULL;
1256			sk->sk_backlog_rcv(sk, skb);
1257
1258			/*
1259			 * We are in process context here with softirqs
1260			 * disabled, use cond_resched_softirq() to preempt.
1261			 * This is safe to do because we've taken the backlog
1262			 * queue private:
1263			 */
1264			cond_resched_softirq();
1265
1266			skb = next;
1267		} while (skb != NULL);
1268
1269		bh_lock_sock(sk);
1270	} while ((skb = sk->sk_backlog.head) != NULL);
1271}
1272
1273/**
1274 * sk_wait_data - wait for data to arrive at sk_receive_queue
1275 * @sk:    sock to wait on
1276 * @timeo: for how long
1277 *
1278 * Now socket state including sk->sk_err is changed only under lock,
1279 * hence we may omit checks after joining wait queue.
1280 * We check receive queue before schedule() only as optimization;
1281 * it is very likely that release_sock() added new data.
1282 */
1283int sk_wait_data(struct sock *sk, long *timeo)
1284{
1285	int rc;
1286	DEFINE_WAIT(wait);
1287
1288	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1289	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1290	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1291	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1292	finish_wait(sk->sk_sleep, &wait);
1293	return rc;
1294}
1295
1296EXPORT_SYMBOL(sk_wait_data);
1297
1298/*
1299 * Set of default routines for initialising struct proto_ops when
1300 * the protocol does not support a particular function. In certain
1301 * cases where it makes no sense for a protocol to have a "do nothing"
1302 * function, some default processing is provided.
1303 */
1304
1305int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1306{
1307	return -EOPNOTSUPP;
1308}
1309
1310int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1311		    int len, int flags)
1312{
1313	return -EOPNOTSUPP;
1314}
1315
1316int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1317{
1318	return -EOPNOTSUPP;
1319}
1320
1321int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1322{
1323	return -EOPNOTSUPP;
1324}
1325
1326int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1327		    int *len, int peer)
1328{
1329	return -EOPNOTSUPP;
1330}
1331
1332unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1333{
1334	return 0;
1335}
1336
1337int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1338{
1339	return -EOPNOTSUPP;
1340}
1341
1342int sock_no_listen(struct socket *sock, int backlog)
1343{
1344	return -EOPNOTSUPP;
1345}
1346
1347int sock_no_shutdown(struct socket *sock, int how)
1348{
1349	return -EOPNOTSUPP;
1350}
1351
1352int sock_no_setsockopt(struct socket *sock, int level, int optname,
1353		    char __user *optval, int optlen)
1354{
1355	return -EOPNOTSUPP;
1356}
1357
1358int sock_no_getsockopt(struct socket *sock, int level, int optname,
1359		    char __user *optval, int __user *optlen)
1360{
1361	return -EOPNOTSUPP;
1362}
1363
1364int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1365		    size_t len)
1366{
1367	return -EOPNOTSUPP;
1368}
1369
1370int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1371		    size_t len, int flags)
1372{
1373	return -EOPNOTSUPP;
1374}
1375
1376int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1377{
1378	/* Mirror missing mmap method error code */
1379	return -ENODEV;
1380}
1381
1382ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1383{
1384	ssize_t res;
1385	struct msghdr msg = {.msg_flags = flags};
1386	struct kvec iov;
1387	char *kaddr = kmap(page);
1388	iov.iov_base = kaddr + offset;
1389	iov.iov_len = size;
1390	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1391	kunmap(page);
1392	return res;
1393}
1394
1395/*
1396 *	Default Socket Callbacks
1397 */
1398
1399static void sock_def_wakeup(struct sock *sk)
1400{
1401	read_lock(&sk->sk_callback_lock);
1402	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1403		wake_up_interruptible_all(sk->sk_sleep);
1404	read_unlock(&sk->sk_callback_lock);
1405}
1406
1407static void sock_def_error_report(struct sock *sk)
1408{
1409	read_lock(&sk->sk_callback_lock);
1410	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1411		wake_up_interruptible(sk->sk_sleep);
1412	sk_wake_async(sk,0,POLL_ERR);
1413	read_unlock(&sk->sk_callback_lock);
1414}
1415
1416static void sock_def_readable(struct sock *sk, int len)
1417{
1418	read_lock(&sk->sk_callback_lock);
1419	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1420		wake_up_interruptible(sk->sk_sleep);
1421	sk_wake_async(sk,1,POLL_IN);
1422	read_unlock(&sk->sk_callback_lock);
1423}
1424
1425static void sock_def_write_space(struct sock *sk)
1426{
1427	read_lock(&sk->sk_callback_lock);
1428
1429	/* Do not wake up a writer until he can make "significant"
1430	 * progress.  --DaveM
1431	 */
1432	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1433		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1434			wake_up_interruptible(sk->sk_sleep);
1435
1436		/* Should agree with poll, otherwise some programs break */
1437		if (sock_writeable(sk))
1438			sk_wake_async(sk, 2, POLL_OUT);
1439	}
1440
1441	read_unlock(&sk->sk_callback_lock);
1442}
1443
1444static void sock_def_destruct(struct sock *sk)
1445{
1446	kfree(sk->sk_protinfo);
1447}
1448
1449void sk_send_sigurg(struct sock *sk)
1450{
1451	if (sk->sk_socket && sk->sk_socket->file)
1452		if (send_sigurg(&sk->sk_socket->file->f_owner))
1453			sk_wake_async(sk, 3, POLL_PRI);
1454}
1455
1456void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1457		    unsigned long expires)
1458{
1459	if (!mod_timer(timer, expires))
1460		sock_hold(sk);
1461}
1462
1463EXPORT_SYMBOL(sk_reset_timer);
1464
1465void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1466{
1467	if (timer_pending(timer) && del_timer(timer))
1468		__sock_put(sk);
1469}
1470
1471EXPORT_SYMBOL(sk_stop_timer);
1472
1473void sock_init_data(struct socket *sock, struct sock *sk)
1474{
1475	skb_queue_head_init(&sk->sk_receive_queue);
1476	skb_queue_head_init(&sk->sk_write_queue);
1477	skb_queue_head_init(&sk->sk_error_queue);
1478#ifdef CONFIG_NET_DMA
1479	skb_queue_head_init(&sk->sk_async_wait_queue);
1480#endif
1481
1482	sk->sk_send_head	=	NULL;
1483
1484	init_timer(&sk->sk_timer);
1485
1486	sk->sk_allocation	=	GFP_KERNEL;
1487	sk->sk_rcvbuf		=	sysctl_rmem_default;
1488	sk->sk_sndbuf		=	sysctl_wmem_default;
1489	sk->sk_state		=	TCP_CLOSE;
1490	sk->sk_socket		=	sock;
1491
1492	sock_set_flag(sk, SOCK_ZAPPED);
1493
1494	if (sock) {
1495		sk->sk_type	=	sock->type;
1496		sk->sk_sleep	=	&sock->wait;
1497		sock->sk	=	sk;
1498	} else
1499		sk->sk_sleep	=	NULL;
1500
1501	rwlock_init(&sk->sk_dst_lock);
1502	rwlock_init(&sk->sk_callback_lock);
1503	lockdep_set_class(&sk->sk_callback_lock,
1504			   af_callback_keys + sk->sk_family);
1505
1506	sk->sk_state_change	=	sock_def_wakeup;
1507	sk->sk_data_ready	=	sock_def_readable;
1508	sk->sk_write_space	=	sock_def_write_space;
1509	sk->sk_error_report	=	sock_def_error_report;
1510	sk->sk_destruct		=	sock_def_destruct;
1511
1512	sk->sk_sndmsg_page	=	NULL;
1513	sk->sk_sndmsg_off	=	0;
1514
1515	sk->sk_peercred.pid 	=	0;
1516	sk->sk_peercred.uid	=	-1;
1517	sk->sk_peercred.gid	=	-1;
1518	sk->sk_write_pending	=	0;
1519	sk->sk_rcvlowat		=	1;
1520	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1521	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1522
1523	sk->sk_stamp = ktime_set(-1L, -1L);
1524
1525	atomic_set(&sk->sk_refcnt, 1);
1526}
1527
1528void fastcall lock_sock_nested(struct sock *sk, int subclass)
1529{
1530	might_sleep();
1531	spin_lock_bh(&sk->sk_lock.slock);
1532	if (sk->sk_lock.owner)
1533		__lock_sock(sk);
1534	sk->sk_lock.owner = (void *)1;
1535	spin_unlock(&sk->sk_lock.slock);
1536	/*
1537	 * The sk_lock has mutex_lock() semantics here:
1538	 */
1539	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1540	local_bh_enable();
1541}
1542
1543EXPORT_SYMBOL(lock_sock_nested);
1544
1545void fastcall release_sock(struct sock *sk)
1546{
1547	/*
1548	 * The sk_lock has mutex_unlock() semantics:
1549	 */
1550	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1551
1552	spin_lock_bh(&sk->sk_lock.slock);
1553	if (sk->sk_backlog.tail)
1554		__release_sock(sk);
1555	sk->sk_lock.owner = NULL;
1556	if (waitqueue_active(&sk->sk_lock.wq))
1557		wake_up(&sk->sk_lock.wq);
1558	spin_unlock_bh(&sk->sk_lock.slock);
1559}
1560EXPORT_SYMBOL(release_sock);
1561
1562int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1563{
1564	struct timeval tv;
1565	if (!sock_flag(sk, SOCK_TIMESTAMP))
1566		sock_enable_timestamp(sk);
1567	tv = ktime_to_timeval(sk->sk_stamp);
1568	if (tv.tv_sec == -1)
1569		return -ENOENT;
1570	if (tv.tv_sec == 0) {
1571		sk->sk_stamp = ktime_get_real();
1572		tv = ktime_to_timeval(sk->sk_stamp);
1573	}
1574	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1575}
1576EXPORT_SYMBOL(sock_get_timestamp);
1577
1578int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1579{
1580	struct timespec ts;
1581	if (!sock_flag(sk, SOCK_TIMESTAMP))
1582		sock_enable_timestamp(sk);
1583	ts = ktime_to_timespec(sk->sk_stamp);
1584	if (ts.tv_sec == -1)
1585		return -ENOENT;
1586	if (ts.tv_sec == 0) {
1587		sk->sk_stamp = ktime_get_real();
1588		ts = ktime_to_timespec(sk->sk_stamp);
1589	}
1590	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1591}
1592EXPORT_SYMBOL(sock_get_timestampns);
1593
1594void sock_enable_timestamp(struct sock *sk)
1595{
1596	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1597		sock_set_flag(sk, SOCK_TIMESTAMP);
1598		net_enable_timestamp();
1599	}
1600}
1601EXPORT_SYMBOL(sock_enable_timestamp);
1602
1603/*
1604 *	Get a socket option on an socket.
1605 *
1606 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1607 *	asynchronous errors should be reported by getsockopt. We assume
1608 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1609 */
1610int sock_common_getsockopt(struct socket *sock, int level, int optname,
1611			   char __user *optval, int __user *optlen)
1612{
1613	struct sock *sk = sock->sk;
1614
1615	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1616}
1617
1618EXPORT_SYMBOL(sock_common_getsockopt);
1619
1620#ifdef CONFIG_COMPAT
1621int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1622				  char __user *optval, int __user *optlen)
1623{
1624	struct sock *sk = sock->sk;
1625
1626	if (sk->sk_prot->compat_getsockopt != NULL)
1627		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1628						      optval, optlen);
1629	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1630}
1631EXPORT_SYMBOL(compat_sock_common_getsockopt);
1632#endif
1633
1634int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1635			struct msghdr *msg, size_t size, int flags)
1636{
1637	struct sock *sk = sock->sk;
1638	int addr_len = 0;
1639	int err;
1640
1641	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1642				   flags & ~MSG_DONTWAIT, &addr_len);
1643	if (err >= 0)
1644		msg->msg_namelen = addr_len;
1645	return err;
1646}
1647
1648EXPORT_SYMBOL(sock_common_recvmsg);
1649
1650/*
1651 *	Set socket options on an inet socket.
1652 */
1653int sock_common_setsockopt(struct socket *sock, int level, int optname,
1654			   char __user *optval, int optlen)
1655{
1656	struct sock *sk = sock->sk;
1657
1658	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1659}
1660
1661EXPORT_SYMBOL(sock_common_setsockopt);
1662
1663#ifdef CONFIG_COMPAT
1664int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1665				  char __user *optval, int optlen)
1666{
1667	struct sock *sk = sock->sk;
1668
1669	if (sk->sk_prot->compat_setsockopt != NULL)
1670		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1671						      optval, optlen);
1672	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1673}
1674EXPORT_SYMBOL(compat_sock_common_setsockopt);
1675#endif
1676
1677void sk_common_release(struct sock *sk)
1678{
1679	if (sk->sk_prot->destroy)
1680		sk->sk_prot->destroy(sk);
1681
1682	/*
1683	 * Observation: when sock_common_release is called, processes have
1684	 * no access to socket. But net still has.
1685	 * Step one, detach it from networking:
1686	 *
1687	 * A. Remove from hash tables.
1688	 */
1689
1690	sk->sk_prot->unhash(sk);
1691
1692	/*
1693	 * In this point socket cannot receive new packets, but it is possible
1694	 * that some packets are in flight because some CPU runs receiver and
1695	 * did hash table lookup before we unhashed socket. They will achieve
1696	 * receive queue and will be purged by socket destructor.
1697	 *
1698	 * Also we still have packets pending on receive queue and probably,
1699	 * our own packets waiting in device queues. sock_destroy will drain
1700	 * receive queue, but transmitted packets will delay socket destruction
1701	 * until the last reference will be released.
1702	 */
1703
1704	sock_orphan(sk);
1705
1706	xfrm_sk_free_policy(sk);
1707
1708	sk_refcnt_debug_release(sk);
1709	sock_put(sk);
1710}
1711
1712EXPORT_SYMBOL(sk_common_release);
1713
1714static DEFINE_RWLOCK(proto_list_lock);
1715static LIST_HEAD(proto_list);
1716
1717int proto_register(struct proto *prot, int alloc_slab)
1718{
1719	char *request_sock_slab_name = NULL;
1720	char *timewait_sock_slab_name;
1721	int rc = -ENOBUFS;
1722
1723	if (alloc_slab) {
1724		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1725					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1726
1727		if (prot->slab == NULL) {
1728			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1729			       prot->name);
1730			goto out;
1731		}
1732
1733		if (prot->rsk_prot != NULL) {
1734			static const char mask[] = "request_sock_%s";
1735
1736			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1737			if (request_sock_slab_name == NULL)
1738				goto out_free_sock_slab;
1739
1740			sprintf(request_sock_slab_name, mask, prot->name);
1741			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1742								 prot->rsk_prot->obj_size, 0,
1743								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1744
1745			if (prot->rsk_prot->slab == NULL) {
1746				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1747				       prot->name);
1748				goto out_free_request_sock_slab_name;
1749			}
1750		}
1751
1752		if (prot->twsk_prot != NULL) {
1753			static const char mask[] = "tw_sock_%s";
1754
1755			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1756
1757			if (timewait_sock_slab_name == NULL)
1758				goto out_free_request_sock_slab;
1759
1760			sprintf(timewait_sock_slab_name, mask, prot->name);
1761			prot->twsk_prot->twsk_slab =
1762				kmem_cache_create(timewait_sock_slab_name,
1763						  prot->twsk_prot->twsk_obj_size,
1764						  0, SLAB_HWCACHE_ALIGN,
1765						  NULL, NULL);
1766			if (prot->twsk_prot->twsk_slab == NULL)
1767				goto out_free_timewait_sock_slab_name;
1768		}
1769	}
1770
1771	write_lock(&proto_list_lock);
1772	list_add(&prot->node, &proto_list);
1773	write_unlock(&proto_list_lock);
1774	rc = 0;
1775out:
1776	return rc;
1777out_free_timewait_sock_slab_name:
1778	kfree(timewait_sock_slab_name);
1779out_free_request_sock_slab:
1780	if (prot->rsk_prot && prot->rsk_prot->slab) {
1781		kmem_cache_destroy(prot->rsk_prot->slab);
1782		prot->rsk_prot->slab = NULL;
1783	}
1784out_free_request_sock_slab_name:
1785	kfree(request_sock_slab_name);
1786out_free_sock_slab:
1787	kmem_cache_destroy(prot->slab);
1788	prot->slab = NULL;
1789	goto out;
1790}
1791
1792EXPORT_SYMBOL(proto_register);
1793
1794void proto_unregister(struct proto *prot)
1795{
1796	write_lock(&proto_list_lock);
1797	list_del(&prot->node);
1798	write_unlock(&proto_list_lock);
1799
1800	if (prot->slab != NULL) {
1801		kmem_cache_destroy(prot->slab);
1802		prot->slab = NULL;
1803	}
1804
1805	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1806		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1807
1808		kmem_cache_destroy(prot->rsk_prot->slab);
1809		kfree(name);
1810		prot->rsk_prot->slab = NULL;
1811	}
1812
1813	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1814		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1815
1816		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1817		kfree(name);
1818		prot->twsk_prot->twsk_slab = NULL;
1819	}
1820}
1821
1822EXPORT_SYMBOL(proto_unregister);
1823
1824#ifdef CONFIG_PROC_FS
1825static inline struct proto *__proto_head(void)
1826{
1827	return list_entry(proto_list.next, struct proto, node);
1828}
1829
1830static inline struct proto *proto_head(void)
1831{
1832	return list_empty(&proto_list) ? NULL : __proto_head();
1833}
1834
1835static inline struct proto *proto_next(struct proto *proto)
1836{
1837	return proto->node.next == &proto_list ? NULL :
1838		list_entry(proto->node.next, struct proto, node);
1839}
1840
1841static inline struct proto *proto_get_idx(loff_t pos)
1842{
1843	struct proto *proto;
1844	loff_t i = 0;
1845
1846	list_for_each_entry(proto, &proto_list, node)
1847		if (i++ == pos)
1848			goto out;
1849
1850	proto = NULL;
1851out:
1852	return proto;
1853}
1854
1855static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1856{
1857	read_lock(&proto_list_lock);
1858	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1859}
1860
1861static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1862{
1863	++*pos;
1864	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1865}
1866
1867static void proto_seq_stop(struct seq_file *seq, void *v)
1868{
1869	read_unlock(&proto_list_lock);
1870}
1871
1872static char proto_method_implemented(const void *method)
1873{
1874	return method == NULL ? 'n' : 'y';
1875}
1876
1877static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1878{
1879	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1880			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1881		   proto->name,
1882		   proto->obj_size,
1883		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1884		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1885		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1886		   proto->max_header,
1887		   proto->slab == NULL ? "no" : "yes",
1888		   module_name(proto->owner),
1889		   proto_method_implemented(proto->close),
1890		   proto_method_implemented(proto->connect),
1891		   proto_method_implemented(proto->disconnect),
1892		   proto_method_implemented(proto->accept),
1893		   proto_method_implemented(proto->ioctl),
1894		   proto_method_implemented(proto->init),
1895		   proto_method_implemented(proto->destroy),
1896		   proto_method_implemented(proto->shutdown),
1897		   proto_method_implemented(proto->setsockopt),
1898		   proto_method_implemented(proto->getsockopt),
1899		   proto_method_implemented(proto->sendmsg),
1900		   proto_method_implemented(proto->recvmsg),
1901		   proto_method_implemented(proto->sendpage),
1902		   proto_method_implemented(proto->bind),
1903		   proto_method_implemented(proto->backlog_rcv),
1904		   proto_method_implemented(proto->hash),
1905		   proto_method_implemented(proto->unhash),
1906		   proto_method_implemented(proto->get_port),
1907		   proto_method_implemented(proto->enter_memory_pressure));
1908}
1909
1910static int proto_seq_show(struct seq_file *seq, void *v)
1911{
1912	if (v == SEQ_START_TOKEN)
1913		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1914			   "protocol",
1915			   "size",
1916			   "sockets",
1917			   "memory",
1918			   "press",
1919			   "maxhdr",
1920			   "slab",
1921			   "module",
1922			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1923	else
1924		proto_seq_printf(seq, v);
1925	return 0;
1926}
1927
1928static const struct seq_operations proto_seq_ops = {
1929	.start  = proto_seq_start,
1930	.next   = proto_seq_next,
1931	.stop   = proto_seq_stop,
1932	.show   = proto_seq_show,
1933};
1934
1935static int proto_seq_open(struct inode *inode, struct file *file)
1936{
1937	return seq_open(file, &proto_seq_ops);
1938}
1939
1940static const struct file_operations proto_seq_fops = {
1941	.owner		= THIS_MODULE,
1942	.open		= proto_seq_open,
1943	.read		= seq_read,
1944	.llseek		= seq_lseek,
1945	.release	= seq_release,
1946};
1947
1948static int __init proto_init(void)
1949{
1950	/* register /proc/net/protocols */
1951	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1952}
1953
1954subsys_initcall(proto_init);
1955
1956#endif /* PROC_FS */
1957
1958EXPORT_SYMBOL(sk_alloc);
1959EXPORT_SYMBOL(sk_free);
1960EXPORT_SYMBOL(sk_send_sigurg);
1961EXPORT_SYMBOL(sock_alloc_send_skb);
1962EXPORT_SYMBOL(sock_init_data);
1963EXPORT_SYMBOL(sock_kfree_s);
1964EXPORT_SYMBOL(sock_kmalloc);
1965EXPORT_SYMBOL(sock_no_accept);
1966EXPORT_SYMBOL(sock_no_bind);
1967EXPORT_SYMBOL(sock_no_connect);
1968EXPORT_SYMBOL(sock_no_getname);
1969EXPORT_SYMBOL(sock_no_getsockopt);
1970EXPORT_SYMBOL(sock_no_ioctl);
1971EXPORT_SYMBOL(sock_no_listen);
1972EXPORT_SYMBOL(sock_no_mmap);
1973EXPORT_SYMBOL(sock_no_poll);
1974EXPORT_SYMBOL(sock_no_recvmsg);
1975EXPORT_SYMBOL(sock_no_sendmsg);
1976EXPORT_SYMBOL(sock_no_sendpage);
1977EXPORT_SYMBOL(sock_no_setsockopt);
1978EXPORT_SYMBOL(sock_no_shutdown);
1979EXPORT_SYMBOL(sock_no_socketpair);
1980EXPORT_SYMBOL(sock_rfree);
1981EXPORT_SYMBOL(sock_setsockopt);
1982EXPORT_SYMBOL(sock_wfree);
1983EXPORT_SYMBOL(sock_wmalloc);
1984EXPORT_SYMBOL(sock_i_uid);
1985EXPORT_SYMBOL(sock_i_ino);
1986EXPORT_SYMBOL(sysctl_optmem_max);
1987#ifdef CONFIG_SYSCTL
1988EXPORT_SYMBOL(sysctl_rmem_max);
1989EXPORT_SYMBOL(sysctl_wmem_max);
1990#endif
1991