sock.c revision 2b85a34e911bf483c27cfdd124aeb1605145dc80
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
120#include <net/net_namespace.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <linux/net_tstamp.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158  "sk_lock-AF_IEEE802154",
159  "sk_lock-AF_MAX"
160};
161static const char *af_family_slock_key_strings[AF_MAX+1] = {
162  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174  "slock-AF_IEEE802154",
175  "slock-AF_MAX"
176};
177static const char *af_family_clock_key_strings[AF_MAX+1] = {
178  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190  "clock-AF_IEEE802154",
191  "clock-AF_MAX"
192};
193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms.  This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS		256
206#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218EXPORT_SYMBOL(sysctl_optmem_max);
219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222	struct timeval tv;
223
224	if (optlen < sizeof(tv))
225		return -EINVAL;
226	if (copy_from_user(&tv, optval, sizeof(tv)))
227		return -EFAULT;
228	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229		return -EDOM;
230
231	if (tv.tv_sec < 0) {
232		static int warned __read_mostly;
233
234		*timeo_p = 0;
235		if (warned < 10 && net_ratelimit()) {
236			warned++;
237			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238			       "tries to set negative timeout\n",
239				current->comm, task_pid_nr(current));
240		}
241		return 0;
242	}
243	*timeo_p = MAX_SCHEDULE_TIMEOUT;
244	if (tv.tv_sec == 0 && tv.tv_usec == 0)
245		return 0;
246	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248	return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253	static int warned;
254	static char warncomm[TASK_COMM_LEN];
255	if (strcmp(warncomm, current->comm) && warned < 5) {
256		strcpy(warncomm,  current->comm);
257		printk(KERN_WARNING "process `%s' is using obsolete "
258		       "%s SO_BSDCOMPAT\n", warncomm, name);
259		warned++;
260	}
261}
262
263static void sock_disable_timestamp(struct sock *sk, int flag)
264{
265	if (sock_flag(sk, flag)) {
266		sock_reset_flag(sk, flag);
267		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269			net_disable_timestamp();
270		}
271	}
272}
273
274
275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277	int err = 0;
278	int skb_len;
279
280	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
281	   number of warnings when compiling with -W --ANK
282	 */
283	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284	    (unsigned)sk->sk_rcvbuf) {
285		err = -ENOMEM;
286		goto out;
287	}
288
289	err = sk_filter(sk, skb);
290	if (err)
291		goto out;
292
293	if (!sk_rmem_schedule(sk, skb->truesize)) {
294		err = -ENOBUFS;
295		goto out;
296	}
297
298	skb->dev = NULL;
299	skb_set_owner_r(skb, sk);
300
301	/* Cache the SKB length before we tack it onto the receive
302	 * queue.  Once it is added it no longer belongs to us and
303	 * may be freed by other threads of control pulling packets
304	 * from the queue.
305	 */
306	skb_len = skb->len;
307
308	skb_queue_tail(&sk->sk_receive_queue, skb);
309
310	if (!sock_flag(sk, SOCK_DEAD))
311		sk->sk_data_ready(sk, skb_len);
312out:
313	return err;
314}
315EXPORT_SYMBOL(sock_queue_rcv_skb);
316
317int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
318{
319	int rc = NET_RX_SUCCESS;
320
321	if (sk_filter(sk, skb))
322		goto discard_and_relse;
323
324	skb->dev = NULL;
325
326	if (nested)
327		bh_lock_sock_nested(sk);
328	else
329		bh_lock_sock(sk);
330	if (!sock_owned_by_user(sk)) {
331		/*
332		 * trylock + unlock semantics:
333		 */
334		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
335
336		rc = sk_backlog_rcv(sk, skb);
337
338		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339	} else
340		sk_add_backlog(sk, skb);
341	bh_unlock_sock(sk);
342out:
343	sock_put(sk);
344	return rc;
345discard_and_relse:
346	kfree_skb(skb);
347	goto out;
348}
349EXPORT_SYMBOL(sk_receive_skb);
350
351struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
352{
353	struct dst_entry *dst = sk->sk_dst_cache;
354
355	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356		sk->sk_dst_cache = NULL;
357		dst_release(dst);
358		return NULL;
359	}
360
361	return dst;
362}
363EXPORT_SYMBOL(__sk_dst_check);
364
365struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
366{
367	struct dst_entry *dst = sk_dst_get(sk);
368
369	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370		sk_dst_reset(sk);
371		dst_release(dst);
372		return NULL;
373	}
374
375	return dst;
376}
377EXPORT_SYMBOL(sk_dst_check);
378
379static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
380{
381	int ret = -ENOPROTOOPT;
382#ifdef CONFIG_NETDEVICES
383	struct net *net = sock_net(sk);
384	char devname[IFNAMSIZ];
385	int index;
386
387	/* Sorry... */
388	ret = -EPERM;
389	if (!capable(CAP_NET_RAW))
390		goto out;
391
392	ret = -EINVAL;
393	if (optlen < 0)
394		goto out;
395
396	/* Bind this socket to a particular device like "eth0",
397	 * as specified in the passed interface name. If the
398	 * name is "" or the option length is zero the socket
399	 * is not bound.
400	 */
401	if (optlen > IFNAMSIZ - 1)
402		optlen = IFNAMSIZ - 1;
403	memset(devname, 0, sizeof(devname));
404
405	ret = -EFAULT;
406	if (copy_from_user(devname, optval, optlen))
407		goto out;
408
409	if (devname[0] == '\0') {
410		index = 0;
411	} else {
412		struct net_device *dev = dev_get_by_name(net, devname);
413
414		ret = -ENODEV;
415		if (!dev)
416			goto out;
417
418		index = dev->ifindex;
419		dev_put(dev);
420	}
421
422	lock_sock(sk);
423	sk->sk_bound_dev_if = index;
424	sk_dst_reset(sk);
425	release_sock(sk);
426
427	ret = 0;
428
429out:
430#endif
431
432	return ret;
433}
434
435static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
436{
437	if (valbool)
438		sock_set_flag(sk, bit);
439	else
440		sock_reset_flag(sk, bit);
441}
442
443/*
444 *	This is meant for all protocols to use and covers goings on
445 *	at the socket level. Everything here is generic.
446 */
447
448int sock_setsockopt(struct socket *sock, int level, int optname,
449		    char __user *optval, int optlen)
450{
451	struct sock *sk = sock->sk;
452	int val;
453	int valbool;
454	struct linger ling;
455	int ret = 0;
456
457	/*
458	 *	Options without arguments
459	 */
460
461	if (optname == SO_BINDTODEVICE)
462		return sock_bindtodevice(sk, optval, optlen);
463
464	if (optlen < sizeof(int))
465		return -EINVAL;
466
467	if (get_user(val, (int __user *)optval))
468		return -EFAULT;
469
470	valbool = val ? 1 : 0;
471
472	lock_sock(sk);
473
474	switch (optname) {
475	case SO_DEBUG:
476		if (val && !capable(CAP_NET_ADMIN))
477			ret = -EACCES;
478		else
479			sock_valbool_flag(sk, SOCK_DBG, valbool);
480		break;
481	case SO_REUSEADDR:
482		sk->sk_reuse = valbool;
483		break;
484	case SO_TYPE:
485	case SO_ERROR:
486		ret = -ENOPROTOOPT;
487		break;
488	case SO_DONTROUTE:
489		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
490		break;
491	case SO_BROADCAST:
492		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
493		break;
494	case SO_SNDBUF:
495		/* Don't error on this BSD doesn't and if you think
496		   about it this is right. Otherwise apps have to
497		   play 'guess the biggest size' games. RCVBUF/SNDBUF
498		   are treated in BSD as hints */
499
500		if (val > sysctl_wmem_max)
501			val = sysctl_wmem_max;
502set_sndbuf:
503		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
504		if ((val * 2) < SOCK_MIN_SNDBUF)
505			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
506		else
507			sk->sk_sndbuf = val * 2;
508
509		/*
510		 *	Wake up sending tasks if we
511		 *	upped the value.
512		 */
513		sk->sk_write_space(sk);
514		break;
515
516	case SO_SNDBUFFORCE:
517		if (!capable(CAP_NET_ADMIN)) {
518			ret = -EPERM;
519			break;
520		}
521		goto set_sndbuf;
522
523	case SO_RCVBUF:
524		/* Don't error on this BSD doesn't and if you think
525		   about it this is right. Otherwise apps have to
526		   play 'guess the biggest size' games. RCVBUF/SNDBUF
527		   are treated in BSD as hints */
528
529		if (val > sysctl_rmem_max)
530			val = sysctl_rmem_max;
531set_rcvbuf:
532		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
533		/*
534		 * We double it on the way in to account for
535		 * "struct sk_buff" etc. overhead.   Applications
536		 * assume that the SO_RCVBUF setting they make will
537		 * allow that much actual data to be received on that
538		 * socket.
539		 *
540		 * Applications are unaware that "struct sk_buff" and
541		 * other overheads allocate from the receive buffer
542		 * during socket buffer allocation.
543		 *
544		 * And after considering the possible alternatives,
545		 * returning the value we actually used in getsockopt
546		 * is the most desirable behavior.
547		 */
548		if ((val * 2) < SOCK_MIN_RCVBUF)
549			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
550		else
551			sk->sk_rcvbuf = val * 2;
552		break;
553
554	case SO_RCVBUFFORCE:
555		if (!capable(CAP_NET_ADMIN)) {
556			ret = -EPERM;
557			break;
558		}
559		goto set_rcvbuf;
560
561	case SO_KEEPALIVE:
562#ifdef CONFIG_INET
563		if (sk->sk_protocol == IPPROTO_TCP)
564			tcp_set_keepalive(sk, valbool);
565#endif
566		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
567		break;
568
569	case SO_OOBINLINE:
570		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
571		break;
572
573	case SO_NO_CHECK:
574		sk->sk_no_check = valbool;
575		break;
576
577	case SO_PRIORITY:
578		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
579			sk->sk_priority = val;
580		else
581			ret = -EPERM;
582		break;
583
584	case SO_LINGER:
585		if (optlen < sizeof(ling)) {
586			ret = -EINVAL;	/* 1003.1g */
587			break;
588		}
589		if (copy_from_user(&ling, optval, sizeof(ling))) {
590			ret = -EFAULT;
591			break;
592		}
593		if (!ling.l_onoff)
594			sock_reset_flag(sk, SOCK_LINGER);
595		else {
596#if (BITS_PER_LONG == 32)
597			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
598				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
599			else
600#endif
601				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
602			sock_set_flag(sk, SOCK_LINGER);
603		}
604		break;
605
606	case SO_BSDCOMPAT:
607		sock_warn_obsolete_bsdism("setsockopt");
608		break;
609
610	case SO_PASSCRED:
611		if (valbool)
612			set_bit(SOCK_PASSCRED, &sock->flags);
613		else
614			clear_bit(SOCK_PASSCRED, &sock->flags);
615		break;
616
617	case SO_TIMESTAMP:
618	case SO_TIMESTAMPNS:
619		if (valbool)  {
620			if (optname == SO_TIMESTAMP)
621				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
622			else
623				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
624			sock_set_flag(sk, SOCK_RCVTSTAMP);
625			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
626		} else {
627			sock_reset_flag(sk, SOCK_RCVTSTAMP);
628			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
629		}
630		break;
631
632	case SO_TIMESTAMPING:
633		if (val & ~SOF_TIMESTAMPING_MASK) {
634			ret = EINVAL;
635			break;
636		}
637		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
638				  val & SOF_TIMESTAMPING_TX_HARDWARE);
639		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
640				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
641		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
642				  val & SOF_TIMESTAMPING_RX_HARDWARE);
643		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
644			sock_enable_timestamp(sk,
645					      SOCK_TIMESTAMPING_RX_SOFTWARE);
646		else
647			sock_disable_timestamp(sk,
648					       SOCK_TIMESTAMPING_RX_SOFTWARE);
649		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
650				  val & SOF_TIMESTAMPING_SOFTWARE);
651		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
652				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
653		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
654				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
655		break;
656
657	case SO_RCVLOWAT:
658		if (val < 0)
659			val = INT_MAX;
660		sk->sk_rcvlowat = val ? : 1;
661		break;
662
663	case SO_RCVTIMEO:
664		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
665		break;
666
667	case SO_SNDTIMEO:
668		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
669		break;
670
671	case SO_ATTACH_FILTER:
672		ret = -EINVAL;
673		if (optlen == sizeof(struct sock_fprog)) {
674			struct sock_fprog fprog;
675
676			ret = -EFAULT;
677			if (copy_from_user(&fprog, optval, sizeof(fprog)))
678				break;
679
680			ret = sk_attach_filter(&fprog, sk);
681		}
682		break;
683
684	case SO_DETACH_FILTER:
685		ret = sk_detach_filter(sk);
686		break;
687
688	case SO_PASSSEC:
689		if (valbool)
690			set_bit(SOCK_PASSSEC, &sock->flags);
691		else
692			clear_bit(SOCK_PASSSEC, &sock->flags);
693		break;
694	case SO_MARK:
695		if (!capable(CAP_NET_ADMIN))
696			ret = -EPERM;
697		else
698			sk->sk_mark = val;
699		break;
700
701		/* We implement the SO_SNDLOWAT etc to
702		   not be settable (1003.1g 5.3) */
703	default:
704		ret = -ENOPROTOOPT;
705		break;
706	}
707	release_sock(sk);
708	return ret;
709}
710EXPORT_SYMBOL(sock_setsockopt);
711
712
713int sock_getsockopt(struct socket *sock, int level, int optname,
714		    char __user *optval, int __user *optlen)
715{
716	struct sock *sk = sock->sk;
717
718	union {
719		int val;
720		struct linger ling;
721		struct timeval tm;
722	} v;
723
724	unsigned int lv = sizeof(int);
725	int len;
726
727	if (get_user(len, optlen))
728		return -EFAULT;
729	if (len < 0)
730		return -EINVAL;
731
732	memset(&v, 0, sizeof(v));
733
734	switch (optname) {
735	case SO_DEBUG:
736		v.val = sock_flag(sk, SOCK_DBG);
737		break;
738
739	case SO_DONTROUTE:
740		v.val = sock_flag(sk, SOCK_LOCALROUTE);
741		break;
742
743	case SO_BROADCAST:
744		v.val = !!sock_flag(sk, SOCK_BROADCAST);
745		break;
746
747	case SO_SNDBUF:
748		v.val = sk->sk_sndbuf;
749		break;
750
751	case SO_RCVBUF:
752		v.val = sk->sk_rcvbuf;
753		break;
754
755	case SO_REUSEADDR:
756		v.val = sk->sk_reuse;
757		break;
758
759	case SO_KEEPALIVE:
760		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
761		break;
762
763	case SO_TYPE:
764		v.val = sk->sk_type;
765		break;
766
767	case SO_ERROR:
768		v.val = -sock_error(sk);
769		if (v.val == 0)
770			v.val = xchg(&sk->sk_err_soft, 0);
771		break;
772
773	case SO_OOBINLINE:
774		v.val = !!sock_flag(sk, SOCK_URGINLINE);
775		break;
776
777	case SO_NO_CHECK:
778		v.val = sk->sk_no_check;
779		break;
780
781	case SO_PRIORITY:
782		v.val = sk->sk_priority;
783		break;
784
785	case SO_LINGER:
786		lv		= sizeof(v.ling);
787		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
788		v.ling.l_linger	= sk->sk_lingertime / HZ;
789		break;
790
791	case SO_BSDCOMPAT:
792		sock_warn_obsolete_bsdism("getsockopt");
793		break;
794
795	case SO_TIMESTAMP:
796		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
797				!sock_flag(sk, SOCK_RCVTSTAMPNS);
798		break;
799
800	case SO_TIMESTAMPNS:
801		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
802		break;
803
804	case SO_TIMESTAMPING:
805		v.val = 0;
806		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
807			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
808		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
809			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
810		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
811			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
812		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
813			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
814		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
815			v.val |= SOF_TIMESTAMPING_SOFTWARE;
816		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
817			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
818		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
819			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
820		break;
821
822	case SO_RCVTIMEO:
823		lv = sizeof(struct timeval);
824		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
825			v.tm.tv_sec = 0;
826			v.tm.tv_usec = 0;
827		} else {
828			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
829			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
830		}
831		break;
832
833	case SO_SNDTIMEO:
834		lv = sizeof(struct timeval);
835		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
836			v.tm.tv_sec = 0;
837			v.tm.tv_usec = 0;
838		} else {
839			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
840			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
841		}
842		break;
843
844	case SO_RCVLOWAT:
845		v.val = sk->sk_rcvlowat;
846		break;
847
848	case SO_SNDLOWAT:
849		v.val = 1;
850		break;
851
852	case SO_PASSCRED:
853		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
854		break;
855
856	case SO_PEERCRED:
857		if (len > sizeof(sk->sk_peercred))
858			len = sizeof(sk->sk_peercred);
859		if (copy_to_user(optval, &sk->sk_peercred, len))
860			return -EFAULT;
861		goto lenout;
862
863	case SO_PEERNAME:
864	{
865		char address[128];
866
867		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
868			return -ENOTCONN;
869		if (lv < len)
870			return -EINVAL;
871		if (copy_to_user(optval, address, len))
872			return -EFAULT;
873		goto lenout;
874	}
875
876	/* Dubious BSD thing... Probably nobody even uses it, but
877	 * the UNIX standard wants it for whatever reason... -DaveM
878	 */
879	case SO_ACCEPTCONN:
880		v.val = sk->sk_state == TCP_LISTEN;
881		break;
882
883	case SO_PASSSEC:
884		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
885		break;
886
887	case SO_PEERSEC:
888		return security_socket_getpeersec_stream(sock, optval, optlen, len);
889
890	case SO_MARK:
891		v.val = sk->sk_mark;
892		break;
893
894	default:
895		return -ENOPROTOOPT;
896	}
897
898	if (len > lv)
899		len = lv;
900	if (copy_to_user(optval, &v, len))
901		return -EFAULT;
902lenout:
903	if (put_user(len, optlen))
904		return -EFAULT;
905	return 0;
906}
907
908/*
909 * Initialize an sk_lock.
910 *
911 * (We also register the sk_lock with the lock validator.)
912 */
913static inline void sock_lock_init(struct sock *sk)
914{
915	sock_lock_init_class_and_name(sk,
916			af_family_slock_key_strings[sk->sk_family],
917			af_family_slock_keys + sk->sk_family,
918			af_family_key_strings[sk->sk_family],
919			af_family_keys + sk->sk_family);
920}
921
922static void sock_copy(struct sock *nsk, const struct sock *osk)
923{
924#ifdef CONFIG_SECURITY_NETWORK
925	void *sptr = nsk->sk_security;
926#endif
927
928	memcpy(nsk, osk, osk->sk_prot->obj_size);
929#ifdef CONFIG_SECURITY_NETWORK
930	nsk->sk_security = sptr;
931	security_sk_clone(osk, nsk);
932#endif
933}
934
935static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
936		int family)
937{
938	struct sock *sk;
939	struct kmem_cache *slab;
940
941	slab = prot->slab;
942	if (slab != NULL)
943		sk = kmem_cache_alloc(slab, priority);
944	else
945		sk = kmalloc(prot->obj_size, priority);
946
947	if (sk != NULL) {
948		if (security_sk_alloc(sk, family, priority))
949			goto out_free;
950
951		if (!try_module_get(prot->owner))
952			goto out_free_sec;
953	}
954
955	return sk;
956
957out_free_sec:
958	security_sk_free(sk);
959out_free:
960	if (slab != NULL)
961		kmem_cache_free(slab, sk);
962	else
963		kfree(sk);
964	return NULL;
965}
966
967static void sk_prot_free(struct proto *prot, struct sock *sk)
968{
969	struct kmem_cache *slab;
970	struct module *owner;
971
972	owner = prot->owner;
973	slab = prot->slab;
974
975	security_sk_free(sk);
976	if (slab != NULL)
977		kmem_cache_free(slab, sk);
978	else
979		kfree(sk);
980	module_put(owner);
981}
982
983/**
984 *	sk_alloc - All socket objects are allocated here
985 *	@net: the applicable net namespace
986 *	@family: protocol family
987 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
988 *	@prot: struct proto associated with this new sock instance
989 */
990struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
991		      struct proto *prot)
992{
993	struct sock *sk;
994
995	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
996	if (sk) {
997		sk->sk_family = family;
998		/*
999		 * See comment in struct sock definition to understand
1000		 * why we need sk_prot_creator -acme
1001		 */
1002		sk->sk_prot = sk->sk_prot_creator = prot;
1003		sock_lock_init(sk);
1004		sock_net_set(sk, get_net(net));
1005	}
1006
1007	return sk;
1008}
1009EXPORT_SYMBOL(sk_alloc);
1010
1011static void __sk_free(struct sock *sk)
1012{
1013	struct sk_filter *filter;
1014
1015	if (sk->sk_destruct)
1016		sk->sk_destruct(sk);
1017
1018	filter = rcu_dereference(sk->sk_filter);
1019	if (filter) {
1020		sk_filter_uncharge(sk, filter);
1021		rcu_assign_pointer(sk->sk_filter, NULL);
1022	}
1023
1024	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1025	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1026
1027	if (atomic_read(&sk->sk_omem_alloc))
1028		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1029		       __func__, atomic_read(&sk->sk_omem_alloc));
1030
1031	put_net(sock_net(sk));
1032	sk_prot_free(sk->sk_prot_creator, sk);
1033}
1034
1035void sk_free(struct sock *sk)
1036{
1037	/*
1038	 * We substract one from sk_wmem_alloc and can know if
1039	 * some packets are still in some tx queue.
1040	 * If not null, sock_wfree() will call __sk_free(sk) later
1041	 */
1042	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1043		__sk_free(sk);
1044}
1045EXPORT_SYMBOL(sk_free);
1046
1047/*
1048 * Last sock_put should drop referrence to sk->sk_net. It has already
1049 * been dropped in sk_change_net. Taking referrence to stopping namespace
1050 * is not an option.
1051 * Take referrence to a socket to remove it from hash _alive_ and after that
1052 * destroy it in the context of init_net.
1053 */
1054void sk_release_kernel(struct sock *sk)
1055{
1056	if (sk == NULL || sk->sk_socket == NULL)
1057		return;
1058
1059	sock_hold(sk);
1060	sock_release(sk->sk_socket);
1061	release_net(sock_net(sk));
1062	sock_net_set(sk, get_net(&init_net));
1063	sock_put(sk);
1064}
1065EXPORT_SYMBOL(sk_release_kernel);
1066
1067struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1068{
1069	struct sock *newsk;
1070
1071	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1072	if (newsk != NULL) {
1073		struct sk_filter *filter;
1074
1075		sock_copy(newsk, sk);
1076
1077		/* SANITY */
1078		get_net(sock_net(newsk));
1079		sk_node_init(&newsk->sk_node);
1080		sock_lock_init(newsk);
1081		bh_lock_sock(newsk);
1082		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1083
1084		atomic_set(&newsk->sk_rmem_alloc, 0);
1085		/*
1086		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1087		 */
1088		atomic_set(&newsk->sk_wmem_alloc, 1);
1089		atomic_set(&newsk->sk_omem_alloc, 0);
1090		skb_queue_head_init(&newsk->sk_receive_queue);
1091		skb_queue_head_init(&newsk->sk_write_queue);
1092#ifdef CONFIG_NET_DMA
1093		skb_queue_head_init(&newsk->sk_async_wait_queue);
1094#endif
1095
1096		rwlock_init(&newsk->sk_dst_lock);
1097		rwlock_init(&newsk->sk_callback_lock);
1098		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1099				af_callback_keys + newsk->sk_family,
1100				af_family_clock_key_strings[newsk->sk_family]);
1101
1102		newsk->sk_dst_cache	= NULL;
1103		newsk->sk_wmem_queued	= 0;
1104		newsk->sk_forward_alloc = 0;
1105		newsk->sk_send_head	= NULL;
1106		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1107
1108		sock_reset_flag(newsk, SOCK_DONE);
1109		skb_queue_head_init(&newsk->sk_error_queue);
1110
1111		filter = newsk->sk_filter;
1112		if (filter != NULL)
1113			sk_filter_charge(newsk, filter);
1114
1115		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1116			/* It is still raw copy of parent, so invalidate
1117			 * destructor and make plain sk_free() */
1118			newsk->sk_destruct = NULL;
1119			sk_free(newsk);
1120			newsk = NULL;
1121			goto out;
1122		}
1123
1124		newsk->sk_err	   = 0;
1125		newsk->sk_priority = 0;
1126		atomic_set(&newsk->sk_refcnt, 2);
1127
1128		/*
1129		 * Increment the counter in the same struct proto as the master
1130		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1131		 * is the same as sk->sk_prot->socks, as this field was copied
1132		 * with memcpy).
1133		 *
1134		 * This _changes_ the previous behaviour, where
1135		 * tcp_create_openreq_child always was incrementing the
1136		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1137		 * to be taken into account in all callers. -acme
1138		 */
1139		sk_refcnt_debug_inc(newsk);
1140		sk_set_socket(newsk, NULL);
1141		newsk->sk_sleep	 = NULL;
1142
1143		if (newsk->sk_prot->sockets_allocated)
1144			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1145	}
1146out:
1147	return newsk;
1148}
1149EXPORT_SYMBOL_GPL(sk_clone);
1150
1151void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1152{
1153	__sk_dst_set(sk, dst);
1154	sk->sk_route_caps = dst->dev->features;
1155	if (sk->sk_route_caps & NETIF_F_GSO)
1156		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1157	if (sk_can_gso(sk)) {
1158		if (dst->header_len) {
1159			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1160		} else {
1161			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1162			sk->sk_gso_max_size = dst->dev->gso_max_size;
1163		}
1164	}
1165}
1166EXPORT_SYMBOL_GPL(sk_setup_caps);
1167
1168void __init sk_init(void)
1169{
1170	if (num_physpages <= 4096) {
1171		sysctl_wmem_max = 32767;
1172		sysctl_rmem_max = 32767;
1173		sysctl_wmem_default = 32767;
1174		sysctl_rmem_default = 32767;
1175	} else if (num_physpages >= 131072) {
1176		sysctl_wmem_max = 131071;
1177		sysctl_rmem_max = 131071;
1178	}
1179}
1180
1181/*
1182 *	Simple resource managers for sockets.
1183 */
1184
1185
1186/*
1187 * Write buffer destructor automatically called from kfree_skb.
1188 */
1189void sock_wfree(struct sk_buff *skb)
1190{
1191	struct sock *sk = skb->sk;
1192	int res;
1193
1194	/* In case it might be waiting for more memory. */
1195	res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1196	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1197		sk->sk_write_space(sk);
1198	/*
1199	 * if sk_wmem_alloc reached 0, we are last user and should
1200	 * free this sock, as sk_free() call could not do it.
1201	 */
1202	if (res == 0)
1203		__sk_free(sk);
1204}
1205EXPORT_SYMBOL(sock_wfree);
1206
1207/*
1208 * Read buffer destructor automatically called from kfree_skb.
1209 */
1210void sock_rfree(struct sk_buff *skb)
1211{
1212	struct sock *sk = skb->sk;
1213
1214	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1215	sk_mem_uncharge(skb->sk, skb->truesize);
1216}
1217EXPORT_SYMBOL(sock_rfree);
1218
1219
1220int sock_i_uid(struct sock *sk)
1221{
1222	int uid;
1223
1224	read_lock(&sk->sk_callback_lock);
1225	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1226	read_unlock(&sk->sk_callback_lock);
1227	return uid;
1228}
1229EXPORT_SYMBOL(sock_i_uid);
1230
1231unsigned long sock_i_ino(struct sock *sk)
1232{
1233	unsigned long ino;
1234
1235	read_lock(&sk->sk_callback_lock);
1236	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1237	read_unlock(&sk->sk_callback_lock);
1238	return ino;
1239}
1240EXPORT_SYMBOL(sock_i_ino);
1241
1242/*
1243 * Allocate a skb from the socket's send buffer.
1244 */
1245struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1246			     gfp_t priority)
1247{
1248	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1249		struct sk_buff *skb = alloc_skb(size, priority);
1250		if (skb) {
1251			skb_set_owner_w(skb, sk);
1252			return skb;
1253		}
1254	}
1255	return NULL;
1256}
1257EXPORT_SYMBOL(sock_wmalloc);
1258
1259/*
1260 * Allocate a skb from the socket's receive buffer.
1261 */
1262struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1263			     gfp_t priority)
1264{
1265	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1266		struct sk_buff *skb = alloc_skb(size, priority);
1267		if (skb) {
1268			skb_set_owner_r(skb, sk);
1269			return skb;
1270		}
1271	}
1272	return NULL;
1273}
1274
1275/*
1276 * Allocate a memory block from the socket's option memory buffer.
1277 */
1278void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1279{
1280	if ((unsigned)size <= sysctl_optmem_max &&
1281	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1282		void *mem;
1283		/* First do the add, to avoid the race if kmalloc
1284		 * might sleep.
1285		 */
1286		atomic_add(size, &sk->sk_omem_alloc);
1287		mem = kmalloc(size, priority);
1288		if (mem)
1289			return mem;
1290		atomic_sub(size, &sk->sk_omem_alloc);
1291	}
1292	return NULL;
1293}
1294EXPORT_SYMBOL(sock_kmalloc);
1295
1296/*
1297 * Free an option memory block.
1298 */
1299void sock_kfree_s(struct sock *sk, void *mem, int size)
1300{
1301	kfree(mem);
1302	atomic_sub(size, &sk->sk_omem_alloc);
1303}
1304EXPORT_SYMBOL(sock_kfree_s);
1305
1306/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1307   I think, these locks should be removed for datagram sockets.
1308 */
1309static long sock_wait_for_wmem(struct sock *sk, long timeo)
1310{
1311	DEFINE_WAIT(wait);
1312
1313	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1314	for (;;) {
1315		if (!timeo)
1316			break;
1317		if (signal_pending(current))
1318			break;
1319		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1320		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1321		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1322			break;
1323		if (sk->sk_shutdown & SEND_SHUTDOWN)
1324			break;
1325		if (sk->sk_err)
1326			break;
1327		timeo = schedule_timeout(timeo);
1328	}
1329	finish_wait(sk->sk_sleep, &wait);
1330	return timeo;
1331}
1332
1333
1334/*
1335 *	Generic send/receive buffer handlers
1336 */
1337
1338struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1339				     unsigned long data_len, int noblock,
1340				     int *errcode)
1341{
1342	struct sk_buff *skb;
1343	gfp_t gfp_mask;
1344	long timeo;
1345	int err;
1346
1347	gfp_mask = sk->sk_allocation;
1348	if (gfp_mask & __GFP_WAIT)
1349		gfp_mask |= __GFP_REPEAT;
1350
1351	timeo = sock_sndtimeo(sk, noblock);
1352	while (1) {
1353		err = sock_error(sk);
1354		if (err != 0)
1355			goto failure;
1356
1357		err = -EPIPE;
1358		if (sk->sk_shutdown & SEND_SHUTDOWN)
1359			goto failure;
1360
1361		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1362			skb = alloc_skb(header_len, gfp_mask);
1363			if (skb) {
1364				int npages;
1365				int i;
1366
1367				/* No pages, we're done... */
1368				if (!data_len)
1369					break;
1370
1371				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1372				skb->truesize += data_len;
1373				skb_shinfo(skb)->nr_frags = npages;
1374				for (i = 0; i < npages; i++) {
1375					struct page *page;
1376					skb_frag_t *frag;
1377
1378					page = alloc_pages(sk->sk_allocation, 0);
1379					if (!page) {
1380						err = -ENOBUFS;
1381						skb_shinfo(skb)->nr_frags = i;
1382						kfree_skb(skb);
1383						goto failure;
1384					}
1385
1386					frag = &skb_shinfo(skb)->frags[i];
1387					frag->page = page;
1388					frag->page_offset = 0;
1389					frag->size = (data_len >= PAGE_SIZE ?
1390						      PAGE_SIZE :
1391						      data_len);
1392					data_len -= PAGE_SIZE;
1393				}
1394
1395				/* Full success... */
1396				break;
1397			}
1398			err = -ENOBUFS;
1399			goto failure;
1400		}
1401		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1402		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1403		err = -EAGAIN;
1404		if (!timeo)
1405			goto failure;
1406		if (signal_pending(current))
1407			goto interrupted;
1408		timeo = sock_wait_for_wmem(sk, timeo);
1409	}
1410
1411	skb_set_owner_w(skb, sk);
1412	return skb;
1413
1414interrupted:
1415	err = sock_intr_errno(timeo);
1416failure:
1417	*errcode = err;
1418	return NULL;
1419}
1420EXPORT_SYMBOL(sock_alloc_send_pskb);
1421
1422struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1423				    int noblock, int *errcode)
1424{
1425	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1426}
1427EXPORT_SYMBOL(sock_alloc_send_skb);
1428
1429static void __lock_sock(struct sock *sk)
1430{
1431	DEFINE_WAIT(wait);
1432
1433	for (;;) {
1434		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1435					TASK_UNINTERRUPTIBLE);
1436		spin_unlock_bh(&sk->sk_lock.slock);
1437		schedule();
1438		spin_lock_bh(&sk->sk_lock.slock);
1439		if (!sock_owned_by_user(sk))
1440			break;
1441	}
1442	finish_wait(&sk->sk_lock.wq, &wait);
1443}
1444
1445static void __release_sock(struct sock *sk)
1446{
1447	struct sk_buff *skb = sk->sk_backlog.head;
1448
1449	do {
1450		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1451		bh_unlock_sock(sk);
1452
1453		do {
1454			struct sk_buff *next = skb->next;
1455
1456			skb->next = NULL;
1457			sk_backlog_rcv(sk, skb);
1458
1459			/*
1460			 * We are in process context here with softirqs
1461			 * disabled, use cond_resched_softirq() to preempt.
1462			 * This is safe to do because we've taken the backlog
1463			 * queue private:
1464			 */
1465			cond_resched_softirq();
1466
1467			skb = next;
1468		} while (skb != NULL);
1469
1470		bh_lock_sock(sk);
1471	} while ((skb = sk->sk_backlog.head) != NULL);
1472}
1473
1474/**
1475 * sk_wait_data - wait for data to arrive at sk_receive_queue
1476 * @sk:    sock to wait on
1477 * @timeo: for how long
1478 *
1479 * Now socket state including sk->sk_err is changed only under lock,
1480 * hence we may omit checks after joining wait queue.
1481 * We check receive queue before schedule() only as optimization;
1482 * it is very likely that release_sock() added new data.
1483 */
1484int sk_wait_data(struct sock *sk, long *timeo)
1485{
1486	int rc;
1487	DEFINE_WAIT(wait);
1488
1489	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1490	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1491	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1492	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1493	finish_wait(sk->sk_sleep, &wait);
1494	return rc;
1495}
1496EXPORT_SYMBOL(sk_wait_data);
1497
1498/**
1499 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1500 *	@sk: socket
1501 *	@size: memory size to allocate
1502 *	@kind: allocation type
1503 *
1504 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1505 *	rmem allocation. This function assumes that protocols which have
1506 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1507 */
1508int __sk_mem_schedule(struct sock *sk, int size, int kind)
1509{
1510	struct proto *prot = sk->sk_prot;
1511	int amt = sk_mem_pages(size);
1512	int allocated;
1513
1514	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1515	allocated = atomic_add_return(amt, prot->memory_allocated);
1516
1517	/* Under limit. */
1518	if (allocated <= prot->sysctl_mem[0]) {
1519		if (prot->memory_pressure && *prot->memory_pressure)
1520			*prot->memory_pressure = 0;
1521		return 1;
1522	}
1523
1524	/* Under pressure. */
1525	if (allocated > prot->sysctl_mem[1])
1526		if (prot->enter_memory_pressure)
1527			prot->enter_memory_pressure(sk);
1528
1529	/* Over hard limit. */
1530	if (allocated > prot->sysctl_mem[2])
1531		goto suppress_allocation;
1532
1533	/* guarantee minimum buffer size under pressure */
1534	if (kind == SK_MEM_RECV) {
1535		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1536			return 1;
1537	} else { /* SK_MEM_SEND */
1538		if (sk->sk_type == SOCK_STREAM) {
1539			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1540				return 1;
1541		} else if (atomic_read(&sk->sk_wmem_alloc) <
1542			   prot->sysctl_wmem[0])
1543				return 1;
1544	}
1545
1546	if (prot->memory_pressure) {
1547		int alloc;
1548
1549		if (!*prot->memory_pressure)
1550			return 1;
1551		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1552		if (prot->sysctl_mem[2] > alloc *
1553		    sk_mem_pages(sk->sk_wmem_queued +
1554				 atomic_read(&sk->sk_rmem_alloc) +
1555				 sk->sk_forward_alloc))
1556			return 1;
1557	}
1558
1559suppress_allocation:
1560
1561	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1562		sk_stream_moderate_sndbuf(sk);
1563
1564		/* Fail only if socket is _under_ its sndbuf.
1565		 * In this case we cannot block, so that we have to fail.
1566		 */
1567		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1568			return 1;
1569	}
1570
1571	/* Alas. Undo changes. */
1572	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1573	atomic_sub(amt, prot->memory_allocated);
1574	return 0;
1575}
1576EXPORT_SYMBOL(__sk_mem_schedule);
1577
1578/**
1579 *	__sk_reclaim - reclaim memory_allocated
1580 *	@sk: socket
1581 */
1582void __sk_mem_reclaim(struct sock *sk)
1583{
1584	struct proto *prot = sk->sk_prot;
1585
1586	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1587		   prot->memory_allocated);
1588	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1589
1590	if (prot->memory_pressure && *prot->memory_pressure &&
1591	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1592		*prot->memory_pressure = 0;
1593}
1594EXPORT_SYMBOL(__sk_mem_reclaim);
1595
1596
1597/*
1598 * Set of default routines for initialising struct proto_ops when
1599 * the protocol does not support a particular function. In certain
1600 * cases where it makes no sense for a protocol to have a "do nothing"
1601 * function, some default processing is provided.
1602 */
1603
1604int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1605{
1606	return -EOPNOTSUPP;
1607}
1608EXPORT_SYMBOL(sock_no_bind);
1609
1610int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1611		    int len, int flags)
1612{
1613	return -EOPNOTSUPP;
1614}
1615EXPORT_SYMBOL(sock_no_connect);
1616
1617int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1618{
1619	return -EOPNOTSUPP;
1620}
1621EXPORT_SYMBOL(sock_no_socketpair);
1622
1623int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1624{
1625	return -EOPNOTSUPP;
1626}
1627EXPORT_SYMBOL(sock_no_accept);
1628
1629int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1630		    int *len, int peer)
1631{
1632	return -EOPNOTSUPP;
1633}
1634EXPORT_SYMBOL(sock_no_getname);
1635
1636unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1637{
1638	return 0;
1639}
1640EXPORT_SYMBOL(sock_no_poll);
1641
1642int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1643{
1644	return -EOPNOTSUPP;
1645}
1646EXPORT_SYMBOL(sock_no_ioctl);
1647
1648int sock_no_listen(struct socket *sock, int backlog)
1649{
1650	return -EOPNOTSUPP;
1651}
1652EXPORT_SYMBOL(sock_no_listen);
1653
1654int sock_no_shutdown(struct socket *sock, int how)
1655{
1656	return -EOPNOTSUPP;
1657}
1658EXPORT_SYMBOL(sock_no_shutdown);
1659
1660int sock_no_setsockopt(struct socket *sock, int level, int optname,
1661		    char __user *optval, int optlen)
1662{
1663	return -EOPNOTSUPP;
1664}
1665EXPORT_SYMBOL(sock_no_setsockopt);
1666
1667int sock_no_getsockopt(struct socket *sock, int level, int optname,
1668		    char __user *optval, int __user *optlen)
1669{
1670	return -EOPNOTSUPP;
1671}
1672EXPORT_SYMBOL(sock_no_getsockopt);
1673
1674int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1675		    size_t len)
1676{
1677	return -EOPNOTSUPP;
1678}
1679EXPORT_SYMBOL(sock_no_sendmsg);
1680
1681int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1682		    size_t len, int flags)
1683{
1684	return -EOPNOTSUPP;
1685}
1686EXPORT_SYMBOL(sock_no_recvmsg);
1687
1688int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1689{
1690	/* Mirror missing mmap method error code */
1691	return -ENODEV;
1692}
1693EXPORT_SYMBOL(sock_no_mmap);
1694
1695ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1696{
1697	ssize_t res;
1698	struct msghdr msg = {.msg_flags = flags};
1699	struct kvec iov;
1700	char *kaddr = kmap(page);
1701	iov.iov_base = kaddr + offset;
1702	iov.iov_len = size;
1703	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1704	kunmap(page);
1705	return res;
1706}
1707EXPORT_SYMBOL(sock_no_sendpage);
1708
1709/*
1710 *	Default Socket Callbacks
1711 */
1712
1713static void sock_def_wakeup(struct sock *sk)
1714{
1715	read_lock(&sk->sk_callback_lock);
1716	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1717		wake_up_interruptible_all(sk->sk_sleep);
1718	read_unlock(&sk->sk_callback_lock);
1719}
1720
1721static void sock_def_error_report(struct sock *sk)
1722{
1723	read_lock(&sk->sk_callback_lock);
1724	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1725		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1726	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1727	read_unlock(&sk->sk_callback_lock);
1728}
1729
1730static void sock_def_readable(struct sock *sk, int len)
1731{
1732	read_lock(&sk->sk_callback_lock);
1733	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1734		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1735						POLLRDNORM | POLLRDBAND);
1736	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1737	read_unlock(&sk->sk_callback_lock);
1738}
1739
1740static void sock_def_write_space(struct sock *sk)
1741{
1742	read_lock(&sk->sk_callback_lock);
1743
1744	/* Do not wake up a writer until he can make "significant"
1745	 * progress.  --DaveM
1746	 */
1747	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1748		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1749			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1750						POLLWRNORM | POLLWRBAND);
1751
1752		/* Should agree with poll, otherwise some programs break */
1753		if (sock_writeable(sk))
1754			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1755	}
1756
1757	read_unlock(&sk->sk_callback_lock);
1758}
1759
1760static void sock_def_destruct(struct sock *sk)
1761{
1762	kfree(sk->sk_protinfo);
1763}
1764
1765void sk_send_sigurg(struct sock *sk)
1766{
1767	if (sk->sk_socket && sk->sk_socket->file)
1768		if (send_sigurg(&sk->sk_socket->file->f_owner))
1769			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1770}
1771EXPORT_SYMBOL(sk_send_sigurg);
1772
1773void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1774		    unsigned long expires)
1775{
1776	if (!mod_timer(timer, expires))
1777		sock_hold(sk);
1778}
1779EXPORT_SYMBOL(sk_reset_timer);
1780
1781void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1782{
1783	if (timer_pending(timer) && del_timer(timer))
1784		__sock_put(sk);
1785}
1786EXPORT_SYMBOL(sk_stop_timer);
1787
1788void sock_init_data(struct socket *sock, struct sock *sk)
1789{
1790	skb_queue_head_init(&sk->sk_receive_queue);
1791	skb_queue_head_init(&sk->sk_write_queue);
1792	skb_queue_head_init(&sk->sk_error_queue);
1793#ifdef CONFIG_NET_DMA
1794	skb_queue_head_init(&sk->sk_async_wait_queue);
1795#endif
1796
1797	sk->sk_send_head	=	NULL;
1798
1799	init_timer(&sk->sk_timer);
1800
1801	sk->sk_allocation	=	GFP_KERNEL;
1802	sk->sk_rcvbuf		=	sysctl_rmem_default;
1803	sk->sk_sndbuf		=	sysctl_wmem_default;
1804	sk->sk_state		=	TCP_CLOSE;
1805	sk_set_socket(sk, sock);
1806
1807	sock_set_flag(sk, SOCK_ZAPPED);
1808
1809	if (sock) {
1810		sk->sk_type	=	sock->type;
1811		sk->sk_sleep	=	&sock->wait;
1812		sock->sk	=	sk;
1813	} else
1814		sk->sk_sleep	=	NULL;
1815
1816	rwlock_init(&sk->sk_dst_lock);
1817	rwlock_init(&sk->sk_callback_lock);
1818	lockdep_set_class_and_name(&sk->sk_callback_lock,
1819			af_callback_keys + sk->sk_family,
1820			af_family_clock_key_strings[sk->sk_family]);
1821
1822	sk->sk_state_change	=	sock_def_wakeup;
1823	sk->sk_data_ready	=	sock_def_readable;
1824	sk->sk_write_space	=	sock_def_write_space;
1825	sk->sk_error_report	=	sock_def_error_report;
1826	sk->sk_destruct		=	sock_def_destruct;
1827
1828	sk->sk_sndmsg_page	=	NULL;
1829	sk->sk_sndmsg_off	=	0;
1830
1831	sk->sk_peercred.pid 	=	0;
1832	sk->sk_peercred.uid	=	-1;
1833	sk->sk_peercred.gid	=	-1;
1834	sk->sk_write_pending	=	0;
1835	sk->sk_rcvlowat		=	1;
1836	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1837	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1838
1839	sk->sk_stamp = ktime_set(-1L, 0);
1840
1841	atomic_set(&sk->sk_refcnt, 1);
1842	atomic_set(&sk->sk_wmem_alloc, 1);
1843	atomic_set(&sk->sk_drops, 0);
1844}
1845EXPORT_SYMBOL(sock_init_data);
1846
1847void lock_sock_nested(struct sock *sk, int subclass)
1848{
1849	might_sleep();
1850	spin_lock_bh(&sk->sk_lock.slock);
1851	if (sk->sk_lock.owned)
1852		__lock_sock(sk);
1853	sk->sk_lock.owned = 1;
1854	spin_unlock(&sk->sk_lock.slock);
1855	/*
1856	 * The sk_lock has mutex_lock() semantics here:
1857	 */
1858	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1859	local_bh_enable();
1860}
1861EXPORT_SYMBOL(lock_sock_nested);
1862
1863void release_sock(struct sock *sk)
1864{
1865	/*
1866	 * The sk_lock has mutex_unlock() semantics:
1867	 */
1868	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1869
1870	spin_lock_bh(&sk->sk_lock.slock);
1871	if (sk->sk_backlog.tail)
1872		__release_sock(sk);
1873	sk->sk_lock.owned = 0;
1874	if (waitqueue_active(&sk->sk_lock.wq))
1875		wake_up(&sk->sk_lock.wq);
1876	spin_unlock_bh(&sk->sk_lock.slock);
1877}
1878EXPORT_SYMBOL(release_sock);
1879
1880int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1881{
1882	struct timeval tv;
1883	if (!sock_flag(sk, SOCK_TIMESTAMP))
1884		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1885	tv = ktime_to_timeval(sk->sk_stamp);
1886	if (tv.tv_sec == -1)
1887		return -ENOENT;
1888	if (tv.tv_sec == 0) {
1889		sk->sk_stamp = ktime_get_real();
1890		tv = ktime_to_timeval(sk->sk_stamp);
1891	}
1892	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1893}
1894EXPORT_SYMBOL(sock_get_timestamp);
1895
1896int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1897{
1898	struct timespec ts;
1899	if (!sock_flag(sk, SOCK_TIMESTAMP))
1900		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1901	ts = ktime_to_timespec(sk->sk_stamp);
1902	if (ts.tv_sec == -1)
1903		return -ENOENT;
1904	if (ts.tv_sec == 0) {
1905		sk->sk_stamp = ktime_get_real();
1906		ts = ktime_to_timespec(sk->sk_stamp);
1907	}
1908	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1909}
1910EXPORT_SYMBOL(sock_get_timestampns);
1911
1912void sock_enable_timestamp(struct sock *sk, int flag)
1913{
1914	if (!sock_flag(sk, flag)) {
1915		sock_set_flag(sk, flag);
1916		/*
1917		 * we just set one of the two flags which require net
1918		 * time stamping, but time stamping might have been on
1919		 * already because of the other one
1920		 */
1921		if (!sock_flag(sk,
1922				flag == SOCK_TIMESTAMP ?
1923				SOCK_TIMESTAMPING_RX_SOFTWARE :
1924				SOCK_TIMESTAMP))
1925			net_enable_timestamp();
1926	}
1927}
1928
1929/*
1930 *	Get a socket option on an socket.
1931 *
1932 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1933 *	asynchronous errors should be reported by getsockopt. We assume
1934 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1935 */
1936int sock_common_getsockopt(struct socket *sock, int level, int optname,
1937			   char __user *optval, int __user *optlen)
1938{
1939	struct sock *sk = sock->sk;
1940
1941	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1942}
1943EXPORT_SYMBOL(sock_common_getsockopt);
1944
1945#ifdef CONFIG_COMPAT
1946int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1947				  char __user *optval, int __user *optlen)
1948{
1949	struct sock *sk = sock->sk;
1950
1951	if (sk->sk_prot->compat_getsockopt != NULL)
1952		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1953						      optval, optlen);
1954	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1955}
1956EXPORT_SYMBOL(compat_sock_common_getsockopt);
1957#endif
1958
1959int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1960			struct msghdr *msg, size_t size, int flags)
1961{
1962	struct sock *sk = sock->sk;
1963	int addr_len = 0;
1964	int err;
1965
1966	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1967				   flags & ~MSG_DONTWAIT, &addr_len);
1968	if (err >= 0)
1969		msg->msg_namelen = addr_len;
1970	return err;
1971}
1972EXPORT_SYMBOL(sock_common_recvmsg);
1973
1974/*
1975 *	Set socket options on an inet socket.
1976 */
1977int sock_common_setsockopt(struct socket *sock, int level, int optname,
1978			   char __user *optval, int optlen)
1979{
1980	struct sock *sk = sock->sk;
1981
1982	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1983}
1984EXPORT_SYMBOL(sock_common_setsockopt);
1985
1986#ifdef CONFIG_COMPAT
1987int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1988				  char __user *optval, int optlen)
1989{
1990	struct sock *sk = sock->sk;
1991
1992	if (sk->sk_prot->compat_setsockopt != NULL)
1993		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1994						      optval, optlen);
1995	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1996}
1997EXPORT_SYMBOL(compat_sock_common_setsockopt);
1998#endif
1999
2000void sk_common_release(struct sock *sk)
2001{
2002	if (sk->sk_prot->destroy)
2003		sk->sk_prot->destroy(sk);
2004
2005	/*
2006	 * Observation: when sock_common_release is called, processes have
2007	 * no access to socket. But net still has.
2008	 * Step one, detach it from networking:
2009	 *
2010	 * A. Remove from hash tables.
2011	 */
2012
2013	sk->sk_prot->unhash(sk);
2014
2015	/*
2016	 * In this point socket cannot receive new packets, but it is possible
2017	 * that some packets are in flight because some CPU runs receiver and
2018	 * did hash table lookup before we unhashed socket. They will achieve
2019	 * receive queue and will be purged by socket destructor.
2020	 *
2021	 * Also we still have packets pending on receive queue and probably,
2022	 * our own packets waiting in device queues. sock_destroy will drain
2023	 * receive queue, but transmitted packets will delay socket destruction
2024	 * until the last reference will be released.
2025	 */
2026
2027	sock_orphan(sk);
2028
2029	xfrm_sk_free_policy(sk);
2030
2031	sk_refcnt_debug_release(sk);
2032	sock_put(sk);
2033}
2034EXPORT_SYMBOL(sk_common_release);
2035
2036static DEFINE_RWLOCK(proto_list_lock);
2037static LIST_HEAD(proto_list);
2038
2039#ifdef CONFIG_PROC_FS
2040#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2041struct prot_inuse {
2042	int val[PROTO_INUSE_NR];
2043};
2044
2045static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2046
2047#ifdef CONFIG_NET_NS
2048void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2049{
2050	int cpu = smp_processor_id();
2051	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2052}
2053EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2054
2055int sock_prot_inuse_get(struct net *net, struct proto *prot)
2056{
2057	int cpu, idx = prot->inuse_idx;
2058	int res = 0;
2059
2060	for_each_possible_cpu(cpu)
2061		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2062
2063	return res >= 0 ? res : 0;
2064}
2065EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2066
2067static int sock_inuse_init_net(struct net *net)
2068{
2069	net->core.inuse = alloc_percpu(struct prot_inuse);
2070	return net->core.inuse ? 0 : -ENOMEM;
2071}
2072
2073static void sock_inuse_exit_net(struct net *net)
2074{
2075	free_percpu(net->core.inuse);
2076}
2077
2078static struct pernet_operations net_inuse_ops = {
2079	.init = sock_inuse_init_net,
2080	.exit = sock_inuse_exit_net,
2081};
2082
2083static __init int net_inuse_init(void)
2084{
2085	if (register_pernet_subsys(&net_inuse_ops))
2086		panic("Cannot initialize net inuse counters");
2087
2088	return 0;
2089}
2090
2091core_initcall(net_inuse_init);
2092#else
2093static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2094
2095void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2096{
2097	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2098}
2099EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2100
2101int sock_prot_inuse_get(struct net *net, struct proto *prot)
2102{
2103	int cpu, idx = prot->inuse_idx;
2104	int res = 0;
2105
2106	for_each_possible_cpu(cpu)
2107		res += per_cpu(prot_inuse, cpu).val[idx];
2108
2109	return res >= 0 ? res : 0;
2110}
2111EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2112#endif
2113
2114static void assign_proto_idx(struct proto *prot)
2115{
2116	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2117
2118	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2119		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2120		return;
2121	}
2122
2123	set_bit(prot->inuse_idx, proto_inuse_idx);
2124}
2125
2126static void release_proto_idx(struct proto *prot)
2127{
2128	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2129		clear_bit(prot->inuse_idx, proto_inuse_idx);
2130}
2131#else
2132static inline void assign_proto_idx(struct proto *prot)
2133{
2134}
2135
2136static inline void release_proto_idx(struct proto *prot)
2137{
2138}
2139#endif
2140
2141int proto_register(struct proto *prot, int alloc_slab)
2142{
2143	if (alloc_slab) {
2144		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2145					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2146					NULL);
2147
2148		if (prot->slab == NULL) {
2149			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2150			       prot->name);
2151			goto out;
2152		}
2153
2154		if (prot->rsk_prot != NULL) {
2155			static const char mask[] = "request_sock_%s";
2156
2157			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2158			if (prot->rsk_prot->slab_name == NULL)
2159				goto out_free_sock_slab;
2160
2161			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2162			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2163								 prot->rsk_prot->obj_size, 0,
2164								 SLAB_HWCACHE_ALIGN, NULL);
2165
2166			if (prot->rsk_prot->slab == NULL) {
2167				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2168				       prot->name);
2169				goto out_free_request_sock_slab_name;
2170			}
2171		}
2172
2173		if (prot->twsk_prot != NULL) {
2174			static const char mask[] = "tw_sock_%s";
2175
2176			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2177
2178			if (prot->twsk_prot->twsk_slab_name == NULL)
2179				goto out_free_request_sock_slab;
2180
2181			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2182			prot->twsk_prot->twsk_slab =
2183				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2184						  prot->twsk_prot->twsk_obj_size,
2185						  0,
2186						  SLAB_HWCACHE_ALIGN |
2187							prot->slab_flags,
2188						  NULL);
2189			if (prot->twsk_prot->twsk_slab == NULL)
2190				goto out_free_timewait_sock_slab_name;
2191		}
2192	}
2193
2194	write_lock(&proto_list_lock);
2195	list_add(&prot->node, &proto_list);
2196	assign_proto_idx(prot);
2197	write_unlock(&proto_list_lock);
2198	return 0;
2199
2200out_free_timewait_sock_slab_name:
2201	kfree(prot->twsk_prot->twsk_slab_name);
2202out_free_request_sock_slab:
2203	if (prot->rsk_prot && prot->rsk_prot->slab) {
2204		kmem_cache_destroy(prot->rsk_prot->slab);
2205		prot->rsk_prot->slab = NULL;
2206	}
2207out_free_request_sock_slab_name:
2208	kfree(prot->rsk_prot->slab_name);
2209out_free_sock_slab:
2210	kmem_cache_destroy(prot->slab);
2211	prot->slab = NULL;
2212out:
2213	return -ENOBUFS;
2214}
2215EXPORT_SYMBOL(proto_register);
2216
2217void proto_unregister(struct proto *prot)
2218{
2219	write_lock(&proto_list_lock);
2220	release_proto_idx(prot);
2221	list_del(&prot->node);
2222	write_unlock(&proto_list_lock);
2223
2224	if (prot->slab != NULL) {
2225		kmem_cache_destroy(prot->slab);
2226		prot->slab = NULL;
2227	}
2228
2229	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2230		kmem_cache_destroy(prot->rsk_prot->slab);
2231		kfree(prot->rsk_prot->slab_name);
2232		prot->rsk_prot->slab = NULL;
2233	}
2234
2235	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2236		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2237		kfree(prot->twsk_prot->twsk_slab_name);
2238		prot->twsk_prot->twsk_slab = NULL;
2239	}
2240}
2241EXPORT_SYMBOL(proto_unregister);
2242
2243#ifdef CONFIG_PROC_FS
2244static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2245	__acquires(proto_list_lock)
2246{
2247	read_lock(&proto_list_lock);
2248	return seq_list_start_head(&proto_list, *pos);
2249}
2250
2251static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2252{
2253	return seq_list_next(v, &proto_list, pos);
2254}
2255
2256static void proto_seq_stop(struct seq_file *seq, void *v)
2257	__releases(proto_list_lock)
2258{
2259	read_unlock(&proto_list_lock);
2260}
2261
2262static char proto_method_implemented(const void *method)
2263{
2264	return method == NULL ? 'n' : 'y';
2265}
2266
2267static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2268{
2269	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2270			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2271		   proto->name,
2272		   proto->obj_size,
2273		   sock_prot_inuse_get(seq_file_net(seq), proto),
2274		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2275		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2276		   proto->max_header,
2277		   proto->slab == NULL ? "no" : "yes",
2278		   module_name(proto->owner),
2279		   proto_method_implemented(proto->close),
2280		   proto_method_implemented(proto->connect),
2281		   proto_method_implemented(proto->disconnect),
2282		   proto_method_implemented(proto->accept),
2283		   proto_method_implemented(proto->ioctl),
2284		   proto_method_implemented(proto->init),
2285		   proto_method_implemented(proto->destroy),
2286		   proto_method_implemented(proto->shutdown),
2287		   proto_method_implemented(proto->setsockopt),
2288		   proto_method_implemented(proto->getsockopt),
2289		   proto_method_implemented(proto->sendmsg),
2290		   proto_method_implemented(proto->recvmsg),
2291		   proto_method_implemented(proto->sendpage),
2292		   proto_method_implemented(proto->bind),
2293		   proto_method_implemented(proto->backlog_rcv),
2294		   proto_method_implemented(proto->hash),
2295		   proto_method_implemented(proto->unhash),
2296		   proto_method_implemented(proto->get_port),
2297		   proto_method_implemented(proto->enter_memory_pressure));
2298}
2299
2300static int proto_seq_show(struct seq_file *seq, void *v)
2301{
2302	if (v == &proto_list)
2303		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2304			   "protocol",
2305			   "size",
2306			   "sockets",
2307			   "memory",
2308			   "press",
2309			   "maxhdr",
2310			   "slab",
2311			   "module",
2312			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2313	else
2314		proto_seq_printf(seq, list_entry(v, struct proto, node));
2315	return 0;
2316}
2317
2318static const struct seq_operations proto_seq_ops = {
2319	.start  = proto_seq_start,
2320	.next   = proto_seq_next,
2321	.stop   = proto_seq_stop,
2322	.show   = proto_seq_show,
2323};
2324
2325static int proto_seq_open(struct inode *inode, struct file *file)
2326{
2327	return seq_open_net(inode, file, &proto_seq_ops,
2328			    sizeof(struct seq_net_private));
2329}
2330
2331static const struct file_operations proto_seq_fops = {
2332	.owner		= THIS_MODULE,
2333	.open		= proto_seq_open,
2334	.read		= seq_read,
2335	.llseek		= seq_lseek,
2336	.release	= seq_release_net,
2337};
2338
2339static __net_init int proto_init_net(struct net *net)
2340{
2341	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2342		return -ENOMEM;
2343
2344	return 0;
2345}
2346
2347static __net_exit void proto_exit_net(struct net *net)
2348{
2349	proc_net_remove(net, "protocols");
2350}
2351
2352
2353static __net_initdata struct pernet_operations proto_net_ops = {
2354	.init = proto_init_net,
2355	.exit = proto_exit_net,
2356};
2357
2358static int __init proto_init(void)
2359{
2360	return register_pernet_subsys(&proto_net_ops);
2361}
2362
2363subsys_initcall(proto_init);
2364
2365#endif /* PROC_FS */
2366