sock.c revision 36cbd3dcc10384f813ec0814255f576c84f2bcd4
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
120#include <net/net_namespace.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <linux/net_tstamp.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *const af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158  "sk_lock-AF_IEEE802154",
159  "sk_lock-AF_MAX"
160};
161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174  "slock-AF_IEEE802154",
175  "slock-AF_MAX"
176};
177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190  "clock-AF_IEEE802154",
191  "clock-AF_MAX"
192};
193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms.  This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS		256
206#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218EXPORT_SYMBOL(sysctl_optmem_max);
219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222	struct timeval tv;
223
224	if (optlen < sizeof(tv))
225		return -EINVAL;
226	if (copy_from_user(&tv, optval, sizeof(tv)))
227		return -EFAULT;
228	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229		return -EDOM;
230
231	if (tv.tv_sec < 0) {
232		static int warned __read_mostly;
233
234		*timeo_p = 0;
235		if (warned < 10 && net_ratelimit()) {
236			warned++;
237			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238			       "tries to set negative timeout\n",
239				current->comm, task_pid_nr(current));
240		}
241		return 0;
242	}
243	*timeo_p = MAX_SCHEDULE_TIMEOUT;
244	if (tv.tv_sec == 0 && tv.tv_usec == 0)
245		return 0;
246	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248	return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253	static int warned;
254	static char warncomm[TASK_COMM_LEN];
255	if (strcmp(warncomm, current->comm) && warned < 5) {
256		strcpy(warncomm,  current->comm);
257		printk(KERN_WARNING "process `%s' is using obsolete "
258		       "%s SO_BSDCOMPAT\n", warncomm, name);
259		warned++;
260	}
261}
262
263static void sock_disable_timestamp(struct sock *sk, int flag)
264{
265	if (sock_flag(sk, flag)) {
266		sock_reset_flag(sk, flag);
267		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269			net_disable_timestamp();
270		}
271	}
272}
273
274
275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277	int err = 0;
278	int skb_len;
279
280	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
281	   number of warnings when compiling with -W --ANK
282	 */
283	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284	    (unsigned)sk->sk_rcvbuf) {
285		err = -ENOMEM;
286		goto out;
287	}
288
289	err = sk_filter(sk, skb);
290	if (err)
291		goto out;
292
293	if (!sk_rmem_schedule(sk, skb->truesize)) {
294		err = -ENOBUFS;
295		goto out;
296	}
297
298	skb->dev = NULL;
299	skb_set_owner_r(skb, sk);
300
301	/* Cache the SKB length before we tack it onto the receive
302	 * queue.  Once it is added it no longer belongs to us and
303	 * may be freed by other threads of control pulling packets
304	 * from the queue.
305	 */
306	skb_len = skb->len;
307
308	skb_queue_tail(&sk->sk_receive_queue, skb);
309
310	if (!sock_flag(sk, SOCK_DEAD))
311		sk->sk_data_ready(sk, skb_len);
312out:
313	return err;
314}
315EXPORT_SYMBOL(sock_queue_rcv_skb);
316
317int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
318{
319	int rc = NET_RX_SUCCESS;
320
321	if (sk_filter(sk, skb))
322		goto discard_and_relse;
323
324	skb->dev = NULL;
325
326	if (nested)
327		bh_lock_sock_nested(sk);
328	else
329		bh_lock_sock(sk);
330	if (!sock_owned_by_user(sk)) {
331		/*
332		 * trylock + unlock semantics:
333		 */
334		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
335
336		rc = sk_backlog_rcv(sk, skb);
337
338		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339	} else
340		sk_add_backlog(sk, skb);
341	bh_unlock_sock(sk);
342out:
343	sock_put(sk);
344	return rc;
345discard_and_relse:
346	kfree_skb(skb);
347	goto out;
348}
349EXPORT_SYMBOL(sk_receive_skb);
350
351struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
352{
353	struct dst_entry *dst = sk->sk_dst_cache;
354
355	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356		sk->sk_dst_cache = NULL;
357		dst_release(dst);
358		return NULL;
359	}
360
361	return dst;
362}
363EXPORT_SYMBOL(__sk_dst_check);
364
365struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
366{
367	struct dst_entry *dst = sk_dst_get(sk);
368
369	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370		sk_dst_reset(sk);
371		dst_release(dst);
372		return NULL;
373	}
374
375	return dst;
376}
377EXPORT_SYMBOL(sk_dst_check);
378
379static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
380{
381	int ret = -ENOPROTOOPT;
382#ifdef CONFIG_NETDEVICES
383	struct net *net = sock_net(sk);
384	char devname[IFNAMSIZ];
385	int index;
386
387	/* Sorry... */
388	ret = -EPERM;
389	if (!capable(CAP_NET_RAW))
390		goto out;
391
392	ret = -EINVAL;
393	if (optlen < 0)
394		goto out;
395
396	/* Bind this socket to a particular device like "eth0",
397	 * as specified in the passed interface name. If the
398	 * name is "" or the option length is zero the socket
399	 * is not bound.
400	 */
401	if (optlen > IFNAMSIZ - 1)
402		optlen = IFNAMSIZ - 1;
403	memset(devname, 0, sizeof(devname));
404
405	ret = -EFAULT;
406	if (copy_from_user(devname, optval, optlen))
407		goto out;
408
409	if (devname[0] == '\0') {
410		index = 0;
411	} else {
412		struct net_device *dev = dev_get_by_name(net, devname);
413
414		ret = -ENODEV;
415		if (!dev)
416			goto out;
417
418		index = dev->ifindex;
419		dev_put(dev);
420	}
421
422	lock_sock(sk);
423	sk->sk_bound_dev_if = index;
424	sk_dst_reset(sk);
425	release_sock(sk);
426
427	ret = 0;
428
429out:
430#endif
431
432	return ret;
433}
434
435static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
436{
437	if (valbool)
438		sock_set_flag(sk, bit);
439	else
440		sock_reset_flag(sk, bit);
441}
442
443/*
444 *	This is meant for all protocols to use and covers goings on
445 *	at the socket level. Everything here is generic.
446 */
447
448int sock_setsockopt(struct socket *sock, int level, int optname,
449		    char __user *optval, int optlen)
450{
451	struct sock *sk = sock->sk;
452	int val;
453	int valbool;
454	struct linger ling;
455	int ret = 0;
456
457	/*
458	 *	Options without arguments
459	 */
460
461	if (optname == SO_BINDTODEVICE)
462		return sock_bindtodevice(sk, optval, optlen);
463
464	if (optlen < sizeof(int))
465		return -EINVAL;
466
467	if (get_user(val, (int __user *)optval))
468		return -EFAULT;
469
470	valbool = val ? 1 : 0;
471
472	lock_sock(sk);
473
474	switch (optname) {
475	case SO_DEBUG:
476		if (val && !capable(CAP_NET_ADMIN))
477			ret = -EACCES;
478		else
479			sock_valbool_flag(sk, SOCK_DBG, valbool);
480		break;
481	case SO_REUSEADDR:
482		sk->sk_reuse = valbool;
483		break;
484	case SO_TYPE:
485	case SO_ERROR:
486		ret = -ENOPROTOOPT;
487		break;
488	case SO_DONTROUTE:
489		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
490		break;
491	case SO_BROADCAST:
492		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
493		break;
494	case SO_SNDBUF:
495		/* Don't error on this BSD doesn't and if you think
496		   about it this is right. Otherwise apps have to
497		   play 'guess the biggest size' games. RCVBUF/SNDBUF
498		   are treated in BSD as hints */
499
500		if (val > sysctl_wmem_max)
501			val = sysctl_wmem_max;
502set_sndbuf:
503		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
504		if ((val * 2) < SOCK_MIN_SNDBUF)
505			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
506		else
507			sk->sk_sndbuf = val * 2;
508
509		/*
510		 *	Wake up sending tasks if we
511		 *	upped the value.
512		 */
513		sk->sk_write_space(sk);
514		break;
515
516	case SO_SNDBUFFORCE:
517		if (!capable(CAP_NET_ADMIN)) {
518			ret = -EPERM;
519			break;
520		}
521		goto set_sndbuf;
522
523	case SO_RCVBUF:
524		/* Don't error on this BSD doesn't and if you think
525		   about it this is right. Otherwise apps have to
526		   play 'guess the biggest size' games. RCVBUF/SNDBUF
527		   are treated in BSD as hints */
528
529		if (val > sysctl_rmem_max)
530			val = sysctl_rmem_max;
531set_rcvbuf:
532		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
533		/*
534		 * We double it on the way in to account for
535		 * "struct sk_buff" etc. overhead.   Applications
536		 * assume that the SO_RCVBUF setting they make will
537		 * allow that much actual data to be received on that
538		 * socket.
539		 *
540		 * Applications are unaware that "struct sk_buff" and
541		 * other overheads allocate from the receive buffer
542		 * during socket buffer allocation.
543		 *
544		 * And after considering the possible alternatives,
545		 * returning the value we actually used in getsockopt
546		 * is the most desirable behavior.
547		 */
548		if ((val * 2) < SOCK_MIN_RCVBUF)
549			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
550		else
551			sk->sk_rcvbuf = val * 2;
552		break;
553
554	case SO_RCVBUFFORCE:
555		if (!capable(CAP_NET_ADMIN)) {
556			ret = -EPERM;
557			break;
558		}
559		goto set_rcvbuf;
560
561	case SO_KEEPALIVE:
562#ifdef CONFIG_INET
563		if (sk->sk_protocol == IPPROTO_TCP)
564			tcp_set_keepalive(sk, valbool);
565#endif
566		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
567		break;
568
569	case SO_OOBINLINE:
570		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
571		break;
572
573	case SO_NO_CHECK:
574		sk->sk_no_check = valbool;
575		break;
576
577	case SO_PRIORITY:
578		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
579			sk->sk_priority = val;
580		else
581			ret = -EPERM;
582		break;
583
584	case SO_LINGER:
585		if (optlen < sizeof(ling)) {
586			ret = -EINVAL;	/* 1003.1g */
587			break;
588		}
589		if (copy_from_user(&ling, optval, sizeof(ling))) {
590			ret = -EFAULT;
591			break;
592		}
593		if (!ling.l_onoff)
594			sock_reset_flag(sk, SOCK_LINGER);
595		else {
596#if (BITS_PER_LONG == 32)
597			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
598				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
599			else
600#endif
601				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
602			sock_set_flag(sk, SOCK_LINGER);
603		}
604		break;
605
606	case SO_BSDCOMPAT:
607		sock_warn_obsolete_bsdism("setsockopt");
608		break;
609
610	case SO_PASSCRED:
611		if (valbool)
612			set_bit(SOCK_PASSCRED, &sock->flags);
613		else
614			clear_bit(SOCK_PASSCRED, &sock->flags);
615		break;
616
617	case SO_TIMESTAMP:
618	case SO_TIMESTAMPNS:
619		if (valbool)  {
620			if (optname == SO_TIMESTAMP)
621				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
622			else
623				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
624			sock_set_flag(sk, SOCK_RCVTSTAMP);
625			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
626		} else {
627			sock_reset_flag(sk, SOCK_RCVTSTAMP);
628			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
629		}
630		break;
631
632	case SO_TIMESTAMPING:
633		if (val & ~SOF_TIMESTAMPING_MASK) {
634			ret = -EINVAL;
635			break;
636		}
637		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
638				  val & SOF_TIMESTAMPING_TX_HARDWARE);
639		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
640				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
641		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
642				  val & SOF_TIMESTAMPING_RX_HARDWARE);
643		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
644			sock_enable_timestamp(sk,
645					      SOCK_TIMESTAMPING_RX_SOFTWARE);
646		else
647			sock_disable_timestamp(sk,
648					       SOCK_TIMESTAMPING_RX_SOFTWARE);
649		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
650				  val & SOF_TIMESTAMPING_SOFTWARE);
651		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
652				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
653		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
654				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
655		break;
656
657	case SO_RCVLOWAT:
658		if (val < 0)
659			val = INT_MAX;
660		sk->sk_rcvlowat = val ? : 1;
661		break;
662
663	case SO_RCVTIMEO:
664		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
665		break;
666
667	case SO_SNDTIMEO:
668		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
669		break;
670
671	case SO_ATTACH_FILTER:
672		ret = -EINVAL;
673		if (optlen == sizeof(struct sock_fprog)) {
674			struct sock_fprog fprog;
675
676			ret = -EFAULT;
677			if (copy_from_user(&fprog, optval, sizeof(fprog)))
678				break;
679
680			ret = sk_attach_filter(&fprog, sk);
681		}
682		break;
683
684	case SO_DETACH_FILTER:
685		ret = sk_detach_filter(sk);
686		break;
687
688	case SO_PASSSEC:
689		if (valbool)
690			set_bit(SOCK_PASSSEC, &sock->flags);
691		else
692			clear_bit(SOCK_PASSSEC, &sock->flags);
693		break;
694	case SO_MARK:
695		if (!capable(CAP_NET_ADMIN))
696			ret = -EPERM;
697		else
698			sk->sk_mark = val;
699		break;
700
701		/* We implement the SO_SNDLOWAT etc to
702		   not be settable (1003.1g 5.3) */
703	default:
704		ret = -ENOPROTOOPT;
705		break;
706	}
707	release_sock(sk);
708	return ret;
709}
710EXPORT_SYMBOL(sock_setsockopt);
711
712
713int sock_getsockopt(struct socket *sock, int level, int optname,
714		    char __user *optval, int __user *optlen)
715{
716	struct sock *sk = sock->sk;
717
718	union {
719		int val;
720		struct linger ling;
721		struct timeval tm;
722	} v;
723
724	unsigned int lv = sizeof(int);
725	int len;
726
727	if (get_user(len, optlen))
728		return -EFAULT;
729	if (len < 0)
730		return -EINVAL;
731
732	memset(&v, 0, sizeof(v));
733
734	switch (optname) {
735	case SO_DEBUG:
736		v.val = sock_flag(sk, SOCK_DBG);
737		break;
738
739	case SO_DONTROUTE:
740		v.val = sock_flag(sk, SOCK_LOCALROUTE);
741		break;
742
743	case SO_BROADCAST:
744		v.val = !!sock_flag(sk, SOCK_BROADCAST);
745		break;
746
747	case SO_SNDBUF:
748		v.val = sk->sk_sndbuf;
749		break;
750
751	case SO_RCVBUF:
752		v.val = sk->sk_rcvbuf;
753		break;
754
755	case SO_REUSEADDR:
756		v.val = sk->sk_reuse;
757		break;
758
759	case SO_KEEPALIVE:
760		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
761		break;
762
763	case SO_TYPE:
764		v.val = sk->sk_type;
765		break;
766
767	case SO_ERROR:
768		v.val = -sock_error(sk);
769		if (v.val == 0)
770			v.val = xchg(&sk->sk_err_soft, 0);
771		break;
772
773	case SO_OOBINLINE:
774		v.val = !!sock_flag(sk, SOCK_URGINLINE);
775		break;
776
777	case SO_NO_CHECK:
778		v.val = sk->sk_no_check;
779		break;
780
781	case SO_PRIORITY:
782		v.val = sk->sk_priority;
783		break;
784
785	case SO_LINGER:
786		lv		= sizeof(v.ling);
787		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
788		v.ling.l_linger	= sk->sk_lingertime / HZ;
789		break;
790
791	case SO_BSDCOMPAT:
792		sock_warn_obsolete_bsdism("getsockopt");
793		break;
794
795	case SO_TIMESTAMP:
796		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
797				!sock_flag(sk, SOCK_RCVTSTAMPNS);
798		break;
799
800	case SO_TIMESTAMPNS:
801		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
802		break;
803
804	case SO_TIMESTAMPING:
805		v.val = 0;
806		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
807			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
808		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
809			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
810		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
811			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
812		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
813			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
814		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
815			v.val |= SOF_TIMESTAMPING_SOFTWARE;
816		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
817			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
818		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
819			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
820		break;
821
822	case SO_RCVTIMEO:
823		lv = sizeof(struct timeval);
824		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
825			v.tm.tv_sec = 0;
826			v.tm.tv_usec = 0;
827		} else {
828			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
829			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
830		}
831		break;
832
833	case SO_SNDTIMEO:
834		lv = sizeof(struct timeval);
835		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
836			v.tm.tv_sec = 0;
837			v.tm.tv_usec = 0;
838		} else {
839			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
840			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
841		}
842		break;
843
844	case SO_RCVLOWAT:
845		v.val = sk->sk_rcvlowat;
846		break;
847
848	case SO_SNDLOWAT:
849		v.val = 1;
850		break;
851
852	case SO_PASSCRED:
853		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
854		break;
855
856	case SO_PEERCRED:
857		if (len > sizeof(sk->sk_peercred))
858			len = sizeof(sk->sk_peercred);
859		if (copy_to_user(optval, &sk->sk_peercred, len))
860			return -EFAULT;
861		goto lenout;
862
863	case SO_PEERNAME:
864	{
865		char address[128];
866
867		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
868			return -ENOTCONN;
869		if (lv < len)
870			return -EINVAL;
871		if (copy_to_user(optval, address, len))
872			return -EFAULT;
873		goto lenout;
874	}
875
876	/* Dubious BSD thing... Probably nobody even uses it, but
877	 * the UNIX standard wants it for whatever reason... -DaveM
878	 */
879	case SO_ACCEPTCONN:
880		v.val = sk->sk_state == TCP_LISTEN;
881		break;
882
883	case SO_PASSSEC:
884		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
885		break;
886
887	case SO_PEERSEC:
888		return security_socket_getpeersec_stream(sock, optval, optlen, len);
889
890	case SO_MARK:
891		v.val = sk->sk_mark;
892		break;
893
894	default:
895		return -ENOPROTOOPT;
896	}
897
898	if (len > lv)
899		len = lv;
900	if (copy_to_user(optval, &v, len))
901		return -EFAULT;
902lenout:
903	if (put_user(len, optlen))
904		return -EFAULT;
905	return 0;
906}
907
908/*
909 * Initialize an sk_lock.
910 *
911 * (We also register the sk_lock with the lock validator.)
912 */
913static inline void sock_lock_init(struct sock *sk)
914{
915	sock_lock_init_class_and_name(sk,
916			af_family_slock_key_strings[sk->sk_family],
917			af_family_slock_keys + sk->sk_family,
918			af_family_key_strings[sk->sk_family],
919			af_family_keys + sk->sk_family);
920}
921
922/*
923 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
924 * even temporarly, because of RCU lookups. sk_node should also be left as is.
925 */
926static void sock_copy(struct sock *nsk, const struct sock *osk)
927{
928#ifdef CONFIG_SECURITY_NETWORK
929	void *sptr = nsk->sk_security;
930#endif
931	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
932		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
933	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
934	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
935#ifdef CONFIG_SECURITY_NETWORK
936	nsk->sk_security = sptr;
937	security_sk_clone(osk, nsk);
938#endif
939}
940
941static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
942		int family)
943{
944	struct sock *sk;
945	struct kmem_cache *slab;
946
947	slab = prot->slab;
948	if (slab != NULL) {
949		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
950		if (!sk)
951			return sk;
952		if (priority & __GFP_ZERO) {
953			/*
954			 * caches using SLAB_DESTROY_BY_RCU should let
955			 * sk_node.next un-modified. Special care is taken
956			 * when initializing object to zero.
957			 */
958			if (offsetof(struct sock, sk_node.next) != 0)
959				memset(sk, 0, offsetof(struct sock, sk_node.next));
960			memset(&sk->sk_node.pprev, 0,
961			       prot->obj_size - offsetof(struct sock,
962							 sk_node.pprev));
963		}
964	}
965	else
966		sk = kmalloc(prot->obj_size, priority);
967
968	if (sk != NULL) {
969		kmemcheck_annotate_bitfield(sk, flags);
970
971		if (security_sk_alloc(sk, family, priority))
972			goto out_free;
973
974		if (!try_module_get(prot->owner))
975			goto out_free_sec;
976	}
977
978	return sk;
979
980out_free_sec:
981	security_sk_free(sk);
982out_free:
983	if (slab != NULL)
984		kmem_cache_free(slab, sk);
985	else
986		kfree(sk);
987	return NULL;
988}
989
990static void sk_prot_free(struct proto *prot, struct sock *sk)
991{
992	struct kmem_cache *slab;
993	struct module *owner;
994
995	owner = prot->owner;
996	slab = prot->slab;
997
998	security_sk_free(sk);
999	if (slab != NULL)
1000		kmem_cache_free(slab, sk);
1001	else
1002		kfree(sk);
1003	module_put(owner);
1004}
1005
1006/**
1007 *	sk_alloc - All socket objects are allocated here
1008 *	@net: the applicable net namespace
1009 *	@family: protocol family
1010 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1011 *	@prot: struct proto associated with this new sock instance
1012 */
1013struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1014		      struct proto *prot)
1015{
1016	struct sock *sk;
1017
1018	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1019	if (sk) {
1020		sk->sk_family = family;
1021		/*
1022		 * See comment in struct sock definition to understand
1023		 * why we need sk_prot_creator -acme
1024		 */
1025		sk->sk_prot = sk->sk_prot_creator = prot;
1026		sock_lock_init(sk);
1027		sock_net_set(sk, get_net(net));
1028	}
1029
1030	return sk;
1031}
1032EXPORT_SYMBOL(sk_alloc);
1033
1034static void __sk_free(struct sock *sk)
1035{
1036	struct sk_filter *filter;
1037
1038	if (sk->sk_destruct)
1039		sk->sk_destruct(sk);
1040
1041	filter = rcu_dereference(sk->sk_filter);
1042	if (filter) {
1043		sk_filter_uncharge(sk, filter);
1044		rcu_assign_pointer(sk->sk_filter, NULL);
1045	}
1046
1047	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1048	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1049
1050	if (atomic_read(&sk->sk_omem_alloc))
1051		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1052		       __func__, atomic_read(&sk->sk_omem_alloc));
1053
1054	put_net(sock_net(sk));
1055	sk_prot_free(sk->sk_prot_creator, sk);
1056}
1057
1058void sk_free(struct sock *sk)
1059{
1060	/*
1061	 * We substract one from sk_wmem_alloc and can know if
1062	 * some packets are still in some tx queue.
1063	 * If not null, sock_wfree() will call __sk_free(sk) later
1064	 */
1065	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1066		__sk_free(sk);
1067}
1068EXPORT_SYMBOL(sk_free);
1069
1070/*
1071 * Last sock_put should drop referrence to sk->sk_net. It has already
1072 * been dropped in sk_change_net. Taking referrence to stopping namespace
1073 * is not an option.
1074 * Take referrence to a socket to remove it from hash _alive_ and after that
1075 * destroy it in the context of init_net.
1076 */
1077void sk_release_kernel(struct sock *sk)
1078{
1079	if (sk == NULL || sk->sk_socket == NULL)
1080		return;
1081
1082	sock_hold(sk);
1083	sock_release(sk->sk_socket);
1084	release_net(sock_net(sk));
1085	sock_net_set(sk, get_net(&init_net));
1086	sock_put(sk);
1087}
1088EXPORT_SYMBOL(sk_release_kernel);
1089
1090struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1091{
1092	struct sock *newsk;
1093
1094	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1095	if (newsk != NULL) {
1096		struct sk_filter *filter;
1097
1098		sock_copy(newsk, sk);
1099
1100		/* SANITY */
1101		get_net(sock_net(newsk));
1102		sk_node_init(&newsk->sk_node);
1103		sock_lock_init(newsk);
1104		bh_lock_sock(newsk);
1105		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1106
1107		atomic_set(&newsk->sk_rmem_alloc, 0);
1108		/*
1109		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1110		 */
1111		atomic_set(&newsk->sk_wmem_alloc, 1);
1112		atomic_set(&newsk->sk_omem_alloc, 0);
1113		skb_queue_head_init(&newsk->sk_receive_queue);
1114		skb_queue_head_init(&newsk->sk_write_queue);
1115#ifdef CONFIG_NET_DMA
1116		skb_queue_head_init(&newsk->sk_async_wait_queue);
1117#endif
1118
1119		rwlock_init(&newsk->sk_dst_lock);
1120		rwlock_init(&newsk->sk_callback_lock);
1121		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1122				af_callback_keys + newsk->sk_family,
1123				af_family_clock_key_strings[newsk->sk_family]);
1124
1125		newsk->sk_dst_cache	= NULL;
1126		newsk->sk_wmem_queued	= 0;
1127		newsk->sk_forward_alloc = 0;
1128		newsk->sk_send_head	= NULL;
1129		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1130
1131		sock_reset_flag(newsk, SOCK_DONE);
1132		skb_queue_head_init(&newsk->sk_error_queue);
1133
1134		filter = newsk->sk_filter;
1135		if (filter != NULL)
1136			sk_filter_charge(newsk, filter);
1137
1138		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1139			/* It is still raw copy of parent, so invalidate
1140			 * destructor and make plain sk_free() */
1141			newsk->sk_destruct = NULL;
1142			sk_free(newsk);
1143			newsk = NULL;
1144			goto out;
1145		}
1146
1147		newsk->sk_err	   = 0;
1148		newsk->sk_priority = 0;
1149		/*
1150		 * Before updating sk_refcnt, we must commit prior changes to memory
1151		 * (Documentation/RCU/rculist_nulls.txt for details)
1152		 */
1153		smp_wmb();
1154		atomic_set(&newsk->sk_refcnt, 2);
1155
1156		/*
1157		 * Increment the counter in the same struct proto as the master
1158		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1159		 * is the same as sk->sk_prot->socks, as this field was copied
1160		 * with memcpy).
1161		 *
1162		 * This _changes_ the previous behaviour, where
1163		 * tcp_create_openreq_child always was incrementing the
1164		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1165		 * to be taken into account in all callers. -acme
1166		 */
1167		sk_refcnt_debug_inc(newsk);
1168		sk_set_socket(newsk, NULL);
1169		newsk->sk_sleep	 = NULL;
1170
1171		if (newsk->sk_prot->sockets_allocated)
1172			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1173	}
1174out:
1175	return newsk;
1176}
1177EXPORT_SYMBOL_GPL(sk_clone);
1178
1179void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1180{
1181	__sk_dst_set(sk, dst);
1182	sk->sk_route_caps = dst->dev->features;
1183	if (sk->sk_route_caps & NETIF_F_GSO)
1184		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1185	if (sk_can_gso(sk)) {
1186		if (dst->header_len) {
1187			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1188		} else {
1189			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1190			sk->sk_gso_max_size = dst->dev->gso_max_size;
1191		}
1192	}
1193}
1194EXPORT_SYMBOL_GPL(sk_setup_caps);
1195
1196void __init sk_init(void)
1197{
1198	if (num_physpages <= 4096) {
1199		sysctl_wmem_max = 32767;
1200		sysctl_rmem_max = 32767;
1201		sysctl_wmem_default = 32767;
1202		sysctl_rmem_default = 32767;
1203	} else if (num_physpages >= 131072) {
1204		sysctl_wmem_max = 131071;
1205		sysctl_rmem_max = 131071;
1206	}
1207}
1208
1209/*
1210 *	Simple resource managers for sockets.
1211 */
1212
1213
1214/*
1215 * Write buffer destructor automatically called from kfree_skb.
1216 */
1217void sock_wfree(struct sk_buff *skb)
1218{
1219	struct sock *sk = skb->sk;
1220	int res;
1221
1222	/* In case it might be waiting for more memory. */
1223	res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1224	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1225		sk->sk_write_space(sk);
1226	/*
1227	 * if sk_wmem_alloc reached 0, we are last user and should
1228	 * free this sock, as sk_free() call could not do it.
1229	 */
1230	if (res == 0)
1231		__sk_free(sk);
1232}
1233EXPORT_SYMBOL(sock_wfree);
1234
1235/*
1236 * Read buffer destructor automatically called from kfree_skb.
1237 */
1238void sock_rfree(struct sk_buff *skb)
1239{
1240	struct sock *sk = skb->sk;
1241
1242	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1243	sk_mem_uncharge(skb->sk, skb->truesize);
1244}
1245EXPORT_SYMBOL(sock_rfree);
1246
1247
1248int sock_i_uid(struct sock *sk)
1249{
1250	int uid;
1251
1252	read_lock(&sk->sk_callback_lock);
1253	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1254	read_unlock(&sk->sk_callback_lock);
1255	return uid;
1256}
1257EXPORT_SYMBOL(sock_i_uid);
1258
1259unsigned long sock_i_ino(struct sock *sk)
1260{
1261	unsigned long ino;
1262
1263	read_lock(&sk->sk_callback_lock);
1264	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1265	read_unlock(&sk->sk_callback_lock);
1266	return ino;
1267}
1268EXPORT_SYMBOL(sock_i_ino);
1269
1270/*
1271 * Allocate a skb from the socket's send buffer.
1272 */
1273struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1274			     gfp_t priority)
1275{
1276	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1277		struct sk_buff *skb = alloc_skb(size, priority);
1278		if (skb) {
1279			skb_set_owner_w(skb, sk);
1280			return skb;
1281		}
1282	}
1283	return NULL;
1284}
1285EXPORT_SYMBOL(sock_wmalloc);
1286
1287/*
1288 * Allocate a skb from the socket's receive buffer.
1289 */
1290struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1291			     gfp_t priority)
1292{
1293	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1294		struct sk_buff *skb = alloc_skb(size, priority);
1295		if (skb) {
1296			skb_set_owner_r(skb, sk);
1297			return skb;
1298		}
1299	}
1300	return NULL;
1301}
1302
1303/*
1304 * Allocate a memory block from the socket's option memory buffer.
1305 */
1306void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1307{
1308	if ((unsigned)size <= sysctl_optmem_max &&
1309	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1310		void *mem;
1311		/* First do the add, to avoid the race if kmalloc
1312		 * might sleep.
1313		 */
1314		atomic_add(size, &sk->sk_omem_alloc);
1315		mem = kmalloc(size, priority);
1316		if (mem)
1317			return mem;
1318		atomic_sub(size, &sk->sk_omem_alloc);
1319	}
1320	return NULL;
1321}
1322EXPORT_SYMBOL(sock_kmalloc);
1323
1324/*
1325 * Free an option memory block.
1326 */
1327void sock_kfree_s(struct sock *sk, void *mem, int size)
1328{
1329	kfree(mem);
1330	atomic_sub(size, &sk->sk_omem_alloc);
1331}
1332EXPORT_SYMBOL(sock_kfree_s);
1333
1334/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1335   I think, these locks should be removed for datagram sockets.
1336 */
1337static long sock_wait_for_wmem(struct sock *sk, long timeo)
1338{
1339	DEFINE_WAIT(wait);
1340
1341	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1342	for (;;) {
1343		if (!timeo)
1344			break;
1345		if (signal_pending(current))
1346			break;
1347		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1348		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1349		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1350			break;
1351		if (sk->sk_shutdown & SEND_SHUTDOWN)
1352			break;
1353		if (sk->sk_err)
1354			break;
1355		timeo = schedule_timeout(timeo);
1356	}
1357	finish_wait(sk->sk_sleep, &wait);
1358	return timeo;
1359}
1360
1361
1362/*
1363 *	Generic send/receive buffer handlers
1364 */
1365
1366struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1367				     unsigned long data_len, int noblock,
1368				     int *errcode)
1369{
1370	struct sk_buff *skb;
1371	gfp_t gfp_mask;
1372	long timeo;
1373	int err;
1374
1375	gfp_mask = sk->sk_allocation;
1376	if (gfp_mask & __GFP_WAIT)
1377		gfp_mask |= __GFP_REPEAT;
1378
1379	timeo = sock_sndtimeo(sk, noblock);
1380	while (1) {
1381		err = sock_error(sk);
1382		if (err != 0)
1383			goto failure;
1384
1385		err = -EPIPE;
1386		if (sk->sk_shutdown & SEND_SHUTDOWN)
1387			goto failure;
1388
1389		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1390			skb = alloc_skb(header_len, gfp_mask);
1391			if (skb) {
1392				int npages;
1393				int i;
1394
1395				/* No pages, we're done... */
1396				if (!data_len)
1397					break;
1398
1399				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1400				skb->truesize += data_len;
1401				skb_shinfo(skb)->nr_frags = npages;
1402				for (i = 0; i < npages; i++) {
1403					struct page *page;
1404					skb_frag_t *frag;
1405
1406					page = alloc_pages(sk->sk_allocation, 0);
1407					if (!page) {
1408						err = -ENOBUFS;
1409						skb_shinfo(skb)->nr_frags = i;
1410						kfree_skb(skb);
1411						goto failure;
1412					}
1413
1414					frag = &skb_shinfo(skb)->frags[i];
1415					frag->page = page;
1416					frag->page_offset = 0;
1417					frag->size = (data_len >= PAGE_SIZE ?
1418						      PAGE_SIZE :
1419						      data_len);
1420					data_len -= PAGE_SIZE;
1421				}
1422
1423				/* Full success... */
1424				break;
1425			}
1426			err = -ENOBUFS;
1427			goto failure;
1428		}
1429		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1430		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1431		err = -EAGAIN;
1432		if (!timeo)
1433			goto failure;
1434		if (signal_pending(current))
1435			goto interrupted;
1436		timeo = sock_wait_for_wmem(sk, timeo);
1437	}
1438
1439	skb_set_owner_w(skb, sk);
1440	return skb;
1441
1442interrupted:
1443	err = sock_intr_errno(timeo);
1444failure:
1445	*errcode = err;
1446	return NULL;
1447}
1448EXPORT_SYMBOL(sock_alloc_send_pskb);
1449
1450struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1451				    int noblock, int *errcode)
1452{
1453	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1454}
1455EXPORT_SYMBOL(sock_alloc_send_skb);
1456
1457static void __lock_sock(struct sock *sk)
1458{
1459	DEFINE_WAIT(wait);
1460
1461	for (;;) {
1462		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1463					TASK_UNINTERRUPTIBLE);
1464		spin_unlock_bh(&sk->sk_lock.slock);
1465		schedule();
1466		spin_lock_bh(&sk->sk_lock.slock);
1467		if (!sock_owned_by_user(sk))
1468			break;
1469	}
1470	finish_wait(&sk->sk_lock.wq, &wait);
1471}
1472
1473static void __release_sock(struct sock *sk)
1474{
1475	struct sk_buff *skb = sk->sk_backlog.head;
1476
1477	do {
1478		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1479		bh_unlock_sock(sk);
1480
1481		do {
1482			struct sk_buff *next = skb->next;
1483
1484			skb->next = NULL;
1485			sk_backlog_rcv(sk, skb);
1486
1487			/*
1488			 * We are in process context here with softirqs
1489			 * disabled, use cond_resched_softirq() to preempt.
1490			 * This is safe to do because we've taken the backlog
1491			 * queue private:
1492			 */
1493			cond_resched_softirq();
1494
1495			skb = next;
1496		} while (skb != NULL);
1497
1498		bh_lock_sock(sk);
1499	} while ((skb = sk->sk_backlog.head) != NULL);
1500}
1501
1502/**
1503 * sk_wait_data - wait for data to arrive at sk_receive_queue
1504 * @sk:    sock to wait on
1505 * @timeo: for how long
1506 *
1507 * Now socket state including sk->sk_err is changed only under lock,
1508 * hence we may omit checks after joining wait queue.
1509 * We check receive queue before schedule() only as optimization;
1510 * it is very likely that release_sock() added new data.
1511 */
1512int sk_wait_data(struct sock *sk, long *timeo)
1513{
1514	int rc;
1515	DEFINE_WAIT(wait);
1516
1517	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1518	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1519	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1520	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1521	finish_wait(sk->sk_sleep, &wait);
1522	return rc;
1523}
1524EXPORT_SYMBOL(sk_wait_data);
1525
1526/**
1527 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1528 *	@sk: socket
1529 *	@size: memory size to allocate
1530 *	@kind: allocation type
1531 *
1532 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1533 *	rmem allocation. This function assumes that protocols which have
1534 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1535 */
1536int __sk_mem_schedule(struct sock *sk, int size, int kind)
1537{
1538	struct proto *prot = sk->sk_prot;
1539	int amt = sk_mem_pages(size);
1540	int allocated;
1541
1542	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1543	allocated = atomic_add_return(amt, prot->memory_allocated);
1544
1545	/* Under limit. */
1546	if (allocated <= prot->sysctl_mem[0]) {
1547		if (prot->memory_pressure && *prot->memory_pressure)
1548			*prot->memory_pressure = 0;
1549		return 1;
1550	}
1551
1552	/* Under pressure. */
1553	if (allocated > prot->sysctl_mem[1])
1554		if (prot->enter_memory_pressure)
1555			prot->enter_memory_pressure(sk);
1556
1557	/* Over hard limit. */
1558	if (allocated > prot->sysctl_mem[2])
1559		goto suppress_allocation;
1560
1561	/* guarantee minimum buffer size under pressure */
1562	if (kind == SK_MEM_RECV) {
1563		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1564			return 1;
1565	} else { /* SK_MEM_SEND */
1566		if (sk->sk_type == SOCK_STREAM) {
1567			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1568				return 1;
1569		} else if (atomic_read(&sk->sk_wmem_alloc) <
1570			   prot->sysctl_wmem[0])
1571				return 1;
1572	}
1573
1574	if (prot->memory_pressure) {
1575		int alloc;
1576
1577		if (!*prot->memory_pressure)
1578			return 1;
1579		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1580		if (prot->sysctl_mem[2] > alloc *
1581		    sk_mem_pages(sk->sk_wmem_queued +
1582				 atomic_read(&sk->sk_rmem_alloc) +
1583				 sk->sk_forward_alloc))
1584			return 1;
1585	}
1586
1587suppress_allocation:
1588
1589	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1590		sk_stream_moderate_sndbuf(sk);
1591
1592		/* Fail only if socket is _under_ its sndbuf.
1593		 * In this case we cannot block, so that we have to fail.
1594		 */
1595		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1596			return 1;
1597	}
1598
1599	/* Alas. Undo changes. */
1600	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1601	atomic_sub(amt, prot->memory_allocated);
1602	return 0;
1603}
1604EXPORT_SYMBOL(__sk_mem_schedule);
1605
1606/**
1607 *	__sk_reclaim - reclaim memory_allocated
1608 *	@sk: socket
1609 */
1610void __sk_mem_reclaim(struct sock *sk)
1611{
1612	struct proto *prot = sk->sk_prot;
1613
1614	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1615		   prot->memory_allocated);
1616	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1617
1618	if (prot->memory_pressure && *prot->memory_pressure &&
1619	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1620		*prot->memory_pressure = 0;
1621}
1622EXPORT_SYMBOL(__sk_mem_reclaim);
1623
1624
1625/*
1626 * Set of default routines for initialising struct proto_ops when
1627 * the protocol does not support a particular function. In certain
1628 * cases where it makes no sense for a protocol to have a "do nothing"
1629 * function, some default processing is provided.
1630 */
1631
1632int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1633{
1634	return -EOPNOTSUPP;
1635}
1636EXPORT_SYMBOL(sock_no_bind);
1637
1638int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1639		    int len, int flags)
1640{
1641	return -EOPNOTSUPP;
1642}
1643EXPORT_SYMBOL(sock_no_connect);
1644
1645int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1646{
1647	return -EOPNOTSUPP;
1648}
1649EXPORT_SYMBOL(sock_no_socketpair);
1650
1651int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1652{
1653	return -EOPNOTSUPP;
1654}
1655EXPORT_SYMBOL(sock_no_accept);
1656
1657int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1658		    int *len, int peer)
1659{
1660	return -EOPNOTSUPP;
1661}
1662EXPORT_SYMBOL(sock_no_getname);
1663
1664unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1665{
1666	return 0;
1667}
1668EXPORT_SYMBOL(sock_no_poll);
1669
1670int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1671{
1672	return -EOPNOTSUPP;
1673}
1674EXPORT_SYMBOL(sock_no_ioctl);
1675
1676int sock_no_listen(struct socket *sock, int backlog)
1677{
1678	return -EOPNOTSUPP;
1679}
1680EXPORT_SYMBOL(sock_no_listen);
1681
1682int sock_no_shutdown(struct socket *sock, int how)
1683{
1684	return -EOPNOTSUPP;
1685}
1686EXPORT_SYMBOL(sock_no_shutdown);
1687
1688int sock_no_setsockopt(struct socket *sock, int level, int optname,
1689		    char __user *optval, int optlen)
1690{
1691	return -EOPNOTSUPP;
1692}
1693EXPORT_SYMBOL(sock_no_setsockopt);
1694
1695int sock_no_getsockopt(struct socket *sock, int level, int optname,
1696		    char __user *optval, int __user *optlen)
1697{
1698	return -EOPNOTSUPP;
1699}
1700EXPORT_SYMBOL(sock_no_getsockopt);
1701
1702int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1703		    size_t len)
1704{
1705	return -EOPNOTSUPP;
1706}
1707EXPORT_SYMBOL(sock_no_sendmsg);
1708
1709int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1710		    size_t len, int flags)
1711{
1712	return -EOPNOTSUPP;
1713}
1714EXPORT_SYMBOL(sock_no_recvmsg);
1715
1716int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1717{
1718	/* Mirror missing mmap method error code */
1719	return -ENODEV;
1720}
1721EXPORT_SYMBOL(sock_no_mmap);
1722
1723ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1724{
1725	ssize_t res;
1726	struct msghdr msg = {.msg_flags = flags};
1727	struct kvec iov;
1728	char *kaddr = kmap(page);
1729	iov.iov_base = kaddr + offset;
1730	iov.iov_len = size;
1731	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1732	kunmap(page);
1733	return res;
1734}
1735EXPORT_SYMBOL(sock_no_sendpage);
1736
1737/*
1738 *	Default Socket Callbacks
1739 */
1740
1741static void sock_def_wakeup(struct sock *sk)
1742{
1743	read_lock(&sk->sk_callback_lock);
1744	if (sk_has_sleeper(sk))
1745		wake_up_interruptible_all(sk->sk_sleep);
1746	read_unlock(&sk->sk_callback_lock);
1747}
1748
1749static void sock_def_error_report(struct sock *sk)
1750{
1751	read_lock(&sk->sk_callback_lock);
1752	if (sk_has_sleeper(sk))
1753		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1754	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1755	read_unlock(&sk->sk_callback_lock);
1756}
1757
1758static void sock_def_readable(struct sock *sk, int len)
1759{
1760	read_lock(&sk->sk_callback_lock);
1761	if (sk_has_sleeper(sk))
1762		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1763						POLLRDNORM | POLLRDBAND);
1764	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1765	read_unlock(&sk->sk_callback_lock);
1766}
1767
1768static void sock_def_write_space(struct sock *sk)
1769{
1770	read_lock(&sk->sk_callback_lock);
1771
1772	/* Do not wake up a writer until he can make "significant"
1773	 * progress.  --DaveM
1774	 */
1775	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1776		if (sk_has_sleeper(sk))
1777			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1778						POLLWRNORM | POLLWRBAND);
1779
1780		/* Should agree with poll, otherwise some programs break */
1781		if (sock_writeable(sk))
1782			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1783	}
1784
1785	read_unlock(&sk->sk_callback_lock);
1786}
1787
1788static void sock_def_destruct(struct sock *sk)
1789{
1790	kfree(sk->sk_protinfo);
1791}
1792
1793void sk_send_sigurg(struct sock *sk)
1794{
1795	if (sk->sk_socket && sk->sk_socket->file)
1796		if (send_sigurg(&sk->sk_socket->file->f_owner))
1797			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1798}
1799EXPORT_SYMBOL(sk_send_sigurg);
1800
1801void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1802		    unsigned long expires)
1803{
1804	if (!mod_timer(timer, expires))
1805		sock_hold(sk);
1806}
1807EXPORT_SYMBOL(sk_reset_timer);
1808
1809void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1810{
1811	if (timer_pending(timer) && del_timer(timer))
1812		__sock_put(sk);
1813}
1814EXPORT_SYMBOL(sk_stop_timer);
1815
1816void sock_init_data(struct socket *sock, struct sock *sk)
1817{
1818	skb_queue_head_init(&sk->sk_receive_queue);
1819	skb_queue_head_init(&sk->sk_write_queue);
1820	skb_queue_head_init(&sk->sk_error_queue);
1821#ifdef CONFIG_NET_DMA
1822	skb_queue_head_init(&sk->sk_async_wait_queue);
1823#endif
1824
1825	sk->sk_send_head	=	NULL;
1826
1827	init_timer(&sk->sk_timer);
1828
1829	sk->sk_allocation	=	GFP_KERNEL;
1830	sk->sk_rcvbuf		=	sysctl_rmem_default;
1831	sk->sk_sndbuf		=	sysctl_wmem_default;
1832	sk->sk_state		=	TCP_CLOSE;
1833	sk_set_socket(sk, sock);
1834
1835	sock_set_flag(sk, SOCK_ZAPPED);
1836
1837	if (sock) {
1838		sk->sk_type	=	sock->type;
1839		sk->sk_sleep	=	&sock->wait;
1840		sock->sk	=	sk;
1841	} else
1842		sk->sk_sleep	=	NULL;
1843
1844	rwlock_init(&sk->sk_dst_lock);
1845	rwlock_init(&sk->sk_callback_lock);
1846	lockdep_set_class_and_name(&sk->sk_callback_lock,
1847			af_callback_keys + sk->sk_family,
1848			af_family_clock_key_strings[sk->sk_family]);
1849
1850	sk->sk_state_change	=	sock_def_wakeup;
1851	sk->sk_data_ready	=	sock_def_readable;
1852	sk->sk_write_space	=	sock_def_write_space;
1853	sk->sk_error_report	=	sock_def_error_report;
1854	sk->sk_destruct		=	sock_def_destruct;
1855
1856	sk->sk_sndmsg_page	=	NULL;
1857	sk->sk_sndmsg_off	=	0;
1858
1859	sk->sk_peercred.pid 	=	0;
1860	sk->sk_peercred.uid	=	-1;
1861	sk->sk_peercred.gid	=	-1;
1862	sk->sk_write_pending	=	0;
1863	sk->sk_rcvlowat		=	1;
1864	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1865	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1866
1867	sk->sk_stamp = ktime_set(-1L, 0);
1868
1869	/*
1870	 * Before updating sk_refcnt, we must commit prior changes to memory
1871	 * (Documentation/RCU/rculist_nulls.txt for details)
1872	 */
1873	smp_wmb();
1874	atomic_set(&sk->sk_refcnt, 1);
1875	atomic_set(&sk->sk_wmem_alloc, 1);
1876	atomic_set(&sk->sk_drops, 0);
1877}
1878EXPORT_SYMBOL(sock_init_data);
1879
1880void lock_sock_nested(struct sock *sk, int subclass)
1881{
1882	might_sleep();
1883	spin_lock_bh(&sk->sk_lock.slock);
1884	if (sk->sk_lock.owned)
1885		__lock_sock(sk);
1886	sk->sk_lock.owned = 1;
1887	spin_unlock(&sk->sk_lock.slock);
1888	/*
1889	 * The sk_lock has mutex_lock() semantics here:
1890	 */
1891	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1892	local_bh_enable();
1893}
1894EXPORT_SYMBOL(lock_sock_nested);
1895
1896void release_sock(struct sock *sk)
1897{
1898	/*
1899	 * The sk_lock has mutex_unlock() semantics:
1900	 */
1901	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1902
1903	spin_lock_bh(&sk->sk_lock.slock);
1904	if (sk->sk_backlog.tail)
1905		__release_sock(sk);
1906	sk->sk_lock.owned = 0;
1907	if (waitqueue_active(&sk->sk_lock.wq))
1908		wake_up(&sk->sk_lock.wq);
1909	spin_unlock_bh(&sk->sk_lock.slock);
1910}
1911EXPORT_SYMBOL(release_sock);
1912
1913int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1914{
1915	struct timeval tv;
1916	if (!sock_flag(sk, SOCK_TIMESTAMP))
1917		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1918	tv = ktime_to_timeval(sk->sk_stamp);
1919	if (tv.tv_sec == -1)
1920		return -ENOENT;
1921	if (tv.tv_sec == 0) {
1922		sk->sk_stamp = ktime_get_real();
1923		tv = ktime_to_timeval(sk->sk_stamp);
1924	}
1925	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1926}
1927EXPORT_SYMBOL(sock_get_timestamp);
1928
1929int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1930{
1931	struct timespec ts;
1932	if (!sock_flag(sk, SOCK_TIMESTAMP))
1933		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1934	ts = ktime_to_timespec(sk->sk_stamp);
1935	if (ts.tv_sec == -1)
1936		return -ENOENT;
1937	if (ts.tv_sec == 0) {
1938		sk->sk_stamp = ktime_get_real();
1939		ts = ktime_to_timespec(sk->sk_stamp);
1940	}
1941	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1942}
1943EXPORT_SYMBOL(sock_get_timestampns);
1944
1945void sock_enable_timestamp(struct sock *sk, int flag)
1946{
1947	if (!sock_flag(sk, flag)) {
1948		sock_set_flag(sk, flag);
1949		/*
1950		 * we just set one of the two flags which require net
1951		 * time stamping, but time stamping might have been on
1952		 * already because of the other one
1953		 */
1954		if (!sock_flag(sk,
1955				flag == SOCK_TIMESTAMP ?
1956				SOCK_TIMESTAMPING_RX_SOFTWARE :
1957				SOCK_TIMESTAMP))
1958			net_enable_timestamp();
1959	}
1960}
1961
1962/*
1963 *	Get a socket option on an socket.
1964 *
1965 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1966 *	asynchronous errors should be reported by getsockopt. We assume
1967 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1968 */
1969int sock_common_getsockopt(struct socket *sock, int level, int optname,
1970			   char __user *optval, int __user *optlen)
1971{
1972	struct sock *sk = sock->sk;
1973
1974	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1975}
1976EXPORT_SYMBOL(sock_common_getsockopt);
1977
1978#ifdef CONFIG_COMPAT
1979int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1980				  char __user *optval, int __user *optlen)
1981{
1982	struct sock *sk = sock->sk;
1983
1984	if (sk->sk_prot->compat_getsockopt != NULL)
1985		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1986						      optval, optlen);
1987	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1988}
1989EXPORT_SYMBOL(compat_sock_common_getsockopt);
1990#endif
1991
1992int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1993			struct msghdr *msg, size_t size, int flags)
1994{
1995	struct sock *sk = sock->sk;
1996	int addr_len = 0;
1997	int err;
1998
1999	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2000				   flags & ~MSG_DONTWAIT, &addr_len);
2001	if (err >= 0)
2002		msg->msg_namelen = addr_len;
2003	return err;
2004}
2005EXPORT_SYMBOL(sock_common_recvmsg);
2006
2007/*
2008 *	Set socket options on an inet socket.
2009 */
2010int sock_common_setsockopt(struct socket *sock, int level, int optname,
2011			   char __user *optval, int optlen)
2012{
2013	struct sock *sk = sock->sk;
2014
2015	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2016}
2017EXPORT_SYMBOL(sock_common_setsockopt);
2018
2019#ifdef CONFIG_COMPAT
2020int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2021				  char __user *optval, int optlen)
2022{
2023	struct sock *sk = sock->sk;
2024
2025	if (sk->sk_prot->compat_setsockopt != NULL)
2026		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2027						      optval, optlen);
2028	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2029}
2030EXPORT_SYMBOL(compat_sock_common_setsockopt);
2031#endif
2032
2033void sk_common_release(struct sock *sk)
2034{
2035	if (sk->sk_prot->destroy)
2036		sk->sk_prot->destroy(sk);
2037
2038	/*
2039	 * Observation: when sock_common_release is called, processes have
2040	 * no access to socket. But net still has.
2041	 * Step one, detach it from networking:
2042	 *
2043	 * A. Remove from hash tables.
2044	 */
2045
2046	sk->sk_prot->unhash(sk);
2047
2048	/*
2049	 * In this point socket cannot receive new packets, but it is possible
2050	 * that some packets are in flight because some CPU runs receiver and
2051	 * did hash table lookup before we unhashed socket. They will achieve
2052	 * receive queue and will be purged by socket destructor.
2053	 *
2054	 * Also we still have packets pending on receive queue and probably,
2055	 * our own packets waiting in device queues. sock_destroy will drain
2056	 * receive queue, but transmitted packets will delay socket destruction
2057	 * until the last reference will be released.
2058	 */
2059
2060	sock_orphan(sk);
2061
2062	xfrm_sk_free_policy(sk);
2063
2064	sk_refcnt_debug_release(sk);
2065	sock_put(sk);
2066}
2067EXPORT_SYMBOL(sk_common_release);
2068
2069static DEFINE_RWLOCK(proto_list_lock);
2070static LIST_HEAD(proto_list);
2071
2072#ifdef CONFIG_PROC_FS
2073#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2074struct prot_inuse {
2075	int val[PROTO_INUSE_NR];
2076};
2077
2078static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2079
2080#ifdef CONFIG_NET_NS
2081void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2082{
2083	int cpu = smp_processor_id();
2084	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2085}
2086EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2087
2088int sock_prot_inuse_get(struct net *net, struct proto *prot)
2089{
2090	int cpu, idx = prot->inuse_idx;
2091	int res = 0;
2092
2093	for_each_possible_cpu(cpu)
2094		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2095
2096	return res >= 0 ? res : 0;
2097}
2098EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2099
2100static int sock_inuse_init_net(struct net *net)
2101{
2102	net->core.inuse = alloc_percpu(struct prot_inuse);
2103	return net->core.inuse ? 0 : -ENOMEM;
2104}
2105
2106static void sock_inuse_exit_net(struct net *net)
2107{
2108	free_percpu(net->core.inuse);
2109}
2110
2111static struct pernet_operations net_inuse_ops = {
2112	.init = sock_inuse_init_net,
2113	.exit = sock_inuse_exit_net,
2114};
2115
2116static __init int net_inuse_init(void)
2117{
2118	if (register_pernet_subsys(&net_inuse_ops))
2119		panic("Cannot initialize net inuse counters");
2120
2121	return 0;
2122}
2123
2124core_initcall(net_inuse_init);
2125#else
2126static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2127
2128void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2129{
2130	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2131}
2132EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2133
2134int sock_prot_inuse_get(struct net *net, struct proto *prot)
2135{
2136	int cpu, idx = prot->inuse_idx;
2137	int res = 0;
2138
2139	for_each_possible_cpu(cpu)
2140		res += per_cpu(prot_inuse, cpu).val[idx];
2141
2142	return res >= 0 ? res : 0;
2143}
2144EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2145#endif
2146
2147static void assign_proto_idx(struct proto *prot)
2148{
2149	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2150
2151	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2152		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2153		return;
2154	}
2155
2156	set_bit(prot->inuse_idx, proto_inuse_idx);
2157}
2158
2159static void release_proto_idx(struct proto *prot)
2160{
2161	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2162		clear_bit(prot->inuse_idx, proto_inuse_idx);
2163}
2164#else
2165static inline void assign_proto_idx(struct proto *prot)
2166{
2167}
2168
2169static inline void release_proto_idx(struct proto *prot)
2170{
2171}
2172#endif
2173
2174int proto_register(struct proto *prot, int alloc_slab)
2175{
2176	if (alloc_slab) {
2177		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2178					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2179					NULL);
2180
2181		if (prot->slab == NULL) {
2182			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2183			       prot->name);
2184			goto out;
2185		}
2186
2187		if (prot->rsk_prot != NULL) {
2188			static const char mask[] = "request_sock_%s";
2189
2190			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2191			if (prot->rsk_prot->slab_name == NULL)
2192				goto out_free_sock_slab;
2193
2194			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2195			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2196								 prot->rsk_prot->obj_size, 0,
2197								 SLAB_HWCACHE_ALIGN, NULL);
2198
2199			if (prot->rsk_prot->slab == NULL) {
2200				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2201				       prot->name);
2202				goto out_free_request_sock_slab_name;
2203			}
2204		}
2205
2206		if (prot->twsk_prot != NULL) {
2207			static const char mask[] = "tw_sock_%s";
2208
2209			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2210
2211			if (prot->twsk_prot->twsk_slab_name == NULL)
2212				goto out_free_request_sock_slab;
2213
2214			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2215			prot->twsk_prot->twsk_slab =
2216				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2217						  prot->twsk_prot->twsk_obj_size,
2218						  0,
2219						  SLAB_HWCACHE_ALIGN |
2220							prot->slab_flags,
2221						  NULL);
2222			if (prot->twsk_prot->twsk_slab == NULL)
2223				goto out_free_timewait_sock_slab_name;
2224		}
2225	}
2226
2227	write_lock(&proto_list_lock);
2228	list_add(&prot->node, &proto_list);
2229	assign_proto_idx(prot);
2230	write_unlock(&proto_list_lock);
2231	return 0;
2232
2233out_free_timewait_sock_slab_name:
2234	kfree(prot->twsk_prot->twsk_slab_name);
2235out_free_request_sock_slab:
2236	if (prot->rsk_prot && prot->rsk_prot->slab) {
2237		kmem_cache_destroy(prot->rsk_prot->slab);
2238		prot->rsk_prot->slab = NULL;
2239	}
2240out_free_request_sock_slab_name:
2241	kfree(prot->rsk_prot->slab_name);
2242out_free_sock_slab:
2243	kmem_cache_destroy(prot->slab);
2244	prot->slab = NULL;
2245out:
2246	return -ENOBUFS;
2247}
2248EXPORT_SYMBOL(proto_register);
2249
2250void proto_unregister(struct proto *prot)
2251{
2252	write_lock(&proto_list_lock);
2253	release_proto_idx(prot);
2254	list_del(&prot->node);
2255	write_unlock(&proto_list_lock);
2256
2257	if (prot->slab != NULL) {
2258		kmem_cache_destroy(prot->slab);
2259		prot->slab = NULL;
2260	}
2261
2262	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2263		kmem_cache_destroy(prot->rsk_prot->slab);
2264		kfree(prot->rsk_prot->slab_name);
2265		prot->rsk_prot->slab = NULL;
2266	}
2267
2268	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2269		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2270		kfree(prot->twsk_prot->twsk_slab_name);
2271		prot->twsk_prot->twsk_slab = NULL;
2272	}
2273}
2274EXPORT_SYMBOL(proto_unregister);
2275
2276#ifdef CONFIG_PROC_FS
2277static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2278	__acquires(proto_list_lock)
2279{
2280	read_lock(&proto_list_lock);
2281	return seq_list_start_head(&proto_list, *pos);
2282}
2283
2284static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2285{
2286	return seq_list_next(v, &proto_list, pos);
2287}
2288
2289static void proto_seq_stop(struct seq_file *seq, void *v)
2290	__releases(proto_list_lock)
2291{
2292	read_unlock(&proto_list_lock);
2293}
2294
2295static char proto_method_implemented(const void *method)
2296{
2297	return method == NULL ? 'n' : 'y';
2298}
2299
2300static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2301{
2302	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2303			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2304		   proto->name,
2305		   proto->obj_size,
2306		   sock_prot_inuse_get(seq_file_net(seq), proto),
2307		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2308		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2309		   proto->max_header,
2310		   proto->slab == NULL ? "no" : "yes",
2311		   module_name(proto->owner),
2312		   proto_method_implemented(proto->close),
2313		   proto_method_implemented(proto->connect),
2314		   proto_method_implemented(proto->disconnect),
2315		   proto_method_implemented(proto->accept),
2316		   proto_method_implemented(proto->ioctl),
2317		   proto_method_implemented(proto->init),
2318		   proto_method_implemented(proto->destroy),
2319		   proto_method_implemented(proto->shutdown),
2320		   proto_method_implemented(proto->setsockopt),
2321		   proto_method_implemented(proto->getsockopt),
2322		   proto_method_implemented(proto->sendmsg),
2323		   proto_method_implemented(proto->recvmsg),
2324		   proto_method_implemented(proto->sendpage),
2325		   proto_method_implemented(proto->bind),
2326		   proto_method_implemented(proto->backlog_rcv),
2327		   proto_method_implemented(proto->hash),
2328		   proto_method_implemented(proto->unhash),
2329		   proto_method_implemented(proto->get_port),
2330		   proto_method_implemented(proto->enter_memory_pressure));
2331}
2332
2333static int proto_seq_show(struct seq_file *seq, void *v)
2334{
2335	if (v == &proto_list)
2336		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2337			   "protocol",
2338			   "size",
2339			   "sockets",
2340			   "memory",
2341			   "press",
2342			   "maxhdr",
2343			   "slab",
2344			   "module",
2345			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2346	else
2347		proto_seq_printf(seq, list_entry(v, struct proto, node));
2348	return 0;
2349}
2350
2351static const struct seq_operations proto_seq_ops = {
2352	.start  = proto_seq_start,
2353	.next   = proto_seq_next,
2354	.stop   = proto_seq_stop,
2355	.show   = proto_seq_show,
2356};
2357
2358static int proto_seq_open(struct inode *inode, struct file *file)
2359{
2360	return seq_open_net(inode, file, &proto_seq_ops,
2361			    sizeof(struct seq_net_private));
2362}
2363
2364static const struct file_operations proto_seq_fops = {
2365	.owner		= THIS_MODULE,
2366	.open		= proto_seq_open,
2367	.read		= seq_read,
2368	.llseek		= seq_lseek,
2369	.release	= seq_release_net,
2370};
2371
2372static __net_init int proto_init_net(struct net *net)
2373{
2374	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2375		return -ENOMEM;
2376
2377	return 0;
2378}
2379
2380static __net_exit void proto_exit_net(struct net *net)
2381{
2382	proc_net_remove(net, "protocols");
2383}
2384
2385
2386static __net_initdata struct pernet_operations proto_net_ops = {
2387	.init = proto_init_net,
2388	.exit = proto_exit_net,
2389};
2390
2391static int __init proto_init(void)
2392{
2393	return register_pernet_subsys(&proto_net_ops);
2394}
2395
2396subsys_initcall(proto_init);
2397
2398#endif /* PROC_FS */
2399