sock.c revision 49c794e94649020248e37b78db16cd25bad38b4f
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
120#include <net/net_namespace.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <linux/net_tstamp.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *const af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158  "sk_lock-AF_IEEE802154",
159  "sk_lock-AF_MAX"
160};
161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174  "slock-AF_IEEE802154",
175  "slock-AF_MAX"
176};
177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190  "clock-AF_IEEE802154",
191  "clock-AF_MAX"
192};
193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms.  This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS		256
206#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218EXPORT_SYMBOL(sysctl_optmem_max);
219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222	struct timeval tv;
223
224	if (optlen < sizeof(tv))
225		return -EINVAL;
226	if (copy_from_user(&tv, optval, sizeof(tv)))
227		return -EFAULT;
228	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229		return -EDOM;
230
231	if (tv.tv_sec < 0) {
232		static int warned __read_mostly;
233
234		*timeo_p = 0;
235		if (warned < 10 && net_ratelimit()) {
236			warned++;
237			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238			       "tries to set negative timeout\n",
239				current->comm, task_pid_nr(current));
240		}
241		return 0;
242	}
243	*timeo_p = MAX_SCHEDULE_TIMEOUT;
244	if (tv.tv_sec == 0 && tv.tv_usec == 0)
245		return 0;
246	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248	return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253	static int warned;
254	static char warncomm[TASK_COMM_LEN];
255	if (strcmp(warncomm, current->comm) && warned < 5) {
256		strcpy(warncomm,  current->comm);
257		printk(KERN_WARNING "process `%s' is using obsolete "
258		       "%s SO_BSDCOMPAT\n", warncomm, name);
259		warned++;
260	}
261}
262
263static void sock_disable_timestamp(struct sock *sk, int flag)
264{
265	if (sock_flag(sk, flag)) {
266		sock_reset_flag(sk, flag);
267		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269			net_disable_timestamp();
270		}
271	}
272}
273
274
275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277	int err = 0;
278	int skb_len;
279
280	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
281	   number of warnings when compiling with -W --ANK
282	 */
283	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284	    (unsigned)sk->sk_rcvbuf) {
285		err = -ENOMEM;
286		goto out;
287	}
288
289	err = sk_filter(sk, skb);
290	if (err)
291		goto out;
292
293	if (!sk_rmem_schedule(sk, skb->truesize)) {
294		err = -ENOBUFS;
295		goto out;
296	}
297
298	skb->dev = NULL;
299	skb_set_owner_r(skb, sk);
300
301	/* Cache the SKB length before we tack it onto the receive
302	 * queue.  Once it is added it no longer belongs to us and
303	 * may be freed by other threads of control pulling packets
304	 * from the queue.
305	 */
306	skb_len = skb->len;
307
308	skb_queue_tail(&sk->sk_receive_queue, skb);
309
310	if (!sock_flag(sk, SOCK_DEAD))
311		sk->sk_data_ready(sk, skb_len);
312out:
313	return err;
314}
315EXPORT_SYMBOL(sock_queue_rcv_skb);
316
317int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
318{
319	int rc = NET_RX_SUCCESS;
320
321	if (sk_filter(sk, skb))
322		goto discard_and_relse;
323
324	skb->dev = NULL;
325
326	if (nested)
327		bh_lock_sock_nested(sk);
328	else
329		bh_lock_sock(sk);
330	if (!sock_owned_by_user(sk)) {
331		/*
332		 * trylock + unlock semantics:
333		 */
334		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
335
336		rc = sk_backlog_rcv(sk, skb);
337
338		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339	} else
340		sk_add_backlog(sk, skb);
341	bh_unlock_sock(sk);
342out:
343	sock_put(sk);
344	return rc;
345discard_and_relse:
346	kfree_skb(skb);
347	goto out;
348}
349EXPORT_SYMBOL(sk_receive_skb);
350
351struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
352{
353	struct dst_entry *dst = sk->sk_dst_cache;
354
355	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356		sk->sk_dst_cache = NULL;
357		dst_release(dst);
358		return NULL;
359	}
360
361	return dst;
362}
363EXPORT_SYMBOL(__sk_dst_check);
364
365struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
366{
367	struct dst_entry *dst = sk_dst_get(sk);
368
369	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370		sk_dst_reset(sk);
371		dst_release(dst);
372		return NULL;
373	}
374
375	return dst;
376}
377EXPORT_SYMBOL(sk_dst_check);
378
379static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
380{
381	int ret = -ENOPROTOOPT;
382#ifdef CONFIG_NETDEVICES
383	struct net *net = sock_net(sk);
384	char devname[IFNAMSIZ];
385	int index;
386
387	/* Sorry... */
388	ret = -EPERM;
389	if (!capable(CAP_NET_RAW))
390		goto out;
391
392	ret = -EINVAL;
393	if (optlen < 0)
394		goto out;
395
396	/* Bind this socket to a particular device like "eth0",
397	 * as specified in the passed interface name. If the
398	 * name is "" or the option length is zero the socket
399	 * is not bound.
400	 */
401	if (optlen > IFNAMSIZ - 1)
402		optlen = IFNAMSIZ - 1;
403	memset(devname, 0, sizeof(devname));
404
405	ret = -EFAULT;
406	if (copy_from_user(devname, optval, optlen))
407		goto out;
408
409	if (devname[0] == '\0') {
410		index = 0;
411	} else {
412		struct net_device *dev = dev_get_by_name(net, devname);
413
414		ret = -ENODEV;
415		if (!dev)
416			goto out;
417
418		index = dev->ifindex;
419		dev_put(dev);
420	}
421
422	lock_sock(sk);
423	sk->sk_bound_dev_if = index;
424	sk_dst_reset(sk);
425	release_sock(sk);
426
427	ret = 0;
428
429out:
430#endif
431
432	return ret;
433}
434
435static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
436{
437	if (valbool)
438		sock_set_flag(sk, bit);
439	else
440		sock_reset_flag(sk, bit);
441}
442
443/*
444 *	This is meant for all protocols to use and covers goings on
445 *	at the socket level. Everything here is generic.
446 */
447
448int sock_setsockopt(struct socket *sock, int level, int optname,
449		    char __user *optval, int optlen)
450{
451	struct sock *sk = sock->sk;
452	int val;
453	int valbool;
454	struct linger ling;
455	int ret = 0;
456
457	/*
458	 *	Options without arguments
459	 */
460
461	if (optname == SO_BINDTODEVICE)
462		return sock_bindtodevice(sk, optval, optlen);
463
464	if (optlen < sizeof(int))
465		return -EINVAL;
466
467	if (get_user(val, (int __user *)optval))
468		return -EFAULT;
469
470	valbool = val ? 1 : 0;
471
472	lock_sock(sk);
473
474	switch (optname) {
475	case SO_DEBUG:
476		if (val && !capable(CAP_NET_ADMIN))
477			ret = -EACCES;
478		else
479			sock_valbool_flag(sk, SOCK_DBG, valbool);
480		break;
481	case SO_REUSEADDR:
482		sk->sk_reuse = valbool;
483		break;
484	case SO_TYPE:
485	case SO_PROTOCOL:
486	case SO_ERROR:
487		ret = -ENOPROTOOPT;
488		break;
489	case SO_DONTROUTE:
490		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
491		break;
492	case SO_BROADCAST:
493		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
494		break;
495	case SO_SNDBUF:
496		/* Don't error on this BSD doesn't and if you think
497		   about it this is right. Otherwise apps have to
498		   play 'guess the biggest size' games. RCVBUF/SNDBUF
499		   are treated in BSD as hints */
500
501		if (val > sysctl_wmem_max)
502			val = sysctl_wmem_max;
503set_sndbuf:
504		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
505		if ((val * 2) < SOCK_MIN_SNDBUF)
506			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
507		else
508			sk->sk_sndbuf = val * 2;
509
510		/*
511		 *	Wake up sending tasks if we
512		 *	upped the value.
513		 */
514		sk->sk_write_space(sk);
515		break;
516
517	case SO_SNDBUFFORCE:
518		if (!capable(CAP_NET_ADMIN)) {
519			ret = -EPERM;
520			break;
521		}
522		goto set_sndbuf;
523
524	case SO_RCVBUF:
525		/* Don't error on this BSD doesn't and if you think
526		   about it this is right. Otherwise apps have to
527		   play 'guess the biggest size' games. RCVBUF/SNDBUF
528		   are treated in BSD as hints */
529
530		if (val > sysctl_rmem_max)
531			val = sysctl_rmem_max;
532set_rcvbuf:
533		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
534		/*
535		 * We double it on the way in to account for
536		 * "struct sk_buff" etc. overhead.   Applications
537		 * assume that the SO_RCVBUF setting they make will
538		 * allow that much actual data to be received on that
539		 * socket.
540		 *
541		 * Applications are unaware that "struct sk_buff" and
542		 * other overheads allocate from the receive buffer
543		 * during socket buffer allocation.
544		 *
545		 * And after considering the possible alternatives,
546		 * returning the value we actually used in getsockopt
547		 * is the most desirable behavior.
548		 */
549		if ((val * 2) < SOCK_MIN_RCVBUF)
550			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
551		else
552			sk->sk_rcvbuf = val * 2;
553		break;
554
555	case SO_RCVBUFFORCE:
556		if (!capable(CAP_NET_ADMIN)) {
557			ret = -EPERM;
558			break;
559		}
560		goto set_rcvbuf;
561
562	case SO_KEEPALIVE:
563#ifdef CONFIG_INET
564		if (sk->sk_protocol == IPPROTO_TCP)
565			tcp_set_keepalive(sk, valbool);
566#endif
567		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
568		break;
569
570	case SO_OOBINLINE:
571		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
572		break;
573
574	case SO_NO_CHECK:
575		sk->sk_no_check = valbool;
576		break;
577
578	case SO_PRIORITY:
579		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
580			sk->sk_priority = val;
581		else
582			ret = -EPERM;
583		break;
584
585	case SO_LINGER:
586		if (optlen < sizeof(ling)) {
587			ret = -EINVAL;	/* 1003.1g */
588			break;
589		}
590		if (copy_from_user(&ling, optval, sizeof(ling))) {
591			ret = -EFAULT;
592			break;
593		}
594		if (!ling.l_onoff)
595			sock_reset_flag(sk, SOCK_LINGER);
596		else {
597#if (BITS_PER_LONG == 32)
598			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
599				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
600			else
601#endif
602				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
603			sock_set_flag(sk, SOCK_LINGER);
604		}
605		break;
606
607	case SO_BSDCOMPAT:
608		sock_warn_obsolete_bsdism("setsockopt");
609		break;
610
611	case SO_PASSCRED:
612		if (valbool)
613			set_bit(SOCK_PASSCRED, &sock->flags);
614		else
615			clear_bit(SOCK_PASSCRED, &sock->flags);
616		break;
617
618	case SO_TIMESTAMP:
619	case SO_TIMESTAMPNS:
620		if (valbool)  {
621			if (optname == SO_TIMESTAMP)
622				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
623			else
624				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
625			sock_set_flag(sk, SOCK_RCVTSTAMP);
626			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
627		} else {
628			sock_reset_flag(sk, SOCK_RCVTSTAMP);
629			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
630		}
631		break;
632
633	case SO_TIMESTAMPING:
634		if (val & ~SOF_TIMESTAMPING_MASK) {
635			ret = -EINVAL;
636			break;
637		}
638		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
639				  val & SOF_TIMESTAMPING_TX_HARDWARE);
640		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
641				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
642		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
643				  val & SOF_TIMESTAMPING_RX_HARDWARE);
644		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
645			sock_enable_timestamp(sk,
646					      SOCK_TIMESTAMPING_RX_SOFTWARE);
647		else
648			sock_disable_timestamp(sk,
649					       SOCK_TIMESTAMPING_RX_SOFTWARE);
650		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
651				  val & SOF_TIMESTAMPING_SOFTWARE);
652		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
653				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
654		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
655				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
656		break;
657
658	case SO_RCVLOWAT:
659		if (val < 0)
660			val = INT_MAX;
661		sk->sk_rcvlowat = val ? : 1;
662		break;
663
664	case SO_RCVTIMEO:
665		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
666		break;
667
668	case SO_SNDTIMEO:
669		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
670		break;
671
672	case SO_ATTACH_FILTER:
673		ret = -EINVAL;
674		if (optlen == sizeof(struct sock_fprog)) {
675			struct sock_fprog fprog;
676
677			ret = -EFAULT;
678			if (copy_from_user(&fprog, optval, sizeof(fprog)))
679				break;
680
681			ret = sk_attach_filter(&fprog, sk);
682		}
683		break;
684
685	case SO_DETACH_FILTER:
686		ret = sk_detach_filter(sk);
687		break;
688
689	case SO_PASSSEC:
690		if (valbool)
691			set_bit(SOCK_PASSSEC, &sock->flags);
692		else
693			clear_bit(SOCK_PASSSEC, &sock->flags);
694		break;
695	case SO_MARK:
696		if (!capable(CAP_NET_ADMIN))
697			ret = -EPERM;
698		else
699			sk->sk_mark = val;
700		break;
701
702		/* We implement the SO_SNDLOWAT etc to
703		   not be settable (1003.1g 5.3) */
704	default:
705		ret = -ENOPROTOOPT;
706		break;
707	}
708	release_sock(sk);
709	return ret;
710}
711EXPORT_SYMBOL(sock_setsockopt);
712
713
714int sock_getsockopt(struct socket *sock, int level, int optname,
715		    char __user *optval, int __user *optlen)
716{
717	struct sock *sk = sock->sk;
718
719	union {
720		int val;
721		struct linger ling;
722		struct timeval tm;
723	} v;
724
725	unsigned int lv = sizeof(int);
726	int len;
727
728	if (get_user(len, optlen))
729		return -EFAULT;
730	if (len < 0)
731		return -EINVAL;
732
733	memset(&v, 0, sizeof(v));
734
735	switch (optname) {
736	case SO_DEBUG:
737		v.val = sock_flag(sk, SOCK_DBG);
738		break;
739
740	case SO_DONTROUTE:
741		v.val = sock_flag(sk, SOCK_LOCALROUTE);
742		break;
743
744	case SO_BROADCAST:
745		v.val = !!sock_flag(sk, SOCK_BROADCAST);
746		break;
747
748	case SO_SNDBUF:
749		v.val = sk->sk_sndbuf;
750		break;
751
752	case SO_RCVBUF:
753		v.val = sk->sk_rcvbuf;
754		break;
755
756	case SO_REUSEADDR:
757		v.val = sk->sk_reuse;
758		break;
759
760	case SO_KEEPALIVE:
761		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
762		break;
763
764	case SO_TYPE:
765		v.val = sk->sk_type;
766		break;
767
768	case SO_PROTOCOL:
769		v.val = sk->sk_protocol;
770		break;
771
772	case SO_ERROR:
773		v.val = -sock_error(sk);
774		if (v.val == 0)
775			v.val = xchg(&sk->sk_err_soft, 0);
776		break;
777
778	case SO_OOBINLINE:
779		v.val = !!sock_flag(sk, SOCK_URGINLINE);
780		break;
781
782	case SO_NO_CHECK:
783		v.val = sk->sk_no_check;
784		break;
785
786	case SO_PRIORITY:
787		v.val = sk->sk_priority;
788		break;
789
790	case SO_LINGER:
791		lv		= sizeof(v.ling);
792		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
793		v.ling.l_linger	= sk->sk_lingertime / HZ;
794		break;
795
796	case SO_BSDCOMPAT:
797		sock_warn_obsolete_bsdism("getsockopt");
798		break;
799
800	case SO_TIMESTAMP:
801		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
802				!sock_flag(sk, SOCK_RCVTSTAMPNS);
803		break;
804
805	case SO_TIMESTAMPNS:
806		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
807		break;
808
809	case SO_TIMESTAMPING:
810		v.val = 0;
811		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
812			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
813		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
814			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
815		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
816			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
817		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
818			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
819		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
820			v.val |= SOF_TIMESTAMPING_SOFTWARE;
821		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
822			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
823		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
824			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
825		break;
826
827	case SO_RCVTIMEO:
828		lv = sizeof(struct timeval);
829		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
830			v.tm.tv_sec = 0;
831			v.tm.tv_usec = 0;
832		} else {
833			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
834			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
835		}
836		break;
837
838	case SO_SNDTIMEO:
839		lv = sizeof(struct timeval);
840		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
841			v.tm.tv_sec = 0;
842			v.tm.tv_usec = 0;
843		} else {
844			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
845			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
846		}
847		break;
848
849	case SO_RCVLOWAT:
850		v.val = sk->sk_rcvlowat;
851		break;
852
853	case SO_SNDLOWAT:
854		v.val = 1;
855		break;
856
857	case SO_PASSCRED:
858		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
859		break;
860
861	case SO_PEERCRED:
862		if (len > sizeof(sk->sk_peercred))
863			len = sizeof(sk->sk_peercred);
864		if (copy_to_user(optval, &sk->sk_peercred, len))
865			return -EFAULT;
866		goto lenout;
867
868	case SO_PEERNAME:
869	{
870		char address[128];
871
872		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
873			return -ENOTCONN;
874		if (lv < len)
875			return -EINVAL;
876		if (copy_to_user(optval, address, len))
877			return -EFAULT;
878		goto lenout;
879	}
880
881	/* Dubious BSD thing... Probably nobody even uses it, but
882	 * the UNIX standard wants it for whatever reason... -DaveM
883	 */
884	case SO_ACCEPTCONN:
885		v.val = sk->sk_state == TCP_LISTEN;
886		break;
887
888	case SO_PASSSEC:
889		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
890		break;
891
892	case SO_PEERSEC:
893		return security_socket_getpeersec_stream(sock, optval, optlen, len);
894
895	case SO_MARK:
896		v.val = sk->sk_mark;
897		break;
898
899	default:
900		return -ENOPROTOOPT;
901	}
902
903	if (len > lv)
904		len = lv;
905	if (copy_to_user(optval, &v, len))
906		return -EFAULT;
907lenout:
908	if (put_user(len, optlen))
909		return -EFAULT;
910	return 0;
911}
912
913/*
914 * Initialize an sk_lock.
915 *
916 * (We also register the sk_lock with the lock validator.)
917 */
918static inline void sock_lock_init(struct sock *sk)
919{
920	sock_lock_init_class_and_name(sk,
921			af_family_slock_key_strings[sk->sk_family],
922			af_family_slock_keys + sk->sk_family,
923			af_family_key_strings[sk->sk_family],
924			af_family_keys + sk->sk_family);
925}
926
927/*
928 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
929 * even temporarly, because of RCU lookups. sk_node should also be left as is.
930 */
931static void sock_copy(struct sock *nsk, const struct sock *osk)
932{
933#ifdef CONFIG_SECURITY_NETWORK
934	void *sptr = nsk->sk_security;
935#endif
936	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
937		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
938	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
939	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
940#ifdef CONFIG_SECURITY_NETWORK
941	nsk->sk_security = sptr;
942	security_sk_clone(osk, nsk);
943#endif
944}
945
946static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
947		int family)
948{
949	struct sock *sk;
950	struct kmem_cache *slab;
951
952	slab = prot->slab;
953	if (slab != NULL) {
954		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
955		if (!sk)
956			return sk;
957		if (priority & __GFP_ZERO) {
958			/*
959			 * caches using SLAB_DESTROY_BY_RCU should let
960			 * sk_node.next un-modified. Special care is taken
961			 * when initializing object to zero.
962			 */
963			if (offsetof(struct sock, sk_node.next) != 0)
964				memset(sk, 0, offsetof(struct sock, sk_node.next));
965			memset(&sk->sk_node.pprev, 0,
966			       prot->obj_size - offsetof(struct sock,
967							 sk_node.pprev));
968		}
969	}
970	else
971		sk = kmalloc(prot->obj_size, priority);
972
973	if (sk != NULL) {
974		kmemcheck_annotate_bitfield(sk, flags);
975
976		if (security_sk_alloc(sk, family, priority))
977			goto out_free;
978
979		if (!try_module_get(prot->owner))
980			goto out_free_sec;
981	}
982
983	return sk;
984
985out_free_sec:
986	security_sk_free(sk);
987out_free:
988	if (slab != NULL)
989		kmem_cache_free(slab, sk);
990	else
991		kfree(sk);
992	return NULL;
993}
994
995static void sk_prot_free(struct proto *prot, struct sock *sk)
996{
997	struct kmem_cache *slab;
998	struct module *owner;
999
1000	owner = prot->owner;
1001	slab = prot->slab;
1002
1003	security_sk_free(sk);
1004	if (slab != NULL)
1005		kmem_cache_free(slab, sk);
1006	else
1007		kfree(sk);
1008	module_put(owner);
1009}
1010
1011/**
1012 *	sk_alloc - All socket objects are allocated here
1013 *	@net: the applicable net namespace
1014 *	@family: protocol family
1015 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1016 *	@prot: struct proto associated with this new sock instance
1017 */
1018struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1019		      struct proto *prot)
1020{
1021	struct sock *sk;
1022
1023	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1024	if (sk) {
1025		sk->sk_family = family;
1026		/*
1027		 * See comment in struct sock definition to understand
1028		 * why we need sk_prot_creator -acme
1029		 */
1030		sk->sk_prot = sk->sk_prot_creator = prot;
1031		sock_lock_init(sk);
1032		sock_net_set(sk, get_net(net));
1033	}
1034
1035	return sk;
1036}
1037EXPORT_SYMBOL(sk_alloc);
1038
1039static void __sk_free(struct sock *sk)
1040{
1041	struct sk_filter *filter;
1042
1043	if (sk->sk_destruct)
1044		sk->sk_destruct(sk);
1045
1046	filter = rcu_dereference(sk->sk_filter);
1047	if (filter) {
1048		sk_filter_uncharge(sk, filter);
1049		rcu_assign_pointer(sk->sk_filter, NULL);
1050	}
1051
1052	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1053	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1054
1055	if (atomic_read(&sk->sk_omem_alloc))
1056		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1057		       __func__, atomic_read(&sk->sk_omem_alloc));
1058
1059	put_net(sock_net(sk));
1060	sk_prot_free(sk->sk_prot_creator, sk);
1061}
1062
1063void sk_free(struct sock *sk)
1064{
1065	/*
1066	 * We substract one from sk_wmem_alloc and can know if
1067	 * some packets are still in some tx queue.
1068	 * If not null, sock_wfree() will call __sk_free(sk) later
1069	 */
1070	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1071		__sk_free(sk);
1072}
1073EXPORT_SYMBOL(sk_free);
1074
1075/*
1076 * Last sock_put should drop referrence to sk->sk_net. It has already
1077 * been dropped in sk_change_net. Taking referrence to stopping namespace
1078 * is not an option.
1079 * Take referrence to a socket to remove it from hash _alive_ and after that
1080 * destroy it in the context of init_net.
1081 */
1082void sk_release_kernel(struct sock *sk)
1083{
1084	if (sk == NULL || sk->sk_socket == NULL)
1085		return;
1086
1087	sock_hold(sk);
1088	sock_release(sk->sk_socket);
1089	release_net(sock_net(sk));
1090	sock_net_set(sk, get_net(&init_net));
1091	sock_put(sk);
1092}
1093EXPORT_SYMBOL(sk_release_kernel);
1094
1095struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1096{
1097	struct sock *newsk;
1098
1099	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1100	if (newsk != NULL) {
1101		struct sk_filter *filter;
1102
1103		sock_copy(newsk, sk);
1104
1105		/* SANITY */
1106		get_net(sock_net(newsk));
1107		sk_node_init(&newsk->sk_node);
1108		sock_lock_init(newsk);
1109		bh_lock_sock(newsk);
1110		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1111
1112		atomic_set(&newsk->sk_rmem_alloc, 0);
1113		/*
1114		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1115		 */
1116		atomic_set(&newsk->sk_wmem_alloc, 1);
1117		atomic_set(&newsk->sk_omem_alloc, 0);
1118		skb_queue_head_init(&newsk->sk_receive_queue);
1119		skb_queue_head_init(&newsk->sk_write_queue);
1120#ifdef CONFIG_NET_DMA
1121		skb_queue_head_init(&newsk->sk_async_wait_queue);
1122#endif
1123
1124		rwlock_init(&newsk->sk_dst_lock);
1125		rwlock_init(&newsk->sk_callback_lock);
1126		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1127				af_callback_keys + newsk->sk_family,
1128				af_family_clock_key_strings[newsk->sk_family]);
1129
1130		newsk->sk_dst_cache	= NULL;
1131		newsk->sk_wmem_queued	= 0;
1132		newsk->sk_forward_alloc = 0;
1133		newsk->sk_send_head	= NULL;
1134		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1135
1136		sock_reset_flag(newsk, SOCK_DONE);
1137		skb_queue_head_init(&newsk->sk_error_queue);
1138
1139		filter = newsk->sk_filter;
1140		if (filter != NULL)
1141			sk_filter_charge(newsk, filter);
1142
1143		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1144			/* It is still raw copy of parent, so invalidate
1145			 * destructor and make plain sk_free() */
1146			newsk->sk_destruct = NULL;
1147			sk_free(newsk);
1148			newsk = NULL;
1149			goto out;
1150		}
1151
1152		newsk->sk_err	   = 0;
1153		newsk->sk_priority = 0;
1154		/*
1155		 * Before updating sk_refcnt, we must commit prior changes to memory
1156		 * (Documentation/RCU/rculist_nulls.txt for details)
1157		 */
1158		smp_wmb();
1159		atomic_set(&newsk->sk_refcnt, 2);
1160
1161		/*
1162		 * Increment the counter in the same struct proto as the master
1163		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1164		 * is the same as sk->sk_prot->socks, as this field was copied
1165		 * with memcpy).
1166		 *
1167		 * This _changes_ the previous behaviour, where
1168		 * tcp_create_openreq_child always was incrementing the
1169		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1170		 * to be taken into account in all callers. -acme
1171		 */
1172		sk_refcnt_debug_inc(newsk);
1173		sk_set_socket(newsk, NULL);
1174		newsk->sk_sleep	 = NULL;
1175
1176		if (newsk->sk_prot->sockets_allocated)
1177			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1178	}
1179out:
1180	return newsk;
1181}
1182EXPORT_SYMBOL_GPL(sk_clone);
1183
1184void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1185{
1186	__sk_dst_set(sk, dst);
1187	sk->sk_route_caps = dst->dev->features;
1188	if (sk->sk_route_caps & NETIF_F_GSO)
1189		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1190	if (sk_can_gso(sk)) {
1191		if (dst->header_len) {
1192			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1193		} else {
1194			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1195			sk->sk_gso_max_size = dst->dev->gso_max_size;
1196		}
1197	}
1198}
1199EXPORT_SYMBOL_GPL(sk_setup_caps);
1200
1201void __init sk_init(void)
1202{
1203	if (num_physpages <= 4096) {
1204		sysctl_wmem_max = 32767;
1205		sysctl_rmem_max = 32767;
1206		sysctl_wmem_default = 32767;
1207		sysctl_rmem_default = 32767;
1208	} else if (num_physpages >= 131072) {
1209		sysctl_wmem_max = 131071;
1210		sysctl_rmem_max = 131071;
1211	}
1212}
1213
1214/*
1215 *	Simple resource managers for sockets.
1216 */
1217
1218
1219/*
1220 * Write buffer destructor automatically called from kfree_skb.
1221 */
1222void sock_wfree(struct sk_buff *skb)
1223{
1224	struct sock *sk = skb->sk;
1225	int res;
1226
1227	/* In case it might be waiting for more memory. */
1228	res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1229	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1230		sk->sk_write_space(sk);
1231	/*
1232	 * if sk_wmem_alloc reached 0, we are last user and should
1233	 * free this sock, as sk_free() call could not do it.
1234	 */
1235	if (res == 0)
1236		__sk_free(sk);
1237}
1238EXPORT_SYMBOL(sock_wfree);
1239
1240/*
1241 * Read buffer destructor automatically called from kfree_skb.
1242 */
1243void sock_rfree(struct sk_buff *skb)
1244{
1245	struct sock *sk = skb->sk;
1246
1247	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1248	sk_mem_uncharge(skb->sk, skb->truesize);
1249}
1250EXPORT_SYMBOL(sock_rfree);
1251
1252
1253int sock_i_uid(struct sock *sk)
1254{
1255	int uid;
1256
1257	read_lock(&sk->sk_callback_lock);
1258	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1259	read_unlock(&sk->sk_callback_lock);
1260	return uid;
1261}
1262EXPORT_SYMBOL(sock_i_uid);
1263
1264unsigned long sock_i_ino(struct sock *sk)
1265{
1266	unsigned long ino;
1267
1268	read_lock(&sk->sk_callback_lock);
1269	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1270	read_unlock(&sk->sk_callback_lock);
1271	return ino;
1272}
1273EXPORT_SYMBOL(sock_i_ino);
1274
1275/*
1276 * Allocate a skb from the socket's send buffer.
1277 */
1278struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1279			     gfp_t priority)
1280{
1281	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282		struct sk_buff *skb = alloc_skb(size, priority);
1283		if (skb) {
1284			skb_set_owner_w(skb, sk);
1285			return skb;
1286		}
1287	}
1288	return NULL;
1289}
1290EXPORT_SYMBOL(sock_wmalloc);
1291
1292/*
1293 * Allocate a skb from the socket's receive buffer.
1294 */
1295struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1296			     gfp_t priority)
1297{
1298	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1299		struct sk_buff *skb = alloc_skb(size, priority);
1300		if (skb) {
1301			skb_set_owner_r(skb, sk);
1302			return skb;
1303		}
1304	}
1305	return NULL;
1306}
1307
1308/*
1309 * Allocate a memory block from the socket's option memory buffer.
1310 */
1311void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1312{
1313	if ((unsigned)size <= sysctl_optmem_max &&
1314	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1315		void *mem;
1316		/* First do the add, to avoid the race if kmalloc
1317		 * might sleep.
1318		 */
1319		atomic_add(size, &sk->sk_omem_alloc);
1320		mem = kmalloc(size, priority);
1321		if (mem)
1322			return mem;
1323		atomic_sub(size, &sk->sk_omem_alloc);
1324	}
1325	return NULL;
1326}
1327EXPORT_SYMBOL(sock_kmalloc);
1328
1329/*
1330 * Free an option memory block.
1331 */
1332void sock_kfree_s(struct sock *sk, void *mem, int size)
1333{
1334	kfree(mem);
1335	atomic_sub(size, &sk->sk_omem_alloc);
1336}
1337EXPORT_SYMBOL(sock_kfree_s);
1338
1339/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1340   I think, these locks should be removed for datagram sockets.
1341 */
1342static long sock_wait_for_wmem(struct sock *sk, long timeo)
1343{
1344	DEFINE_WAIT(wait);
1345
1346	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1347	for (;;) {
1348		if (!timeo)
1349			break;
1350		if (signal_pending(current))
1351			break;
1352		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1353		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1354		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1355			break;
1356		if (sk->sk_shutdown & SEND_SHUTDOWN)
1357			break;
1358		if (sk->sk_err)
1359			break;
1360		timeo = schedule_timeout(timeo);
1361	}
1362	finish_wait(sk->sk_sleep, &wait);
1363	return timeo;
1364}
1365
1366
1367/*
1368 *	Generic send/receive buffer handlers
1369 */
1370
1371struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1372				     unsigned long data_len, int noblock,
1373				     int *errcode)
1374{
1375	struct sk_buff *skb;
1376	gfp_t gfp_mask;
1377	long timeo;
1378	int err;
1379
1380	gfp_mask = sk->sk_allocation;
1381	if (gfp_mask & __GFP_WAIT)
1382		gfp_mask |= __GFP_REPEAT;
1383
1384	timeo = sock_sndtimeo(sk, noblock);
1385	while (1) {
1386		err = sock_error(sk);
1387		if (err != 0)
1388			goto failure;
1389
1390		err = -EPIPE;
1391		if (sk->sk_shutdown & SEND_SHUTDOWN)
1392			goto failure;
1393
1394		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1395			skb = alloc_skb(header_len, gfp_mask);
1396			if (skb) {
1397				int npages;
1398				int i;
1399
1400				/* No pages, we're done... */
1401				if (!data_len)
1402					break;
1403
1404				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1405				skb->truesize += data_len;
1406				skb_shinfo(skb)->nr_frags = npages;
1407				for (i = 0; i < npages; i++) {
1408					struct page *page;
1409					skb_frag_t *frag;
1410
1411					page = alloc_pages(sk->sk_allocation, 0);
1412					if (!page) {
1413						err = -ENOBUFS;
1414						skb_shinfo(skb)->nr_frags = i;
1415						kfree_skb(skb);
1416						goto failure;
1417					}
1418
1419					frag = &skb_shinfo(skb)->frags[i];
1420					frag->page = page;
1421					frag->page_offset = 0;
1422					frag->size = (data_len >= PAGE_SIZE ?
1423						      PAGE_SIZE :
1424						      data_len);
1425					data_len -= PAGE_SIZE;
1426				}
1427
1428				/* Full success... */
1429				break;
1430			}
1431			err = -ENOBUFS;
1432			goto failure;
1433		}
1434		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1435		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1436		err = -EAGAIN;
1437		if (!timeo)
1438			goto failure;
1439		if (signal_pending(current))
1440			goto interrupted;
1441		timeo = sock_wait_for_wmem(sk, timeo);
1442	}
1443
1444	skb_set_owner_w(skb, sk);
1445	return skb;
1446
1447interrupted:
1448	err = sock_intr_errno(timeo);
1449failure:
1450	*errcode = err;
1451	return NULL;
1452}
1453EXPORT_SYMBOL(sock_alloc_send_pskb);
1454
1455struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1456				    int noblock, int *errcode)
1457{
1458	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1459}
1460EXPORT_SYMBOL(sock_alloc_send_skb);
1461
1462static void __lock_sock(struct sock *sk)
1463{
1464	DEFINE_WAIT(wait);
1465
1466	for (;;) {
1467		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1468					TASK_UNINTERRUPTIBLE);
1469		spin_unlock_bh(&sk->sk_lock.slock);
1470		schedule();
1471		spin_lock_bh(&sk->sk_lock.slock);
1472		if (!sock_owned_by_user(sk))
1473			break;
1474	}
1475	finish_wait(&sk->sk_lock.wq, &wait);
1476}
1477
1478static void __release_sock(struct sock *sk)
1479{
1480	struct sk_buff *skb = sk->sk_backlog.head;
1481
1482	do {
1483		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1484		bh_unlock_sock(sk);
1485
1486		do {
1487			struct sk_buff *next = skb->next;
1488
1489			skb->next = NULL;
1490			sk_backlog_rcv(sk, skb);
1491
1492			/*
1493			 * We are in process context here with softirqs
1494			 * disabled, use cond_resched_softirq() to preempt.
1495			 * This is safe to do because we've taken the backlog
1496			 * queue private:
1497			 */
1498			cond_resched_softirq();
1499
1500			skb = next;
1501		} while (skb != NULL);
1502
1503		bh_lock_sock(sk);
1504	} while ((skb = sk->sk_backlog.head) != NULL);
1505}
1506
1507/**
1508 * sk_wait_data - wait for data to arrive at sk_receive_queue
1509 * @sk:    sock to wait on
1510 * @timeo: for how long
1511 *
1512 * Now socket state including sk->sk_err is changed only under lock,
1513 * hence we may omit checks after joining wait queue.
1514 * We check receive queue before schedule() only as optimization;
1515 * it is very likely that release_sock() added new data.
1516 */
1517int sk_wait_data(struct sock *sk, long *timeo)
1518{
1519	int rc;
1520	DEFINE_WAIT(wait);
1521
1522	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1523	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1524	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1525	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1526	finish_wait(sk->sk_sleep, &wait);
1527	return rc;
1528}
1529EXPORT_SYMBOL(sk_wait_data);
1530
1531/**
1532 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1533 *	@sk: socket
1534 *	@size: memory size to allocate
1535 *	@kind: allocation type
1536 *
1537 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1538 *	rmem allocation. This function assumes that protocols which have
1539 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1540 */
1541int __sk_mem_schedule(struct sock *sk, int size, int kind)
1542{
1543	struct proto *prot = sk->sk_prot;
1544	int amt = sk_mem_pages(size);
1545	int allocated;
1546
1547	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1548	allocated = atomic_add_return(amt, prot->memory_allocated);
1549
1550	/* Under limit. */
1551	if (allocated <= prot->sysctl_mem[0]) {
1552		if (prot->memory_pressure && *prot->memory_pressure)
1553			*prot->memory_pressure = 0;
1554		return 1;
1555	}
1556
1557	/* Under pressure. */
1558	if (allocated > prot->sysctl_mem[1])
1559		if (prot->enter_memory_pressure)
1560			prot->enter_memory_pressure(sk);
1561
1562	/* Over hard limit. */
1563	if (allocated > prot->sysctl_mem[2])
1564		goto suppress_allocation;
1565
1566	/* guarantee minimum buffer size under pressure */
1567	if (kind == SK_MEM_RECV) {
1568		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1569			return 1;
1570	} else { /* SK_MEM_SEND */
1571		if (sk->sk_type == SOCK_STREAM) {
1572			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1573				return 1;
1574		} else if (atomic_read(&sk->sk_wmem_alloc) <
1575			   prot->sysctl_wmem[0])
1576				return 1;
1577	}
1578
1579	if (prot->memory_pressure) {
1580		int alloc;
1581
1582		if (!*prot->memory_pressure)
1583			return 1;
1584		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1585		if (prot->sysctl_mem[2] > alloc *
1586		    sk_mem_pages(sk->sk_wmem_queued +
1587				 atomic_read(&sk->sk_rmem_alloc) +
1588				 sk->sk_forward_alloc))
1589			return 1;
1590	}
1591
1592suppress_allocation:
1593
1594	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1595		sk_stream_moderate_sndbuf(sk);
1596
1597		/* Fail only if socket is _under_ its sndbuf.
1598		 * In this case we cannot block, so that we have to fail.
1599		 */
1600		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1601			return 1;
1602	}
1603
1604	/* Alas. Undo changes. */
1605	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1606	atomic_sub(amt, prot->memory_allocated);
1607	return 0;
1608}
1609EXPORT_SYMBOL(__sk_mem_schedule);
1610
1611/**
1612 *	__sk_reclaim - reclaim memory_allocated
1613 *	@sk: socket
1614 */
1615void __sk_mem_reclaim(struct sock *sk)
1616{
1617	struct proto *prot = sk->sk_prot;
1618
1619	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1620		   prot->memory_allocated);
1621	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1622
1623	if (prot->memory_pressure && *prot->memory_pressure &&
1624	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1625		*prot->memory_pressure = 0;
1626}
1627EXPORT_SYMBOL(__sk_mem_reclaim);
1628
1629
1630/*
1631 * Set of default routines for initialising struct proto_ops when
1632 * the protocol does not support a particular function. In certain
1633 * cases where it makes no sense for a protocol to have a "do nothing"
1634 * function, some default processing is provided.
1635 */
1636
1637int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1638{
1639	return -EOPNOTSUPP;
1640}
1641EXPORT_SYMBOL(sock_no_bind);
1642
1643int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1644		    int len, int flags)
1645{
1646	return -EOPNOTSUPP;
1647}
1648EXPORT_SYMBOL(sock_no_connect);
1649
1650int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1651{
1652	return -EOPNOTSUPP;
1653}
1654EXPORT_SYMBOL(sock_no_socketpair);
1655
1656int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1657{
1658	return -EOPNOTSUPP;
1659}
1660EXPORT_SYMBOL(sock_no_accept);
1661
1662int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1663		    int *len, int peer)
1664{
1665	return -EOPNOTSUPP;
1666}
1667EXPORT_SYMBOL(sock_no_getname);
1668
1669unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1670{
1671	return 0;
1672}
1673EXPORT_SYMBOL(sock_no_poll);
1674
1675int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1676{
1677	return -EOPNOTSUPP;
1678}
1679EXPORT_SYMBOL(sock_no_ioctl);
1680
1681int sock_no_listen(struct socket *sock, int backlog)
1682{
1683	return -EOPNOTSUPP;
1684}
1685EXPORT_SYMBOL(sock_no_listen);
1686
1687int sock_no_shutdown(struct socket *sock, int how)
1688{
1689	return -EOPNOTSUPP;
1690}
1691EXPORT_SYMBOL(sock_no_shutdown);
1692
1693int sock_no_setsockopt(struct socket *sock, int level, int optname,
1694		    char __user *optval, int optlen)
1695{
1696	return -EOPNOTSUPP;
1697}
1698EXPORT_SYMBOL(sock_no_setsockopt);
1699
1700int sock_no_getsockopt(struct socket *sock, int level, int optname,
1701		    char __user *optval, int __user *optlen)
1702{
1703	return -EOPNOTSUPP;
1704}
1705EXPORT_SYMBOL(sock_no_getsockopt);
1706
1707int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1708		    size_t len)
1709{
1710	return -EOPNOTSUPP;
1711}
1712EXPORT_SYMBOL(sock_no_sendmsg);
1713
1714int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1715		    size_t len, int flags)
1716{
1717	return -EOPNOTSUPP;
1718}
1719EXPORT_SYMBOL(sock_no_recvmsg);
1720
1721int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1722{
1723	/* Mirror missing mmap method error code */
1724	return -ENODEV;
1725}
1726EXPORT_SYMBOL(sock_no_mmap);
1727
1728ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1729{
1730	ssize_t res;
1731	struct msghdr msg = {.msg_flags = flags};
1732	struct kvec iov;
1733	char *kaddr = kmap(page);
1734	iov.iov_base = kaddr + offset;
1735	iov.iov_len = size;
1736	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1737	kunmap(page);
1738	return res;
1739}
1740EXPORT_SYMBOL(sock_no_sendpage);
1741
1742/*
1743 *	Default Socket Callbacks
1744 */
1745
1746static void sock_def_wakeup(struct sock *sk)
1747{
1748	read_lock(&sk->sk_callback_lock);
1749	if (sk_has_sleeper(sk))
1750		wake_up_interruptible_all(sk->sk_sleep);
1751	read_unlock(&sk->sk_callback_lock);
1752}
1753
1754static void sock_def_error_report(struct sock *sk)
1755{
1756	read_lock(&sk->sk_callback_lock);
1757	if (sk_has_sleeper(sk))
1758		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1759	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1760	read_unlock(&sk->sk_callback_lock);
1761}
1762
1763static void sock_def_readable(struct sock *sk, int len)
1764{
1765	read_lock(&sk->sk_callback_lock);
1766	if (sk_has_sleeper(sk))
1767		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1768						POLLRDNORM | POLLRDBAND);
1769	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1770	read_unlock(&sk->sk_callback_lock);
1771}
1772
1773static void sock_def_write_space(struct sock *sk)
1774{
1775	read_lock(&sk->sk_callback_lock);
1776
1777	/* Do not wake up a writer until he can make "significant"
1778	 * progress.  --DaveM
1779	 */
1780	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1781		if (sk_has_sleeper(sk))
1782			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1783						POLLWRNORM | POLLWRBAND);
1784
1785		/* Should agree with poll, otherwise some programs break */
1786		if (sock_writeable(sk))
1787			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1788	}
1789
1790	read_unlock(&sk->sk_callback_lock);
1791}
1792
1793static void sock_def_destruct(struct sock *sk)
1794{
1795	kfree(sk->sk_protinfo);
1796}
1797
1798void sk_send_sigurg(struct sock *sk)
1799{
1800	if (sk->sk_socket && sk->sk_socket->file)
1801		if (send_sigurg(&sk->sk_socket->file->f_owner))
1802			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1803}
1804EXPORT_SYMBOL(sk_send_sigurg);
1805
1806void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1807		    unsigned long expires)
1808{
1809	if (!mod_timer(timer, expires))
1810		sock_hold(sk);
1811}
1812EXPORT_SYMBOL(sk_reset_timer);
1813
1814void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1815{
1816	if (timer_pending(timer) && del_timer(timer))
1817		__sock_put(sk);
1818}
1819EXPORT_SYMBOL(sk_stop_timer);
1820
1821void sock_init_data(struct socket *sock, struct sock *sk)
1822{
1823	skb_queue_head_init(&sk->sk_receive_queue);
1824	skb_queue_head_init(&sk->sk_write_queue);
1825	skb_queue_head_init(&sk->sk_error_queue);
1826#ifdef CONFIG_NET_DMA
1827	skb_queue_head_init(&sk->sk_async_wait_queue);
1828#endif
1829
1830	sk->sk_send_head	=	NULL;
1831
1832	init_timer(&sk->sk_timer);
1833
1834	sk->sk_allocation	=	GFP_KERNEL;
1835	sk->sk_rcvbuf		=	sysctl_rmem_default;
1836	sk->sk_sndbuf		=	sysctl_wmem_default;
1837	sk->sk_state		=	TCP_CLOSE;
1838	sk_set_socket(sk, sock);
1839
1840	sock_set_flag(sk, SOCK_ZAPPED);
1841
1842	if (sock) {
1843		sk->sk_type	=	sock->type;
1844		sk->sk_sleep	=	&sock->wait;
1845		sock->sk	=	sk;
1846	} else
1847		sk->sk_sleep	=	NULL;
1848
1849	rwlock_init(&sk->sk_dst_lock);
1850	rwlock_init(&sk->sk_callback_lock);
1851	lockdep_set_class_and_name(&sk->sk_callback_lock,
1852			af_callback_keys + sk->sk_family,
1853			af_family_clock_key_strings[sk->sk_family]);
1854
1855	sk->sk_state_change	=	sock_def_wakeup;
1856	sk->sk_data_ready	=	sock_def_readable;
1857	sk->sk_write_space	=	sock_def_write_space;
1858	sk->sk_error_report	=	sock_def_error_report;
1859	sk->sk_destruct		=	sock_def_destruct;
1860
1861	sk->sk_sndmsg_page	=	NULL;
1862	sk->sk_sndmsg_off	=	0;
1863
1864	sk->sk_peercred.pid 	=	0;
1865	sk->sk_peercred.uid	=	-1;
1866	sk->sk_peercred.gid	=	-1;
1867	sk->sk_write_pending	=	0;
1868	sk->sk_rcvlowat		=	1;
1869	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1870	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1871
1872	sk->sk_stamp = ktime_set(-1L, 0);
1873
1874	/*
1875	 * Before updating sk_refcnt, we must commit prior changes to memory
1876	 * (Documentation/RCU/rculist_nulls.txt for details)
1877	 */
1878	smp_wmb();
1879	atomic_set(&sk->sk_refcnt, 1);
1880	atomic_set(&sk->sk_wmem_alloc, 1);
1881	atomic_set(&sk->sk_drops, 0);
1882}
1883EXPORT_SYMBOL(sock_init_data);
1884
1885void lock_sock_nested(struct sock *sk, int subclass)
1886{
1887	might_sleep();
1888	spin_lock_bh(&sk->sk_lock.slock);
1889	if (sk->sk_lock.owned)
1890		__lock_sock(sk);
1891	sk->sk_lock.owned = 1;
1892	spin_unlock(&sk->sk_lock.slock);
1893	/*
1894	 * The sk_lock has mutex_lock() semantics here:
1895	 */
1896	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1897	local_bh_enable();
1898}
1899EXPORT_SYMBOL(lock_sock_nested);
1900
1901void release_sock(struct sock *sk)
1902{
1903	/*
1904	 * The sk_lock has mutex_unlock() semantics:
1905	 */
1906	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1907
1908	spin_lock_bh(&sk->sk_lock.slock);
1909	if (sk->sk_backlog.tail)
1910		__release_sock(sk);
1911	sk->sk_lock.owned = 0;
1912	if (waitqueue_active(&sk->sk_lock.wq))
1913		wake_up(&sk->sk_lock.wq);
1914	spin_unlock_bh(&sk->sk_lock.slock);
1915}
1916EXPORT_SYMBOL(release_sock);
1917
1918int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1919{
1920	struct timeval tv;
1921	if (!sock_flag(sk, SOCK_TIMESTAMP))
1922		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1923	tv = ktime_to_timeval(sk->sk_stamp);
1924	if (tv.tv_sec == -1)
1925		return -ENOENT;
1926	if (tv.tv_sec == 0) {
1927		sk->sk_stamp = ktime_get_real();
1928		tv = ktime_to_timeval(sk->sk_stamp);
1929	}
1930	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1931}
1932EXPORT_SYMBOL(sock_get_timestamp);
1933
1934int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1935{
1936	struct timespec ts;
1937	if (!sock_flag(sk, SOCK_TIMESTAMP))
1938		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1939	ts = ktime_to_timespec(sk->sk_stamp);
1940	if (ts.tv_sec == -1)
1941		return -ENOENT;
1942	if (ts.tv_sec == 0) {
1943		sk->sk_stamp = ktime_get_real();
1944		ts = ktime_to_timespec(sk->sk_stamp);
1945	}
1946	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1947}
1948EXPORT_SYMBOL(sock_get_timestampns);
1949
1950void sock_enable_timestamp(struct sock *sk, int flag)
1951{
1952	if (!sock_flag(sk, flag)) {
1953		sock_set_flag(sk, flag);
1954		/*
1955		 * we just set one of the two flags which require net
1956		 * time stamping, but time stamping might have been on
1957		 * already because of the other one
1958		 */
1959		if (!sock_flag(sk,
1960				flag == SOCK_TIMESTAMP ?
1961				SOCK_TIMESTAMPING_RX_SOFTWARE :
1962				SOCK_TIMESTAMP))
1963			net_enable_timestamp();
1964	}
1965}
1966
1967/*
1968 *	Get a socket option on an socket.
1969 *
1970 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1971 *	asynchronous errors should be reported by getsockopt. We assume
1972 *	this means if you specify SO_ERROR (otherwise whats the point of it).
1973 */
1974int sock_common_getsockopt(struct socket *sock, int level, int optname,
1975			   char __user *optval, int __user *optlen)
1976{
1977	struct sock *sk = sock->sk;
1978
1979	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1980}
1981EXPORT_SYMBOL(sock_common_getsockopt);
1982
1983#ifdef CONFIG_COMPAT
1984int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1985				  char __user *optval, int __user *optlen)
1986{
1987	struct sock *sk = sock->sk;
1988
1989	if (sk->sk_prot->compat_getsockopt != NULL)
1990		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1991						      optval, optlen);
1992	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1993}
1994EXPORT_SYMBOL(compat_sock_common_getsockopt);
1995#endif
1996
1997int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1998			struct msghdr *msg, size_t size, int flags)
1999{
2000	struct sock *sk = sock->sk;
2001	int addr_len = 0;
2002	int err;
2003
2004	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2005				   flags & ~MSG_DONTWAIT, &addr_len);
2006	if (err >= 0)
2007		msg->msg_namelen = addr_len;
2008	return err;
2009}
2010EXPORT_SYMBOL(sock_common_recvmsg);
2011
2012/*
2013 *	Set socket options on an inet socket.
2014 */
2015int sock_common_setsockopt(struct socket *sock, int level, int optname,
2016			   char __user *optval, int optlen)
2017{
2018	struct sock *sk = sock->sk;
2019
2020	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2021}
2022EXPORT_SYMBOL(sock_common_setsockopt);
2023
2024#ifdef CONFIG_COMPAT
2025int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2026				  char __user *optval, int optlen)
2027{
2028	struct sock *sk = sock->sk;
2029
2030	if (sk->sk_prot->compat_setsockopt != NULL)
2031		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2032						      optval, optlen);
2033	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2034}
2035EXPORT_SYMBOL(compat_sock_common_setsockopt);
2036#endif
2037
2038void sk_common_release(struct sock *sk)
2039{
2040	if (sk->sk_prot->destroy)
2041		sk->sk_prot->destroy(sk);
2042
2043	/*
2044	 * Observation: when sock_common_release is called, processes have
2045	 * no access to socket. But net still has.
2046	 * Step one, detach it from networking:
2047	 *
2048	 * A. Remove from hash tables.
2049	 */
2050
2051	sk->sk_prot->unhash(sk);
2052
2053	/*
2054	 * In this point socket cannot receive new packets, but it is possible
2055	 * that some packets are in flight because some CPU runs receiver and
2056	 * did hash table lookup before we unhashed socket. They will achieve
2057	 * receive queue and will be purged by socket destructor.
2058	 *
2059	 * Also we still have packets pending on receive queue and probably,
2060	 * our own packets waiting in device queues. sock_destroy will drain
2061	 * receive queue, but transmitted packets will delay socket destruction
2062	 * until the last reference will be released.
2063	 */
2064
2065	sock_orphan(sk);
2066
2067	xfrm_sk_free_policy(sk);
2068
2069	sk_refcnt_debug_release(sk);
2070	sock_put(sk);
2071}
2072EXPORT_SYMBOL(sk_common_release);
2073
2074static DEFINE_RWLOCK(proto_list_lock);
2075static LIST_HEAD(proto_list);
2076
2077#ifdef CONFIG_PROC_FS
2078#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2079struct prot_inuse {
2080	int val[PROTO_INUSE_NR];
2081};
2082
2083static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2084
2085#ifdef CONFIG_NET_NS
2086void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2087{
2088	int cpu = smp_processor_id();
2089	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2090}
2091EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2092
2093int sock_prot_inuse_get(struct net *net, struct proto *prot)
2094{
2095	int cpu, idx = prot->inuse_idx;
2096	int res = 0;
2097
2098	for_each_possible_cpu(cpu)
2099		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2100
2101	return res >= 0 ? res : 0;
2102}
2103EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2104
2105static int sock_inuse_init_net(struct net *net)
2106{
2107	net->core.inuse = alloc_percpu(struct prot_inuse);
2108	return net->core.inuse ? 0 : -ENOMEM;
2109}
2110
2111static void sock_inuse_exit_net(struct net *net)
2112{
2113	free_percpu(net->core.inuse);
2114}
2115
2116static struct pernet_operations net_inuse_ops = {
2117	.init = sock_inuse_init_net,
2118	.exit = sock_inuse_exit_net,
2119};
2120
2121static __init int net_inuse_init(void)
2122{
2123	if (register_pernet_subsys(&net_inuse_ops))
2124		panic("Cannot initialize net inuse counters");
2125
2126	return 0;
2127}
2128
2129core_initcall(net_inuse_init);
2130#else
2131static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2132
2133void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2134{
2135	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2136}
2137EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2138
2139int sock_prot_inuse_get(struct net *net, struct proto *prot)
2140{
2141	int cpu, idx = prot->inuse_idx;
2142	int res = 0;
2143
2144	for_each_possible_cpu(cpu)
2145		res += per_cpu(prot_inuse, cpu).val[idx];
2146
2147	return res >= 0 ? res : 0;
2148}
2149EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2150#endif
2151
2152static void assign_proto_idx(struct proto *prot)
2153{
2154	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2155
2156	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2157		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2158		return;
2159	}
2160
2161	set_bit(prot->inuse_idx, proto_inuse_idx);
2162}
2163
2164static void release_proto_idx(struct proto *prot)
2165{
2166	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2167		clear_bit(prot->inuse_idx, proto_inuse_idx);
2168}
2169#else
2170static inline void assign_proto_idx(struct proto *prot)
2171{
2172}
2173
2174static inline void release_proto_idx(struct proto *prot)
2175{
2176}
2177#endif
2178
2179int proto_register(struct proto *prot, int alloc_slab)
2180{
2181	if (alloc_slab) {
2182		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2183					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2184					NULL);
2185
2186		if (prot->slab == NULL) {
2187			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2188			       prot->name);
2189			goto out;
2190		}
2191
2192		if (prot->rsk_prot != NULL) {
2193			static const char mask[] = "request_sock_%s";
2194
2195			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2196			if (prot->rsk_prot->slab_name == NULL)
2197				goto out_free_sock_slab;
2198
2199			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2200			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2201								 prot->rsk_prot->obj_size, 0,
2202								 SLAB_HWCACHE_ALIGN, NULL);
2203
2204			if (prot->rsk_prot->slab == NULL) {
2205				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2206				       prot->name);
2207				goto out_free_request_sock_slab_name;
2208			}
2209		}
2210
2211		if (prot->twsk_prot != NULL) {
2212			static const char mask[] = "tw_sock_%s";
2213
2214			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2215
2216			if (prot->twsk_prot->twsk_slab_name == NULL)
2217				goto out_free_request_sock_slab;
2218
2219			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2220			prot->twsk_prot->twsk_slab =
2221				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2222						  prot->twsk_prot->twsk_obj_size,
2223						  0,
2224						  SLAB_HWCACHE_ALIGN |
2225							prot->slab_flags,
2226						  NULL);
2227			if (prot->twsk_prot->twsk_slab == NULL)
2228				goto out_free_timewait_sock_slab_name;
2229		}
2230	}
2231
2232	write_lock(&proto_list_lock);
2233	list_add(&prot->node, &proto_list);
2234	assign_proto_idx(prot);
2235	write_unlock(&proto_list_lock);
2236	return 0;
2237
2238out_free_timewait_sock_slab_name:
2239	kfree(prot->twsk_prot->twsk_slab_name);
2240out_free_request_sock_slab:
2241	if (prot->rsk_prot && prot->rsk_prot->slab) {
2242		kmem_cache_destroy(prot->rsk_prot->slab);
2243		prot->rsk_prot->slab = NULL;
2244	}
2245out_free_request_sock_slab_name:
2246	kfree(prot->rsk_prot->slab_name);
2247out_free_sock_slab:
2248	kmem_cache_destroy(prot->slab);
2249	prot->slab = NULL;
2250out:
2251	return -ENOBUFS;
2252}
2253EXPORT_SYMBOL(proto_register);
2254
2255void proto_unregister(struct proto *prot)
2256{
2257	write_lock(&proto_list_lock);
2258	release_proto_idx(prot);
2259	list_del(&prot->node);
2260	write_unlock(&proto_list_lock);
2261
2262	if (prot->slab != NULL) {
2263		kmem_cache_destroy(prot->slab);
2264		prot->slab = NULL;
2265	}
2266
2267	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2268		kmem_cache_destroy(prot->rsk_prot->slab);
2269		kfree(prot->rsk_prot->slab_name);
2270		prot->rsk_prot->slab = NULL;
2271	}
2272
2273	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2274		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2275		kfree(prot->twsk_prot->twsk_slab_name);
2276		prot->twsk_prot->twsk_slab = NULL;
2277	}
2278}
2279EXPORT_SYMBOL(proto_unregister);
2280
2281#ifdef CONFIG_PROC_FS
2282static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2283	__acquires(proto_list_lock)
2284{
2285	read_lock(&proto_list_lock);
2286	return seq_list_start_head(&proto_list, *pos);
2287}
2288
2289static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2290{
2291	return seq_list_next(v, &proto_list, pos);
2292}
2293
2294static void proto_seq_stop(struct seq_file *seq, void *v)
2295	__releases(proto_list_lock)
2296{
2297	read_unlock(&proto_list_lock);
2298}
2299
2300static char proto_method_implemented(const void *method)
2301{
2302	return method == NULL ? 'n' : 'y';
2303}
2304
2305static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2306{
2307	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2308			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2309		   proto->name,
2310		   proto->obj_size,
2311		   sock_prot_inuse_get(seq_file_net(seq), proto),
2312		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2313		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2314		   proto->max_header,
2315		   proto->slab == NULL ? "no" : "yes",
2316		   module_name(proto->owner),
2317		   proto_method_implemented(proto->close),
2318		   proto_method_implemented(proto->connect),
2319		   proto_method_implemented(proto->disconnect),
2320		   proto_method_implemented(proto->accept),
2321		   proto_method_implemented(proto->ioctl),
2322		   proto_method_implemented(proto->init),
2323		   proto_method_implemented(proto->destroy),
2324		   proto_method_implemented(proto->shutdown),
2325		   proto_method_implemented(proto->setsockopt),
2326		   proto_method_implemented(proto->getsockopt),
2327		   proto_method_implemented(proto->sendmsg),
2328		   proto_method_implemented(proto->recvmsg),
2329		   proto_method_implemented(proto->sendpage),
2330		   proto_method_implemented(proto->bind),
2331		   proto_method_implemented(proto->backlog_rcv),
2332		   proto_method_implemented(proto->hash),
2333		   proto_method_implemented(proto->unhash),
2334		   proto_method_implemented(proto->get_port),
2335		   proto_method_implemented(proto->enter_memory_pressure));
2336}
2337
2338static int proto_seq_show(struct seq_file *seq, void *v)
2339{
2340	if (v == &proto_list)
2341		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2342			   "protocol",
2343			   "size",
2344			   "sockets",
2345			   "memory",
2346			   "press",
2347			   "maxhdr",
2348			   "slab",
2349			   "module",
2350			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2351	else
2352		proto_seq_printf(seq, list_entry(v, struct proto, node));
2353	return 0;
2354}
2355
2356static const struct seq_operations proto_seq_ops = {
2357	.start  = proto_seq_start,
2358	.next   = proto_seq_next,
2359	.stop   = proto_seq_stop,
2360	.show   = proto_seq_show,
2361};
2362
2363static int proto_seq_open(struct inode *inode, struct file *file)
2364{
2365	return seq_open_net(inode, file, &proto_seq_ops,
2366			    sizeof(struct seq_net_private));
2367}
2368
2369static const struct file_operations proto_seq_fops = {
2370	.owner		= THIS_MODULE,
2371	.open		= proto_seq_open,
2372	.read		= seq_read,
2373	.llseek		= seq_lseek,
2374	.release	= seq_release_net,
2375};
2376
2377static __net_init int proto_init_net(struct net *net)
2378{
2379	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2380		return -ENOMEM;
2381
2382	return 0;
2383}
2384
2385static __net_exit void proto_exit_net(struct net *net)
2386{
2387	proc_net_remove(net, "protocols");
2388}
2389
2390
2391static __net_initdata struct pernet_operations proto_net_ops = {
2392	.init = proto_init_net,
2393	.exit = proto_exit_net,
2394};
2395
2396static int __init proto_init(void)
2397{
2398	return register_pernet_subsys(&proto_net_ops);
2399}
2400
2401subsys_initcall(proto_init);
2402
2403#endif /* PROC_FS */
2404