sock.c revision 7fee226ad2397b635e2fd565a59ca3ae08a164cd
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
120#include <net/net_namespace.h>
121#include <net/request_sock.h>
122#include <net/sock.h>
123#include <linux/net_tstamp.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
145static const char *const af_family_key_strings[AF_MAX+1] = {
146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158  "sk_lock-AF_IEEE802154",
159  "sk_lock-AF_MAX"
160};
161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174  "slock-AF_IEEE802154",
175  "slock-AF_MAX"
176};
177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190  "clock-AF_IEEE802154",
191  "clock-AF_MAX"
192};
193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms.  This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS		256
206#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218EXPORT_SYMBOL(sysctl_optmem_max);
219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222	struct timeval tv;
223
224	if (optlen < sizeof(tv))
225		return -EINVAL;
226	if (copy_from_user(&tv, optval, sizeof(tv)))
227		return -EFAULT;
228	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229		return -EDOM;
230
231	if (tv.tv_sec < 0) {
232		static int warned __read_mostly;
233
234		*timeo_p = 0;
235		if (warned < 10 && net_ratelimit()) {
236			warned++;
237			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238			       "tries to set negative timeout\n",
239				current->comm, task_pid_nr(current));
240		}
241		return 0;
242	}
243	*timeo_p = MAX_SCHEDULE_TIMEOUT;
244	if (tv.tv_sec == 0 && tv.tv_usec == 0)
245		return 0;
246	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248	return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253	static int warned;
254	static char warncomm[TASK_COMM_LEN];
255	if (strcmp(warncomm, current->comm) && warned < 5) {
256		strcpy(warncomm,  current->comm);
257		printk(KERN_WARNING "process `%s' is using obsolete "
258		       "%s SO_BSDCOMPAT\n", warncomm, name);
259		warned++;
260	}
261}
262
263static void sock_disable_timestamp(struct sock *sk, int flag)
264{
265	if (sock_flag(sk, flag)) {
266		sock_reset_flag(sk, flag);
267		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269			net_disable_timestamp();
270		}
271	}
272}
273
274
275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277	int err;
278	int skb_len;
279	unsigned long flags;
280	struct sk_buff_head *list = &sk->sk_receive_queue;
281
282	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
283	   number of warnings when compiling with -W --ANK
284	 */
285	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286	    (unsigned)sk->sk_rcvbuf) {
287		atomic_inc(&sk->sk_drops);
288		return -ENOMEM;
289	}
290
291	err = sk_filter(sk, skb);
292	if (err)
293		return err;
294
295	if (!sk_rmem_schedule(sk, skb->truesize)) {
296		atomic_inc(&sk->sk_drops);
297		return -ENOBUFS;
298	}
299
300	skb->dev = NULL;
301	skb_set_owner_r(skb, sk);
302
303	/* Cache the SKB length before we tack it onto the receive
304	 * queue.  Once it is added it no longer belongs to us and
305	 * may be freed by other threads of control pulling packets
306	 * from the queue.
307	 */
308	skb_len = skb->len;
309
310	/* we escape from rcu protected region, make sure we dont leak
311	 * a norefcounted dst
312	 */
313	skb_dst_force(skb);
314
315	spin_lock_irqsave(&list->lock, flags);
316	skb->dropcount = atomic_read(&sk->sk_drops);
317	__skb_queue_tail(list, skb);
318	spin_unlock_irqrestore(&list->lock, flags);
319
320	if (!sock_flag(sk, SOCK_DEAD))
321		sk->sk_data_ready(sk, skb_len);
322	return 0;
323}
324EXPORT_SYMBOL(sock_queue_rcv_skb);
325
326int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
327{
328	int rc = NET_RX_SUCCESS;
329
330	if (sk_filter(sk, skb))
331		goto discard_and_relse;
332
333	skb->dev = NULL;
334
335	if (sk_rcvqueues_full(sk, skb)) {
336		atomic_inc(&sk->sk_drops);
337		goto discard_and_relse;
338	}
339	if (nested)
340		bh_lock_sock_nested(sk);
341	else
342		bh_lock_sock(sk);
343	if (!sock_owned_by_user(sk)) {
344		/*
345		 * trylock + unlock semantics:
346		 */
347		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
348
349		rc = sk_backlog_rcv(sk, skb);
350
351		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
352	} else if (sk_add_backlog(sk, skb)) {
353		bh_unlock_sock(sk);
354		atomic_inc(&sk->sk_drops);
355		goto discard_and_relse;
356	}
357
358	bh_unlock_sock(sk);
359out:
360	sock_put(sk);
361	return rc;
362discard_and_relse:
363	kfree_skb(skb);
364	goto out;
365}
366EXPORT_SYMBOL(sk_receive_skb);
367
368void sk_reset_txq(struct sock *sk)
369{
370	sk_tx_queue_clear(sk);
371}
372EXPORT_SYMBOL(sk_reset_txq);
373
374struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
375{
376	struct dst_entry *dst = __sk_dst_get(sk);
377
378	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
379		sk_tx_queue_clear(sk);
380		rcu_assign_pointer(sk->sk_dst_cache, NULL);
381		dst_release(dst);
382		return NULL;
383	}
384
385	return dst;
386}
387EXPORT_SYMBOL(__sk_dst_check);
388
389struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
390{
391	struct dst_entry *dst = sk_dst_get(sk);
392
393	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
394		sk_dst_reset(sk);
395		dst_release(dst);
396		return NULL;
397	}
398
399	return dst;
400}
401EXPORT_SYMBOL(sk_dst_check);
402
403static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
404{
405	int ret = -ENOPROTOOPT;
406#ifdef CONFIG_NETDEVICES
407	struct net *net = sock_net(sk);
408	char devname[IFNAMSIZ];
409	int index;
410
411	/* Sorry... */
412	ret = -EPERM;
413	if (!capable(CAP_NET_RAW))
414		goto out;
415
416	ret = -EINVAL;
417	if (optlen < 0)
418		goto out;
419
420	/* Bind this socket to a particular device like "eth0",
421	 * as specified in the passed interface name. If the
422	 * name is "" or the option length is zero the socket
423	 * is not bound.
424	 */
425	if (optlen > IFNAMSIZ - 1)
426		optlen = IFNAMSIZ - 1;
427	memset(devname, 0, sizeof(devname));
428
429	ret = -EFAULT;
430	if (copy_from_user(devname, optval, optlen))
431		goto out;
432
433	index = 0;
434	if (devname[0] != '\0') {
435		struct net_device *dev;
436
437		rcu_read_lock();
438		dev = dev_get_by_name_rcu(net, devname);
439		if (dev)
440			index = dev->ifindex;
441		rcu_read_unlock();
442		ret = -ENODEV;
443		if (!dev)
444			goto out;
445	}
446
447	lock_sock(sk);
448	sk->sk_bound_dev_if = index;
449	sk_dst_reset(sk);
450	release_sock(sk);
451
452	ret = 0;
453
454out:
455#endif
456
457	return ret;
458}
459
460static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
461{
462	if (valbool)
463		sock_set_flag(sk, bit);
464	else
465		sock_reset_flag(sk, bit);
466}
467
468/*
469 *	This is meant for all protocols to use and covers goings on
470 *	at the socket level. Everything here is generic.
471 */
472
473int sock_setsockopt(struct socket *sock, int level, int optname,
474		    char __user *optval, unsigned int optlen)
475{
476	struct sock *sk = sock->sk;
477	int val;
478	int valbool;
479	struct linger ling;
480	int ret = 0;
481
482	/*
483	 *	Options without arguments
484	 */
485
486	if (optname == SO_BINDTODEVICE)
487		return sock_bindtodevice(sk, optval, optlen);
488
489	if (optlen < sizeof(int))
490		return -EINVAL;
491
492	if (get_user(val, (int __user *)optval))
493		return -EFAULT;
494
495	valbool = val ? 1 : 0;
496
497	lock_sock(sk);
498
499	switch (optname) {
500	case SO_DEBUG:
501		if (val && !capable(CAP_NET_ADMIN))
502			ret = -EACCES;
503		else
504			sock_valbool_flag(sk, SOCK_DBG, valbool);
505		break;
506	case SO_REUSEADDR:
507		sk->sk_reuse = valbool;
508		break;
509	case SO_TYPE:
510	case SO_PROTOCOL:
511	case SO_DOMAIN:
512	case SO_ERROR:
513		ret = -ENOPROTOOPT;
514		break;
515	case SO_DONTROUTE:
516		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
517		break;
518	case SO_BROADCAST:
519		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
520		break;
521	case SO_SNDBUF:
522		/* Don't error on this BSD doesn't and if you think
523		   about it this is right. Otherwise apps have to
524		   play 'guess the biggest size' games. RCVBUF/SNDBUF
525		   are treated in BSD as hints */
526
527		if (val > sysctl_wmem_max)
528			val = sysctl_wmem_max;
529set_sndbuf:
530		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
531		if ((val * 2) < SOCK_MIN_SNDBUF)
532			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
533		else
534			sk->sk_sndbuf = val * 2;
535
536		/*
537		 *	Wake up sending tasks if we
538		 *	upped the value.
539		 */
540		sk->sk_write_space(sk);
541		break;
542
543	case SO_SNDBUFFORCE:
544		if (!capable(CAP_NET_ADMIN)) {
545			ret = -EPERM;
546			break;
547		}
548		goto set_sndbuf;
549
550	case SO_RCVBUF:
551		/* Don't error on this BSD doesn't and if you think
552		   about it this is right. Otherwise apps have to
553		   play 'guess the biggest size' games. RCVBUF/SNDBUF
554		   are treated in BSD as hints */
555
556		if (val > sysctl_rmem_max)
557			val = sysctl_rmem_max;
558set_rcvbuf:
559		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
560		/*
561		 * We double it on the way in to account for
562		 * "struct sk_buff" etc. overhead.   Applications
563		 * assume that the SO_RCVBUF setting they make will
564		 * allow that much actual data to be received on that
565		 * socket.
566		 *
567		 * Applications are unaware that "struct sk_buff" and
568		 * other overheads allocate from the receive buffer
569		 * during socket buffer allocation.
570		 *
571		 * And after considering the possible alternatives,
572		 * returning the value we actually used in getsockopt
573		 * is the most desirable behavior.
574		 */
575		if ((val * 2) < SOCK_MIN_RCVBUF)
576			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
577		else
578			sk->sk_rcvbuf = val * 2;
579		break;
580
581	case SO_RCVBUFFORCE:
582		if (!capable(CAP_NET_ADMIN)) {
583			ret = -EPERM;
584			break;
585		}
586		goto set_rcvbuf;
587
588	case SO_KEEPALIVE:
589#ifdef CONFIG_INET
590		if (sk->sk_protocol == IPPROTO_TCP)
591			tcp_set_keepalive(sk, valbool);
592#endif
593		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
594		break;
595
596	case SO_OOBINLINE:
597		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
598		break;
599
600	case SO_NO_CHECK:
601		sk->sk_no_check = valbool;
602		break;
603
604	case SO_PRIORITY:
605		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
606			sk->sk_priority = val;
607		else
608			ret = -EPERM;
609		break;
610
611	case SO_LINGER:
612		if (optlen < sizeof(ling)) {
613			ret = -EINVAL;	/* 1003.1g */
614			break;
615		}
616		if (copy_from_user(&ling, optval, sizeof(ling))) {
617			ret = -EFAULT;
618			break;
619		}
620		if (!ling.l_onoff)
621			sock_reset_flag(sk, SOCK_LINGER);
622		else {
623#if (BITS_PER_LONG == 32)
624			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
625				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
626			else
627#endif
628				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
629			sock_set_flag(sk, SOCK_LINGER);
630		}
631		break;
632
633	case SO_BSDCOMPAT:
634		sock_warn_obsolete_bsdism("setsockopt");
635		break;
636
637	case SO_PASSCRED:
638		if (valbool)
639			set_bit(SOCK_PASSCRED, &sock->flags);
640		else
641			clear_bit(SOCK_PASSCRED, &sock->flags);
642		break;
643
644	case SO_TIMESTAMP:
645	case SO_TIMESTAMPNS:
646		if (valbool)  {
647			if (optname == SO_TIMESTAMP)
648				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
649			else
650				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
651			sock_set_flag(sk, SOCK_RCVTSTAMP);
652			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
653		} else {
654			sock_reset_flag(sk, SOCK_RCVTSTAMP);
655			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
656		}
657		break;
658
659	case SO_TIMESTAMPING:
660		if (val & ~SOF_TIMESTAMPING_MASK) {
661			ret = -EINVAL;
662			break;
663		}
664		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
665				  val & SOF_TIMESTAMPING_TX_HARDWARE);
666		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
667				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
668		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
669				  val & SOF_TIMESTAMPING_RX_HARDWARE);
670		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
671			sock_enable_timestamp(sk,
672					      SOCK_TIMESTAMPING_RX_SOFTWARE);
673		else
674			sock_disable_timestamp(sk,
675					       SOCK_TIMESTAMPING_RX_SOFTWARE);
676		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
677				  val & SOF_TIMESTAMPING_SOFTWARE);
678		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
679				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
680		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
681				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
682		break;
683
684	case SO_RCVLOWAT:
685		if (val < 0)
686			val = INT_MAX;
687		sk->sk_rcvlowat = val ? : 1;
688		break;
689
690	case SO_RCVTIMEO:
691		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
692		break;
693
694	case SO_SNDTIMEO:
695		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
696		break;
697
698	case SO_ATTACH_FILTER:
699		ret = -EINVAL;
700		if (optlen == sizeof(struct sock_fprog)) {
701			struct sock_fprog fprog;
702
703			ret = -EFAULT;
704			if (copy_from_user(&fprog, optval, sizeof(fprog)))
705				break;
706
707			ret = sk_attach_filter(&fprog, sk);
708		}
709		break;
710
711	case SO_DETACH_FILTER:
712		ret = sk_detach_filter(sk);
713		break;
714
715	case SO_PASSSEC:
716		if (valbool)
717			set_bit(SOCK_PASSSEC, &sock->flags);
718		else
719			clear_bit(SOCK_PASSSEC, &sock->flags);
720		break;
721	case SO_MARK:
722		if (!capable(CAP_NET_ADMIN))
723			ret = -EPERM;
724		else
725			sk->sk_mark = val;
726		break;
727
728		/* We implement the SO_SNDLOWAT etc to
729		   not be settable (1003.1g 5.3) */
730	case SO_RXQ_OVFL:
731		if (valbool)
732			sock_set_flag(sk, SOCK_RXQ_OVFL);
733		else
734			sock_reset_flag(sk, SOCK_RXQ_OVFL);
735		break;
736	default:
737		ret = -ENOPROTOOPT;
738		break;
739	}
740	release_sock(sk);
741	return ret;
742}
743EXPORT_SYMBOL(sock_setsockopt);
744
745
746int sock_getsockopt(struct socket *sock, int level, int optname,
747		    char __user *optval, int __user *optlen)
748{
749	struct sock *sk = sock->sk;
750
751	union {
752		int val;
753		struct linger ling;
754		struct timeval tm;
755	} v;
756
757	int lv = sizeof(int);
758	int len;
759
760	if (get_user(len, optlen))
761		return -EFAULT;
762	if (len < 0)
763		return -EINVAL;
764
765	memset(&v, 0, sizeof(v));
766
767	switch (optname) {
768	case SO_DEBUG:
769		v.val = sock_flag(sk, SOCK_DBG);
770		break;
771
772	case SO_DONTROUTE:
773		v.val = sock_flag(sk, SOCK_LOCALROUTE);
774		break;
775
776	case SO_BROADCAST:
777		v.val = !!sock_flag(sk, SOCK_BROADCAST);
778		break;
779
780	case SO_SNDBUF:
781		v.val = sk->sk_sndbuf;
782		break;
783
784	case SO_RCVBUF:
785		v.val = sk->sk_rcvbuf;
786		break;
787
788	case SO_REUSEADDR:
789		v.val = sk->sk_reuse;
790		break;
791
792	case SO_KEEPALIVE:
793		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
794		break;
795
796	case SO_TYPE:
797		v.val = sk->sk_type;
798		break;
799
800	case SO_PROTOCOL:
801		v.val = sk->sk_protocol;
802		break;
803
804	case SO_DOMAIN:
805		v.val = sk->sk_family;
806		break;
807
808	case SO_ERROR:
809		v.val = -sock_error(sk);
810		if (v.val == 0)
811			v.val = xchg(&sk->sk_err_soft, 0);
812		break;
813
814	case SO_OOBINLINE:
815		v.val = !!sock_flag(sk, SOCK_URGINLINE);
816		break;
817
818	case SO_NO_CHECK:
819		v.val = sk->sk_no_check;
820		break;
821
822	case SO_PRIORITY:
823		v.val = sk->sk_priority;
824		break;
825
826	case SO_LINGER:
827		lv		= sizeof(v.ling);
828		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
829		v.ling.l_linger	= sk->sk_lingertime / HZ;
830		break;
831
832	case SO_BSDCOMPAT:
833		sock_warn_obsolete_bsdism("getsockopt");
834		break;
835
836	case SO_TIMESTAMP:
837		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
838				!sock_flag(sk, SOCK_RCVTSTAMPNS);
839		break;
840
841	case SO_TIMESTAMPNS:
842		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
843		break;
844
845	case SO_TIMESTAMPING:
846		v.val = 0;
847		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
848			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
849		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
850			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
851		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
852			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
853		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
854			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
855		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
856			v.val |= SOF_TIMESTAMPING_SOFTWARE;
857		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
858			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
859		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
860			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
861		break;
862
863	case SO_RCVTIMEO:
864		lv = sizeof(struct timeval);
865		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
866			v.tm.tv_sec = 0;
867			v.tm.tv_usec = 0;
868		} else {
869			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
870			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
871		}
872		break;
873
874	case SO_SNDTIMEO:
875		lv = sizeof(struct timeval);
876		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
877			v.tm.tv_sec = 0;
878			v.tm.tv_usec = 0;
879		} else {
880			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
881			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
882		}
883		break;
884
885	case SO_RCVLOWAT:
886		v.val = sk->sk_rcvlowat;
887		break;
888
889	case SO_SNDLOWAT:
890		v.val = 1;
891		break;
892
893	case SO_PASSCRED:
894		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
895		break;
896
897	case SO_PEERCRED:
898		if (len > sizeof(sk->sk_peercred))
899			len = sizeof(sk->sk_peercred);
900		if (copy_to_user(optval, &sk->sk_peercred, len))
901			return -EFAULT;
902		goto lenout;
903
904	case SO_PEERNAME:
905	{
906		char address[128];
907
908		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
909			return -ENOTCONN;
910		if (lv < len)
911			return -EINVAL;
912		if (copy_to_user(optval, address, len))
913			return -EFAULT;
914		goto lenout;
915	}
916
917	/* Dubious BSD thing... Probably nobody even uses it, but
918	 * the UNIX standard wants it for whatever reason... -DaveM
919	 */
920	case SO_ACCEPTCONN:
921		v.val = sk->sk_state == TCP_LISTEN;
922		break;
923
924	case SO_PASSSEC:
925		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
926		break;
927
928	case SO_PEERSEC:
929		return security_socket_getpeersec_stream(sock, optval, optlen, len);
930
931	case SO_MARK:
932		v.val = sk->sk_mark;
933		break;
934
935	case SO_RXQ_OVFL:
936		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
937		break;
938
939	default:
940		return -ENOPROTOOPT;
941	}
942
943	if (len > lv)
944		len = lv;
945	if (copy_to_user(optval, &v, len))
946		return -EFAULT;
947lenout:
948	if (put_user(len, optlen))
949		return -EFAULT;
950	return 0;
951}
952
953/*
954 * Initialize an sk_lock.
955 *
956 * (We also register the sk_lock with the lock validator.)
957 */
958static inline void sock_lock_init(struct sock *sk)
959{
960	sock_lock_init_class_and_name(sk,
961			af_family_slock_key_strings[sk->sk_family],
962			af_family_slock_keys + sk->sk_family,
963			af_family_key_strings[sk->sk_family],
964			af_family_keys + sk->sk_family);
965}
966
967/*
968 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
969 * even temporarly, because of RCU lookups. sk_node should also be left as is.
970 */
971static void sock_copy(struct sock *nsk, const struct sock *osk)
972{
973#ifdef CONFIG_SECURITY_NETWORK
974	void *sptr = nsk->sk_security;
975#endif
976	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
977		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
978		     sizeof(osk->sk_tx_queue_mapping));
979	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
980	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
981#ifdef CONFIG_SECURITY_NETWORK
982	nsk->sk_security = sptr;
983	security_sk_clone(osk, nsk);
984#endif
985}
986
987static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
988		int family)
989{
990	struct sock *sk;
991	struct kmem_cache *slab;
992
993	slab = prot->slab;
994	if (slab != NULL) {
995		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
996		if (!sk)
997			return sk;
998		if (priority & __GFP_ZERO) {
999			/*
1000			 * caches using SLAB_DESTROY_BY_RCU should let
1001			 * sk_node.next un-modified. Special care is taken
1002			 * when initializing object to zero.
1003			 */
1004			if (offsetof(struct sock, sk_node.next) != 0)
1005				memset(sk, 0, offsetof(struct sock, sk_node.next));
1006			memset(&sk->sk_node.pprev, 0,
1007			       prot->obj_size - offsetof(struct sock,
1008							 sk_node.pprev));
1009		}
1010	}
1011	else
1012		sk = kmalloc(prot->obj_size, priority);
1013
1014	if (sk != NULL) {
1015		kmemcheck_annotate_bitfield(sk, flags);
1016
1017		if (security_sk_alloc(sk, family, priority))
1018			goto out_free;
1019
1020		if (!try_module_get(prot->owner))
1021			goto out_free_sec;
1022		sk_tx_queue_clear(sk);
1023	}
1024
1025	return sk;
1026
1027out_free_sec:
1028	security_sk_free(sk);
1029out_free:
1030	if (slab != NULL)
1031		kmem_cache_free(slab, sk);
1032	else
1033		kfree(sk);
1034	return NULL;
1035}
1036
1037static void sk_prot_free(struct proto *prot, struct sock *sk)
1038{
1039	struct kmem_cache *slab;
1040	struct module *owner;
1041
1042	owner = prot->owner;
1043	slab = prot->slab;
1044
1045	security_sk_free(sk);
1046	if (slab != NULL)
1047		kmem_cache_free(slab, sk);
1048	else
1049		kfree(sk);
1050	module_put(owner);
1051}
1052
1053/**
1054 *	sk_alloc - All socket objects are allocated here
1055 *	@net: the applicable net namespace
1056 *	@family: protocol family
1057 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1058 *	@prot: struct proto associated with this new sock instance
1059 */
1060struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1061		      struct proto *prot)
1062{
1063	struct sock *sk;
1064
1065	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1066	if (sk) {
1067		sk->sk_family = family;
1068		/*
1069		 * See comment in struct sock definition to understand
1070		 * why we need sk_prot_creator -acme
1071		 */
1072		sk->sk_prot = sk->sk_prot_creator = prot;
1073		sock_lock_init(sk);
1074		sock_net_set(sk, get_net(net));
1075		atomic_set(&sk->sk_wmem_alloc, 1);
1076	}
1077
1078	return sk;
1079}
1080EXPORT_SYMBOL(sk_alloc);
1081
1082static void __sk_free(struct sock *sk)
1083{
1084	struct sk_filter *filter;
1085
1086	if (sk->sk_destruct)
1087		sk->sk_destruct(sk);
1088
1089	filter = rcu_dereference_check(sk->sk_filter,
1090				       atomic_read(&sk->sk_wmem_alloc) == 0);
1091	if (filter) {
1092		sk_filter_uncharge(sk, filter);
1093		rcu_assign_pointer(sk->sk_filter, NULL);
1094	}
1095
1096	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1097	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1098
1099	if (atomic_read(&sk->sk_omem_alloc))
1100		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1101		       __func__, atomic_read(&sk->sk_omem_alloc));
1102
1103	put_net(sock_net(sk));
1104	sk_prot_free(sk->sk_prot_creator, sk);
1105}
1106
1107void sk_free(struct sock *sk)
1108{
1109	/*
1110	 * We substract one from sk_wmem_alloc and can know if
1111	 * some packets are still in some tx queue.
1112	 * If not null, sock_wfree() will call __sk_free(sk) later
1113	 */
1114	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1115		__sk_free(sk);
1116}
1117EXPORT_SYMBOL(sk_free);
1118
1119/*
1120 * Last sock_put should drop referrence to sk->sk_net. It has already
1121 * been dropped in sk_change_net. Taking referrence to stopping namespace
1122 * is not an option.
1123 * Take referrence to a socket to remove it from hash _alive_ and after that
1124 * destroy it in the context of init_net.
1125 */
1126void sk_release_kernel(struct sock *sk)
1127{
1128	if (sk == NULL || sk->sk_socket == NULL)
1129		return;
1130
1131	sock_hold(sk);
1132	sock_release(sk->sk_socket);
1133	release_net(sock_net(sk));
1134	sock_net_set(sk, get_net(&init_net));
1135	sock_put(sk);
1136}
1137EXPORT_SYMBOL(sk_release_kernel);
1138
1139struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1140{
1141	struct sock *newsk;
1142
1143	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1144	if (newsk != NULL) {
1145		struct sk_filter *filter;
1146
1147		sock_copy(newsk, sk);
1148
1149		/* SANITY */
1150		get_net(sock_net(newsk));
1151		sk_node_init(&newsk->sk_node);
1152		sock_lock_init(newsk);
1153		bh_lock_sock(newsk);
1154		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1155		newsk->sk_backlog.len = 0;
1156
1157		atomic_set(&newsk->sk_rmem_alloc, 0);
1158		/*
1159		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1160		 */
1161		atomic_set(&newsk->sk_wmem_alloc, 1);
1162		atomic_set(&newsk->sk_omem_alloc, 0);
1163		skb_queue_head_init(&newsk->sk_receive_queue);
1164		skb_queue_head_init(&newsk->sk_write_queue);
1165#ifdef CONFIG_NET_DMA
1166		skb_queue_head_init(&newsk->sk_async_wait_queue);
1167#endif
1168
1169		spin_lock_init(&newsk->sk_dst_lock);
1170		rwlock_init(&newsk->sk_callback_lock);
1171		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1172				af_callback_keys + newsk->sk_family,
1173				af_family_clock_key_strings[newsk->sk_family]);
1174
1175		newsk->sk_dst_cache	= NULL;
1176		newsk->sk_wmem_queued	= 0;
1177		newsk->sk_forward_alloc = 0;
1178		newsk->sk_send_head	= NULL;
1179		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1180
1181		sock_reset_flag(newsk, SOCK_DONE);
1182		skb_queue_head_init(&newsk->sk_error_queue);
1183
1184		filter = newsk->sk_filter;
1185		if (filter != NULL)
1186			sk_filter_charge(newsk, filter);
1187
1188		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1189			/* It is still raw copy of parent, so invalidate
1190			 * destructor and make plain sk_free() */
1191			newsk->sk_destruct = NULL;
1192			sk_free(newsk);
1193			newsk = NULL;
1194			goto out;
1195		}
1196
1197		newsk->sk_err	   = 0;
1198		newsk->sk_priority = 0;
1199		/*
1200		 * Before updating sk_refcnt, we must commit prior changes to memory
1201		 * (Documentation/RCU/rculist_nulls.txt for details)
1202		 */
1203		smp_wmb();
1204		atomic_set(&newsk->sk_refcnt, 2);
1205
1206		/*
1207		 * Increment the counter in the same struct proto as the master
1208		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1209		 * is the same as sk->sk_prot->socks, as this field was copied
1210		 * with memcpy).
1211		 *
1212		 * This _changes_ the previous behaviour, where
1213		 * tcp_create_openreq_child always was incrementing the
1214		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1215		 * to be taken into account in all callers. -acme
1216		 */
1217		sk_refcnt_debug_inc(newsk);
1218		sk_set_socket(newsk, NULL);
1219		newsk->sk_wq = NULL;
1220
1221		if (newsk->sk_prot->sockets_allocated)
1222			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1223
1224		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1225		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1226			net_enable_timestamp();
1227	}
1228out:
1229	return newsk;
1230}
1231EXPORT_SYMBOL_GPL(sk_clone);
1232
1233void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1234{
1235	__sk_dst_set(sk, dst);
1236	sk->sk_route_caps = dst->dev->features;
1237	if (sk->sk_route_caps & NETIF_F_GSO)
1238		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1239	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1240	if (sk_can_gso(sk)) {
1241		if (dst->header_len) {
1242			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1243		} else {
1244			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1245			sk->sk_gso_max_size = dst->dev->gso_max_size;
1246		}
1247	}
1248}
1249EXPORT_SYMBOL_GPL(sk_setup_caps);
1250
1251void __init sk_init(void)
1252{
1253	if (totalram_pages <= 4096) {
1254		sysctl_wmem_max = 32767;
1255		sysctl_rmem_max = 32767;
1256		sysctl_wmem_default = 32767;
1257		sysctl_rmem_default = 32767;
1258	} else if (totalram_pages >= 131072) {
1259		sysctl_wmem_max = 131071;
1260		sysctl_rmem_max = 131071;
1261	}
1262}
1263
1264/*
1265 *	Simple resource managers for sockets.
1266 */
1267
1268
1269/*
1270 * Write buffer destructor automatically called from kfree_skb.
1271 */
1272void sock_wfree(struct sk_buff *skb)
1273{
1274	struct sock *sk = skb->sk;
1275	unsigned int len = skb->truesize;
1276
1277	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1278		/*
1279		 * Keep a reference on sk_wmem_alloc, this will be released
1280		 * after sk_write_space() call
1281		 */
1282		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1283		sk->sk_write_space(sk);
1284		len = 1;
1285	}
1286	/*
1287	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1288	 * could not do because of in-flight packets
1289	 */
1290	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1291		__sk_free(sk);
1292}
1293EXPORT_SYMBOL(sock_wfree);
1294
1295/*
1296 * Read buffer destructor automatically called from kfree_skb.
1297 */
1298void sock_rfree(struct sk_buff *skb)
1299{
1300	struct sock *sk = skb->sk;
1301
1302	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1303	sk_mem_uncharge(skb->sk, skb->truesize);
1304}
1305EXPORT_SYMBOL(sock_rfree);
1306
1307
1308int sock_i_uid(struct sock *sk)
1309{
1310	int uid;
1311
1312	read_lock(&sk->sk_callback_lock);
1313	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1314	read_unlock(&sk->sk_callback_lock);
1315	return uid;
1316}
1317EXPORT_SYMBOL(sock_i_uid);
1318
1319unsigned long sock_i_ino(struct sock *sk)
1320{
1321	unsigned long ino;
1322
1323	read_lock(&sk->sk_callback_lock);
1324	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1325	read_unlock(&sk->sk_callback_lock);
1326	return ino;
1327}
1328EXPORT_SYMBOL(sock_i_ino);
1329
1330/*
1331 * Allocate a skb from the socket's send buffer.
1332 */
1333struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1334			     gfp_t priority)
1335{
1336	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1337		struct sk_buff *skb = alloc_skb(size, priority);
1338		if (skb) {
1339			skb_set_owner_w(skb, sk);
1340			return skb;
1341		}
1342	}
1343	return NULL;
1344}
1345EXPORT_SYMBOL(sock_wmalloc);
1346
1347/*
1348 * Allocate a skb from the socket's receive buffer.
1349 */
1350struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1351			     gfp_t priority)
1352{
1353	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1354		struct sk_buff *skb = alloc_skb(size, priority);
1355		if (skb) {
1356			skb_set_owner_r(skb, sk);
1357			return skb;
1358		}
1359	}
1360	return NULL;
1361}
1362
1363/*
1364 * Allocate a memory block from the socket's option memory buffer.
1365 */
1366void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1367{
1368	if ((unsigned)size <= sysctl_optmem_max &&
1369	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1370		void *mem;
1371		/* First do the add, to avoid the race if kmalloc
1372		 * might sleep.
1373		 */
1374		atomic_add(size, &sk->sk_omem_alloc);
1375		mem = kmalloc(size, priority);
1376		if (mem)
1377			return mem;
1378		atomic_sub(size, &sk->sk_omem_alloc);
1379	}
1380	return NULL;
1381}
1382EXPORT_SYMBOL(sock_kmalloc);
1383
1384/*
1385 * Free an option memory block.
1386 */
1387void sock_kfree_s(struct sock *sk, void *mem, int size)
1388{
1389	kfree(mem);
1390	atomic_sub(size, &sk->sk_omem_alloc);
1391}
1392EXPORT_SYMBOL(sock_kfree_s);
1393
1394/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1395   I think, these locks should be removed for datagram sockets.
1396 */
1397static long sock_wait_for_wmem(struct sock *sk, long timeo)
1398{
1399	DEFINE_WAIT(wait);
1400
1401	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1402	for (;;) {
1403		if (!timeo)
1404			break;
1405		if (signal_pending(current))
1406			break;
1407		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1408		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1409		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1410			break;
1411		if (sk->sk_shutdown & SEND_SHUTDOWN)
1412			break;
1413		if (sk->sk_err)
1414			break;
1415		timeo = schedule_timeout(timeo);
1416	}
1417	finish_wait(sk_sleep(sk), &wait);
1418	return timeo;
1419}
1420
1421
1422/*
1423 *	Generic send/receive buffer handlers
1424 */
1425
1426struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1427				     unsigned long data_len, int noblock,
1428				     int *errcode)
1429{
1430	struct sk_buff *skb;
1431	gfp_t gfp_mask;
1432	long timeo;
1433	int err;
1434
1435	gfp_mask = sk->sk_allocation;
1436	if (gfp_mask & __GFP_WAIT)
1437		gfp_mask |= __GFP_REPEAT;
1438
1439	timeo = sock_sndtimeo(sk, noblock);
1440	while (1) {
1441		err = sock_error(sk);
1442		if (err != 0)
1443			goto failure;
1444
1445		err = -EPIPE;
1446		if (sk->sk_shutdown & SEND_SHUTDOWN)
1447			goto failure;
1448
1449		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1450			skb = alloc_skb(header_len, gfp_mask);
1451			if (skb) {
1452				int npages;
1453				int i;
1454
1455				/* No pages, we're done... */
1456				if (!data_len)
1457					break;
1458
1459				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1460				skb->truesize += data_len;
1461				skb_shinfo(skb)->nr_frags = npages;
1462				for (i = 0; i < npages; i++) {
1463					struct page *page;
1464					skb_frag_t *frag;
1465
1466					page = alloc_pages(sk->sk_allocation, 0);
1467					if (!page) {
1468						err = -ENOBUFS;
1469						skb_shinfo(skb)->nr_frags = i;
1470						kfree_skb(skb);
1471						goto failure;
1472					}
1473
1474					frag = &skb_shinfo(skb)->frags[i];
1475					frag->page = page;
1476					frag->page_offset = 0;
1477					frag->size = (data_len >= PAGE_SIZE ?
1478						      PAGE_SIZE :
1479						      data_len);
1480					data_len -= PAGE_SIZE;
1481				}
1482
1483				/* Full success... */
1484				break;
1485			}
1486			err = -ENOBUFS;
1487			goto failure;
1488		}
1489		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1490		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1491		err = -EAGAIN;
1492		if (!timeo)
1493			goto failure;
1494		if (signal_pending(current))
1495			goto interrupted;
1496		timeo = sock_wait_for_wmem(sk, timeo);
1497	}
1498
1499	skb_set_owner_w(skb, sk);
1500	return skb;
1501
1502interrupted:
1503	err = sock_intr_errno(timeo);
1504failure:
1505	*errcode = err;
1506	return NULL;
1507}
1508EXPORT_SYMBOL(sock_alloc_send_pskb);
1509
1510struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1511				    int noblock, int *errcode)
1512{
1513	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1514}
1515EXPORT_SYMBOL(sock_alloc_send_skb);
1516
1517static void __lock_sock(struct sock *sk)
1518{
1519	DEFINE_WAIT(wait);
1520
1521	for (;;) {
1522		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1523					TASK_UNINTERRUPTIBLE);
1524		spin_unlock_bh(&sk->sk_lock.slock);
1525		schedule();
1526		spin_lock_bh(&sk->sk_lock.slock);
1527		if (!sock_owned_by_user(sk))
1528			break;
1529	}
1530	finish_wait(&sk->sk_lock.wq, &wait);
1531}
1532
1533static void __release_sock(struct sock *sk)
1534{
1535	struct sk_buff *skb = sk->sk_backlog.head;
1536
1537	do {
1538		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1539		bh_unlock_sock(sk);
1540
1541		do {
1542			struct sk_buff *next = skb->next;
1543
1544			WARN_ON_ONCE(skb_dst_is_noref(skb));
1545			skb->next = NULL;
1546			sk_backlog_rcv(sk, skb);
1547
1548			/*
1549			 * We are in process context here with softirqs
1550			 * disabled, use cond_resched_softirq() to preempt.
1551			 * This is safe to do because we've taken the backlog
1552			 * queue private:
1553			 */
1554			cond_resched_softirq();
1555
1556			skb = next;
1557		} while (skb != NULL);
1558
1559		bh_lock_sock(sk);
1560	} while ((skb = sk->sk_backlog.head) != NULL);
1561
1562	/*
1563	 * Doing the zeroing here guarantee we can not loop forever
1564	 * while a wild producer attempts to flood us.
1565	 */
1566	sk->sk_backlog.len = 0;
1567}
1568
1569/**
1570 * sk_wait_data - wait for data to arrive at sk_receive_queue
1571 * @sk:    sock to wait on
1572 * @timeo: for how long
1573 *
1574 * Now socket state including sk->sk_err is changed only under lock,
1575 * hence we may omit checks after joining wait queue.
1576 * We check receive queue before schedule() only as optimization;
1577 * it is very likely that release_sock() added new data.
1578 */
1579int sk_wait_data(struct sock *sk, long *timeo)
1580{
1581	int rc;
1582	DEFINE_WAIT(wait);
1583
1584	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1585	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1586	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1587	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1588	finish_wait(sk_sleep(sk), &wait);
1589	return rc;
1590}
1591EXPORT_SYMBOL(sk_wait_data);
1592
1593/**
1594 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1595 *	@sk: socket
1596 *	@size: memory size to allocate
1597 *	@kind: allocation type
1598 *
1599 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1600 *	rmem allocation. This function assumes that protocols which have
1601 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1602 */
1603int __sk_mem_schedule(struct sock *sk, int size, int kind)
1604{
1605	struct proto *prot = sk->sk_prot;
1606	int amt = sk_mem_pages(size);
1607	int allocated;
1608
1609	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1610	allocated = atomic_add_return(amt, prot->memory_allocated);
1611
1612	/* Under limit. */
1613	if (allocated <= prot->sysctl_mem[0]) {
1614		if (prot->memory_pressure && *prot->memory_pressure)
1615			*prot->memory_pressure = 0;
1616		return 1;
1617	}
1618
1619	/* Under pressure. */
1620	if (allocated > prot->sysctl_mem[1])
1621		if (prot->enter_memory_pressure)
1622			prot->enter_memory_pressure(sk);
1623
1624	/* Over hard limit. */
1625	if (allocated > prot->sysctl_mem[2])
1626		goto suppress_allocation;
1627
1628	/* guarantee minimum buffer size under pressure */
1629	if (kind == SK_MEM_RECV) {
1630		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1631			return 1;
1632	} else { /* SK_MEM_SEND */
1633		if (sk->sk_type == SOCK_STREAM) {
1634			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1635				return 1;
1636		} else if (atomic_read(&sk->sk_wmem_alloc) <
1637			   prot->sysctl_wmem[0])
1638				return 1;
1639	}
1640
1641	if (prot->memory_pressure) {
1642		int alloc;
1643
1644		if (!*prot->memory_pressure)
1645			return 1;
1646		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1647		if (prot->sysctl_mem[2] > alloc *
1648		    sk_mem_pages(sk->sk_wmem_queued +
1649				 atomic_read(&sk->sk_rmem_alloc) +
1650				 sk->sk_forward_alloc))
1651			return 1;
1652	}
1653
1654suppress_allocation:
1655
1656	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1657		sk_stream_moderate_sndbuf(sk);
1658
1659		/* Fail only if socket is _under_ its sndbuf.
1660		 * In this case we cannot block, so that we have to fail.
1661		 */
1662		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1663			return 1;
1664	}
1665
1666	/* Alas. Undo changes. */
1667	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1668	atomic_sub(amt, prot->memory_allocated);
1669	return 0;
1670}
1671EXPORT_SYMBOL(__sk_mem_schedule);
1672
1673/**
1674 *	__sk_reclaim - reclaim memory_allocated
1675 *	@sk: socket
1676 */
1677void __sk_mem_reclaim(struct sock *sk)
1678{
1679	struct proto *prot = sk->sk_prot;
1680
1681	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1682		   prot->memory_allocated);
1683	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1684
1685	if (prot->memory_pressure && *prot->memory_pressure &&
1686	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1687		*prot->memory_pressure = 0;
1688}
1689EXPORT_SYMBOL(__sk_mem_reclaim);
1690
1691
1692/*
1693 * Set of default routines for initialising struct proto_ops when
1694 * the protocol does not support a particular function. In certain
1695 * cases where it makes no sense for a protocol to have a "do nothing"
1696 * function, some default processing is provided.
1697 */
1698
1699int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1700{
1701	return -EOPNOTSUPP;
1702}
1703EXPORT_SYMBOL(sock_no_bind);
1704
1705int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1706		    int len, int flags)
1707{
1708	return -EOPNOTSUPP;
1709}
1710EXPORT_SYMBOL(sock_no_connect);
1711
1712int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1713{
1714	return -EOPNOTSUPP;
1715}
1716EXPORT_SYMBOL(sock_no_socketpair);
1717
1718int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1719{
1720	return -EOPNOTSUPP;
1721}
1722EXPORT_SYMBOL(sock_no_accept);
1723
1724int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1725		    int *len, int peer)
1726{
1727	return -EOPNOTSUPP;
1728}
1729EXPORT_SYMBOL(sock_no_getname);
1730
1731unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1732{
1733	return 0;
1734}
1735EXPORT_SYMBOL(sock_no_poll);
1736
1737int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1738{
1739	return -EOPNOTSUPP;
1740}
1741EXPORT_SYMBOL(sock_no_ioctl);
1742
1743int sock_no_listen(struct socket *sock, int backlog)
1744{
1745	return -EOPNOTSUPP;
1746}
1747EXPORT_SYMBOL(sock_no_listen);
1748
1749int sock_no_shutdown(struct socket *sock, int how)
1750{
1751	return -EOPNOTSUPP;
1752}
1753EXPORT_SYMBOL(sock_no_shutdown);
1754
1755int sock_no_setsockopt(struct socket *sock, int level, int optname,
1756		    char __user *optval, unsigned int optlen)
1757{
1758	return -EOPNOTSUPP;
1759}
1760EXPORT_SYMBOL(sock_no_setsockopt);
1761
1762int sock_no_getsockopt(struct socket *sock, int level, int optname,
1763		    char __user *optval, int __user *optlen)
1764{
1765	return -EOPNOTSUPP;
1766}
1767EXPORT_SYMBOL(sock_no_getsockopt);
1768
1769int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1770		    size_t len)
1771{
1772	return -EOPNOTSUPP;
1773}
1774EXPORT_SYMBOL(sock_no_sendmsg);
1775
1776int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1777		    size_t len, int flags)
1778{
1779	return -EOPNOTSUPP;
1780}
1781EXPORT_SYMBOL(sock_no_recvmsg);
1782
1783int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1784{
1785	/* Mirror missing mmap method error code */
1786	return -ENODEV;
1787}
1788EXPORT_SYMBOL(sock_no_mmap);
1789
1790ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1791{
1792	ssize_t res;
1793	struct msghdr msg = {.msg_flags = flags};
1794	struct kvec iov;
1795	char *kaddr = kmap(page);
1796	iov.iov_base = kaddr + offset;
1797	iov.iov_len = size;
1798	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1799	kunmap(page);
1800	return res;
1801}
1802EXPORT_SYMBOL(sock_no_sendpage);
1803
1804/*
1805 *	Default Socket Callbacks
1806 */
1807
1808static void sock_def_wakeup(struct sock *sk)
1809{
1810	struct socket_wq *wq;
1811
1812	rcu_read_lock();
1813	wq = rcu_dereference(sk->sk_wq);
1814	if (wq_has_sleeper(wq))
1815		wake_up_interruptible_all(&wq->wait);
1816	rcu_read_unlock();
1817}
1818
1819static void sock_def_error_report(struct sock *sk)
1820{
1821	struct socket_wq *wq;
1822
1823	rcu_read_lock();
1824	wq = rcu_dereference(sk->sk_wq);
1825	if (wq_has_sleeper(wq))
1826		wake_up_interruptible_poll(&wq->wait, POLLERR);
1827	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1828	rcu_read_unlock();
1829}
1830
1831static void sock_def_readable(struct sock *sk, int len)
1832{
1833	struct socket_wq *wq;
1834
1835	rcu_read_lock();
1836	wq = rcu_dereference(sk->sk_wq);
1837	if (wq_has_sleeper(wq))
1838		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
1839						POLLRDNORM | POLLRDBAND);
1840	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1841	rcu_read_unlock();
1842}
1843
1844static void sock_def_write_space(struct sock *sk)
1845{
1846	struct socket_wq *wq;
1847
1848	rcu_read_lock();
1849
1850	/* Do not wake up a writer until he can make "significant"
1851	 * progress.  --DaveM
1852	 */
1853	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1854		wq = rcu_dereference(sk->sk_wq);
1855		if (wq_has_sleeper(wq))
1856			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1857						POLLWRNORM | POLLWRBAND);
1858
1859		/* Should agree with poll, otherwise some programs break */
1860		if (sock_writeable(sk))
1861			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1862	}
1863
1864	rcu_read_unlock();
1865}
1866
1867static void sock_def_destruct(struct sock *sk)
1868{
1869	kfree(sk->sk_protinfo);
1870}
1871
1872void sk_send_sigurg(struct sock *sk)
1873{
1874	if (sk->sk_socket && sk->sk_socket->file)
1875		if (send_sigurg(&sk->sk_socket->file->f_owner))
1876			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1877}
1878EXPORT_SYMBOL(sk_send_sigurg);
1879
1880void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1881		    unsigned long expires)
1882{
1883	if (!mod_timer(timer, expires))
1884		sock_hold(sk);
1885}
1886EXPORT_SYMBOL(sk_reset_timer);
1887
1888void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1889{
1890	if (timer_pending(timer) && del_timer(timer))
1891		__sock_put(sk);
1892}
1893EXPORT_SYMBOL(sk_stop_timer);
1894
1895void sock_init_data(struct socket *sock, struct sock *sk)
1896{
1897	skb_queue_head_init(&sk->sk_receive_queue);
1898	skb_queue_head_init(&sk->sk_write_queue);
1899	skb_queue_head_init(&sk->sk_error_queue);
1900#ifdef CONFIG_NET_DMA
1901	skb_queue_head_init(&sk->sk_async_wait_queue);
1902#endif
1903
1904	sk->sk_send_head	=	NULL;
1905
1906	init_timer(&sk->sk_timer);
1907
1908	sk->sk_allocation	=	GFP_KERNEL;
1909	sk->sk_rcvbuf		=	sysctl_rmem_default;
1910	sk->sk_sndbuf		=	sysctl_wmem_default;
1911	sk->sk_state		=	TCP_CLOSE;
1912	sk_set_socket(sk, sock);
1913
1914	sock_set_flag(sk, SOCK_ZAPPED);
1915
1916	if (sock) {
1917		sk->sk_type	=	sock->type;
1918		sk->sk_wq	=	sock->wq;
1919		sock->sk	=	sk;
1920	} else
1921		sk->sk_wq	=	NULL;
1922
1923	spin_lock_init(&sk->sk_dst_lock);
1924	rwlock_init(&sk->sk_callback_lock);
1925	lockdep_set_class_and_name(&sk->sk_callback_lock,
1926			af_callback_keys + sk->sk_family,
1927			af_family_clock_key_strings[sk->sk_family]);
1928
1929	sk->sk_state_change	=	sock_def_wakeup;
1930	sk->sk_data_ready	=	sock_def_readable;
1931	sk->sk_write_space	=	sock_def_write_space;
1932	sk->sk_error_report	=	sock_def_error_report;
1933	sk->sk_destruct		=	sock_def_destruct;
1934
1935	sk->sk_sndmsg_page	=	NULL;
1936	sk->sk_sndmsg_off	=	0;
1937
1938	sk->sk_peercred.pid 	=	0;
1939	sk->sk_peercred.uid	=	-1;
1940	sk->sk_peercred.gid	=	-1;
1941	sk->sk_write_pending	=	0;
1942	sk->sk_rcvlowat		=	1;
1943	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1944	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1945
1946	sk->sk_stamp = ktime_set(-1L, 0);
1947
1948	/*
1949	 * Before updating sk_refcnt, we must commit prior changes to memory
1950	 * (Documentation/RCU/rculist_nulls.txt for details)
1951	 */
1952	smp_wmb();
1953	atomic_set(&sk->sk_refcnt, 1);
1954	atomic_set(&sk->sk_drops, 0);
1955}
1956EXPORT_SYMBOL(sock_init_data);
1957
1958void lock_sock_nested(struct sock *sk, int subclass)
1959{
1960	might_sleep();
1961	spin_lock_bh(&sk->sk_lock.slock);
1962	if (sk->sk_lock.owned)
1963		__lock_sock(sk);
1964	sk->sk_lock.owned = 1;
1965	spin_unlock(&sk->sk_lock.slock);
1966	/*
1967	 * The sk_lock has mutex_lock() semantics here:
1968	 */
1969	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1970	local_bh_enable();
1971}
1972EXPORT_SYMBOL(lock_sock_nested);
1973
1974void release_sock(struct sock *sk)
1975{
1976	/*
1977	 * The sk_lock has mutex_unlock() semantics:
1978	 */
1979	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1980
1981	spin_lock_bh(&sk->sk_lock.slock);
1982	if (sk->sk_backlog.tail)
1983		__release_sock(sk);
1984	sk->sk_lock.owned = 0;
1985	if (waitqueue_active(&sk->sk_lock.wq))
1986		wake_up(&sk->sk_lock.wq);
1987	spin_unlock_bh(&sk->sk_lock.slock);
1988}
1989EXPORT_SYMBOL(release_sock);
1990
1991int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1992{
1993	struct timeval tv;
1994	if (!sock_flag(sk, SOCK_TIMESTAMP))
1995		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1996	tv = ktime_to_timeval(sk->sk_stamp);
1997	if (tv.tv_sec == -1)
1998		return -ENOENT;
1999	if (tv.tv_sec == 0) {
2000		sk->sk_stamp = ktime_get_real();
2001		tv = ktime_to_timeval(sk->sk_stamp);
2002	}
2003	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2004}
2005EXPORT_SYMBOL(sock_get_timestamp);
2006
2007int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2008{
2009	struct timespec ts;
2010	if (!sock_flag(sk, SOCK_TIMESTAMP))
2011		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2012	ts = ktime_to_timespec(sk->sk_stamp);
2013	if (ts.tv_sec == -1)
2014		return -ENOENT;
2015	if (ts.tv_sec == 0) {
2016		sk->sk_stamp = ktime_get_real();
2017		ts = ktime_to_timespec(sk->sk_stamp);
2018	}
2019	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2020}
2021EXPORT_SYMBOL(sock_get_timestampns);
2022
2023void sock_enable_timestamp(struct sock *sk, int flag)
2024{
2025	if (!sock_flag(sk, flag)) {
2026		sock_set_flag(sk, flag);
2027		/*
2028		 * we just set one of the two flags which require net
2029		 * time stamping, but time stamping might have been on
2030		 * already because of the other one
2031		 */
2032		if (!sock_flag(sk,
2033				flag == SOCK_TIMESTAMP ?
2034				SOCK_TIMESTAMPING_RX_SOFTWARE :
2035				SOCK_TIMESTAMP))
2036			net_enable_timestamp();
2037	}
2038}
2039
2040/*
2041 *	Get a socket option on an socket.
2042 *
2043 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2044 *	asynchronous errors should be reported by getsockopt. We assume
2045 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2046 */
2047int sock_common_getsockopt(struct socket *sock, int level, int optname,
2048			   char __user *optval, int __user *optlen)
2049{
2050	struct sock *sk = sock->sk;
2051
2052	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2053}
2054EXPORT_SYMBOL(sock_common_getsockopt);
2055
2056#ifdef CONFIG_COMPAT
2057int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2058				  char __user *optval, int __user *optlen)
2059{
2060	struct sock *sk = sock->sk;
2061
2062	if (sk->sk_prot->compat_getsockopt != NULL)
2063		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2064						      optval, optlen);
2065	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2066}
2067EXPORT_SYMBOL(compat_sock_common_getsockopt);
2068#endif
2069
2070int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2071			struct msghdr *msg, size_t size, int flags)
2072{
2073	struct sock *sk = sock->sk;
2074	int addr_len = 0;
2075	int err;
2076
2077	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2078				   flags & ~MSG_DONTWAIT, &addr_len);
2079	if (err >= 0)
2080		msg->msg_namelen = addr_len;
2081	return err;
2082}
2083EXPORT_SYMBOL(sock_common_recvmsg);
2084
2085/*
2086 *	Set socket options on an inet socket.
2087 */
2088int sock_common_setsockopt(struct socket *sock, int level, int optname,
2089			   char __user *optval, unsigned int optlen)
2090{
2091	struct sock *sk = sock->sk;
2092
2093	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2094}
2095EXPORT_SYMBOL(sock_common_setsockopt);
2096
2097#ifdef CONFIG_COMPAT
2098int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2099				  char __user *optval, unsigned int optlen)
2100{
2101	struct sock *sk = sock->sk;
2102
2103	if (sk->sk_prot->compat_setsockopt != NULL)
2104		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2105						      optval, optlen);
2106	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2107}
2108EXPORT_SYMBOL(compat_sock_common_setsockopt);
2109#endif
2110
2111void sk_common_release(struct sock *sk)
2112{
2113	if (sk->sk_prot->destroy)
2114		sk->sk_prot->destroy(sk);
2115
2116	/*
2117	 * Observation: when sock_common_release is called, processes have
2118	 * no access to socket. But net still has.
2119	 * Step one, detach it from networking:
2120	 *
2121	 * A. Remove from hash tables.
2122	 */
2123
2124	sk->sk_prot->unhash(sk);
2125
2126	/*
2127	 * In this point socket cannot receive new packets, but it is possible
2128	 * that some packets are in flight because some CPU runs receiver and
2129	 * did hash table lookup before we unhashed socket. They will achieve
2130	 * receive queue and will be purged by socket destructor.
2131	 *
2132	 * Also we still have packets pending on receive queue and probably,
2133	 * our own packets waiting in device queues. sock_destroy will drain
2134	 * receive queue, but transmitted packets will delay socket destruction
2135	 * until the last reference will be released.
2136	 */
2137
2138	sock_orphan(sk);
2139
2140	xfrm_sk_free_policy(sk);
2141
2142	sk_refcnt_debug_release(sk);
2143	sock_put(sk);
2144}
2145EXPORT_SYMBOL(sk_common_release);
2146
2147static DEFINE_RWLOCK(proto_list_lock);
2148static LIST_HEAD(proto_list);
2149
2150#ifdef CONFIG_PROC_FS
2151#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2152struct prot_inuse {
2153	int val[PROTO_INUSE_NR];
2154};
2155
2156static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2157
2158#ifdef CONFIG_NET_NS
2159void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2160{
2161	int cpu = smp_processor_id();
2162	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2163}
2164EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2165
2166int sock_prot_inuse_get(struct net *net, struct proto *prot)
2167{
2168	int cpu, idx = prot->inuse_idx;
2169	int res = 0;
2170
2171	for_each_possible_cpu(cpu)
2172		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2173
2174	return res >= 0 ? res : 0;
2175}
2176EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2177
2178static int __net_init sock_inuse_init_net(struct net *net)
2179{
2180	net->core.inuse = alloc_percpu(struct prot_inuse);
2181	return net->core.inuse ? 0 : -ENOMEM;
2182}
2183
2184static void __net_exit sock_inuse_exit_net(struct net *net)
2185{
2186	free_percpu(net->core.inuse);
2187}
2188
2189static struct pernet_operations net_inuse_ops = {
2190	.init = sock_inuse_init_net,
2191	.exit = sock_inuse_exit_net,
2192};
2193
2194static __init int net_inuse_init(void)
2195{
2196	if (register_pernet_subsys(&net_inuse_ops))
2197		panic("Cannot initialize net inuse counters");
2198
2199	return 0;
2200}
2201
2202core_initcall(net_inuse_init);
2203#else
2204static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2205
2206void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2207{
2208	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2209}
2210EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2211
2212int sock_prot_inuse_get(struct net *net, struct proto *prot)
2213{
2214	int cpu, idx = prot->inuse_idx;
2215	int res = 0;
2216
2217	for_each_possible_cpu(cpu)
2218		res += per_cpu(prot_inuse, cpu).val[idx];
2219
2220	return res >= 0 ? res : 0;
2221}
2222EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2223#endif
2224
2225static void assign_proto_idx(struct proto *prot)
2226{
2227	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2228
2229	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2230		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2231		return;
2232	}
2233
2234	set_bit(prot->inuse_idx, proto_inuse_idx);
2235}
2236
2237static void release_proto_idx(struct proto *prot)
2238{
2239	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2240		clear_bit(prot->inuse_idx, proto_inuse_idx);
2241}
2242#else
2243static inline void assign_proto_idx(struct proto *prot)
2244{
2245}
2246
2247static inline void release_proto_idx(struct proto *prot)
2248{
2249}
2250#endif
2251
2252int proto_register(struct proto *prot, int alloc_slab)
2253{
2254	if (alloc_slab) {
2255		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2256					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2257					NULL);
2258
2259		if (prot->slab == NULL) {
2260			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2261			       prot->name);
2262			goto out;
2263		}
2264
2265		if (prot->rsk_prot != NULL) {
2266			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2267			if (prot->rsk_prot->slab_name == NULL)
2268				goto out_free_sock_slab;
2269
2270			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2271								 prot->rsk_prot->obj_size, 0,
2272								 SLAB_HWCACHE_ALIGN, NULL);
2273
2274			if (prot->rsk_prot->slab == NULL) {
2275				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2276				       prot->name);
2277				goto out_free_request_sock_slab_name;
2278			}
2279		}
2280
2281		if (prot->twsk_prot != NULL) {
2282			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2283
2284			if (prot->twsk_prot->twsk_slab_name == NULL)
2285				goto out_free_request_sock_slab;
2286
2287			prot->twsk_prot->twsk_slab =
2288				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2289						  prot->twsk_prot->twsk_obj_size,
2290						  0,
2291						  SLAB_HWCACHE_ALIGN |
2292							prot->slab_flags,
2293						  NULL);
2294			if (prot->twsk_prot->twsk_slab == NULL)
2295				goto out_free_timewait_sock_slab_name;
2296		}
2297	}
2298
2299	write_lock(&proto_list_lock);
2300	list_add(&prot->node, &proto_list);
2301	assign_proto_idx(prot);
2302	write_unlock(&proto_list_lock);
2303	return 0;
2304
2305out_free_timewait_sock_slab_name:
2306	kfree(prot->twsk_prot->twsk_slab_name);
2307out_free_request_sock_slab:
2308	if (prot->rsk_prot && prot->rsk_prot->slab) {
2309		kmem_cache_destroy(prot->rsk_prot->slab);
2310		prot->rsk_prot->slab = NULL;
2311	}
2312out_free_request_sock_slab_name:
2313	if (prot->rsk_prot)
2314		kfree(prot->rsk_prot->slab_name);
2315out_free_sock_slab:
2316	kmem_cache_destroy(prot->slab);
2317	prot->slab = NULL;
2318out:
2319	return -ENOBUFS;
2320}
2321EXPORT_SYMBOL(proto_register);
2322
2323void proto_unregister(struct proto *prot)
2324{
2325	write_lock(&proto_list_lock);
2326	release_proto_idx(prot);
2327	list_del(&prot->node);
2328	write_unlock(&proto_list_lock);
2329
2330	if (prot->slab != NULL) {
2331		kmem_cache_destroy(prot->slab);
2332		prot->slab = NULL;
2333	}
2334
2335	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2336		kmem_cache_destroy(prot->rsk_prot->slab);
2337		kfree(prot->rsk_prot->slab_name);
2338		prot->rsk_prot->slab = NULL;
2339	}
2340
2341	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2342		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2343		kfree(prot->twsk_prot->twsk_slab_name);
2344		prot->twsk_prot->twsk_slab = NULL;
2345	}
2346}
2347EXPORT_SYMBOL(proto_unregister);
2348
2349#ifdef CONFIG_PROC_FS
2350static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2351	__acquires(proto_list_lock)
2352{
2353	read_lock(&proto_list_lock);
2354	return seq_list_start_head(&proto_list, *pos);
2355}
2356
2357static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2358{
2359	return seq_list_next(v, &proto_list, pos);
2360}
2361
2362static void proto_seq_stop(struct seq_file *seq, void *v)
2363	__releases(proto_list_lock)
2364{
2365	read_unlock(&proto_list_lock);
2366}
2367
2368static char proto_method_implemented(const void *method)
2369{
2370	return method == NULL ? 'n' : 'y';
2371}
2372
2373static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2374{
2375	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2376			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2377		   proto->name,
2378		   proto->obj_size,
2379		   sock_prot_inuse_get(seq_file_net(seq), proto),
2380		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2381		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2382		   proto->max_header,
2383		   proto->slab == NULL ? "no" : "yes",
2384		   module_name(proto->owner),
2385		   proto_method_implemented(proto->close),
2386		   proto_method_implemented(proto->connect),
2387		   proto_method_implemented(proto->disconnect),
2388		   proto_method_implemented(proto->accept),
2389		   proto_method_implemented(proto->ioctl),
2390		   proto_method_implemented(proto->init),
2391		   proto_method_implemented(proto->destroy),
2392		   proto_method_implemented(proto->shutdown),
2393		   proto_method_implemented(proto->setsockopt),
2394		   proto_method_implemented(proto->getsockopt),
2395		   proto_method_implemented(proto->sendmsg),
2396		   proto_method_implemented(proto->recvmsg),
2397		   proto_method_implemented(proto->sendpage),
2398		   proto_method_implemented(proto->bind),
2399		   proto_method_implemented(proto->backlog_rcv),
2400		   proto_method_implemented(proto->hash),
2401		   proto_method_implemented(proto->unhash),
2402		   proto_method_implemented(proto->get_port),
2403		   proto_method_implemented(proto->enter_memory_pressure));
2404}
2405
2406static int proto_seq_show(struct seq_file *seq, void *v)
2407{
2408	if (v == &proto_list)
2409		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2410			   "protocol",
2411			   "size",
2412			   "sockets",
2413			   "memory",
2414			   "press",
2415			   "maxhdr",
2416			   "slab",
2417			   "module",
2418			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2419	else
2420		proto_seq_printf(seq, list_entry(v, struct proto, node));
2421	return 0;
2422}
2423
2424static const struct seq_operations proto_seq_ops = {
2425	.start  = proto_seq_start,
2426	.next   = proto_seq_next,
2427	.stop   = proto_seq_stop,
2428	.show   = proto_seq_show,
2429};
2430
2431static int proto_seq_open(struct inode *inode, struct file *file)
2432{
2433	return seq_open_net(inode, file, &proto_seq_ops,
2434			    sizeof(struct seq_net_private));
2435}
2436
2437static const struct file_operations proto_seq_fops = {
2438	.owner		= THIS_MODULE,
2439	.open		= proto_seq_open,
2440	.read		= seq_read,
2441	.llseek		= seq_lseek,
2442	.release	= seq_release_net,
2443};
2444
2445static __net_init int proto_init_net(struct net *net)
2446{
2447	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2448		return -ENOMEM;
2449
2450	return 0;
2451}
2452
2453static __net_exit void proto_exit_net(struct net *net)
2454{
2455	proc_net_remove(net, "protocols");
2456}
2457
2458
2459static __net_initdata struct pernet_operations proto_net_ops = {
2460	.init = proto_init_net,
2461	.exit = proto_exit_net,
2462};
2463
2464static int __init proto_init(void)
2465{
2466	return register_pernet_subsys(&proto_net_ops);
2467}
2468
2469subsys_initcall(proto_init);
2470
2471#endif /* PROC_FS */
2472