sock.c revision e56c57d0d3fdbbdf583d3af96bfb803b8dfa713e
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
121#include <net/net_namespace.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <linux/net_tstamp.h>
125#include <net/xfrm.h>
126#include <linux/ipsec.h>
127#include <net/cls_cgroup.h>
128
129#include <linux/filter.h>
130
131#include <trace/events/sock.h>
132
133#ifdef CONFIG_INET
134#include <net/tcp.h>
135#endif
136
137/*
138 * Each address family might have different locking rules, so we have
139 * one slock key per address family:
140 */
141static struct lock_class_key af_family_keys[AF_MAX];
142static struct lock_class_key af_family_slock_keys[AF_MAX];
143
144/*
145 * Make lock validator output more readable. (we pre-construct these
146 * strings build-time, so that runtime initialization of socket
147 * locks is fast):
148 */
149static const char *const af_family_key_strings[AF_MAX+1] = {
150  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
151  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
152  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
153  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
154  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
155  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
156  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
157  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
158  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
159  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
160  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
161  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
162  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
163  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
164};
165static const char *const af_family_slock_key_strings[AF_MAX+1] = {
166  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
167  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
168  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
169  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
170  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
171  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
172  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
173  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
174  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
175  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
176  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
177  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
178  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
179  "slock-AF_NFC"   , "slock-AF_MAX"
180};
181static const char *const af_family_clock_key_strings[AF_MAX+1] = {
182  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
183  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
184  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
185  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
186  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
187  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
188  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
189  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
190  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
191  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
192  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
193  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
194  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
195  "clock-AF_NFC"   , "clock-AF_MAX"
196};
197
198/*
199 * sk_callback_lock locking rules are per-address-family,
200 * so split the lock classes by using a per-AF key:
201 */
202static struct lock_class_key af_callback_keys[AF_MAX];
203
204/* Take into consideration the size of the struct sk_buff overhead in the
205 * determination of these values, since that is non-constant across
206 * platforms.  This makes socket queueing behavior and performance
207 * not depend upon such differences.
208 */
209#define _SK_MEM_PACKETS		256
210#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
211#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
212#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
213
214/* Run time adjustable parameters. */
215__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
216__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
217__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
218__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
219
220/* Maximal space eaten by iovec or ancillary data plus some space */
221int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
222EXPORT_SYMBOL(sysctl_optmem_max);
223
224#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
225int net_cls_subsys_id = -1;
226EXPORT_SYMBOL_GPL(net_cls_subsys_id);
227#endif
228
229static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
230{
231	struct timeval tv;
232
233	if (optlen < sizeof(tv))
234		return -EINVAL;
235	if (copy_from_user(&tv, optval, sizeof(tv)))
236		return -EFAULT;
237	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
238		return -EDOM;
239
240	if (tv.tv_sec < 0) {
241		static int warned __read_mostly;
242
243		*timeo_p = 0;
244		if (warned < 10 && net_ratelimit()) {
245			warned++;
246			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
247			       "tries to set negative timeout\n",
248				current->comm, task_pid_nr(current));
249		}
250		return 0;
251	}
252	*timeo_p = MAX_SCHEDULE_TIMEOUT;
253	if (tv.tv_sec == 0 && tv.tv_usec == 0)
254		return 0;
255	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
256		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
257	return 0;
258}
259
260static void sock_warn_obsolete_bsdism(const char *name)
261{
262	static int warned;
263	static char warncomm[TASK_COMM_LEN];
264	if (strcmp(warncomm, current->comm) && warned < 5) {
265		strcpy(warncomm,  current->comm);
266		printk(KERN_WARNING "process `%s' is using obsolete "
267		       "%s SO_BSDCOMPAT\n", warncomm, name);
268		warned++;
269	}
270}
271
272static void sock_disable_timestamp(struct sock *sk, int flag)
273{
274	if (sock_flag(sk, flag)) {
275		sock_reset_flag(sk, flag);
276		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
277		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
278			net_disable_timestamp();
279		}
280	}
281}
282
283
284int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
285{
286	int err;
287	int skb_len;
288	unsigned long flags;
289	struct sk_buff_head *list = &sk->sk_receive_queue;
290
291	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
292	   number of warnings when compiling with -W --ANK
293	 */
294	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
295	    (unsigned)sk->sk_rcvbuf) {
296		atomic_inc(&sk->sk_drops);
297		trace_sock_rcvqueue_full(sk, skb);
298		return -ENOMEM;
299	}
300
301	err = sk_filter(sk, skb);
302	if (err)
303		return err;
304
305	if (!sk_rmem_schedule(sk, skb->truesize)) {
306		atomic_inc(&sk->sk_drops);
307		return -ENOBUFS;
308	}
309
310	skb->dev = NULL;
311	skb_set_owner_r(skb, sk);
312
313	/* Cache the SKB length before we tack it onto the receive
314	 * queue.  Once it is added it no longer belongs to us and
315	 * may be freed by other threads of control pulling packets
316	 * from the queue.
317	 */
318	skb_len = skb->len;
319
320	/* we escape from rcu protected region, make sure we dont leak
321	 * a norefcounted dst
322	 */
323	skb_dst_force(skb);
324
325	spin_lock_irqsave(&list->lock, flags);
326	skb->dropcount = atomic_read(&sk->sk_drops);
327	__skb_queue_tail(list, skb);
328	spin_unlock_irqrestore(&list->lock, flags);
329
330	if (!sock_flag(sk, SOCK_DEAD))
331		sk->sk_data_ready(sk, skb_len);
332	return 0;
333}
334EXPORT_SYMBOL(sock_queue_rcv_skb);
335
336int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
337{
338	int rc = NET_RX_SUCCESS;
339
340	if (sk_filter(sk, skb))
341		goto discard_and_relse;
342
343	skb->dev = NULL;
344
345	if (sk_rcvqueues_full(sk, skb)) {
346		atomic_inc(&sk->sk_drops);
347		goto discard_and_relse;
348	}
349	if (nested)
350		bh_lock_sock_nested(sk);
351	else
352		bh_lock_sock(sk);
353	if (!sock_owned_by_user(sk)) {
354		/*
355		 * trylock + unlock semantics:
356		 */
357		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
358
359		rc = sk_backlog_rcv(sk, skb);
360
361		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
362	} else if (sk_add_backlog(sk, skb)) {
363		bh_unlock_sock(sk);
364		atomic_inc(&sk->sk_drops);
365		goto discard_and_relse;
366	}
367
368	bh_unlock_sock(sk);
369out:
370	sock_put(sk);
371	return rc;
372discard_and_relse:
373	kfree_skb(skb);
374	goto out;
375}
376EXPORT_SYMBOL(sk_receive_skb);
377
378void sk_reset_txq(struct sock *sk)
379{
380	sk_tx_queue_clear(sk);
381}
382EXPORT_SYMBOL(sk_reset_txq);
383
384struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
385{
386	struct dst_entry *dst = __sk_dst_get(sk);
387
388	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
389		sk_tx_queue_clear(sk);
390		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
391		dst_release(dst);
392		return NULL;
393	}
394
395	return dst;
396}
397EXPORT_SYMBOL(__sk_dst_check);
398
399struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
400{
401	struct dst_entry *dst = sk_dst_get(sk);
402
403	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
404		sk_dst_reset(sk);
405		dst_release(dst);
406		return NULL;
407	}
408
409	return dst;
410}
411EXPORT_SYMBOL(sk_dst_check);
412
413static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
414{
415	int ret = -ENOPROTOOPT;
416#ifdef CONFIG_NETDEVICES
417	struct net *net = sock_net(sk);
418	char devname[IFNAMSIZ];
419	int index;
420
421	/* Sorry... */
422	ret = -EPERM;
423	if (!capable(CAP_NET_RAW))
424		goto out;
425
426	ret = -EINVAL;
427	if (optlen < 0)
428		goto out;
429
430	/* Bind this socket to a particular device like "eth0",
431	 * as specified in the passed interface name. If the
432	 * name is "" or the option length is zero the socket
433	 * is not bound.
434	 */
435	if (optlen > IFNAMSIZ - 1)
436		optlen = IFNAMSIZ - 1;
437	memset(devname, 0, sizeof(devname));
438
439	ret = -EFAULT;
440	if (copy_from_user(devname, optval, optlen))
441		goto out;
442
443	index = 0;
444	if (devname[0] != '\0') {
445		struct net_device *dev;
446
447		rcu_read_lock();
448		dev = dev_get_by_name_rcu(net, devname);
449		if (dev)
450			index = dev->ifindex;
451		rcu_read_unlock();
452		ret = -ENODEV;
453		if (!dev)
454			goto out;
455	}
456
457	lock_sock(sk);
458	sk->sk_bound_dev_if = index;
459	sk_dst_reset(sk);
460	release_sock(sk);
461
462	ret = 0;
463
464out:
465#endif
466
467	return ret;
468}
469
470static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
471{
472	if (valbool)
473		sock_set_flag(sk, bit);
474	else
475		sock_reset_flag(sk, bit);
476}
477
478/*
479 *	This is meant for all protocols to use and covers goings on
480 *	at the socket level. Everything here is generic.
481 */
482
483int sock_setsockopt(struct socket *sock, int level, int optname,
484		    char __user *optval, unsigned int optlen)
485{
486	struct sock *sk = sock->sk;
487	int val;
488	int valbool;
489	struct linger ling;
490	int ret = 0;
491
492	/*
493	 *	Options without arguments
494	 */
495
496	if (optname == SO_BINDTODEVICE)
497		return sock_bindtodevice(sk, optval, optlen);
498
499	if (optlen < sizeof(int))
500		return -EINVAL;
501
502	if (get_user(val, (int __user *)optval))
503		return -EFAULT;
504
505	valbool = val ? 1 : 0;
506
507	lock_sock(sk);
508
509	switch (optname) {
510	case SO_DEBUG:
511		if (val && !capable(CAP_NET_ADMIN))
512			ret = -EACCES;
513		else
514			sock_valbool_flag(sk, SOCK_DBG, valbool);
515		break;
516	case SO_REUSEADDR:
517		sk->sk_reuse = valbool;
518		break;
519	case SO_TYPE:
520	case SO_PROTOCOL:
521	case SO_DOMAIN:
522	case SO_ERROR:
523		ret = -ENOPROTOOPT;
524		break;
525	case SO_DONTROUTE:
526		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
527		break;
528	case SO_BROADCAST:
529		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
530		break;
531	case SO_SNDBUF:
532		/* Don't error on this BSD doesn't and if you think
533		   about it this is right. Otherwise apps have to
534		   play 'guess the biggest size' games. RCVBUF/SNDBUF
535		   are treated in BSD as hints */
536
537		if (val > sysctl_wmem_max)
538			val = sysctl_wmem_max;
539set_sndbuf:
540		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
541		if ((val * 2) < SOCK_MIN_SNDBUF)
542			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
543		else
544			sk->sk_sndbuf = val * 2;
545
546		/*
547		 *	Wake up sending tasks if we
548		 *	upped the value.
549		 */
550		sk->sk_write_space(sk);
551		break;
552
553	case SO_SNDBUFFORCE:
554		if (!capable(CAP_NET_ADMIN)) {
555			ret = -EPERM;
556			break;
557		}
558		goto set_sndbuf;
559
560	case SO_RCVBUF:
561		/* Don't error on this BSD doesn't and if you think
562		   about it this is right. Otherwise apps have to
563		   play 'guess the biggest size' games. RCVBUF/SNDBUF
564		   are treated in BSD as hints */
565
566		if (val > sysctl_rmem_max)
567			val = sysctl_rmem_max;
568set_rcvbuf:
569		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
570		/*
571		 * We double it on the way in to account for
572		 * "struct sk_buff" etc. overhead.   Applications
573		 * assume that the SO_RCVBUF setting they make will
574		 * allow that much actual data to be received on that
575		 * socket.
576		 *
577		 * Applications are unaware that "struct sk_buff" and
578		 * other overheads allocate from the receive buffer
579		 * during socket buffer allocation.
580		 *
581		 * And after considering the possible alternatives,
582		 * returning the value we actually used in getsockopt
583		 * is the most desirable behavior.
584		 */
585		if ((val * 2) < SOCK_MIN_RCVBUF)
586			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
587		else
588			sk->sk_rcvbuf = val * 2;
589		break;
590
591	case SO_RCVBUFFORCE:
592		if (!capable(CAP_NET_ADMIN)) {
593			ret = -EPERM;
594			break;
595		}
596		goto set_rcvbuf;
597
598	case SO_KEEPALIVE:
599#ifdef CONFIG_INET
600		if (sk->sk_protocol == IPPROTO_TCP)
601			tcp_set_keepalive(sk, valbool);
602#endif
603		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
604		break;
605
606	case SO_OOBINLINE:
607		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
608		break;
609
610	case SO_NO_CHECK:
611		sk->sk_no_check = valbool;
612		break;
613
614	case SO_PRIORITY:
615		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
616			sk->sk_priority = val;
617		else
618			ret = -EPERM;
619		break;
620
621	case SO_LINGER:
622		if (optlen < sizeof(ling)) {
623			ret = -EINVAL;	/* 1003.1g */
624			break;
625		}
626		if (copy_from_user(&ling, optval, sizeof(ling))) {
627			ret = -EFAULT;
628			break;
629		}
630		if (!ling.l_onoff)
631			sock_reset_flag(sk, SOCK_LINGER);
632		else {
633#if (BITS_PER_LONG == 32)
634			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
635				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
636			else
637#endif
638				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
639			sock_set_flag(sk, SOCK_LINGER);
640		}
641		break;
642
643	case SO_BSDCOMPAT:
644		sock_warn_obsolete_bsdism("setsockopt");
645		break;
646
647	case SO_PASSCRED:
648		if (valbool)
649			set_bit(SOCK_PASSCRED, &sock->flags);
650		else
651			clear_bit(SOCK_PASSCRED, &sock->flags);
652		break;
653
654	case SO_TIMESTAMP:
655	case SO_TIMESTAMPNS:
656		if (valbool)  {
657			if (optname == SO_TIMESTAMP)
658				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
659			else
660				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
661			sock_set_flag(sk, SOCK_RCVTSTAMP);
662			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
663		} else {
664			sock_reset_flag(sk, SOCK_RCVTSTAMP);
665			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
666		}
667		break;
668
669	case SO_TIMESTAMPING:
670		if (val & ~SOF_TIMESTAMPING_MASK) {
671			ret = -EINVAL;
672			break;
673		}
674		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
675				  val & SOF_TIMESTAMPING_TX_HARDWARE);
676		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
677				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
678		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
679				  val & SOF_TIMESTAMPING_RX_HARDWARE);
680		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
681			sock_enable_timestamp(sk,
682					      SOCK_TIMESTAMPING_RX_SOFTWARE);
683		else
684			sock_disable_timestamp(sk,
685					       SOCK_TIMESTAMPING_RX_SOFTWARE);
686		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
687				  val & SOF_TIMESTAMPING_SOFTWARE);
688		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
689				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
690		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
691				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
692		break;
693
694	case SO_RCVLOWAT:
695		if (val < 0)
696			val = INT_MAX;
697		sk->sk_rcvlowat = val ? : 1;
698		break;
699
700	case SO_RCVTIMEO:
701		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
702		break;
703
704	case SO_SNDTIMEO:
705		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
706		break;
707
708	case SO_ATTACH_FILTER:
709		ret = -EINVAL;
710		if (optlen == sizeof(struct sock_fprog)) {
711			struct sock_fprog fprog;
712
713			ret = -EFAULT;
714			if (copy_from_user(&fprog, optval, sizeof(fprog)))
715				break;
716
717			ret = sk_attach_filter(&fprog, sk);
718		}
719		break;
720
721	case SO_DETACH_FILTER:
722		ret = sk_detach_filter(sk);
723		break;
724
725	case SO_PASSSEC:
726		if (valbool)
727			set_bit(SOCK_PASSSEC, &sock->flags);
728		else
729			clear_bit(SOCK_PASSSEC, &sock->flags);
730		break;
731	case SO_MARK:
732		if (!capable(CAP_NET_ADMIN))
733			ret = -EPERM;
734		else
735			sk->sk_mark = val;
736		break;
737
738		/* We implement the SO_SNDLOWAT etc to
739		   not be settable (1003.1g 5.3) */
740	case SO_RXQ_OVFL:
741		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
742		break;
743	default:
744		ret = -ENOPROTOOPT;
745		break;
746	}
747	release_sock(sk);
748	return ret;
749}
750EXPORT_SYMBOL(sock_setsockopt);
751
752
753void cred_to_ucred(struct pid *pid, const struct cred *cred,
754		   struct ucred *ucred)
755{
756	ucred->pid = pid_vnr(pid);
757	ucred->uid = ucred->gid = -1;
758	if (cred) {
759		struct user_namespace *current_ns = current_user_ns();
760
761		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
762		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
763	}
764}
765EXPORT_SYMBOL_GPL(cred_to_ucred);
766
767int sock_getsockopt(struct socket *sock, int level, int optname,
768		    char __user *optval, int __user *optlen)
769{
770	struct sock *sk = sock->sk;
771
772	union {
773		int val;
774		struct linger ling;
775		struct timeval tm;
776	} v;
777
778	int lv = sizeof(int);
779	int len;
780
781	if (get_user(len, optlen))
782		return -EFAULT;
783	if (len < 0)
784		return -EINVAL;
785
786	memset(&v, 0, sizeof(v));
787
788	switch (optname) {
789	case SO_DEBUG:
790		v.val = sock_flag(sk, SOCK_DBG);
791		break;
792
793	case SO_DONTROUTE:
794		v.val = sock_flag(sk, SOCK_LOCALROUTE);
795		break;
796
797	case SO_BROADCAST:
798		v.val = !!sock_flag(sk, SOCK_BROADCAST);
799		break;
800
801	case SO_SNDBUF:
802		v.val = sk->sk_sndbuf;
803		break;
804
805	case SO_RCVBUF:
806		v.val = sk->sk_rcvbuf;
807		break;
808
809	case SO_REUSEADDR:
810		v.val = sk->sk_reuse;
811		break;
812
813	case SO_KEEPALIVE:
814		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
815		break;
816
817	case SO_TYPE:
818		v.val = sk->sk_type;
819		break;
820
821	case SO_PROTOCOL:
822		v.val = sk->sk_protocol;
823		break;
824
825	case SO_DOMAIN:
826		v.val = sk->sk_family;
827		break;
828
829	case SO_ERROR:
830		v.val = -sock_error(sk);
831		if (v.val == 0)
832			v.val = xchg(&sk->sk_err_soft, 0);
833		break;
834
835	case SO_OOBINLINE:
836		v.val = !!sock_flag(sk, SOCK_URGINLINE);
837		break;
838
839	case SO_NO_CHECK:
840		v.val = sk->sk_no_check;
841		break;
842
843	case SO_PRIORITY:
844		v.val = sk->sk_priority;
845		break;
846
847	case SO_LINGER:
848		lv		= sizeof(v.ling);
849		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
850		v.ling.l_linger	= sk->sk_lingertime / HZ;
851		break;
852
853	case SO_BSDCOMPAT:
854		sock_warn_obsolete_bsdism("getsockopt");
855		break;
856
857	case SO_TIMESTAMP:
858		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
859				!sock_flag(sk, SOCK_RCVTSTAMPNS);
860		break;
861
862	case SO_TIMESTAMPNS:
863		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
864		break;
865
866	case SO_TIMESTAMPING:
867		v.val = 0;
868		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
869			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
870		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
871			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
872		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
873			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
874		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
875			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
876		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
877			v.val |= SOF_TIMESTAMPING_SOFTWARE;
878		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
879			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
880		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
881			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
882		break;
883
884	case SO_RCVTIMEO:
885		lv = sizeof(struct timeval);
886		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
887			v.tm.tv_sec = 0;
888			v.tm.tv_usec = 0;
889		} else {
890			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
891			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
892		}
893		break;
894
895	case SO_SNDTIMEO:
896		lv = sizeof(struct timeval);
897		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
898			v.tm.tv_sec = 0;
899			v.tm.tv_usec = 0;
900		} else {
901			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
902			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
903		}
904		break;
905
906	case SO_RCVLOWAT:
907		v.val = sk->sk_rcvlowat;
908		break;
909
910	case SO_SNDLOWAT:
911		v.val = 1;
912		break;
913
914	case SO_PASSCRED:
915		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
916		break;
917
918	case SO_PEERCRED:
919	{
920		struct ucred peercred;
921		if (len > sizeof(peercred))
922			len = sizeof(peercred);
923		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
924		if (copy_to_user(optval, &peercred, len))
925			return -EFAULT;
926		goto lenout;
927	}
928
929	case SO_PEERNAME:
930	{
931		char address[128];
932
933		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
934			return -ENOTCONN;
935		if (lv < len)
936			return -EINVAL;
937		if (copy_to_user(optval, address, len))
938			return -EFAULT;
939		goto lenout;
940	}
941
942	/* Dubious BSD thing... Probably nobody even uses it, but
943	 * the UNIX standard wants it for whatever reason... -DaveM
944	 */
945	case SO_ACCEPTCONN:
946		v.val = sk->sk_state == TCP_LISTEN;
947		break;
948
949	case SO_PASSSEC:
950		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
951		break;
952
953	case SO_PEERSEC:
954		return security_socket_getpeersec_stream(sock, optval, optlen, len);
955
956	case SO_MARK:
957		v.val = sk->sk_mark;
958		break;
959
960	case SO_RXQ_OVFL:
961		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
962		break;
963
964	default:
965		return -ENOPROTOOPT;
966	}
967
968	if (len > lv)
969		len = lv;
970	if (copy_to_user(optval, &v, len))
971		return -EFAULT;
972lenout:
973	if (put_user(len, optlen))
974		return -EFAULT;
975	return 0;
976}
977
978/*
979 * Initialize an sk_lock.
980 *
981 * (We also register the sk_lock with the lock validator.)
982 */
983static inline void sock_lock_init(struct sock *sk)
984{
985	sock_lock_init_class_and_name(sk,
986			af_family_slock_key_strings[sk->sk_family],
987			af_family_slock_keys + sk->sk_family,
988			af_family_key_strings[sk->sk_family],
989			af_family_keys + sk->sk_family);
990}
991
992/*
993 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
994 * even temporarly, because of RCU lookups. sk_node should also be left as is.
995 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
996 */
997static void sock_copy(struct sock *nsk, const struct sock *osk)
998{
999#ifdef CONFIG_SECURITY_NETWORK
1000	void *sptr = nsk->sk_security;
1001#endif
1002	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1003
1004	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1005	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1006
1007#ifdef CONFIG_SECURITY_NETWORK
1008	nsk->sk_security = sptr;
1009	security_sk_clone(osk, nsk);
1010#endif
1011}
1012
1013/*
1014 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1015 * un-modified. Special care is taken when initializing object to zero.
1016 */
1017static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1018{
1019	if (offsetof(struct sock, sk_node.next) != 0)
1020		memset(sk, 0, offsetof(struct sock, sk_node.next));
1021	memset(&sk->sk_node.pprev, 0,
1022	       size - offsetof(struct sock, sk_node.pprev));
1023}
1024
1025void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1026{
1027	unsigned long nulls1, nulls2;
1028
1029	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1030	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1031	if (nulls1 > nulls2)
1032		swap(nulls1, nulls2);
1033
1034	if (nulls1 != 0)
1035		memset((char *)sk, 0, nulls1);
1036	memset((char *)sk + nulls1 + sizeof(void *), 0,
1037	       nulls2 - nulls1 - sizeof(void *));
1038	memset((char *)sk + nulls2 + sizeof(void *), 0,
1039	       size - nulls2 - sizeof(void *));
1040}
1041EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1042
1043static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1044		int family)
1045{
1046	struct sock *sk;
1047	struct kmem_cache *slab;
1048
1049	slab = prot->slab;
1050	if (slab != NULL) {
1051		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1052		if (!sk)
1053			return sk;
1054		if (priority & __GFP_ZERO) {
1055			if (prot->clear_sk)
1056				prot->clear_sk(sk, prot->obj_size);
1057			else
1058				sk_prot_clear_nulls(sk, prot->obj_size);
1059		}
1060	} else
1061		sk = kmalloc(prot->obj_size, priority);
1062
1063	if (sk != NULL) {
1064		kmemcheck_annotate_bitfield(sk, flags);
1065
1066		if (security_sk_alloc(sk, family, priority))
1067			goto out_free;
1068
1069		if (!try_module_get(prot->owner))
1070			goto out_free_sec;
1071		sk_tx_queue_clear(sk);
1072	}
1073
1074	return sk;
1075
1076out_free_sec:
1077	security_sk_free(sk);
1078out_free:
1079	if (slab != NULL)
1080		kmem_cache_free(slab, sk);
1081	else
1082		kfree(sk);
1083	return NULL;
1084}
1085
1086static void sk_prot_free(struct proto *prot, struct sock *sk)
1087{
1088	struct kmem_cache *slab;
1089	struct module *owner;
1090
1091	owner = prot->owner;
1092	slab = prot->slab;
1093
1094	security_sk_free(sk);
1095	if (slab != NULL)
1096		kmem_cache_free(slab, sk);
1097	else
1098		kfree(sk);
1099	module_put(owner);
1100}
1101
1102#ifdef CONFIG_CGROUPS
1103void sock_update_classid(struct sock *sk)
1104{
1105	u32 classid;
1106
1107	rcu_read_lock();  /* doing current task, which cannot vanish. */
1108	classid = task_cls_classid(current);
1109	rcu_read_unlock();
1110	if (classid && classid != sk->sk_classid)
1111		sk->sk_classid = classid;
1112}
1113EXPORT_SYMBOL(sock_update_classid);
1114#endif
1115
1116/**
1117 *	sk_alloc - All socket objects are allocated here
1118 *	@net: the applicable net namespace
1119 *	@family: protocol family
1120 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1121 *	@prot: struct proto associated with this new sock instance
1122 */
1123struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1124		      struct proto *prot)
1125{
1126	struct sock *sk;
1127
1128	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1129	if (sk) {
1130		sk->sk_family = family;
1131		/*
1132		 * See comment in struct sock definition to understand
1133		 * why we need sk_prot_creator -acme
1134		 */
1135		sk->sk_prot = sk->sk_prot_creator = prot;
1136		sock_lock_init(sk);
1137		sock_net_set(sk, get_net(net));
1138		atomic_set(&sk->sk_wmem_alloc, 1);
1139
1140		sock_update_classid(sk);
1141	}
1142
1143	return sk;
1144}
1145EXPORT_SYMBOL(sk_alloc);
1146
1147static void __sk_free(struct sock *sk)
1148{
1149	struct sk_filter *filter;
1150
1151	if (sk->sk_destruct)
1152		sk->sk_destruct(sk);
1153
1154	filter = rcu_dereference_check(sk->sk_filter,
1155				       atomic_read(&sk->sk_wmem_alloc) == 0);
1156	if (filter) {
1157		sk_filter_uncharge(sk, filter);
1158		RCU_INIT_POINTER(sk->sk_filter, NULL);
1159	}
1160
1161	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1162	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1163
1164	if (atomic_read(&sk->sk_omem_alloc))
1165		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1166		       __func__, atomic_read(&sk->sk_omem_alloc));
1167
1168	if (sk->sk_peer_cred)
1169		put_cred(sk->sk_peer_cred);
1170	put_pid(sk->sk_peer_pid);
1171	put_net(sock_net(sk));
1172	sk_prot_free(sk->sk_prot_creator, sk);
1173}
1174
1175void sk_free(struct sock *sk)
1176{
1177	/*
1178	 * We subtract one from sk_wmem_alloc and can know if
1179	 * some packets are still in some tx queue.
1180	 * If not null, sock_wfree() will call __sk_free(sk) later
1181	 */
1182	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1183		__sk_free(sk);
1184}
1185EXPORT_SYMBOL(sk_free);
1186
1187/*
1188 * Last sock_put should drop reference to sk->sk_net. It has already
1189 * been dropped in sk_change_net. Taking reference to stopping namespace
1190 * is not an option.
1191 * Take reference to a socket to remove it from hash _alive_ and after that
1192 * destroy it in the context of init_net.
1193 */
1194void sk_release_kernel(struct sock *sk)
1195{
1196	if (sk == NULL || sk->sk_socket == NULL)
1197		return;
1198
1199	sock_hold(sk);
1200	sock_release(sk->sk_socket);
1201	release_net(sock_net(sk));
1202	sock_net_set(sk, get_net(&init_net));
1203	sock_put(sk);
1204}
1205EXPORT_SYMBOL(sk_release_kernel);
1206
1207/**
1208 *	sk_clone_lock - clone a socket, and lock its clone
1209 *	@sk: the socket to clone
1210 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1211 *
1212 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1213 */
1214struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1215{
1216	struct sock *newsk;
1217
1218	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1219	if (newsk != NULL) {
1220		struct sk_filter *filter;
1221
1222		sock_copy(newsk, sk);
1223
1224		/* SANITY */
1225		get_net(sock_net(newsk));
1226		sk_node_init(&newsk->sk_node);
1227		sock_lock_init(newsk);
1228		bh_lock_sock(newsk);
1229		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1230		newsk->sk_backlog.len = 0;
1231
1232		atomic_set(&newsk->sk_rmem_alloc, 0);
1233		/*
1234		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1235		 */
1236		atomic_set(&newsk->sk_wmem_alloc, 1);
1237		atomic_set(&newsk->sk_omem_alloc, 0);
1238		skb_queue_head_init(&newsk->sk_receive_queue);
1239		skb_queue_head_init(&newsk->sk_write_queue);
1240#ifdef CONFIG_NET_DMA
1241		skb_queue_head_init(&newsk->sk_async_wait_queue);
1242#endif
1243
1244		spin_lock_init(&newsk->sk_dst_lock);
1245		rwlock_init(&newsk->sk_callback_lock);
1246		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1247				af_callback_keys + newsk->sk_family,
1248				af_family_clock_key_strings[newsk->sk_family]);
1249
1250		newsk->sk_dst_cache	= NULL;
1251		newsk->sk_wmem_queued	= 0;
1252		newsk->sk_forward_alloc = 0;
1253		newsk->sk_send_head	= NULL;
1254		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1255
1256		sock_reset_flag(newsk, SOCK_DONE);
1257		skb_queue_head_init(&newsk->sk_error_queue);
1258
1259		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1260		if (filter != NULL)
1261			sk_filter_charge(newsk, filter);
1262
1263		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1264			/* It is still raw copy of parent, so invalidate
1265			 * destructor and make plain sk_free() */
1266			newsk->sk_destruct = NULL;
1267			bh_unlock_sock(newsk);
1268			sk_free(newsk);
1269			newsk = NULL;
1270			goto out;
1271		}
1272
1273		newsk->sk_err	   = 0;
1274		newsk->sk_priority = 0;
1275		/*
1276		 * Before updating sk_refcnt, we must commit prior changes to memory
1277		 * (Documentation/RCU/rculist_nulls.txt for details)
1278		 */
1279		smp_wmb();
1280		atomic_set(&newsk->sk_refcnt, 2);
1281
1282		/*
1283		 * Increment the counter in the same struct proto as the master
1284		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1285		 * is the same as sk->sk_prot->socks, as this field was copied
1286		 * with memcpy).
1287		 *
1288		 * This _changes_ the previous behaviour, where
1289		 * tcp_create_openreq_child always was incrementing the
1290		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1291		 * to be taken into account in all callers. -acme
1292		 */
1293		sk_refcnt_debug_inc(newsk);
1294		sk_set_socket(newsk, NULL);
1295		newsk->sk_wq = NULL;
1296
1297		if (newsk->sk_prot->sockets_allocated)
1298			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1299
1300		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1301		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1302			net_enable_timestamp();
1303	}
1304out:
1305	return newsk;
1306}
1307EXPORT_SYMBOL_GPL(sk_clone_lock);
1308
1309void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1310{
1311	__sk_dst_set(sk, dst);
1312	sk->sk_route_caps = dst->dev->features;
1313	if (sk->sk_route_caps & NETIF_F_GSO)
1314		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1315	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1316	if (sk_can_gso(sk)) {
1317		if (dst->header_len) {
1318			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1319		} else {
1320			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1321			sk->sk_gso_max_size = dst->dev->gso_max_size;
1322		}
1323	}
1324}
1325EXPORT_SYMBOL_GPL(sk_setup_caps);
1326
1327void __init sk_init(void)
1328{
1329	if (totalram_pages <= 4096) {
1330		sysctl_wmem_max = 32767;
1331		sysctl_rmem_max = 32767;
1332		sysctl_wmem_default = 32767;
1333		sysctl_rmem_default = 32767;
1334	} else if (totalram_pages >= 131072) {
1335		sysctl_wmem_max = 131071;
1336		sysctl_rmem_max = 131071;
1337	}
1338}
1339
1340/*
1341 *	Simple resource managers for sockets.
1342 */
1343
1344
1345/*
1346 * Write buffer destructor automatically called from kfree_skb.
1347 */
1348void sock_wfree(struct sk_buff *skb)
1349{
1350	struct sock *sk = skb->sk;
1351	unsigned int len = skb->truesize;
1352
1353	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1354		/*
1355		 * Keep a reference on sk_wmem_alloc, this will be released
1356		 * after sk_write_space() call
1357		 */
1358		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1359		sk->sk_write_space(sk);
1360		len = 1;
1361	}
1362	/*
1363	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1364	 * could not do because of in-flight packets
1365	 */
1366	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1367		__sk_free(sk);
1368}
1369EXPORT_SYMBOL(sock_wfree);
1370
1371/*
1372 * Read buffer destructor automatically called from kfree_skb.
1373 */
1374void sock_rfree(struct sk_buff *skb)
1375{
1376	struct sock *sk = skb->sk;
1377	unsigned int len = skb->truesize;
1378
1379	atomic_sub(len, &sk->sk_rmem_alloc);
1380	sk_mem_uncharge(sk, len);
1381}
1382EXPORT_SYMBOL(sock_rfree);
1383
1384
1385int sock_i_uid(struct sock *sk)
1386{
1387	int uid;
1388
1389	read_lock_bh(&sk->sk_callback_lock);
1390	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1391	read_unlock_bh(&sk->sk_callback_lock);
1392	return uid;
1393}
1394EXPORT_SYMBOL(sock_i_uid);
1395
1396unsigned long sock_i_ino(struct sock *sk)
1397{
1398	unsigned long ino;
1399
1400	read_lock_bh(&sk->sk_callback_lock);
1401	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1402	read_unlock_bh(&sk->sk_callback_lock);
1403	return ino;
1404}
1405EXPORT_SYMBOL(sock_i_ino);
1406
1407/*
1408 * Allocate a skb from the socket's send buffer.
1409 */
1410struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1411			     gfp_t priority)
1412{
1413	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1414		struct sk_buff *skb = alloc_skb(size, priority);
1415		if (skb) {
1416			skb_set_owner_w(skb, sk);
1417			return skb;
1418		}
1419	}
1420	return NULL;
1421}
1422EXPORT_SYMBOL(sock_wmalloc);
1423
1424/*
1425 * Allocate a skb from the socket's receive buffer.
1426 */
1427struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1428			     gfp_t priority)
1429{
1430	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1431		struct sk_buff *skb = alloc_skb(size, priority);
1432		if (skb) {
1433			skb_set_owner_r(skb, sk);
1434			return skb;
1435		}
1436	}
1437	return NULL;
1438}
1439
1440/*
1441 * Allocate a memory block from the socket's option memory buffer.
1442 */
1443void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1444{
1445	if ((unsigned)size <= sysctl_optmem_max &&
1446	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1447		void *mem;
1448		/* First do the add, to avoid the race if kmalloc
1449		 * might sleep.
1450		 */
1451		atomic_add(size, &sk->sk_omem_alloc);
1452		mem = kmalloc(size, priority);
1453		if (mem)
1454			return mem;
1455		atomic_sub(size, &sk->sk_omem_alloc);
1456	}
1457	return NULL;
1458}
1459EXPORT_SYMBOL(sock_kmalloc);
1460
1461/*
1462 * Free an option memory block.
1463 */
1464void sock_kfree_s(struct sock *sk, void *mem, int size)
1465{
1466	kfree(mem);
1467	atomic_sub(size, &sk->sk_omem_alloc);
1468}
1469EXPORT_SYMBOL(sock_kfree_s);
1470
1471/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1472   I think, these locks should be removed for datagram sockets.
1473 */
1474static long sock_wait_for_wmem(struct sock *sk, long timeo)
1475{
1476	DEFINE_WAIT(wait);
1477
1478	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1479	for (;;) {
1480		if (!timeo)
1481			break;
1482		if (signal_pending(current))
1483			break;
1484		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1485		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1486		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1487			break;
1488		if (sk->sk_shutdown & SEND_SHUTDOWN)
1489			break;
1490		if (sk->sk_err)
1491			break;
1492		timeo = schedule_timeout(timeo);
1493	}
1494	finish_wait(sk_sleep(sk), &wait);
1495	return timeo;
1496}
1497
1498
1499/*
1500 *	Generic send/receive buffer handlers
1501 */
1502
1503struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1504				     unsigned long data_len, int noblock,
1505				     int *errcode)
1506{
1507	struct sk_buff *skb;
1508	gfp_t gfp_mask;
1509	long timeo;
1510	int err;
1511
1512	gfp_mask = sk->sk_allocation;
1513	if (gfp_mask & __GFP_WAIT)
1514		gfp_mask |= __GFP_REPEAT;
1515
1516	timeo = sock_sndtimeo(sk, noblock);
1517	while (1) {
1518		err = sock_error(sk);
1519		if (err != 0)
1520			goto failure;
1521
1522		err = -EPIPE;
1523		if (sk->sk_shutdown & SEND_SHUTDOWN)
1524			goto failure;
1525
1526		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1527			skb = alloc_skb(header_len, gfp_mask);
1528			if (skb) {
1529				int npages;
1530				int i;
1531
1532				/* No pages, we're done... */
1533				if (!data_len)
1534					break;
1535
1536				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1537				skb->truesize += data_len;
1538				skb_shinfo(skb)->nr_frags = npages;
1539				for (i = 0; i < npages; i++) {
1540					struct page *page;
1541
1542					page = alloc_pages(sk->sk_allocation, 0);
1543					if (!page) {
1544						err = -ENOBUFS;
1545						skb_shinfo(skb)->nr_frags = i;
1546						kfree_skb(skb);
1547						goto failure;
1548					}
1549
1550					__skb_fill_page_desc(skb, i,
1551							page, 0,
1552							(data_len >= PAGE_SIZE ?
1553							 PAGE_SIZE :
1554							 data_len));
1555					data_len -= PAGE_SIZE;
1556				}
1557
1558				/* Full success... */
1559				break;
1560			}
1561			err = -ENOBUFS;
1562			goto failure;
1563		}
1564		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1565		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1566		err = -EAGAIN;
1567		if (!timeo)
1568			goto failure;
1569		if (signal_pending(current))
1570			goto interrupted;
1571		timeo = sock_wait_for_wmem(sk, timeo);
1572	}
1573
1574	skb_set_owner_w(skb, sk);
1575	return skb;
1576
1577interrupted:
1578	err = sock_intr_errno(timeo);
1579failure:
1580	*errcode = err;
1581	return NULL;
1582}
1583EXPORT_SYMBOL(sock_alloc_send_pskb);
1584
1585struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1586				    int noblock, int *errcode)
1587{
1588	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1589}
1590EXPORT_SYMBOL(sock_alloc_send_skb);
1591
1592static void __lock_sock(struct sock *sk)
1593	__releases(&sk->sk_lock.slock)
1594	__acquires(&sk->sk_lock.slock)
1595{
1596	DEFINE_WAIT(wait);
1597
1598	for (;;) {
1599		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1600					TASK_UNINTERRUPTIBLE);
1601		spin_unlock_bh(&sk->sk_lock.slock);
1602		schedule();
1603		spin_lock_bh(&sk->sk_lock.slock);
1604		if (!sock_owned_by_user(sk))
1605			break;
1606	}
1607	finish_wait(&sk->sk_lock.wq, &wait);
1608}
1609
1610static void __release_sock(struct sock *sk)
1611	__releases(&sk->sk_lock.slock)
1612	__acquires(&sk->sk_lock.slock)
1613{
1614	struct sk_buff *skb = sk->sk_backlog.head;
1615
1616	do {
1617		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1618		bh_unlock_sock(sk);
1619
1620		do {
1621			struct sk_buff *next = skb->next;
1622
1623			WARN_ON_ONCE(skb_dst_is_noref(skb));
1624			skb->next = NULL;
1625			sk_backlog_rcv(sk, skb);
1626
1627			/*
1628			 * We are in process context here with softirqs
1629			 * disabled, use cond_resched_softirq() to preempt.
1630			 * This is safe to do because we've taken the backlog
1631			 * queue private:
1632			 */
1633			cond_resched_softirq();
1634
1635			skb = next;
1636		} while (skb != NULL);
1637
1638		bh_lock_sock(sk);
1639	} while ((skb = sk->sk_backlog.head) != NULL);
1640
1641	/*
1642	 * Doing the zeroing here guarantee we can not loop forever
1643	 * while a wild producer attempts to flood us.
1644	 */
1645	sk->sk_backlog.len = 0;
1646}
1647
1648/**
1649 * sk_wait_data - wait for data to arrive at sk_receive_queue
1650 * @sk:    sock to wait on
1651 * @timeo: for how long
1652 *
1653 * Now socket state including sk->sk_err is changed only under lock,
1654 * hence we may omit checks after joining wait queue.
1655 * We check receive queue before schedule() only as optimization;
1656 * it is very likely that release_sock() added new data.
1657 */
1658int sk_wait_data(struct sock *sk, long *timeo)
1659{
1660	int rc;
1661	DEFINE_WAIT(wait);
1662
1663	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1664	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1665	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1666	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1667	finish_wait(sk_sleep(sk), &wait);
1668	return rc;
1669}
1670EXPORT_SYMBOL(sk_wait_data);
1671
1672/**
1673 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1674 *	@sk: socket
1675 *	@size: memory size to allocate
1676 *	@kind: allocation type
1677 *
1678 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1679 *	rmem allocation. This function assumes that protocols which have
1680 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1681 */
1682int __sk_mem_schedule(struct sock *sk, int size, int kind)
1683{
1684	struct proto *prot = sk->sk_prot;
1685	int amt = sk_mem_pages(size);
1686	long allocated;
1687
1688	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1689	allocated = atomic_long_add_return(amt, prot->memory_allocated);
1690
1691	/* Under limit. */
1692	if (allocated <= prot->sysctl_mem[0]) {
1693		if (prot->memory_pressure && *prot->memory_pressure)
1694			*prot->memory_pressure = 0;
1695		return 1;
1696	}
1697
1698	/* Under pressure. */
1699	if (allocated > prot->sysctl_mem[1])
1700		if (prot->enter_memory_pressure)
1701			prot->enter_memory_pressure(sk);
1702
1703	/* Over hard limit. */
1704	if (allocated > prot->sysctl_mem[2])
1705		goto suppress_allocation;
1706
1707	/* guarantee minimum buffer size under pressure */
1708	if (kind == SK_MEM_RECV) {
1709		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1710			return 1;
1711	} else { /* SK_MEM_SEND */
1712		if (sk->sk_type == SOCK_STREAM) {
1713			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1714				return 1;
1715		} else if (atomic_read(&sk->sk_wmem_alloc) <
1716			   prot->sysctl_wmem[0])
1717				return 1;
1718	}
1719
1720	if (prot->memory_pressure) {
1721		int alloc;
1722
1723		if (!*prot->memory_pressure)
1724			return 1;
1725		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1726		if (prot->sysctl_mem[2] > alloc *
1727		    sk_mem_pages(sk->sk_wmem_queued +
1728				 atomic_read(&sk->sk_rmem_alloc) +
1729				 sk->sk_forward_alloc))
1730			return 1;
1731	}
1732
1733suppress_allocation:
1734
1735	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1736		sk_stream_moderate_sndbuf(sk);
1737
1738		/* Fail only if socket is _under_ its sndbuf.
1739		 * In this case we cannot block, so that we have to fail.
1740		 */
1741		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1742			return 1;
1743	}
1744
1745	trace_sock_exceed_buf_limit(sk, prot, allocated);
1746
1747	/* Alas. Undo changes. */
1748	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1749	atomic_long_sub(amt, prot->memory_allocated);
1750	return 0;
1751}
1752EXPORT_SYMBOL(__sk_mem_schedule);
1753
1754/**
1755 *	__sk_reclaim - reclaim memory_allocated
1756 *	@sk: socket
1757 */
1758void __sk_mem_reclaim(struct sock *sk)
1759{
1760	struct proto *prot = sk->sk_prot;
1761
1762	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1763		   prot->memory_allocated);
1764	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1765
1766	if (prot->memory_pressure && *prot->memory_pressure &&
1767	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1768		*prot->memory_pressure = 0;
1769}
1770EXPORT_SYMBOL(__sk_mem_reclaim);
1771
1772
1773/*
1774 * Set of default routines for initialising struct proto_ops when
1775 * the protocol does not support a particular function. In certain
1776 * cases where it makes no sense for a protocol to have a "do nothing"
1777 * function, some default processing is provided.
1778 */
1779
1780int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1781{
1782	return -EOPNOTSUPP;
1783}
1784EXPORT_SYMBOL(sock_no_bind);
1785
1786int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1787		    int len, int flags)
1788{
1789	return -EOPNOTSUPP;
1790}
1791EXPORT_SYMBOL(sock_no_connect);
1792
1793int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1794{
1795	return -EOPNOTSUPP;
1796}
1797EXPORT_SYMBOL(sock_no_socketpair);
1798
1799int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1800{
1801	return -EOPNOTSUPP;
1802}
1803EXPORT_SYMBOL(sock_no_accept);
1804
1805int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1806		    int *len, int peer)
1807{
1808	return -EOPNOTSUPP;
1809}
1810EXPORT_SYMBOL(sock_no_getname);
1811
1812unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1813{
1814	return 0;
1815}
1816EXPORT_SYMBOL(sock_no_poll);
1817
1818int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1819{
1820	return -EOPNOTSUPP;
1821}
1822EXPORT_SYMBOL(sock_no_ioctl);
1823
1824int sock_no_listen(struct socket *sock, int backlog)
1825{
1826	return -EOPNOTSUPP;
1827}
1828EXPORT_SYMBOL(sock_no_listen);
1829
1830int sock_no_shutdown(struct socket *sock, int how)
1831{
1832	return -EOPNOTSUPP;
1833}
1834EXPORT_SYMBOL(sock_no_shutdown);
1835
1836int sock_no_setsockopt(struct socket *sock, int level, int optname,
1837		    char __user *optval, unsigned int optlen)
1838{
1839	return -EOPNOTSUPP;
1840}
1841EXPORT_SYMBOL(sock_no_setsockopt);
1842
1843int sock_no_getsockopt(struct socket *sock, int level, int optname,
1844		    char __user *optval, int __user *optlen)
1845{
1846	return -EOPNOTSUPP;
1847}
1848EXPORT_SYMBOL(sock_no_getsockopt);
1849
1850int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1851		    size_t len)
1852{
1853	return -EOPNOTSUPP;
1854}
1855EXPORT_SYMBOL(sock_no_sendmsg);
1856
1857int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1858		    size_t len, int flags)
1859{
1860	return -EOPNOTSUPP;
1861}
1862EXPORT_SYMBOL(sock_no_recvmsg);
1863
1864int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1865{
1866	/* Mirror missing mmap method error code */
1867	return -ENODEV;
1868}
1869EXPORT_SYMBOL(sock_no_mmap);
1870
1871ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1872{
1873	ssize_t res;
1874	struct msghdr msg = {.msg_flags = flags};
1875	struct kvec iov;
1876	char *kaddr = kmap(page);
1877	iov.iov_base = kaddr + offset;
1878	iov.iov_len = size;
1879	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1880	kunmap(page);
1881	return res;
1882}
1883EXPORT_SYMBOL(sock_no_sendpage);
1884
1885/*
1886 *	Default Socket Callbacks
1887 */
1888
1889static void sock_def_wakeup(struct sock *sk)
1890{
1891	struct socket_wq *wq;
1892
1893	rcu_read_lock();
1894	wq = rcu_dereference(sk->sk_wq);
1895	if (wq_has_sleeper(wq))
1896		wake_up_interruptible_all(&wq->wait);
1897	rcu_read_unlock();
1898}
1899
1900static void sock_def_error_report(struct sock *sk)
1901{
1902	struct socket_wq *wq;
1903
1904	rcu_read_lock();
1905	wq = rcu_dereference(sk->sk_wq);
1906	if (wq_has_sleeper(wq))
1907		wake_up_interruptible_poll(&wq->wait, POLLERR);
1908	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1909	rcu_read_unlock();
1910}
1911
1912static void sock_def_readable(struct sock *sk, int len)
1913{
1914	struct socket_wq *wq;
1915
1916	rcu_read_lock();
1917	wq = rcu_dereference(sk->sk_wq);
1918	if (wq_has_sleeper(wq))
1919		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1920						POLLRDNORM | POLLRDBAND);
1921	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1922	rcu_read_unlock();
1923}
1924
1925static void sock_def_write_space(struct sock *sk)
1926{
1927	struct socket_wq *wq;
1928
1929	rcu_read_lock();
1930
1931	/* Do not wake up a writer until he can make "significant"
1932	 * progress.  --DaveM
1933	 */
1934	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1935		wq = rcu_dereference(sk->sk_wq);
1936		if (wq_has_sleeper(wq))
1937			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1938						POLLWRNORM | POLLWRBAND);
1939
1940		/* Should agree with poll, otherwise some programs break */
1941		if (sock_writeable(sk))
1942			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1943	}
1944
1945	rcu_read_unlock();
1946}
1947
1948static void sock_def_destruct(struct sock *sk)
1949{
1950	kfree(sk->sk_protinfo);
1951}
1952
1953void sk_send_sigurg(struct sock *sk)
1954{
1955	if (sk->sk_socket && sk->sk_socket->file)
1956		if (send_sigurg(&sk->sk_socket->file->f_owner))
1957			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1958}
1959EXPORT_SYMBOL(sk_send_sigurg);
1960
1961void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1962		    unsigned long expires)
1963{
1964	if (!mod_timer(timer, expires))
1965		sock_hold(sk);
1966}
1967EXPORT_SYMBOL(sk_reset_timer);
1968
1969void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1970{
1971	if (timer_pending(timer) && del_timer(timer))
1972		__sock_put(sk);
1973}
1974EXPORT_SYMBOL(sk_stop_timer);
1975
1976void sock_init_data(struct socket *sock, struct sock *sk)
1977{
1978	skb_queue_head_init(&sk->sk_receive_queue);
1979	skb_queue_head_init(&sk->sk_write_queue);
1980	skb_queue_head_init(&sk->sk_error_queue);
1981#ifdef CONFIG_NET_DMA
1982	skb_queue_head_init(&sk->sk_async_wait_queue);
1983#endif
1984
1985	sk->sk_send_head	=	NULL;
1986
1987	init_timer(&sk->sk_timer);
1988
1989	sk->sk_allocation	=	GFP_KERNEL;
1990	sk->sk_rcvbuf		=	sysctl_rmem_default;
1991	sk->sk_sndbuf		=	sysctl_wmem_default;
1992	sk->sk_state		=	TCP_CLOSE;
1993	sk_set_socket(sk, sock);
1994
1995	sock_set_flag(sk, SOCK_ZAPPED);
1996
1997	if (sock) {
1998		sk->sk_type	=	sock->type;
1999		sk->sk_wq	=	sock->wq;
2000		sock->sk	=	sk;
2001	} else
2002		sk->sk_wq	=	NULL;
2003
2004	spin_lock_init(&sk->sk_dst_lock);
2005	rwlock_init(&sk->sk_callback_lock);
2006	lockdep_set_class_and_name(&sk->sk_callback_lock,
2007			af_callback_keys + sk->sk_family,
2008			af_family_clock_key_strings[sk->sk_family]);
2009
2010	sk->sk_state_change	=	sock_def_wakeup;
2011	sk->sk_data_ready	=	sock_def_readable;
2012	sk->sk_write_space	=	sock_def_write_space;
2013	sk->sk_error_report	=	sock_def_error_report;
2014	sk->sk_destruct		=	sock_def_destruct;
2015
2016	sk->sk_sndmsg_page	=	NULL;
2017	sk->sk_sndmsg_off	=	0;
2018
2019	sk->sk_peer_pid 	=	NULL;
2020	sk->sk_peer_cred	=	NULL;
2021	sk->sk_write_pending	=	0;
2022	sk->sk_rcvlowat		=	1;
2023	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2024	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2025
2026	sk->sk_stamp = ktime_set(-1L, 0);
2027
2028	/*
2029	 * Before updating sk_refcnt, we must commit prior changes to memory
2030	 * (Documentation/RCU/rculist_nulls.txt for details)
2031	 */
2032	smp_wmb();
2033	atomic_set(&sk->sk_refcnt, 1);
2034	atomic_set(&sk->sk_drops, 0);
2035}
2036EXPORT_SYMBOL(sock_init_data);
2037
2038void lock_sock_nested(struct sock *sk, int subclass)
2039{
2040	might_sleep();
2041	spin_lock_bh(&sk->sk_lock.slock);
2042	if (sk->sk_lock.owned)
2043		__lock_sock(sk);
2044	sk->sk_lock.owned = 1;
2045	spin_unlock(&sk->sk_lock.slock);
2046	/*
2047	 * The sk_lock has mutex_lock() semantics here:
2048	 */
2049	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2050	local_bh_enable();
2051}
2052EXPORT_SYMBOL(lock_sock_nested);
2053
2054void release_sock(struct sock *sk)
2055{
2056	/*
2057	 * The sk_lock has mutex_unlock() semantics:
2058	 */
2059	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2060
2061	spin_lock_bh(&sk->sk_lock.slock);
2062	if (sk->sk_backlog.tail)
2063		__release_sock(sk);
2064	sk->sk_lock.owned = 0;
2065	if (waitqueue_active(&sk->sk_lock.wq))
2066		wake_up(&sk->sk_lock.wq);
2067	spin_unlock_bh(&sk->sk_lock.slock);
2068}
2069EXPORT_SYMBOL(release_sock);
2070
2071/**
2072 * lock_sock_fast - fast version of lock_sock
2073 * @sk: socket
2074 *
2075 * This version should be used for very small section, where process wont block
2076 * return false if fast path is taken
2077 *   sk_lock.slock locked, owned = 0, BH disabled
2078 * return true if slow path is taken
2079 *   sk_lock.slock unlocked, owned = 1, BH enabled
2080 */
2081bool lock_sock_fast(struct sock *sk)
2082{
2083	might_sleep();
2084	spin_lock_bh(&sk->sk_lock.slock);
2085
2086	if (!sk->sk_lock.owned)
2087		/*
2088		 * Note : We must disable BH
2089		 */
2090		return false;
2091
2092	__lock_sock(sk);
2093	sk->sk_lock.owned = 1;
2094	spin_unlock(&sk->sk_lock.slock);
2095	/*
2096	 * The sk_lock has mutex_lock() semantics here:
2097	 */
2098	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2099	local_bh_enable();
2100	return true;
2101}
2102EXPORT_SYMBOL(lock_sock_fast);
2103
2104int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2105{
2106	struct timeval tv;
2107	if (!sock_flag(sk, SOCK_TIMESTAMP))
2108		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2109	tv = ktime_to_timeval(sk->sk_stamp);
2110	if (tv.tv_sec == -1)
2111		return -ENOENT;
2112	if (tv.tv_sec == 0) {
2113		sk->sk_stamp = ktime_get_real();
2114		tv = ktime_to_timeval(sk->sk_stamp);
2115	}
2116	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2117}
2118EXPORT_SYMBOL(sock_get_timestamp);
2119
2120int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2121{
2122	struct timespec ts;
2123	if (!sock_flag(sk, SOCK_TIMESTAMP))
2124		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2125	ts = ktime_to_timespec(sk->sk_stamp);
2126	if (ts.tv_sec == -1)
2127		return -ENOENT;
2128	if (ts.tv_sec == 0) {
2129		sk->sk_stamp = ktime_get_real();
2130		ts = ktime_to_timespec(sk->sk_stamp);
2131	}
2132	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2133}
2134EXPORT_SYMBOL(sock_get_timestampns);
2135
2136void sock_enable_timestamp(struct sock *sk, int flag)
2137{
2138	if (!sock_flag(sk, flag)) {
2139		sock_set_flag(sk, flag);
2140		/*
2141		 * we just set one of the two flags which require net
2142		 * time stamping, but time stamping might have been on
2143		 * already because of the other one
2144		 */
2145		if (!sock_flag(sk,
2146				flag == SOCK_TIMESTAMP ?
2147				SOCK_TIMESTAMPING_RX_SOFTWARE :
2148				SOCK_TIMESTAMP))
2149			net_enable_timestamp();
2150	}
2151}
2152
2153/*
2154 *	Get a socket option on an socket.
2155 *
2156 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2157 *	asynchronous errors should be reported by getsockopt. We assume
2158 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2159 */
2160int sock_common_getsockopt(struct socket *sock, int level, int optname,
2161			   char __user *optval, int __user *optlen)
2162{
2163	struct sock *sk = sock->sk;
2164
2165	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2166}
2167EXPORT_SYMBOL(sock_common_getsockopt);
2168
2169#ifdef CONFIG_COMPAT
2170int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2171				  char __user *optval, int __user *optlen)
2172{
2173	struct sock *sk = sock->sk;
2174
2175	if (sk->sk_prot->compat_getsockopt != NULL)
2176		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2177						      optval, optlen);
2178	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2179}
2180EXPORT_SYMBOL(compat_sock_common_getsockopt);
2181#endif
2182
2183int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2184			struct msghdr *msg, size_t size, int flags)
2185{
2186	struct sock *sk = sock->sk;
2187	int addr_len = 0;
2188	int err;
2189
2190	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2191				   flags & ~MSG_DONTWAIT, &addr_len);
2192	if (err >= 0)
2193		msg->msg_namelen = addr_len;
2194	return err;
2195}
2196EXPORT_SYMBOL(sock_common_recvmsg);
2197
2198/*
2199 *	Set socket options on an inet socket.
2200 */
2201int sock_common_setsockopt(struct socket *sock, int level, int optname,
2202			   char __user *optval, unsigned int optlen)
2203{
2204	struct sock *sk = sock->sk;
2205
2206	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2207}
2208EXPORT_SYMBOL(sock_common_setsockopt);
2209
2210#ifdef CONFIG_COMPAT
2211int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2212				  char __user *optval, unsigned int optlen)
2213{
2214	struct sock *sk = sock->sk;
2215
2216	if (sk->sk_prot->compat_setsockopt != NULL)
2217		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2218						      optval, optlen);
2219	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2220}
2221EXPORT_SYMBOL(compat_sock_common_setsockopt);
2222#endif
2223
2224void sk_common_release(struct sock *sk)
2225{
2226	if (sk->sk_prot->destroy)
2227		sk->sk_prot->destroy(sk);
2228
2229	/*
2230	 * Observation: when sock_common_release is called, processes have
2231	 * no access to socket. But net still has.
2232	 * Step one, detach it from networking:
2233	 *
2234	 * A. Remove from hash tables.
2235	 */
2236
2237	sk->sk_prot->unhash(sk);
2238
2239	/*
2240	 * In this point socket cannot receive new packets, but it is possible
2241	 * that some packets are in flight because some CPU runs receiver and
2242	 * did hash table lookup before we unhashed socket. They will achieve
2243	 * receive queue and will be purged by socket destructor.
2244	 *
2245	 * Also we still have packets pending on receive queue and probably,
2246	 * our own packets waiting in device queues. sock_destroy will drain
2247	 * receive queue, but transmitted packets will delay socket destruction
2248	 * until the last reference will be released.
2249	 */
2250
2251	sock_orphan(sk);
2252
2253	xfrm_sk_free_policy(sk);
2254
2255	sk_refcnt_debug_release(sk);
2256	sock_put(sk);
2257}
2258EXPORT_SYMBOL(sk_common_release);
2259
2260static DEFINE_RWLOCK(proto_list_lock);
2261static LIST_HEAD(proto_list);
2262
2263#ifdef CONFIG_PROC_FS
2264#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2265struct prot_inuse {
2266	int val[PROTO_INUSE_NR];
2267};
2268
2269static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2270
2271#ifdef CONFIG_NET_NS
2272void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2273{
2274	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2275}
2276EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2277
2278int sock_prot_inuse_get(struct net *net, struct proto *prot)
2279{
2280	int cpu, idx = prot->inuse_idx;
2281	int res = 0;
2282
2283	for_each_possible_cpu(cpu)
2284		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2285
2286	return res >= 0 ? res : 0;
2287}
2288EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2289
2290static int __net_init sock_inuse_init_net(struct net *net)
2291{
2292	net->core.inuse = alloc_percpu(struct prot_inuse);
2293	return net->core.inuse ? 0 : -ENOMEM;
2294}
2295
2296static void __net_exit sock_inuse_exit_net(struct net *net)
2297{
2298	free_percpu(net->core.inuse);
2299}
2300
2301static struct pernet_operations net_inuse_ops = {
2302	.init = sock_inuse_init_net,
2303	.exit = sock_inuse_exit_net,
2304};
2305
2306static __init int net_inuse_init(void)
2307{
2308	if (register_pernet_subsys(&net_inuse_ops))
2309		panic("Cannot initialize net inuse counters");
2310
2311	return 0;
2312}
2313
2314core_initcall(net_inuse_init);
2315#else
2316static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2317
2318void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2319{
2320	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2321}
2322EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2323
2324int sock_prot_inuse_get(struct net *net, struct proto *prot)
2325{
2326	int cpu, idx = prot->inuse_idx;
2327	int res = 0;
2328
2329	for_each_possible_cpu(cpu)
2330		res += per_cpu(prot_inuse, cpu).val[idx];
2331
2332	return res >= 0 ? res : 0;
2333}
2334EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2335#endif
2336
2337static void assign_proto_idx(struct proto *prot)
2338{
2339	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2340
2341	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2342		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2343		return;
2344	}
2345
2346	set_bit(prot->inuse_idx, proto_inuse_idx);
2347}
2348
2349static void release_proto_idx(struct proto *prot)
2350{
2351	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2352		clear_bit(prot->inuse_idx, proto_inuse_idx);
2353}
2354#else
2355static inline void assign_proto_idx(struct proto *prot)
2356{
2357}
2358
2359static inline void release_proto_idx(struct proto *prot)
2360{
2361}
2362#endif
2363
2364int proto_register(struct proto *prot, int alloc_slab)
2365{
2366	if (alloc_slab) {
2367		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2368					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2369					NULL);
2370
2371		if (prot->slab == NULL) {
2372			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2373			       prot->name);
2374			goto out;
2375		}
2376
2377		if (prot->rsk_prot != NULL) {
2378			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2379			if (prot->rsk_prot->slab_name == NULL)
2380				goto out_free_sock_slab;
2381
2382			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2383								 prot->rsk_prot->obj_size, 0,
2384								 SLAB_HWCACHE_ALIGN, NULL);
2385
2386			if (prot->rsk_prot->slab == NULL) {
2387				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2388				       prot->name);
2389				goto out_free_request_sock_slab_name;
2390			}
2391		}
2392
2393		if (prot->twsk_prot != NULL) {
2394			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2395
2396			if (prot->twsk_prot->twsk_slab_name == NULL)
2397				goto out_free_request_sock_slab;
2398
2399			prot->twsk_prot->twsk_slab =
2400				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2401						  prot->twsk_prot->twsk_obj_size,
2402						  0,
2403						  SLAB_HWCACHE_ALIGN |
2404							prot->slab_flags,
2405						  NULL);
2406			if (prot->twsk_prot->twsk_slab == NULL)
2407				goto out_free_timewait_sock_slab_name;
2408		}
2409	}
2410
2411	write_lock(&proto_list_lock);
2412	list_add(&prot->node, &proto_list);
2413	assign_proto_idx(prot);
2414	write_unlock(&proto_list_lock);
2415	return 0;
2416
2417out_free_timewait_sock_slab_name:
2418	kfree(prot->twsk_prot->twsk_slab_name);
2419out_free_request_sock_slab:
2420	if (prot->rsk_prot && prot->rsk_prot->slab) {
2421		kmem_cache_destroy(prot->rsk_prot->slab);
2422		prot->rsk_prot->slab = NULL;
2423	}
2424out_free_request_sock_slab_name:
2425	if (prot->rsk_prot)
2426		kfree(prot->rsk_prot->slab_name);
2427out_free_sock_slab:
2428	kmem_cache_destroy(prot->slab);
2429	prot->slab = NULL;
2430out:
2431	return -ENOBUFS;
2432}
2433EXPORT_SYMBOL(proto_register);
2434
2435void proto_unregister(struct proto *prot)
2436{
2437	write_lock(&proto_list_lock);
2438	release_proto_idx(prot);
2439	list_del(&prot->node);
2440	write_unlock(&proto_list_lock);
2441
2442	if (prot->slab != NULL) {
2443		kmem_cache_destroy(prot->slab);
2444		prot->slab = NULL;
2445	}
2446
2447	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2448		kmem_cache_destroy(prot->rsk_prot->slab);
2449		kfree(prot->rsk_prot->slab_name);
2450		prot->rsk_prot->slab = NULL;
2451	}
2452
2453	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2454		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2455		kfree(prot->twsk_prot->twsk_slab_name);
2456		prot->twsk_prot->twsk_slab = NULL;
2457	}
2458}
2459EXPORT_SYMBOL(proto_unregister);
2460
2461#ifdef CONFIG_PROC_FS
2462static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2463	__acquires(proto_list_lock)
2464{
2465	read_lock(&proto_list_lock);
2466	return seq_list_start_head(&proto_list, *pos);
2467}
2468
2469static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2470{
2471	return seq_list_next(v, &proto_list, pos);
2472}
2473
2474static void proto_seq_stop(struct seq_file *seq, void *v)
2475	__releases(proto_list_lock)
2476{
2477	read_unlock(&proto_list_lock);
2478}
2479
2480static char proto_method_implemented(const void *method)
2481{
2482	return method == NULL ? 'n' : 'y';
2483}
2484
2485static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2486{
2487	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2488			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2489		   proto->name,
2490		   proto->obj_size,
2491		   sock_prot_inuse_get(seq_file_net(seq), proto),
2492		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
2493		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2494		   proto->max_header,
2495		   proto->slab == NULL ? "no" : "yes",
2496		   module_name(proto->owner),
2497		   proto_method_implemented(proto->close),
2498		   proto_method_implemented(proto->connect),
2499		   proto_method_implemented(proto->disconnect),
2500		   proto_method_implemented(proto->accept),
2501		   proto_method_implemented(proto->ioctl),
2502		   proto_method_implemented(proto->init),
2503		   proto_method_implemented(proto->destroy),
2504		   proto_method_implemented(proto->shutdown),
2505		   proto_method_implemented(proto->setsockopt),
2506		   proto_method_implemented(proto->getsockopt),
2507		   proto_method_implemented(proto->sendmsg),
2508		   proto_method_implemented(proto->recvmsg),
2509		   proto_method_implemented(proto->sendpage),
2510		   proto_method_implemented(proto->bind),
2511		   proto_method_implemented(proto->backlog_rcv),
2512		   proto_method_implemented(proto->hash),
2513		   proto_method_implemented(proto->unhash),
2514		   proto_method_implemented(proto->get_port),
2515		   proto_method_implemented(proto->enter_memory_pressure));
2516}
2517
2518static int proto_seq_show(struct seq_file *seq, void *v)
2519{
2520	if (v == &proto_list)
2521		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2522			   "protocol",
2523			   "size",
2524			   "sockets",
2525			   "memory",
2526			   "press",
2527			   "maxhdr",
2528			   "slab",
2529			   "module",
2530			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2531	else
2532		proto_seq_printf(seq, list_entry(v, struct proto, node));
2533	return 0;
2534}
2535
2536static const struct seq_operations proto_seq_ops = {
2537	.start  = proto_seq_start,
2538	.next   = proto_seq_next,
2539	.stop   = proto_seq_stop,
2540	.show   = proto_seq_show,
2541};
2542
2543static int proto_seq_open(struct inode *inode, struct file *file)
2544{
2545	return seq_open_net(inode, file, &proto_seq_ops,
2546			    sizeof(struct seq_net_private));
2547}
2548
2549static const struct file_operations proto_seq_fops = {
2550	.owner		= THIS_MODULE,
2551	.open		= proto_seq_open,
2552	.read		= seq_read,
2553	.llseek		= seq_lseek,
2554	.release	= seq_release_net,
2555};
2556
2557static __net_init int proto_init_net(struct net *net)
2558{
2559	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2560		return -ENOMEM;
2561
2562	return 0;
2563}
2564
2565static __net_exit void proto_exit_net(struct net *net)
2566{
2567	proc_net_remove(net, "protocols");
2568}
2569
2570
2571static __net_initdata struct pernet_operations proto_net_ops = {
2572	.init = proto_init_net,
2573	.exit = proto_exit_net,
2574};
2575
2576static int __init proto_init(void)
2577{
2578	return register_pernet_subsys(&proto_net_ops);
2579}
2580
2581subsys_initcall(proto_init);
2582
2583#endif /* PROC_FS */
2584