sock.c revision 5bc1421e34ecfe0bd4b26dc3232b7d5e25179144
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
121#include <net/net_namespace.h>
122#include <net/request_sock.h>
123#include <net/sock.h>
124#include <linux/net_tstamp.h>
125#include <net/xfrm.h>
126#include <linux/ipsec.h>
127#include <net/cls_cgroup.h>
128#include <net/netprio_cgroup.h>
129
130#include <linux/filter.h>
131
132#include <trace/events/sock.h>
133
134#ifdef CONFIG_INET
135#include <net/tcp.h>
136#endif
137
138/*
139 * Each address family might have different locking rules, so we have
140 * one slock key per address family:
141 */
142static struct lock_class_key af_family_keys[AF_MAX];
143static struct lock_class_key af_family_slock_keys[AF_MAX];
144
145/*
146 * Make lock validator output more readable. (we pre-construct these
147 * strings build-time, so that runtime initialization of socket
148 * locks is fast):
149 */
150static const char *const af_family_key_strings[AF_MAX+1] = {
151  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
152  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
153  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
154  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
155  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
156  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
157  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
158  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
159  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
160  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
161  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
162  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
163  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
164  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
165};
166static const char *const af_family_slock_key_strings[AF_MAX+1] = {
167  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
168  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
169  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
170  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
171  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
172  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
173  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
174  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
175  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
176  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
177  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
178  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
179  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
180  "slock-AF_NFC"   , "slock-AF_MAX"
181};
182static const char *const af_family_clock_key_strings[AF_MAX+1] = {
183  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
184  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
185  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
186  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
187  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
188  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
189  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
190  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
191  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
192  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
193  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
194  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
195  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
196  "clock-AF_NFC"   , "clock-AF_MAX"
197};
198
199/*
200 * sk_callback_lock locking rules are per-address-family,
201 * so split the lock classes by using a per-AF key:
202 */
203static struct lock_class_key af_callback_keys[AF_MAX];
204
205/* Take into consideration the size of the struct sk_buff overhead in the
206 * determination of these values, since that is non-constant across
207 * platforms.  This makes socket queueing behavior and performance
208 * not depend upon such differences.
209 */
210#define _SK_MEM_PACKETS		256
211#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
212#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
213#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
214
215/* Run time adjustable parameters. */
216__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
217__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
218__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
219__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
220
221/* Maximal space eaten by iovec or ancillary data plus some space */
222int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
223EXPORT_SYMBOL(sysctl_optmem_max);
224
225#if defined(CONFIG_CGROUPS)
226#if !defined(CONFIG_NET_CLS_CGROUP)
227int net_cls_subsys_id = -1;
228EXPORT_SYMBOL_GPL(net_cls_subsys_id);
229#endif
230#if !defined(CONFIG_NETPRIO_CGROUP)
231int net_prio_subsys_id = -1;
232EXPORT_SYMBOL_GPL(net_prio_subsys_id);
233#endif
234#endif
235
236static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
237{
238	struct timeval tv;
239
240	if (optlen < sizeof(tv))
241		return -EINVAL;
242	if (copy_from_user(&tv, optval, sizeof(tv)))
243		return -EFAULT;
244	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
245		return -EDOM;
246
247	if (tv.tv_sec < 0) {
248		static int warned __read_mostly;
249
250		*timeo_p = 0;
251		if (warned < 10 && net_ratelimit()) {
252			warned++;
253			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
254			       "tries to set negative timeout\n",
255				current->comm, task_pid_nr(current));
256		}
257		return 0;
258	}
259	*timeo_p = MAX_SCHEDULE_TIMEOUT;
260	if (tv.tv_sec == 0 && tv.tv_usec == 0)
261		return 0;
262	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
263		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
264	return 0;
265}
266
267static void sock_warn_obsolete_bsdism(const char *name)
268{
269	static int warned;
270	static char warncomm[TASK_COMM_LEN];
271	if (strcmp(warncomm, current->comm) && warned < 5) {
272		strcpy(warncomm,  current->comm);
273		printk(KERN_WARNING "process `%s' is using obsolete "
274		       "%s SO_BSDCOMPAT\n", warncomm, name);
275		warned++;
276	}
277}
278
279static void sock_disable_timestamp(struct sock *sk, int flag)
280{
281	if (sock_flag(sk, flag)) {
282		sock_reset_flag(sk, flag);
283		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
284		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
285			net_disable_timestamp();
286		}
287	}
288}
289
290
291int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
292{
293	int err;
294	int skb_len;
295	unsigned long flags;
296	struct sk_buff_head *list = &sk->sk_receive_queue;
297
298	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
299	   number of warnings when compiling with -W --ANK
300	 */
301	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
302	    (unsigned)sk->sk_rcvbuf) {
303		atomic_inc(&sk->sk_drops);
304		trace_sock_rcvqueue_full(sk, skb);
305		return -ENOMEM;
306	}
307
308	err = sk_filter(sk, skb);
309	if (err)
310		return err;
311
312	if (!sk_rmem_schedule(sk, skb->truesize)) {
313		atomic_inc(&sk->sk_drops);
314		return -ENOBUFS;
315	}
316
317	skb->dev = NULL;
318	skb_set_owner_r(skb, sk);
319
320	/* Cache the SKB length before we tack it onto the receive
321	 * queue.  Once it is added it no longer belongs to us and
322	 * may be freed by other threads of control pulling packets
323	 * from the queue.
324	 */
325	skb_len = skb->len;
326
327	/* we escape from rcu protected region, make sure we dont leak
328	 * a norefcounted dst
329	 */
330	skb_dst_force(skb);
331
332	spin_lock_irqsave(&list->lock, flags);
333	skb->dropcount = atomic_read(&sk->sk_drops);
334	__skb_queue_tail(list, skb);
335	spin_unlock_irqrestore(&list->lock, flags);
336
337	if (!sock_flag(sk, SOCK_DEAD))
338		sk->sk_data_ready(sk, skb_len);
339	return 0;
340}
341EXPORT_SYMBOL(sock_queue_rcv_skb);
342
343int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
344{
345	int rc = NET_RX_SUCCESS;
346
347	if (sk_filter(sk, skb))
348		goto discard_and_relse;
349
350	skb->dev = NULL;
351
352	if (sk_rcvqueues_full(sk, skb)) {
353		atomic_inc(&sk->sk_drops);
354		goto discard_and_relse;
355	}
356	if (nested)
357		bh_lock_sock_nested(sk);
358	else
359		bh_lock_sock(sk);
360	if (!sock_owned_by_user(sk)) {
361		/*
362		 * trylock + unlock semantics:
363		 */
364		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
365
366		rc = sk_backlog_rcv(sk, skb);
367
368		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
369	} else if (sk_add_backlog(sk, skb)) {
370		bh_unlock_sock(sk);
371		atomic_inc(&sk->sk_drops);
372		goto discard_and_relse;
373	}
374
375	bh_unlock_sock(sk);
376out:
377	sock_put(sk);
378	return rc;
379discard_and_relse:
380	kfree_skb(skb);
381	goto out;
382}
383EXPORT_SYMBOL(sk_receive_skb);
384
385void sk_reset_txq(struct sock *sk)
386{
387	sk_tx_queue_clear(sk);
388}
389EXPORT_SYMBOL(sk_reset_txq);
390
391struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
392{
393	struct dst_entry *dst = __sk_dst_get(sk);
394
395	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
396		sk_tx_queue_clear(sk);
397		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
398		dst_release(dst);
399		return NULL;
400	}
401
402	return dst;
403}
404EXPORT_SYMBOL(__sk_dst_check);
405
406struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
407{
408	struct dst_entry *dst = sk_dst_get(sk);
409
410	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
411		sk_dst_reset(sk);
412		dst_release(dst);
413		return NULL;
414	}
415
416	return dst;
417}
418EXPORT_SYMBOL(sk_dst_check);
419
420static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
421{
422	int ret = -ENOPROTOOPT;
423#ifdef CONFIG_NETDEVICES
424	struct net *net = sock_net(sk);
425	char devname[IFNAMSIZ];
426	int index;
427
428	/* Sorry... */
429	ret = -EPERM;
430	if (!capable(CAP_NET_RAW))
431		goto out;
432
433	ret = -EINVAL;
434	if (optlen < 0)
435		goto out;
436
437	/* Bind this socket to a particular device like "eth0",
438	 * as specified in the passed interface name. If the
439	 * name is "" or the option length is zero the socket
440	 * is not bound.
441	 */
442	if (optlen > IFNAMSIZ - 1)
443		optlen = IFNAMSIZ - 1;
444	memset(devname, 0, sizeof(devname));
445
446	ret = -EFAULT;
447	if (copy_from_user(devname, optval, optlen))
448		goto out;
449
450	index = 0;
451	if (devname[0] != '\0') {
452		struct net_device *dev;
453
454		rcu_read_lock();
455		dev = dev_get_by_name_rcu(net, devname);
456		if (dev)
457			index = dev->ifindex;
458		rcu_read_unlock();
459		ret = -ENODEV;
460		if (!dev)
461			goto out;
462	}
463
464	lock_sock(sk);
465	sk->sk_bound_dev_if = index;
466	sk_dst_reset(sk);
467	release_sock(sk);
468
469	ret = 0;
470
471out:
472#endif
473
474	return ret;
475}
476
477static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
478{
479	if (valbool)
480		sock_set_flag(sk, bit);
481	else
482		sock_reset_flag(sk, bit);
483}
484
485/*
486 *	This is meant for all protocols to use and covers goings on
487 *	at the socket level. Everything here is generic.
488 */
489
490int sock_setsockopt(struct socket *sock, int level, int optname,
491		    char __user *optval, unsigned int optlen)
492{
493	struct sock *sk = sock->sk;
494	int val;
495	int valbool;
496	struct linger ling;
497	int ret = 0;
498
499	/*
500	 *	Options without arguments
501	 */
502
503	if (optname == SO_BINDTODEVICE)
504		return sock_bindtodevice(sk, optval, optlen);
505
506	if (optlen < sizeof(int))
507		return -EINVAL;
508
509	if (get_user(val, (int __user *)optval))
510		return -EFAULT;
511
512	valbool = val ? 1 : 0;
513
514	lock_sock(sk);
515
516	switch (optname) {
517	case SO_DEBUG:
518		if (val && !capable(CAP_NET_ADMIN))
519			ret = -EACCES;
520		else
521			sock_valbool_flag(sk, SOCK_DBG, valbool);
522		break;
523	case SO_REUSEADDR:
524		sk->sk_reuse = valbool;
525		break;
526	case SO_TYPE:
527	case SO_PROTOCOL:
528	case SO_DOMAIN:
529	case SO_ERROR:
530		ret = -ENOPROTOOPT;
531		break;
532	case SO_DONTROUTE:
533		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
534		break;
535	case SO_BROADCAST:
536		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
537		break;
538	case SO_SNDBUF:
539		/* Don't error on this BSD doesn't and if you think
540		   about it this is right. Otherwise apps have to
541		   play 'guess the biggest size' games. RCVBUF/SNDBUF
542		   are treated in BSD as hints */
543
544		if (val > sysctl_wmem_max)
545			val = sysctl_wmem_max;
546set_sndbuf:
547		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
548		if ((val * 2) < SOCK_MIN_SNDBUF)
549			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
550		else
551			sk->sk_sndbuf = val * 2;
552
553		/*
554		 *	Wake up sending tasks if we
555		 *	upped the value.
556		 */
557		sk->sk_write_space(sk);
558		break;
559
560	case SO_SNDBUFFORCE:
561		if (!capable(CAP_NET_ADMIN)) {
562			ret = -EPERM;
563			break;
564		}
565		goto set_sndbuf;
566
567	case SO_RCVBUF:
568		/* Don't error on this BSD doesn't and if you think
569		   about it this is right. Otherwise apps have to
570		   play 'guess the biggest size' games. RCVBUF/SNDBUF
571		   are treated in BSD as hints */
572
573		if (val > sysctl_rmem_max)
574			val = sysctl_rmem_max;
575set_rcvbuf:
576		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
577		/*
578		 * We double it on the way in to account for
579		 * "struct sk_buff" etc. overhead.   Applications
580		 * assume that the SO_RCVBUF setting they make will
581		 * allow that much actual data to be received on that
582		 * socket.
583		 *
584		 * Applications are unaware that "struct sk_buff" and
585		 * other overheads allocate from the receive buffer
586		 * during socket buffer allocation.
587		 *
588		 * And after considering the possible alternatives,
589		 * returning the value we actually used in getsockopt
590		 * is the most desirable behavior.
591		 */
592		if ((val * 2) < SOCK_MIN_RCVBUF)
593			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
594		else
595			sk->sk_rcvbuf = val * 2;
596		break;
597
598	case SO_RCVBUFFORCE:
599		if (!capable(CAP_NET_ADMIN)) {
600			ret = -EPERM;
601			break;
602		}
603		goto set_rcvbuf;
604
605	case SO_KEEPALIVE:
606#ifdef CONFIG_INET
607		if (sk->sk_protocol == IPPROTO_TCP)
608			tcp_set_keepalive(sk, valbool);
609#endif
610		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
611		break;
612
613	case SO_OOBINLINE:
614		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
615		break;
616
617	case SO_NO_CHECK:
618		sk->sk_no_check = valbool;
619		break;
620
621	case SO_PRIORITY:
622		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
623			sk->sk_priority = val;
624		else
625			ret = -EPERM;
626		break;
627
628	case SO_LINGER:
629		if (optlen < sizeof(ling)) {
630			ret = -EINVAL;	/* 1003.1g */
631			break;
632		}
633		if (copy_from_user(&ling, optval, sizeof(ling))) {
634			ret = -EFAULT;
635			break;
636		}
637		if (!ling.l_onoff)
638			sock_reset_flag(sk, SOCK_LINGER);
639		else {
640#if (BITS_PER_LONG == 32)
641			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
642				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
643			else
644#endif
645				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
646			sock_set_flag(sk, SOCK_LINGER);
647		}
648		break;
649
650	case SO_BSDCOMPAT:
651		sock_warn_obsolete_bsdism("setsockopt");
652		break;
653
654	case SO_PASSCRED:
655		if (valbool)
656			set_bit(SOCK_PASSCRED, &sock->flags);
657		else
658			clear_bit(SOCK_PASSCRED, &sock->flags);
659		break;
660
661	case SO_TIMESTAMP:
662	case SO_TIMESTAMPNS:
663		if (valbool)  {
664			if (optname == SO_TIMESTAMP)
665				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
666			else
667				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
668			sock_set_flag(sk, SOCK_RCVTSTAMP);
669			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
670		} else {
671			sock_reset_flag(sk, SOCK_RCVTSTAMP);
672			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
673		}
674		break;
675
676	case SO_TIMESTAMPING:
677		if (val & ~SOF_TIMESTAMPING_MASK) {
678			ret = -EINVAL;
679			break;
680		}
681		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
682				  val & SOF_TIMESTAMPING_TX_HARDWARE);
683		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
684				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
685		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
686				  val & SOF_TIMESTAMPING_RX_HARDWARE);
687		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
688			sock_enable_timestamp(sk,
689					      SOCK_TIMESTAMPING_RX_SOFTWARE);
690		else
691			sock_disable_timestamp(sk,
692					       SOCK_TIMESTAMPING_RX_SOFTWARE);
693		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
694				  val & SOF_TIMESTAMPING_SOFTWARE);
695		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
696				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
697		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
698				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
699		break;
700
701	case SO_RCVLOWAT:
702		if (val < 0)
703			val = INT_MAX;
704		sk->sk_rcvlowat = val ? : 1;
705		break;
706
707	case SO_RCVTIMEO:
708		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
709		break;
710
711	case SO_SNDTIMEO:
712		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
713		break;
714
715	case SO_ATTACH_FILTER:
716		ret = -EINVAL;
717		if (optlen == sizeof(struct sock_fprog)) {
718			struct sock_fprog fprog;
719
720			ret = -EFAULT;
721			if (copy_from_user(&fprog, optval, sizeof(fprog)))
722				break;
723
724			ret = sk_attach_filter(&fprog, sk);
725		}
726		break;
727
728	case SO_DETACH_FILTER:
729		ret = sk_detach_filter(sk);
730		break;
731
732	case SO_PASSSEC:
733		if (valbool)
734			set_bit(SOCK_PASSSEC, &sock->flags);
735		else
736			clear_bit(SOCK_PASSSEC, &sock->flags);
737		break;
738	case SO_MARK:
739		if (!capable(CAP_NET_ADMIN))
740			ret = -EPERM;
741		else
742			sk->sk_mark = val;
743		break;
744
745		/* We implement the SO_SNDLOWAT etc to
746		   not be settable (1003.1g 5.3) */
747	case SO_RXQ_OVFL:
748		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
749		break;
750
751	case SO_WIFI_STATUS:
752		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
753		break;
754
755	default:
756		ret = -ENOPROTOOPT;
757		break;
758	}
759	release_sock(sk);
760	return ret;
761}
762EXPORT_SYMBOL(sock_setsockopt);
763
764
765void cred_to_ucred(struct pid *pid, const struct cred *cred,
766		   struct ucred *ucred)
767{
768	ucred->pid = pid_vnr(pid);
769	ucred->uid = ucred->gid = -1;
770	if (cred) {
771		struct user_namespace *current_ns = current_user_ns();
772
773		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
774		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
775	}
776}
777EXPORT_SYMBOL_GPL(cred_to_ucred);
778
779int sock_getsockopt(struct socket *sock, int level, int optname,
780		    char __user *optval, int __user *optlen)
781{
782	struct sock *sk = sock->sk;
783
784	union {
785		int val;
786		struct linger ling;
787		struct timeval tm;
788	} v;
789
790	int lv = sizeof(int);
791	int len;
792
793	if (get_user(len, optlen))
794		return -EFAULT;
795	if (len < 0)
796		return -EINVAL;
797
798	memset(&v, 0, sizeof(v));
799
800	switch (optname) {
801	case SO_DEBUG:
802		v.val = sock_flag(sk, SOCK_DBG);
803		break;
804
805	case SO_DONTROUTE:
806		v.val = sock_flag(sk, SOCK_LOCALROUTE);
807		break;
808
809	case SO_BROADCAST:
810		v.val = !!sock_flag(sk, SOCK_BROADCAST);
811		break;
812
813	case SO_SNDBUF:
814		v.val = sk->sk_sndbuf;
815		break;
816
817	case SO_RCVBUF:
818		v.val = sk->sk_rcvbuf;
819		break;
820
821	case SO_REUSEADDR:
822		v.val = sk->sk_reuse;
823		break;
824
825	case SO_KEEPALIVE:
826		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
827		break;
828
829	case SO_TYPE:
830		v.val = sk->sk_type;
831		break;
832
833	case SO_PROTOCOL:
834		v.val = sk->sk_protocol;
835		break;
836
837	case SO_DOMAIN:
838		v.val = sk->sk_family;
839		break;
840
841	case SO_ERROR:
842		v.val = -sock_error(sk);
843		if (v.val == 0)
844			v.val = xchg(&sk->sk_err_soft, 0);
845		break;
846
847	case SO_OOBINLINE:
848		v.val = !!sock_flag(sk, SOCK_URGINLINE);
849		break;
850
851	case SO_NO_CHECK:
852		v.val = sk->sk_no_check;
853		break;
854
855	case SO_PRIORITY:
856		v.val = sk->sk_priority;
857		break;
858
859	case SO_LINGER:
860		lv		= sizeof(v.ling);
861		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
862		v.ling.l_linger	= sk->sk_lingertime / HZ;
863		break;
864
865	case SO_BSDCOMPAT:
866		sock_warn_obsolete_bsdism("getsockopt");
867		break;
868
869	case SO_TIMESTAMP:
870		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
871				!sock_flag(sk, SOCK_RCVTSTAMPNS);
872		break;
873
874	case SO_TIMESTAMPNS:
875		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
876		break;
877
878	case SO_TIMESTAMPING:
879		v.val = 0;
880		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
881			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
882		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
883			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
884		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
885			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
886		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
887			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
888		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
889			v.val |= SOF_TIMESTAMPING_SOFTWARE;
890		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
891			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
892		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
893			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
894		break;
895
896	case SO_RCVTIMEO:
897		lv = sizeof(struct timeval);
898		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
899			v.tm.tv_sec = 0;
900			v.tm.tv_usec = 0;
901		} else {
902			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
903			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
904		}
905		break;
906
907	case SO_SNDTIMEO:
908		lv = sizeof(struct timeval);
909		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
910			v.tm.tv_sec = 0;
911			v.tm.tv_usec = 0;
912		} else {
913			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
914			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
915		}
916		break;
917
918	case SO_RCVLOWAT:
919		v.val = sk->sk_rcvlowat;
920		break;
921
922	case SO_SNDLOWAT:
923		v.val = 1;
924		break;
925
926	case SO_PASSCRED:
927		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
928		break;
929
930	case SO_PEERCRED:
931	{
932		struct ucred peercred;
933		if (len > sizeof(peercred))
934			len = sizeof(peercred);
935		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
936		if (copy_to_user(optval, &peercred, len))
937			return -EFAULT;
938		goto lenout;
939	}
940
941	case SO_PEERNAME:
942	{
943		char address[128];
944
945		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
946			return -ENOTCONN;
947		if (lv < len)
948			return -EINVAL;
949		if (copy_to_user(optval, address, len))
950			return -EFAULT;
951		goto lenout;
952	}
953
954	/* Dubious BSD thing... Probably nobody even uses it, but
955	 * the UNIX standard wants it for whatever reason... -DaveM
956	 */
957	case SO_ACCEPTCONN:
958		v.val = sk->sk_state == TCP_LISTEN;
959		break;
960
961	case SO_PASSSEC:
962		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
963		break;
964
965	case SO_PEERSEC:
966		return security_socket_getpeersec_stream(sock, optval, optlen, len);
967
968	case SO_MARK:
969		v.val = sk->sk_mark;
970		break;
971
972	case SO_RXQ_OVFL:
973		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
974		break;
975
976	case SO_WIFI_STATUS:
977		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
978		break;
979
980	default:
981		return -ENOPROTOOPT;
982	}
983
984	if (len > lv)
985		len = lv;
986	if (copy_to_user(optval, &v, len))
987		return -EFAULT;
988lenout:
989	if (put_user(len, optlen))
990		return -EFAULT;
991	return 0;
992}
993
994/*
995 * Initialize an sk_lock.
996 *
997 * (We also register the sk_lock with the lock validator.)
998 */
999static inline void sock_lock_init(struct sock *sk)
1000{
1001	sock_lock_init_class_and_name(sk,
1002			af_family_slock_key_strings[sk->sk_family],
1003			af_family_slock_keys + sk->sk_family,
1004			af_family_key_strings[sk->sk_family],
1005			af_family_keys + sk->sk_family);
1006}
1007
1008/*
1009 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1010 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1011 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1012 */
1013static void sock_copy(struct sock *nsk, const struct sock *osk)
1014{
1015#ifdef CONFIG_SECURITY_NETWORK
1016	void *sptr = nsk->sk_security;
1017#endif
1018	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1019
1020	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1021	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1022
1023#ifdef CONFIG_SECURITY_NETWORK
1024	nsk->sk_security = sptr;
1025	security_sk_clone(osk, nsk);
1026#endif
1027}
1028
1029/*
1030 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1031 * un-modified. Special care is taken when initializing object to zero.
1032 */
1033static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1034{
1035	if (offsetof(struct sock, sk_node.next) != 0)
1036		memset(sk, 0, offsetof(struct sock, sk_node.next));
1037	memset(&sk->sk_node.pprev, 0,
1038	       size - offsetof(struct sock, sk_node.pprev));
1039}
1040
1041void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1042{
1043	unsigned long nulls1, nulls2;
1044
1045	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1046	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1047	if (nulls1 > nulls2)
1048		swap(nulls1, nulls2);
1049
1050	if (nulls1 != 0)
1051		memset((char *)sk, 0, nulls1);
1052	memset((char *)sk + nulls1 + sizeof(void *), 0,
1053	       nulls2 - nulls1 - sizeof(void *));
1054	memset((char *)sk + nulls2 + sizeof(void *), 0,
1055	       size - nulls2 - sizeof(void *));
1056}
1057EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1058
1059static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1060		int family)
1061{
1062	struct sock *sk;
1063	struct kmem_cache *slab;
1064
1065	slab = prot->slab;
1066	if (slab != NULL) {
1067		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1068		if (!sk)
1069			return sk;
1070		if (priority & __GFP_ZERO) {
1071			if (prot->clear_sk)
1072				prot->clear_sk(sk, prot->obj_size);
1073			else
1074				sk_prot_clear_nulls(sk, prot->obj_size);
1075		}
1076	} else
1077		sk = kmalloc(prot->obj_size, priority);
1078
1079	if (sk != NULL) {
1080		kmemcheck_annotate_bitfield(sk, flags);
1081
1082		if (security_sk_alloc(sk, family, priority))
1083			goto out_free;
1084
1085		if (!try_module_get(prot->owner))
1086			goto out_free_sec;
1087		sk_tx_queue_clear(sk);
1088	}
1089
1090	return sk;
1091
1092out_free_sec:
1093	security_sk_free(sk);
1094out_free:
1095	if (slab != NULL)
1096		kmem_cache_free(slab, sk);
1097	else
1098		kfree(sk);
1099	return NULL;
1100}
1101
1102static void sk_prot_free(struct proto *prot, struct sock *sk)
1103{
1104	struct kmem_cache *slab;
1105	struct module *owner;
1106
1107	owner = prot->owner;
1108	slab = prot->slab;
1109
1110	security_sk_free(sk);
1111	if (slab != NULL)
1112		kmem_cache_free(slab, sk);
1113	else
1114		kfree(sk);
1115	module_put(owner);
1116}
1117
1118#ifdef CONFIG_CGROUPS
1119void sock_update_classid(struct sock *sk)
1120{
1121	u32 classid;
1122
1123	rcu_read_lock();  /* doing current task, which cannot vanish. */
1124	classid = task_cls_classid(current);
1125	rcu_read_unlock();
1126	if (classid && classid != sk->sk_classid)
1127		sk->sk_classid = classid;
1128}
1129EXPORT_SYMBOL(sock_update_classid);
1130
1131void sock_update_netprioidx(struct sock *sk)
1132{
1133	struct cgroup_netprio_state *state;
1134	if (in_interrupt())
1135		return;
1136	rcu_read_lock();
1137	state = task_netprio_state(current);
1138	sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
1139	rcu_read_unlock();
1140}
1141EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1142#endif
1143
1144/**
1145 *	sk_alloc - All socket objects are allocated here
1146 *	@net: the applicable net namespace
1147 *	@family: protocol family
1148 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1149 *	@prot: struct proto associated with this new sock instance
1150 */
1151struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1152		      struct proto *prot)
1153{
1154	struct sock *sk;
1155
1156	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1157	if (sk) {
1158		sk->sk_family = family;
1159		/*
1160		 * See comment in struct sock definition to understand
1161		 * why we need sk_prot_creator -acme
1162		 */
1163		sk->sk_prot = sk->sk_prot_creator = prot;
1164		sock_lock_init(sk);
1165		sock_net_set(sk, get_net(net));
1166		atomic_set(&sk->sk_wmem_alloc, 1);
1167
1168		sock_update_classid(sk);
1169		sock_update_netprioidx(sk);
1170	}
1171
1172	return sk;
1173}
1174EXPORT_SYMBOL(sk_alloc);
1175
1176static void __sk_free(struct sock *sk)
1177{
1178	struct sk_filter *filter;
1179
1180	if (sk->sk_destruct)
1181		sk->sk_destruct(sk);
1182
1183	filter = rcu_dereference_check(sk->sk_filter,
1184				       atomic_read(&sk->sk_wmem_alloc) == 0);
1185	if (filter) {
1186		sk_filter_uncharge(sk, filter);
1187		RCU_INIT_POINTER(sk->sk_filter, NULL);
1188	}
1189
1190	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1191	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1192
1193	if (atomic_read(&sk->sk_omem_alloc))
1194		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1195		       __func__, atomic_read(&sk->sk_omem_alloc));
1196
1197	if (sk->sk_peer_cred)
1198		put_cred(sk->sk_peer_cred);
1199	put_pid(sk->sk_peer_pid);
1200	put_net(sock_net(sk));
1201	sk_prot_free(sk->sk_prot_creator, sk);
1202}
1203
1204void sk_free(struct sock *sk)
1205{
1206	/*
1207	 * We subtract one from sk_wmem_alloc and can know if
1208	 * some packets are still in some tx queue.
1209	 * If not null, sock_wfree() will call __sk_free(sk) later
1210	 */
1211	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1212		__sk_free(sk);
1213}
1214EXPORT_SYMBOL(sk_free);
1215
1216/*
1217 * Last sock_put should drop reference to sk->sk_net. It has already
1218 * been dropped in sk_change_net. Taking reference to stopping namespace
1219 * is not an option.
1220 * Take reference to a socket to remove it from hash _alive_ and after that
1221 * destroy it in the context of init_net.
1222 */
1223void sk_release_kernel(struct sock *sk)
1224{
1225	if (sk == NULL || sk->sk_socket == NULL)
1226		return;
1227
1228	sock_hold(sk);
1229	sock_release(sk->sk_socket);
1230	release_net(sock_net(sk));
1231	sock_net_set(sk, get_net(&init_net));
1232	sock_put(sk);
1233}
1234EXPORT_SYMBOL(sk_release_kernel);
1235
1236/**
1237 *	sk_clone_lock - clone a socket, and lock its clone
1238 *	@sk: the socket to clone
1239 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1240 *
1241 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1242 */
1243struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1244{
1245	struct sock *newsk;
1246
1247	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1248	if (newsk != NULL) {
1249		struct sk_filter *filter;
1250
1251		sock_copy(newsk, sk);
1252
1253		/* SANITY */
1254		get_net(sock_net(newsk));
1255		sk_node_init(&newsk->sk_node);
1256		sock_lock_init(newsk);
1257		bh_lock_sock(newsk);
1258		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1259		newsk->sk_backlog.len = 0;
1260
1261		atomic_set(&newsk->sk_rmem_alloc, 0);
1262		/*
1263		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1264		 */
1265		atomic_set(&newsk->sk_wmem_alloc, 1);
1266		atomic_set(&newsk->sk_omem_alloc, 0);
1267		skb_queue_head_init(&newsk->sk_receive_queue);
1268		skb_queue_head_init(&newsk->sk_write_queue);
1269#ifdef CONFIG_NET_DMA
1270		skb_queue_head_init(&newsk->sk_async_wait_queue);
1271#endif
1272
1273		spin_lock_init(&newsk->sk_dst_lock);
1274		rwlock_init(&newsk->sk_callback_lock);
1275		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1276				af_callback_keys + newsk->sk_family,
1277				af_family_clock_key_strings[newsk->sk_family]);
1278
1279		newsk->sk_dst_cache	= NULL;
1280		newsk->sk_wmem_queued	= 0;
1281		newsk->sk_forward_alloc = 0;
1282		newsk->sk_send_head	= NULL;
1283		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1284
1285		sock_reset_flag(newsk, SOCK_DONE);
1286		skb_queue_head_init(&newsk->sk_error_queue);
1287
1288		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1289		if (filter != NULL)
1290			sk_filter_charge(newsk, filter);
1291
1292		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1293			/* It is still raw copy of parent, so invalidate
1294			 * destructor and make plain sk_free() */
1295			newsk->sk_destruct = NULL;
1296			bh_unlock_sock(newsk);
1297			sk_free(newsk);
1298			newsk = NULL;
1299			goto out;
1300		}
1301
1302		newsk->sk_err	   = 0;
1303		newsk->sk_priority = 0;
1304		/*
1305		 * Before updating sk_refcnt, we must commit prior changes to memory
1306		 * (Documentation/RCU/rculist_nulls.txt for details)
1307		 */
1308		smp_wmb();
1309		atomic_set(&newsk->sk_refcnt, 2);
1310
1311		/*
1312		 * Increment the counter in the same struct proto as the master
1313		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1314		 * is the same as sk->sk_prot->socks, as this field was copied
1315		 * with memcpy).
1316		 *
1317		 * This _changes_ the previous behaviour, where
1318		 * tcp_create_openreq_child always was incrementing the
1319		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1320		 * to be taken into account in all callers. -acme
1321		 */
1322		sk_refcnt_debug_inc(newsk);
1323		sk_set_socket(newsk, NULL);
1324		newsk->sk_wq = NULL;
1325
1326		if (newsk->sk_prot->sockets_allocated)
1327			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1328
1329		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1330		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1331			net_enable_timestamp();
1332	}
1333out:
1334	return newsk;
1335}
1336EXPORT_SYMBOL_GPL(sk_clone_lock);
1337
1338void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1339{
1340	__sk_dst_set(sk, dst);
1341	sk->sk_route_caps = dst->dev->features;
1342	if (sk->sk_route_caps & NETIF_F_GSO)
1343		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1344	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1345	if (sk_can_gso(sk)) {
1346		if (dst->header_len) {
1347			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1348		} else {
1349			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1350			sk->sk_gso_max_size = dst->dev->gso_max_size;
1351		}
1352	}
1353}
1354EXPORT_SYMBOL_GPL(sk_setup_caps);
1355
1356void __init sk_init(void)
1357{
1358	if (totalram_pages <= 4096) {
1359		sysctl_wmem_max = 32767;
1360		sysctl_rmem_max = 32767;
1361		sysctl_wmem_default = 32767;
1362		sysctl_rmem_default = 32767;
1363	} else if (totalram_pages >= 131072) {
1364		sysctl_wmem_max = 131071;
1365		sysctl_rmem_max = 131071;
1366	}
1367}
1368
1369/*
1370 *	Simple resource managers for sockets.
1371 */
1372
1373
1374/*
1375 * Write buffer destructor automatically called from kfree_skb.
1376 */
1377void sock_wfree(struct sk_buff *skb)
1378{
1379	struct sock *sk = skb->sk;
1380	unsigned int len = skb->truesize;
1381
1382	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1383		/*
1384		 * Keep a reference on sk_wmem_alloc, this will be released
1385		 * after sk_write_space() call
1386		 */
1387		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1388		sk->sk_write_space(sk);
1389		len = 1;
1390	}
1391	/*
1392	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1393	 * could not do because of in-flight packets
1394	 */
1395	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1396		__sk_free(sk);
1397}
1398EXPORT_SYMBOL(sock_wfree);
1399
1400/*
1401 * Read buffer destructor automatically called from kfree_skb.
1402 */
1403void sock_rfree(struct sk_buff *skb)
1404{
1405	struct sock *sk = skb->sk;
1406	unsigned int len = skb->truesize;
1407
1408	atomic_sub(len, &sk->sk_rmem_alloc);
1409	sk_mem_uncharge(sk, len);
1410}
1411EXPORT_SYMBOL(sock_rfree);
1412
1413
1414int sock_i_uid(struct sock *sk)
1415{
1416	int uid;
1417
1418	read_lock_bh(&sk->sk_callback_lock);
1419	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1420	read_unlock_bh(&sk->sk_callback_lock);
1421	return uid;
1422}
1423EXPORT_SYMBOL(sock_i_uid);
1424
1425unsigned long sock_i_ino(struct sock *sk)
1426{
1427	unsigned long ino;
1428
1429	read_lock_bh(&sk->sk_callback_lock);
1430	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1431	read_unlock_bh(&sk->sk_callback_lock);
1432	return ino;
1433}
1434EXPORT_SYMBOL(sock_i_ino);
1435
1436/*
1437 * Allocate a skb from the socket's send buffer.
1438 */
1439struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1440			     gfp_t priority)
1441{
1442	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1443		struct sk_buff *skb = alloc_skb(size, priority);
1444		if (skb) {
1445			skb_set_owner_w(skb, sk);
1446			return skb;
1447		}
1448	}
1449	return NULL;
1450}
1451EXPORT_SYMBOL(sock_wmalloc);
1452
1453/*
1454 * Allocate a skb from the socket's receive buffer.
1455 */
1456struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1457			     gfp_t priority)
1458{
1459	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1460		struct sk_buff *skb = alloc_skb(size, priority);
1461		if (skb) {
1462			skb_set_owner_r(skb, sk);
1463			return skb;
1464		}
1465	}
1466	return NULL;
1467}
1468
1469/*
1470 * Allocate a memory block from the socket's option memory buffer.
1471 */
1472void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1473{
1474	if ((unsigned)size <= sysctl_optmem_max &&
1475	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1476		void *mem;
1477		/* First do the add, to avoid the race if kmalloc
1478		 * might sleep.
1479		 */
1480		atomic_add(size, &sk->sk_omem_alloc);
1481		mem = kmalloc(size, priority);
1482		if (mem)
1483			return mem;
1484		atomic_sub(size, &sk->sk_omem_alloc);
1485	}
1486	return NULL;
1487}
1488EXPORT_SYMBOL(sock_kmalloc);
1489
1490/*
1491 * Free an option memory block.
1492 */
1493void sock_kfree_s(struct sock *sk, void *mem, int size)
1494{
1495	kfree(mem);
1496	atomic_sub(size, &sk->sk_omem_alloc);
1497}
1498EXPORT_SYMBOL(sock_kfree_s);
1499
1500/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1501   I think, these locks should be removed for datagram sockets.
1502 */
1503static long sock_wait_for_wmem(struct sock *sk, long timeo)
1504{
1505	DEFINE_WAIT(wait);
1506
1507	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1508	for (;;) {
1509		if (!timeo)
1510			break;
1511		if (signal_pending(current))
1512			break;
1513		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1514		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1515		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1516			break;
1517		if (sk->sk_shutdown & SEND_SHUTDOWN)
1518			break;
1519		if (sk->sk_err)
1520			break;
1521		timeo = schedule_timeout(timeo);
1522	}
1523	finish_wait(sk_sleep(sk), &wait);
1524	return timeo;
1525}
1526
1527
1528/*
1529 *	Generic send/receive buffer handlers
1530 */
1531
1532struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1533				     unsigned long data_len, int noblock,
1534				     int *errcode)
1535{
1536	struct sk_buff *skb;
1537	gfp_t gfp_mask;
1538	long timeo;
1539	int err;
1540
1541	gfp_mask = sk->sk_allocation;
1542	if (gfp_mask & __GFP_WAIT)
1543		gfp_mask |= __GFP_REPEAT;
1544
1545	timeo = sock_sndtimeo(sk, noblock);
1546	while (1) {
1547		err = sock_error(sk);
1548		if (err != 0)
1549			goto failure;
1550
1551		err = -EPIPE;
1552		if (sk->sk_shutdown & SEND_SHUTDOWN)
1553			goto failure;
1554
1555		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1556			skb = alloc_skb(header_len, gfp_mask);
1557			if (skb) {
1558				int npages;
1559				int i;
1560
1561				/* No pages, we're done... */
1562				if (!data_len)
1563					break;
1564
1565				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1566				skb->truesize += data_len;
1567				skb_shinfo(skb)->nr_frags = npages;
1568				for (i = 0; i < npages; i++) {
1569					struct page *page;
1570
1571					page = alloc_pages(sk->sk_allocation, 0);
1572					if (!page) {
1573						err = -ENOBUFS;
1574						skb_shinfo(skb)->nr_frags = i;
1575						kfree_skb(skb);
1576						goto failure;
1577					}
1578
1579					__skb_fill_page_desc(skb, i,
1580							page, 0,
1581							(data_len >= PAGE_SIZE ?
1582							 PAGE_SIZE :
1583							 data_len));
1584					data_len -= PAGE_SIZE;
1585				}
1586
1587				/* Full success... */
1588				break;
1589			}
1590			err = -ENOBUFS;
1591			goto failure;
1592		}
1593		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1594		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1595		err = -EAGAIN;
1596		if (!timeo)
1597			goto failure;
1598		if (signal_pending(current))
1599			goto interrupted;
1600		timeo = sock_wait_for_wmem(sk, timeo);
1601	}
1602
1603	skb_set_owner_w(skb, sk);
1604	return skb;
1605
1606interrupted:
1607	err = sock_intr_errno(timeo);
1608failure:
1609	*errcode = err;
1610	return NULL;
1611}
1612EXPORT_SYMBOL(sock_alloc_send_pskb);
1613
1614struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1615				    int noblock, int *errcode)
1616{
1617	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1618}
1619EXPORT_SYMBOL(sock_alloc_send_skb);
1620
1621static void __lock_sock(struct sock *sk)
1622	__releases(&sk->sk_lock.slock)
1623	__acquires(&sk->sk_lock.slock)
1624{
1625	DEFINE_WAIT(wait);
1626
1627	for (;;) {
1628		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1629					TASK_UNINTERRUPTIBLE);
1630		spin_unlock_bh(&sk->sk_lock.slock);
1631		schedule();
1632		spin_lock_bh(&sk->sk_lock.slock);
1633		if (!sock_owned_by_user(sk))
1634			break;
1635	}
1636	finish_wait(&sk->sk_lock.wq, &wait);
1637}
1638
1639static void __release_sock(struct sock *sk)
1640	__releases(&sk->sk_lock.slock)
1641	__acquires(&sk->sk_lock.slock)
1642{
1643	struct sk_buff *skb = sk->sk_backlog.head;
1644
1645	do {
1646		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1647		bh_unlock_sock(sk);
1648
1649		do {
1650			struct sk_buff *next = skb->next;
1651
1652			WARN_ON_ONCE(skb_dst_is_noref(skb));
1653			skb->next = NULL;
1654			sk_backlog_rcv(sk, skb);
1655
1656			/*
1657			 * We are in process context here with softirqs
1658			 * disabled, use cond_resched_softirq() to preempt.
1659			 * This is safe to do because we've taken the backlog
1660			 * queue private:
1661			 */
1662			cond_resched_softirq();
1663
1664			skb = next;
1665		} while (skb != NULL);
1666
1667		bh_lock_sock(sk);
1668	} while ((skb = sk->sk_backlog.head) != NULL);
1669
1670	/*
1671	 * Doing the zeroing here guarantee we can not loop forever
1672	 * while a wild producer attempts to flood us.
1673	 */
1674	sk->sk_backlog.len = 0;
1675}
1676
1677/**
1678 * sk_wait_data - wait for data to arrive at sk_receive_queue
1679 * @sk:    sock to wait on
1680 * @timeo: for how long
1681 *
1682 * Now socket state including sk->sk_err is changed only under lock,
1683 * hence we may omit checks after joining wait queue.
1684 * We check receive queue before schedule() only as optimization;
1685 * it is very likely that release_sock() added new data.
1686 */
1687int sk_wait_data(struct sock *sk, long *timeo)
1688{
1689	int rc;
1690	DEFINE_WAIT(wait);
1691
1692	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1693	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1694	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1695	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1696	finish_wait(sk_sleep(sk), &wait);
1697	return rc;
1698}
1699EXPORT_SYMBOL(sk_wait_data);
1700
1701/**
1702 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1703 *	@sk: socket
1704 *	@size: memory size to allocate
1705 *	@kind: allocation type
1706 *
1707 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1708 *	rmem allocation. This function assumes that protocols which have
1709 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1710 */
1711int __sk_mem_schedule(struct sock *sk, int size, int kind)
1712{
1713	struct proto *prot = sk->sk_prot;
1714	int amt = sk_mem_pages(size);
1715	long allocated;
1716
1717	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1718	allocated = atomic_long_add_return(amt, prot->memory_allocated);
1719
1720	/* Under limit. */
1721	if (allocated <= prot->sysctl_mem[0]) {
1722		if (prot->memory_pressure && *prot->memory_pressure)
1723			*prot->memory_pressure = 0;
1724		return 1;
1725	}
1726
1727	/* Under pressure. */
1728	if (allocated > prot->sysctl_mem[1])
1729		if (prot->enter_memory_pressure)
1730			prot->enter_memory_pressure(sk);
1731
1732	/* Over hard limit. */
1733	if (allocated > prot->sysctl_mem[2])
1734		goto suppress_allocation;
1735
1736	/* guarantee minimum buffer size under pressure */
1737	if (kind == SK_MEM_RECV) {
1738		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1739			return 1;
1740	} else { /* SK_MEM_SEND */
1741		if (sk->sk_type == SOCK_STREAM) {
1742			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1743				return 1;
1744		} else if (atomic_read(&sk->sk_wmem_alloc) <
1745			   prot->sysctl_wmem[0])
1746				return 1;
1747	}
1748
1749	if (prot->memory_pressure) {
1750		int alloc;
1751
1752		if (!*prot->memory_pressure)
1753			return 1;
1754		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1755		if (prot->sysctl_mem[2] > alloc *
1756		    sk_mem_pages(sk->sk_wmem_queued +
1757				 atomic_read(&sk->sk_rmem_alloc) +
1758				 sk->sk_forward_alloc))
1759			return 1;
1760	}
1761
1762suppress_allocation:
1763
1764	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1765		sk_stream_moderate_sndbuf(sk);
1766
1767		/* Fail only if socket is _under_ its sndbuf.
1768		 * In this case we cannot block, so that we have to fail.
1769		 */
1770		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1771			return 1;
1772	}
1773
1774	trace_sock_exceed_buf_limit(sk, prot, allocated);
1775
1776	/* Alas. Undo changes. */
1777	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1778	atomic_long_sub(amt, prot->memory_allocated);
1779	return 0;
1780}
1781EXPORT_SYMBOL(__sk_mem_schedule);
1782
1783/**
1784 *	__sk_reclaim - reclaim memory_allocated
1785 *	@sk: socket
1786 */
1787void __sk_mem_reclaim(struct sock *sk)
1788{
1789	struct proto *prot = sk->sk_prot;
1790
1791	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1792		   prot->memory_allocated);
1793	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1794
1795	if (prot->memory_pressure && *prot->memory_pressure &&
1796	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1797		*prot->memory_pressure = 0;
1798}
1799EXPORT_SYMBOL(__sk_mem_reclaim);
1800
1801
1802/*
1803 * Set of default routines for initialising struct proto_ops when
1804 * the protocol does not support a particular function. In certain
1805 * cases where it makes no sense for a protocol to have a "do nothing"
1806 * function, some default processing is provided.
1807 */
1808
1809int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1810{
1811	return -EOPNOTSUPP;
1812}
1813EXPORT_SYMBOL(sock_no_bind);
1814
1815int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1816		    int len, int flags)
1817{
1818	return -EOPNOTSUPP;
1819}
1820EXPORT_SYMBOL(sock_no_connect);
1821
1822int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1823{
1824	return -EOPNOTSUPP;
1825}
1826EXPORT_SYMBOL(sock_no_socketpair);
1827
1828int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1829{
1830	return -EOPNOTSUPP;
1831}
1832EXPORT_SYMBOL(sock_no_accept);
1833
1834int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1835		    int *len, int peer)
1836{
1837	return -EOPNOTSUPP;
1838}
1839EXPORT_SYMBOL(sock_no_getname);
1840
1841unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1842{
1843	return 0;
1844}
1845EXPORT_SYMBOL(sock_no_poll);
1846
1847int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1848{
1849	return -EOPNOTSUPP;
1850}
1851EXPORT_SYMBOL(sock_no_ioctl);
1852
1853int sock_no_listen(struct socket *sock, int backlog)
1854{
1855	return -EOPNOTSUPP;
1856}
1857EXPORT_SYMBOL(sock_no_listen);
1858
1859int sock_no_shutdown(struct socket *sock, int how)
1860{
1861	return -EOPNOTSUPP;
1862}
1863EXPORT_SYMBOL(sock_no_shutdown);
1864
1865int sock_no_setsockopt(struct socket *sock, int level, int optname,
1866		    char __user *optval, unsigned int optlen)
1867{
1868	return -EOPNOTSUPP;
1869}
1870EXPORT_SYMBOL(sock_no_setsockopt);
1871
1872int sock_no_getsockopt(struct socket *sock, int level, int optname,
1873		    char __user *optval, int __user *optlen)
1874{
1875	return -EOPNOTSUPP;
1876}
1877EXPORT_SYMBOL(sock_no_getsockopt);
1878
1879int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1880		    size_t len)
1881{
1882	return -EOPNOTSUPP;
1883}
1884EXPORT_SYMBOL(sock_no_sendmsg);
1885
1886int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1887		    size_t len, int flags)
1888{
1889	return -EOPNOTSUPP;
1890}
1891EXPORT_SYMBOL(sock_no_recvmsg);
1892
1893int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1894{
1895	/* Mirror missing mmap method error code */
1896	return -ENODEV;
1897}
1898EXPORT_SYMBOL(sock_no_mmap);
1899
1900ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1901{
1902	ssize_t res;
1903	struct msghdr msg = {.msg_flags = flags};
1904	struct kvec iov;
1905	char *kaddr = kmap(page);
1906	iov.iov_base = kaddr + offset;
1907	iov.iov_len = size;
1908	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1909	kunmap(page);
1910	return res;
1911}
1912EXPORT_SYMBOL(sock_no_sendpage);
1913
1914/*
1915 *	Default Socket Callbacks
1916 */
1917
1918static void sock_def_wakeup(struct sock *sk)
1919{
1920	struct socket_wq *wq;
1921
1922	rcu_read_lock();
1923	wq = rcu_dereference(sk->sk_wq);
1924	if (wq_has_sleeper(wq))
1925		wake_up_interruptible_all(&wq->wait);
1926	rcu_read_unlock();
1927}
1928
1929static void sock_def_error_report(struct sock *sk)
1930{
1931	struct socket_wq *wq;
1932
1933	rcu_read_lock();
1934	wq = rcu_dereference(sk->sk_wq);
1935	if (wq_has_sleeper(wq))
1936		wake_up_interruptible_poll(&wq->wait, POLLERR);
1937	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1938	rcu_read_unlock();
1939}
1940
1941static void sock_def_readable(struct sock *sk, int len)
1942{
1943	struct socket_wq *wq;
1944
1945	rcu_read_lock();
1946	wq = rcu_dereference(sk->sk_wq);
1947	if (wq_has_sleeper(wq))
1948		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1949						POLLRDNORM | POLLRDBAND);
1950	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1951	rcu_read_unlock();
1952}
1953
1954static void sock_def_write_space(struct sock *sk)
1955{
1956	struct socket_wq *wq;
1957
1958	rcu_read_lock();
1959
1960	/* Do not wake up a writer until he can make "significant"
1961	 * progress.  --DaveM
1962	 */
1963	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1964		wq = rcu_dereference(sk->sk_wq);
1965		if (wq_has_sleeper(wq))
1966			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1967						POLLWRNORM | POLLWRBAND);
1968
1969		/* Should agree with poll, otherwise some programs break */
1970		if (sock_writeable(sk))
1971			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1972	}
1973
1974	rcu_read_unlock();
1975}
1976
1977static void sock_def_destruct(struct sock *sk)
1978{
1979	kfree(sk->sk_protinfo);
1980}
1981
1982void sk_send_sigurg(struct sock *sk)
1983{
1984	if (sk->sk_socket && sk->sk_socket->file)
1985		if (send_sigurg(&sk->sk_socket->file->f_owner))
1986			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1987}
1988EXPORT_SYMBOL(sk_send_sigurg);
1989
1990void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1991		    unsigned long expires)
1992{
1993	if (!mod_timer(timer, expires))
1994		sock_hold(sk);
1995}
1996EXPORT_SYMBOL(sk_reset_timer);
1997
1998void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1999{
2000	if (timer_pending(timer) && del_timer(timer))
2001		__sock_put(sk);
2002}
2003EXPORT_SYMBOL(sk_stop_timer);
2004
2005void sock_init_data(struct socket *sock, struct sock *sk)
2006{
2007	skb_queue_head_init(&sk->sk_receive_queue);
2008	skb_queue_head_init(&sk->sk_write_queue);
2009	skb_queue_head_init(&sk->sk_error_queue);
2010#ifdef CONFIG_NET_DMA
2011	skb_queue_head_init(&sk->sk_async_wait_queue);
2012#endif
2013
2014	sk->sk_send_head	=	NULL;
2015
2016	init_timer(&sk->sk_timer);
2017
2018	sk->sk_allocation	=	GFP_KERNEL;
2019	sk->sk_rcvbuf		=	sysctl_rmem_default;
2020	sk->sk_sndbuf		=	sysctl_wmem_default;
2021	sk->sk_state		=	TCP_CLOSE;
2022	sk_set_socket(sk, sock);
2023
2024	sock_set_flag(sk, SOCK_ZAPPED);
2025
2026	if (sock) {
2027		sk->sk_type	=	sock->type;
2028		sk->sk_wq	=	sock->wq;
2029		sock->sk	=	sk;
2030	} else
2031		sk->sk_wq	=	NULL;
2032
2033	spin_lock_init(&sk->sk_dst_lock);
2034	rwlock_init(&sk->sk_callback_lock);
2035	lockdep_set_class_and_name(&sk->sk_callback_lock,
2036			af_callback_keys + sk->sk_family,
2037			af_family_clock_key_strings[sk->sk_family]);
2038
2039	sk->sk_state_change	=	sock_def_wakeup;
2040	sk->sk_data_ready	=	sock_def_readable;
2041	sk->sk_write_space	=	sock_def_write_space;
2042	sk->sk_error_report	=	sock_def_error_report;
2043	sk->sk_destruct		=	sock_def_destruct;
2044
2045	sk->sk_sndmsg_page	=	NULL;
2046	sk->sk_sndmsg_off	=	0;
2047
2048	sk->sk_peer_pid 	=	NULL;
2049	sk->sk_peer_cred	=	NULL;
2050	sk->sk_write_pending	=	0;
2051	sk->sk_rcvlowat		=	1;
2052	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2053	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2054
2055	sk->sk_stamp = ktime_set(-1L, 0);
2056
2057	/*
2058	 * Before updating sk_refcnt, we must commit prior changes to memory
2059	 * (Documentation/RCU/rculist_nulls.txt for details)
2060	 */
2061	smp_wmb();
2062	atomic_set(&sk->sk_refcnt, 1);
2063	atomic_set(&sk->sk_drops, 0);
2064}
2065EXPORT_SYMBOL(sock_init_data);
2066
2067void lock_sock_nested(struct sock *sk, int subclass)
2068{
2069	might_sleep();
2070	spin_lock_bh(&sk->sk_lock.slock);
2071	if (sk->sk_lock.owned)
2072		__lock_sock(sk);
2073	sk->sk_lock.owned = 1;
2074	spin_unlock(&sk->sk_lock.slock);
2075	/*
2076	 * The sk_lock has mutex_lock() semantics here:
2077	 */
2078	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2079	local_bh_enable();
2080}
2081EXPORT_SYMBOL(lock_sock_nested);
2082
2083void release_sock(struct sock *sk)
2084{
2085	/*
2086	 * The sk_lock has mutex_unlock() semantics:
2087	 */
2088	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2089
2090	spin_lock_bh(&sk->sk_lock.slock);
2091	if (sk->sk_backlog.tail)
2092		__release_sock(sk);
2093	sk->sk_lock.owned = 0;
2094	if (waitqueue_active(&sk->sk_lock.wq))
2095		wake_up(&sk->sk_lock.wq);
2096	spin_unlock_bh(&sk->sk_lock.slock);
2097}
2098EXPORT_SYMBOL(release_sock);
2099
2100/**
2101 * lock_sock_fast - fast version of lock_sock
2102 * @sk: socket
2103 *
2104 * This version should be used for very small section, where process wont block
2105 * return false if fast path is taken
2106 *   sk_lock.slock locked, owned = 0, BH disabled
2107 * return true if slow path is taken
2108 *   sk_lock.slock unlocked, owned = 1, BH enabled
2109 */
2110bool lock_sock_fast(struct sock *sk)
2111{
2112	might_sleep();
2113	spin_lock_bh(&sk->sk_lock.slock);
2114
2115	if (!sk->sk_lock.owned)
2116		/*
2117		 * Note : We must disable BH
2118		 */
2119		return false;
2120
2121	__lock_sock(sk);
2122	sk->sk_lock.owned = 1;
2123	spin_unlock(&sk->sk_lock.slock);
2124	/*
2125	 * The sk_lock has mutex_lock() semantics here:
2126	 */
2127	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2128	local_bh_enable();
2129	return true;
2130}
2131EXPORT_SYMBOL(lock_sock_fast);
2132
2133int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2134{
2135	struct timeval tv;
2136	if (!sock_flag(sk, SOCK_TIMESTAMP))
2137		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2138	tv = ktime_to_timeval(sk->sk_stamp);
2139	if (tv.tv_sec == -1)
2140		return -ENOENT;
2141	if (tv.tv_sec == 0) {
2142		sk->sk_stamp = ktime_get_real();
2143		tv = ktime_to_timeval(sk->sk_stamp);
2144	}
2145	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2146}
2147EXPORT_SYMBOL(sock_get_timestamp);
2148
2149int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2150{
2151	struct timespec ts;
2152	if (!sock_flag(sk, SOCK_TIMESTAMP))
2153		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2154	ts = ktime_to_timespec(sk->sk_stamp);
2155	if (ts.tv_sec == -1)
2156		return -ENOENT;
2157	if (ts.tv_sec == 0) {
2158		sk->sk_stamp = ktime_get_real();
2159		ts = ktime_to_timespec(sk->sk_stamp);
2160	}
2161	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2162}
2163EXPORT_SYMBOL(sock_get_timestampns);
2164
2165void sock_enable_timestamp(struct sock *sk, int flag)
2166{
2167	if (!sock_flag(sk, flag)) {
2168		sock_set_flag(sk, flag);
2169		/*
2170		 * we just set one of the two flags which require net
2171		 * time stamping, but time stamping might have been on
2172		 * already because of the other one
2173		 */
2174		if (!sock_flag(sk,
2175				flag == SOCK_TIMESTAMP ?
2176				SOCK_TIMESTAMPING_RX_SOFTWARE :
2177				SOCK_TIMESTAMP))
2178			net_enable_timestamp();
2179	}
2180}
2181
2182/*
2183 *	Get a socket option on an socket.
2184 *
2185 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2186 *	asynchronous errors should be reported by getsockopt. We assume
2187 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2188 */
2189int sock_common_getsockopt(struct socket *sock, int level, int optname,
2190			   char __user *optval, int __user *optlen)
2191{
2192	struct sock *sk = sock->sk;
2193
2194	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2195}
2196EXPORT_SYMBOL(sock_common_getsockopt);
2197
2198#ifdef CONFIG_COMPAT
2199int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2200				  char __user *optval, int __user *optlen)
2201{
2202	struct sock *sk = sock->sk;
2203
2204	if (sk->sk_prot->compat_getsockopt != NULL)
2205		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2206						      optval, optlen);
2207	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2208}
2209EXPORT_SYMBOL(compat_sock_common_getsockopt);
2210#endif
2211
2212int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2213			struct msghdr *msg, size_t size, int flags)
2214{
2215	struct sock *sk = sock->sk;
2216	int addr_len = 0;
2217	int err;
2218
2219	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2220				   flags & ~MSG_DONTWAIT, &addr_len);
2221	if (err >= 0)
2222		msg->msg_namelen = addr_len;
2223	return err;
2224}
2225EXPORT_SYMBOL(sock_common_recvmsg);
2226
2227/*
2228 *	Set socket options on an inet socket.
2229 */
2230int sock_common_setsockopt(struct socket *sock, int level, int optname,
2231			   char __user *optval, unsigned int optlen)
2232{
2233	struct sock *sk = sock->sk;
2234
2235	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2236}
2237EXPORT_SYMBOL(sock_common_setsockopt);
2238
2239#ifdef CONFIG_COMPAT
2240int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2241				  char __user *optval, unsigned int optlen)
2242{
2243	struct sock *sk = sock->sk;
2244
2245	if (sk->sk_prot->compat_setsockopt != NULL)
2246		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2247						      optval, optlen);
2248	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2249}
2250EXPORT_SYMBOL(compat_sock_common_setsockopt);
2251#endif
2252
2253void sk_common_release(struct sock *sk)
2254{
2255	if (sk->sk_prot->destroy)
2256		sk->sk_prot->destroy(sk);
2257
2258	/*
2259	 * Observation: when sock_common_release is called, processes have
2260	 * no access to socket. But net still has.
2261	 * Step one, detach it from networking:
2262	 *
2263	 * A. Remove from hash tables.
2264	 */
2265
2266	sk->sk_prot->unhash(sk);
2267
2268	/*
2269	 * In this point socket cannot receive new packets, but it is possible
2270	 * that some packets are in flight because some CPU runs receiver and
2271	 * did hash table lookup before we unhashed socket. They will achieve
2272	 * receive queue and will be purged by socket destructor.
2273	 *
2274	 * Also we still have packets pending on receive queue and probably,
2275	 * our own packets waiting in device queues. sock_destroy will drain
2276	 * receive queue, but transmitted packets will delay socket destruction
2277	 * until the last reference will be released.
2278	 */
2279
2280	sock_orphan(sk);
2281
2282	xfrm_sk_free_policy(sk);
2283
2284	sk_refcnt_debug_release(sk);
2285	sock_put(sk);
2286}
2287EXPORT_SYMBOL(sk_common_release);
2288
2289static DEFINE_RWLOCK(proto_list_lock);
2290static LIST_HEAD(proto_list);
2291
2292#ifdef CONFIG_PROC_FS
2293#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2294struct prot_inuse {
2295	int val[PROTO_INUSE_NR];
2296};
2297
2298static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2299
2300#ifdef CONFIG_NET_NS
2301void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2302{
2303	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2304}
2305EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2306
2307int sock_prot_inuse_get(struct net *net, struct proto *prot)
2308{
2309	int cpu, idx = prot->inuse_idx;
2310	int res = 0;
2311
2312	for_each_possible_cpu(cpu)
2313		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2314
2315	return res >= 0 ? res : 0;
2316}
2317EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2318
2319static int __net_init sock_inuse_init_net(struct net *net)
2320{
2321	net->core.inuse = alloc_percpu(struct prot_inuse);
2322	return net->core.inuse ? 0 : -ENOMEM;
2323}
2324
2325static void __net_exit sock_inuse_exit_net(struct net *net)
2326{
2327	free_percpu(net->core.inuse);
2328}
2329
2330static struct pernet_operations net_inuse_ops = {
2331	.init = sock_inuse_init_net,
2332	.exit = sock_inuse_exit_net,
2333};
2334
2335static __init int net_inuse_init(void)
2336{
2337	if (register_pernet_subsys(&net_inuse_ops))
2338		panic("Cannot initialize net inuse counters");
2339
2340	return 0;
2341}
2342
2343core_initcall(net_inuse_init);
2344#else
2345static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2346
2347void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2348{
2349	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2350}
2351EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2352
2353int sock_prot_inuse_get(struct net *net, struct proto *prot)
2354{
2355	int cpu, idx = prot->inuse_idx;
2356	int res = 0;
2357
2358	for_each_possible_cpu(cpu)
2359		res += per_cpu(prot_inuse, cpu).val[idx];
2360
2361	return res >= 0 ? res : 0;
2362}
2363EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2364#endif
2365
2366static void assign_proto_idx(struct proto *prot)
2367{
2368	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2369
2370	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2371		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2372		return;
2373	}
2374
2375	set_bit(prot->inuse_idx, proto_inuse_idx);
2376}
2377
2378static void release_proto_idx(struct proto *prot)
2379{
2380	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2381		clear_bit(prot->inuse_idx, proto_inuse_idx);
2382}
2383#else
2384static inline void assign_proto_idx(struct proto *prot)
2385{
2386}
2387
2388static inline void release_proto_idx(struct proto *prot)
2389{
2390}
2391#endif
2392
2393int proto_register(struct proto *prot, int alloc_slab)
2394{
2395	if (alloc_slab) {
2396		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2397					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2398					NULL);
2399
2400		if (prot->slab == NULL) {
2401			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2402			       prot->name);
2403			goto out;
2404		}
2405
2406		if (prot->rsk_prot != NULL) {
2407			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2408			if (prot->rsk_prot->slab_name == NULL)
2409				goto out_free_sock_slab;
2410
2411			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2412								 prot->rsk_prot->obj_size, 0,
2413								 SLAB_HWCACHE_ALIGN, NULL);
2414
2415			if (prot->rsk_prot->slab == NULL) {
2416				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2417				       prot->name);
2418				goto out_free_request_sock_slab_name;
2419			}
2420		}
2421
2422		if (prot->twsk_prot != NULL) {
2423			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2424
2425			if (prot->twsk_prot->twsk_slab_name == NULL)
2426				goto out_free_request_sock_slab;
2427
2428			prot->twsk_prot->twsk_slab =
2429				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2430						  prot->twsk_prot->twsk_obj_size,
2431						  0,
2432						  SLAB_HWCACHE_ALIGN |
2433							prot->slab_flags,
2434						  NULL);
2435			if (prot->twsk_prot->twsk_slab == NULL)
2436				goto out_free_timewait_sock_slab_name;
2437		}
2438	}
2439
2440	write_lock(&proto_list_lock);
2441	list_add(&prot->node, &proto_list);
2442	assign_proto_idx(prot);
2443	write_unlock(&proto_list_lock);
2444	return 0;
2445
2446out_free_timewait_sock_slab_name:
2447	kfree(prot->twsk_prot->twsk_slab_name);
2448out_free_request_sock_slab:
2449	if (prot->rsk_prot && prot->rsk_prot->slab) {
2450		kmem_cache_destroy(prot->rsk_prot->slab);
2451		prot->rsk_prot->slab = NULL;
2452	}
2453out_free_request_sock_slab_name:
2454	if (prot->rsk_prot)
2455		kfree(prot->rsk_prot->slab_name);
2456out_free_sock_slab:
2457	kmem_cache_destroy(prot->slab);
2458	prot->slab = NULL;
2459out:
2460	return -ENOBUFS;
2461}
2462EXPORT_SYMBOL(proto_register);
2463
2464void proto_unregister(struct proto *prot)
2465{
2466	write_lock(&proto_list_lock);
2467	release_proto_idx(prot);
2468	list_del(&prot->node);
2469	write_unlock(&proto_list_lock);
2470
2471	if (prot->slab != NULL) {
2472		kmem_cache_destroy(prot->slab);
2473		prot->slab = NULL;
2474	}
2475
2476	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2477		kmem_cache_destroy(prot->rsk_prot->slab);
2478		kfree(prot->rsk_prot->slab_name);
2479		prot->rsk_prot->slab = NULL;
2480	}
2481
2482	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2483		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2484		kfree(prot->twsk_prot->twsk_slab_name);
2485		prot->twsk_prot->twsk_slab = NULL;
2486	}
2487}
2488EXPORT_SYMBOL(proto_unregister);
2489
2490#ifdef CONFIG_PROC_FS
2491static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2492	__acquires(proto_list_lock)
2493{
2494	read_lock(&proto_list_lock);
2495	return seq_list_start_head(&proto_list, *pos);
2496}
2497
2498static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2499{
2500	return seq_list_next(v, &proto_list, pos);
2501}
2502
2503static void proto_seq_stop(struct seq_file *seq, void *v)
2504	__releases(proto_list_lock)
2505{
2506	read_unlock(&proto_list_lock);
2507}
2508
2509static char proto_method_implemented(const void *method)
2510{
2511	return method == NULL ? 'n' : 'y';
2512}
2513
2514static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2515{
2516	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2517			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2518		   proto->name,
2519		   proto->obj_size,
2520		   sock_prot_inuse_get(seq_file_net(seq), proto),
2521		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
2522		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2523		   proto->max_header,
2524		   proto->slab == NULL ? "no" : "yes",
2525		   module_name(proto->owner),
2526		   proto_method_implemented(proto->close),
2527		   proto_method_implemented(proto->connect),
2528		   proto_method_implemented(proto->disconnect),
2529		   proto_method_implemented(proto->accept),
2530		   proto_method_implemented(proto->ioctl),
2531		   proto_method_implemented(proto->init),
2532		   proto_method_implemented(proto->destroy),
2533		   proto_method_implemented(proto->shutdown),
2534		   proto_method_implemented(proto->setsockopt),
2535		   proto_method_implemented(proto->getsockopt),
2536		   proto_method_implemented(proto->sendmsg),
2537		   proto_method_implemented(proto->recvmsg),
2538		   proto_method_implemented(proto->sendpage),
2539		   proto_method_implemented(proto->bind),
2540		   proto_method_implemented(proto->backlog_rcv),
2541		   proto_method_implemented(proto->hash),
2542		   proto_method_implemented(proto->unhash),
2543		   proto_method_implemented(proto->get_port),
2544		   proto_method_implemented(proto->enter_memory_pressure));
2545}
2546
2547static int proto_seq_show(struct seq_file *seq, void *v)
2548{
2549	if (v == &proto_list)
2550		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2551			   "protocol",
2552			   "size",
2553			   "sockets",
2554			   "memory",
2555			   "press",
2556			   "maxhdr",
2557			   "slab",
2558			   "module",
2559			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2560	else
2561		proto_seq_printf(seq, list_entry(v, struct proto, node));
2562	return 0;
2563}
2564
2565static const struct seq_operations proto_seq_ops = {
2566	.start  = proto_seq_start,
2567	.next   = proto_seq_next,
2568	.stop   = proto_seq_stop,
2569	.show   = proto_seq_show,
2570};
2571
2572static int proto_seq_open(struct inode *inode, struct file *file)
2573{
2574	return seq_open_net(inode, file, &proto_seq_ops,
2575			    sizeof(struct seq_net_private));
2576}
2577
2578static const struct file_operations proto_seq_fops = {
2579	.owner		= THIS_MODULE,
2580	.open		= proto_seq_open,
2581	.read		= seq_read,
2582	.llseek		= seq_lseek,
2583	.release	= seq_release_net,
2584};
2585
2586static __net_init int proto_init_net(struct net *net)
2587{
2588	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2589		return -ENOMEM;
2590
2591	return 0;
2592}
2593
2594static __net_exit void proto_exit_net(struct net *net)
2595{
2596	proc_net_remove(net, "protocols");
2597}
2598
2599
2600static __net_initdata struct pernet_operations proto_net_ops = {
2601	.init = proto_init_net,
2602	.exit = proto_exit_net,
2603};
2604
2605static int __init proto_init(void)
2606{
2607	return register_pernet_subsys(&proto_net_ops);
2608}
2609
2610subsys_initcall(proto_init);
2611
2612#endif /* PROC_FS */
2613