sock.c revision cb75a36c8a1ab68e2dbfbe172f12c792b0c6dba8
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114#include <linux/static_key.h>
115#include <linux/memcontrol.h>
116
117#include <asm/uaccess.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/net_namespace.h>
123#include <net/request_sock.h>
124#include <net/sock.h>
125#include <linux/net_tstamp.h>
126#include <net/xfrm.h>
127#include <linux/ipsec.h>
128#include <net/cls_cgroup.h>
129#include <net/netprio_cgroup.h>
130
131#include <linux/filter.h>
132
133#include <trace/events/sock.h>
134
135#ifdef CONFIG_INET
136#include <net/tcp.h>
137#endif
138
139static DEFINE_MUTEX(proto_list_mutex);
140static LIST_HEAD(proto_list);
141
142#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
143int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
144{
145	struct proto *proto;
146	int ret = 0;
147
148	mutex_lock(&proto_list_mutex);
149	list_for_each_entry(proto, &proto_list, node) {
150		if (proto->init_cgroup) {
151			ret = proto->init_cgroup(cgrp, ss);
152			if (ret)
153				goto out;
154		}
155	}
156
157	mutex_unlock(&proto_list_mutex);
158	return ret;
159out:
160	list_for_each_entry_continue_reverse(proto, &proto_list, node)
161		if (proto->destroy_cgroup)
162			proto->destroy_cgroup(cgrp);
163	mutex_unlock(&proto_list_mutex);
164	return ret;
165}
166
167void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
168{
169	struct proto *proto;
170
171	mutex_lock(&proto_list_mutex);
172	list_for_each_entry_reverse(proto, &proto_list, node)
173		if (proto->destroy_cgroup)
174			proto->destroy_cgroup(cgrp);
175	mutex_unlock(&proto_list_mutex);
176}
177#endif
178
179/*
180 * Each address family might have different locking rules, so we have
181 * one slock key per address family:
182 */
183static struct lock_class_key af_family_keys[AF_MAX];
184static struct lock_class_key af_family_slock_keys[AF_MAX];
185
186struct static_key memcg_socket_limit_enabled;
187EXPORT_SYMBOL(memcg_socket_limit_enabled);
188
189/*
190 * Make lock validator output more readable. (we pre-construct these
191 * strings build-time, so that runtime initialization of socket
192 * locks is fast):
193 */
194static const char *const af_family_key_strings[AF_MAX+1] = {
195  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
196  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
197  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
198  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
199  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
200  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
201  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
202  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
203  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
204  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
205  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
206  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
207  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
208  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
209};
210static const char *const af_family_slock_key_strings[AF_MAX+1] = {
211  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
212  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
213  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
214  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
215  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
216  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
217  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
218  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
219  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
220  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
221  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
222  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
223  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
224  "slock-AF_NFC"   , "slock-AF_MAX"
225};
226static const char *const af_family_clock_key_strings[AF_MAX+1] = {
227  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
228  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
229  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
230  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
231  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
232  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
233  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
234  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
235  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
236  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
237  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
238  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
239  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
240  "clock-AF_NFC"   , "clock-AF_MAX"
241};
242
243/*
244 * sk_callback_lock locking rules are per-address-family,
245 * so split the lock classes by using a per-AF key:
246 */
247static struct lock_class_key af_callback_keys[AF_MAX];
248
249/* Take into consideration the size of the struct sk_buff overhead in the
250 * determination of these values, since that is non-constant across
251 * platforms.  This makes socket queueing behavior and performance
252 * not depend upon such differences.
253 */
254#define _SK_MEM_PACKETS		256
255#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
256#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
257#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258
259/* Run time adjustable parameters. */
260__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
261__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
262__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
263__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
264
265/* Maximal space eaten by iovec or ancillary data plus some space */
266int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
267EXPORT_SYMBOL(sysctl_optmem_max);
268
269#if defined(CONFIG_CGROUPS)
270#if !defined(CONFIG_NET_CLS_CGROUP)
271int net_cls_subsys_id = -1;
272EXPORT_SYMBOL_GPL(net_cls_subsys_id);
273#endif
274#if !defined(CONFIG_NETPRIO_CGROUP)
275int net_prio_subsys_id = -1;
276EXPORT_SYMBOL_GPL(net_prio_subsys_id);
277#endif
278#endif
279
280static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
281{
282	struct timeval tv;
283
284	if (optlen < sizeof(tv))
285		return -EINVAL;
286	if (copy_from_user(&tv, optval, sizeof(tv)))
287		return -EFAULT;
288	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
289		return -EDOM;
290
291	if (tv.tv_sec < 0) {
292		static int warned __read_mostly;
293
294		*timeo_p = 0;
295		if (warned < 10 && net_ratelimit()) {
296			warned++;
297			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
298			       "tries to set negative timeout\n",
299				current->comm, task_pid_nr(current));
300		}
301		return 0;
302	}
303	*timeo_p = MAX_SCHEDULE_TIMEOUT;
304	if (tv.tv_sec == 0 && tv.tv_usec == 0)
305		return 0;
306	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
307		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
308	return 0;
309}
310
311static void sock_warn_obsolete_bsdism(const char *name)
312{
313	static int warned;
314	static char warncomm[TASK_COMM_LEN];
315	if (strcmp(warncomm, current->comm) && warned < 5) {
316		strcpy(warncomm,  current->comm);
317		printk(KERN_WARNING "process `%s' is using obsolete "
318		       "%s SO_BSDCOMPAT\n", warncomm, name);
319		warned++;
320	}
321}
322
323#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
324
325static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
326{
327	if (sk->sk_flags & flags) {
328		sk->sk_flags &= ~flags;
329		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
330			net_disable_timestamp();
331	}
332}
333
334
335int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
336{
337	int err;
338	int skb_len;
339	unsigned long flags;
340	struct sk_buff_head *list = &sk->sk_receive_queue;
341
342	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
343		atomic_inc(&sk->sk_drops);
344		trace_sock_rcvqueue_full(sk, skb);
345		return -ENOMEM;
346	}
347
348	err = sk_filter(sk, skb);
349	if (err)
350		return err;
351
352	if (!sk_rmem_schedule(sk, skb->truesize)) {
353		atomic_inc(&sk->sk_drops);
354		return -ENOBUFS;
355	}
356
357	skb->dev = NULL;
358	skb_set_owner_r(skb, sk);
359
360	/* Cache the SKB length before we tack it onto the receive
361	 * queue.  Once it is added it no longer belongs to us and
362	 * may be freed by other threads of control pulling packets
363	 * from the queue.
364	 */
365	skb_len = skb->len;
366
367	/* we escape from rcu protected region, make sure we dont leak
368	 * a norefcounted dst
369	 */
370	skb_dst_force(skb);
371
372	spin_lock_irqsave(&list->lock, flags);
373	skb->dropcount = atomic_read(&sk->sk_drops);
374	__skb_queue_tail(list, skb);
375	spin_unlock_irqrestore(&list->lock, flags);
376
377	if (!sock_flag(sk, SOCK_DEAD))
378		sk->sk_data_ready(sk, skb_len);
379	return 0;
380}
381EXPORT_SYMBOL(sock_queue_rcv_skb);
382
383int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
384{
385	int rc = NET_RX_SUCCESS;
386
387	if (sk_filter(sk, skb))
388		goto discard_and_relse;
389
390	skb->dev = NULL;
391
392	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
393		atomic_inc(&sk->sk_drops);
394		goto discard_and_relse;
395	}
396	if (nested)
397		bh_lock_sock_nested(sk);
398	else
399		bh_lock_sock(sk);
400	if (!sock_owned_by_user(sk)) {
401		/*
402		 * trylock + unlock semantics:
403		 */
404		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
405
406		rc = sk_backlog_rcv(sk, skb);
407
408		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
409	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
410		bh_unlock_sock(sk);
411		atomic_inc(&sk->sk_drops);
412		goto discard_and_relse;
413	}
414
415	bh_unlock_sock(sk);
416out:
417	sock_put(sk);
418	return rc;
419discard_and_relse:
420	kfree_skb(skb);
421	goto out;
422}
423EXPORT_SYMBOL(sk_receive_skb);
424
425void sk_reset_txq(struct sock *sk)
426{
427	sk_tx_queue_clear(sk);
428}
429EXPORT_SYMBOL(sk_reset_txq);
430
431struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
432{
433	struct dst_entry *dst = __sk_dst_get(sk);
434
435	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
436		sk_tx_queue_clear(sk);
437		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
438		dst_release(dst);
439		return NULL;
440	}
441
442	return dst;
443}
444EXPORT_SYMBOL(__sk_dst_check);
445
446struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
447{
448	struct dst_entry *dst = sk_dst_get(sk);
449
450	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
451		sk_dst_reset(sk);
452		dst_release(dst);
453		return NULL;
454	}
455
456	return dst;
457}
458EXPORT_SYMBOL(sk_dst_check);
459
460static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
461{
462	int ret = -ENOPROTOOPT;
463#ifdef CONFIG_NETDEVICES
464	struct net *net = sock_net(sk);
465	char devname[IFNAMSIZ];
466	int index;
467
468	/* Sorry... */
469	ret = -EPERM;
470	if (!capable(CAP_NET_RAW))
471		goto out;
472
473	ret = -EINVAL;
474	if (optlen < 0)
475		goto out;
476
477	/* Bind this socket to a particular device like "eth0",
478	 * as specified in the passed interface name. If the
479	 * name is "" or the option length is zero the socket
480	 * is not bound.
481	 */
482	if (optlen > IFNAMSIZ - 1)
483		optlen = IFNAMSIZ - 1;
484	memset(devname, 0, sizeof(devname));
485
486	ret = -EFAULT;
487	if (copy_from_user(devname, optval, optlen))
488		goto out;
489
490	index = 0;
491	if (devname[0] != '\0') {
492		struct net_device *dev;
493
494		rcu_read_lock();
495		dev = dev_get_by_name_rcu(net, devname);
496		if (dev)
497			index = dev->ifindex;
498		rcu_read_unlock();
499		ret = -ENODEV;
500		if (!dev)
501			goto out;
502	}
503
504	lock_sock(sk);
505	sk->sk_bound_dev_if = index;
506	sk_dst_reset(sk);
507	release_sock(sk);
508
509	ret = 0;
510
511out:
512#endif
513
514	return ret;
515}
516
517static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
518{
519	if (valbool)
520		sock_set_flag(sk, bit);
521	else
522		sock_reset_flag(sk, bit);
523}
524
525/*
526 *	This is meant for all protocols to use and covers goings on
527 *	at the socket level. Everything here is generic.
528 */
529
530int sock_setsockopt(struct socket *sock, int level, int optname,
531		    char __user *optval, unsigned int optlen)
532{
533	struct sock *sk = sock->sk;
534	int val;
535	int valbool;
536	struct linger ling;
537	int ret = 0;
538
539	/*
540	 *	Options without arguments
541	 */
542
543	if (optname == SO_BINDTODEVICE)
544		return sock_bindtodevice(sk, optval, optlen);
545
546	if (optlen < sizeof(int))
547		return -EINVAL;
548
549	if (get_user(val, (int __user *)optval))
550		return -EFAULT;
551
552	valbool = val ? 1 : 0;
553
554	lock_sock(sk);
555
556	switch (optname) {
557	case SO_DEBUG:
558		if (val && !capable(CAP_NET_ADMIN))
559			ret = -EACCES;
560		else
561			sock_valbool_flag(sk, SOCK_DBG, valbool);
562		break;
563	case SO_REUSEADDR:
564		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
565		break;
566	case SO_TYPE:
567	case SO_PROTOCOL:
568	case SO_DOMAIN:
569	case SO_ERROR:
570		ret = -ENOPROTOOPT;
571		break;
572	case SO_DONTROUTE:
573		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
574		break;
575	case SO_BROADCAST:
576		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
577		break;
578	case SO_SNDBUF:
579		/* Don't error on this BSD doesn't and if you think
580		 * about it this is right. Otherwise apps have to
581		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
582		 * are treated in BSD as hints
583		 */
584		val = min_t(u32, val, sysctl_wmem_max);
585set_sndbuf:
586		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
587		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
588		/* Wake up sending tasks if we upped the value. */
589		sk->sk_write_space(sk);
590		break;
591
592	case SO_SNDBUFFORCE:
593		if (!capable(CAP_NET_ADMIN)) {
594			ret = -EPERM;
595			break;
596		}
597		goto set_sndbuf;
598
599	case SO_RCVBUF:
600		/* Don't error on this BSD doesn't and if you think
601		 * about it this is right. Otherwise apps have to
602		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
603		 * are treated in BSD as hints
604		 */
605		val = min_t(u32, val, sysctl_rmem_max);
606set_rcvbuf:
607		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
608		/*
609		 * We double it on the way in to account for
610		 * "struct sk_buff" etc. overhead.   Applications
611		 * assume that the SO_RCVBUF setting they make will
612		 * allow that much actual data to be received on that
613		 * socket.
614		 *
615		 * Applications are unaware that "struct sk_buff" and
616		 * other overheads allocate from the receive buffer
617		 * during socket buffer allocation.
618		 *
619		 * And after considering the possible alternatives,
620		 * returning the value we actually used in getsockopt
621		 * is the most desirable behavior.
622		 */
623		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
624		break;
625
626	case SO_RCVBUFFORCE:
627		if (!capable(CAP_NET_ADMIN)) {
628			ret = -EPERM;
629			break;
630		}
631		goto set_rcvbuf;
632
633	case SO_KEEPALIVE:
634#ifdef CONFIG_INET
635		if (sk->sk_protocol == IPPROTO_TCP)
636			tcp_set_keepalive(sk, valbool);
637#endif
638		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
639		break;
640
641	case SO_OOBINLINE:
642		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
643		break;
644
645	case SO_NO_CHECK:
646		sk->sk_no_check = valbool;
647		break;
648
649	case SO_PRIORITY:
650		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
651			sk->sk_priority = val;
652		else
653			ret = -EPERM;
654		break;
655
656	case SO_LINGER:
657		if (optlen < sizeof(ling)) {
658			ret = -EINVAL;	/* 1003.1g */
659			break;
660		}
661		if (copy_from_user(&ling, optval, sizeof(ling))) {
662			ret = -EFAULT;
663			break;
664		}
665		if (!ling.l_onoff)
666			sock_reset_flag(sk, SOCK_LINGER);
667		else {
668#if (BITS_PER_LONG == 32)
669			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
670				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
671			else
672#endif
673				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
674			sock_set_flag(sk, SOCK_LINGER);
675		}
676		break;
677
678	case SO_BSDCOMPAT:
679		sock_warn_obsolete_bsdism("setsockopt");
680		break;
681
682	case SO_PASSCRED:
683		if (valbool)
684			set_bit(SOCK_PASSCRED, &sock->flags);
685		else
686			clear_bit(SOCK_PASSCRED, &sock->flags);
687		break;
688
689	case SO_TIMESTAMP:
690	case SO_TIMESTAMPNS:
691		if (valbool)  {
692			if (optname == SO_TIMESTAMP)
693				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
694			else
695				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
696			sock_set_flag(sk, SOCK_RCVTSTAMP);
697			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
698		} else {
699			sock_reset_flag(sk, SOCK_RCVTSTAMP);
700			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
701		}
702		break;
703
704	case SO_TIMESTAMPING:
705		if (val & ~SOF_TIMESTAMPING_MASK) {
706			ret = -EINVAL;
707			break;
708		}
709		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
710				  val & SOF_TIMESTAMPING_TX_HARDWARE);
711		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
712				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
713		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
714				  val & SOF_TIMESTAMPING_RX_HARDWARE);
715		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
716			sock_enable_timestamp(sk,
717					      SOCK_TIMESTAMPING_RX_SOFTWARE);
718		else
719			sock_disable_timestamp(sk,
720					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
721		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
722				  val & SOF_TIMESTAMPING_SOFTWARE);
723		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
724				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
725		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
726				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
727		break;
728
729	case SO_RCVLOWAT:
730		if (val < 0)
731			val = INT_MAX;
732		sk->sk_rcvlowat = val ? : 1;
733		break;
734
735	case SO_RCVTIMEO:
736		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
737		break;
738
739	case SO_SNDTIMEO:
740		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
741		break;
742
743	case SO_ATTACH_FILTER:
744		ret = -EINVAL;
745		if (optlen == sizeof(struct sock_fprog)) {
746			struct sock_fprog fprog;
747
748			ret = -EFAULT;
749			if (copy_from_user(&fprog, optval, sizeof(fprog)))
750				break;
751
752			ret = sk_attach_filter(&fprog, sk);
753		}
754		break;
755
756	case SO_DETACH_FILTER:
757		ret = sk_detach_filter(sk);
758		break;
759
760	case SO_PASSSEC:
761		if (valbool)
762			set_bit(SOCK_PASSSEC, &sock->flags);
763		else
764			clear_bit(SOCK_PASSSEC, &sock->flags);
765		break;
766	case SO_MARK:
767		if (!capable(CAP_NET_ADMIN))
768			ret = -EPERM;
769		else
770			sk->sk_mark = val;
771		break;
772
773		/* We implement the SO_SNDLOWAT etc to
774		   not be settable (1003.1g 5.3) */
775	case SO_RXQ_OVFL:
776		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
777		break;
778
779	case SO_WIFI_STATUS:
780		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
781		break;
782
783	case SO_PEEK_OFF:
784		if (sock->ops->set_peek_off)
785			sock->ops->set_peek_off(sk, val);
786		else
787			ret = -EOPNOTSUPP;
788		break;
789
790	case SO_NOFCS:
791		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
792		break;
793
794	default:
795		ret = -ENOPROTOOPT;
796		break;
797	}
798	release_sock(sk);
799	return ret;
800}
801EXPORT_SYMBOL(sock_setsockopt);
802
803
804void cred_to_ucred(struct pid *pid, const struct cred *cred,
805		   struct ucred *ucred)
806{
807	ucred->pid = pid_vnr(pid);
808	ucred->uid = ucred->gid = -1;
809	if (cred) {
810		struct user_namespace *current_ns = current_user_ns();
811
812		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
813		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
814	}
815}
816EXPORT_SYMBOL_GPL(cred_to_ucred);
817
818int sock_getsockopt(struct socket *sock, int level, int optname,
819		    char __user *optval, int __user *optlen)
820{
821	struct sock *sk = sock->sk;
822
823	union {
824		int val;
825		struct linger ling;
826		struct timeval tm;
827	} v;
828
829	int lv = sizeof(int);
830	int len;
831
832	if (get_user(len, optlen))
833		return -EFAULT;
834	if (len < 0)
835		return -EINVAL;
836
837	memset(&v, 0, sizeof(v));
838
839	switch (optname) {
840	case SO_DEBUG:
841		v.val = sock_flag(sk, SOCK_DBG);
842		break;
843
844	case SO_DONTROUTE:
845		v.val = sock_flag(sk, SOCK_LOCALROUTE);
846		break;
847
848	case SO_BROADCAST:
849		v.val = !!sock_flag(sk, SOCK_BROADCAST);
850		break;
851
852	case SO_SNDBUF:
853		v.val = sk->sk_sndbuf;
854		break;
855
856	case SO_RCVBUF:
857		v.val = sk->sk_rcvbuf;
858		break;
859
860	case SO_REUSEADDR:
861		v.val = sk->sk_reuse;
862		break;
863
864	case SO_KEEPALIVE:
865		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
866		break;
867
868	case SO_TYPE:
869		v.val = sk->sk_type;
870		break;
871
872	case SO_PROTOCOL:
873		v.val = sk->sk_protocol;
874		break;
875
876	case SO_DOMAIN:
877		v.val = sk->sk_family;
878		break;
879
880	case SO_ERROR:
881		v.val = -sock_error(sk);
882		if (v.val == 0)
883			v.val = xchg(&sk->sk_err_soft, 0);
884		break;
885
886	case SO_OOBINLINE:
887		v.val = !!sock_flag(sk, SOCK_URGINLINE);
888		break;
889
890	case SO_NO_CHECK:
891		v.val = sk->sk_no_check;
892		break;
893
894	case SO_PRIORITY:
895		v.val = sk->sk_priority;
896		break;
897
898	case SO_LINGER:
899		lv		= sizeof(v.ling);
900		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
901		v.ling.l_linger	= sk->sk_lingertime / HZ;
902		break;
903
904	case SO_BSDCOMPAT:
905		sock_warn_obsolete_bsdism("getsockopt");
906		break;
907
908	case SO_TIMESTAMP:
909		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
910				!sock_flag(sk, SOCK_RCVTSTAMPNS);
911		break;
912
913	case SO_TIMESTAMPNS:
914		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
915		break;
916
917	case SO_TIMESTAMPING:
918		v.val = 0;
919		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
920			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
921		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
922			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
923		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
924			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
925		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
926			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
927		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
928			v.val |= SOF_TIMESTAMPING_SOFTWARE;
929		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
930			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
931		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
932			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
933		break;
934
935	case SO_RCVTIMEO:
936		lv = sizeof(struct timeval);
937		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
938			v.tm.tv_sec = 0;
939			v.tm.tv_usec = 0;
940		} else {
941			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
942			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
943		}
944		break;
945
946	case SO_SNDTIMEO:
947		lv = sizeof(struct timeval);
948		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
949			v.tm.tv_sec = 0;
950			v.tm.tv_usec = 0;
951		} else {
952			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
953			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
954		}
955		break;
956
957	case SO_RCVLOWAT:
958		v.val = sk->sk_rcvlowat;
959		break;
960
961	case SO_SNDLOWAT:
962		v.val = 1;
963		break;
964
965	case SO_PASSCRED:
966		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
967		break;
968
969	case SO_PEERCRED:
970	{
971		struct ucred peercred;
972		if (len > sizeof(peercred))
973			len = sizeof(peercred);
974		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
975		if (copy_to_user(optval, &peercred, len))
976			return -EFAULT;
977		goto lenout;
978	}
979
980	case SO_PEERNAME:
981	{
982		char address[128];
983
984		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
985			return -ENOTCONN;
986		if (lv < len)
987			return -EINVAL;
988		if (copy_to_user(optval, address, len))
989			return -EFAULT;
990		goto lenout;
991	}
992
993	/* Dubious BSD thing... Probably nobody even uses it, but
994	 * the UNIX standard wants it for whatever reason... -DaveM
995	 */
996	case SO_ACCEPTCONN:
997		v.val = sk->sk_state == TCP_LISTEN;
998		break;
999
1000	case SO_PASSSEC:
1001		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1002		break;
1003
1004	case SO_PEERSEC:
1005		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1006
1007	case SO_MARK:
1008		v.val = sk->sk_mark;
1009		break;
1010
1011	case SO_RXQ_OVFL:
1012		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1013		break;
1014
1015	case SO_WIFI_STATUS:
1016		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1017		break;
1018
1019	case SO_PEEK_OFF:
1020		if (!sock->ops->set_peek_off)
1021			return -EOPNOTSUPP;
1022
1023		v.val = sk->sk_peek_off;
1024		break;
1025	case SO_NOFCS:
1026		v.val = !!sock_flag(sk, SOCK_NOFCS);
1027		break;
1028	default:
1029		return -ENOPROTOOPT;
1030	}
1031
1032	if (len > lv)
1033		len = lv;
1034	if (copy_to_user(optval, &v, len))
1035		return -EFAULT;
1036lenout:
1037	if (put_user(len, optlen))
1038		return -EFAULT;
1039	return 0;
1040}
1041
1042/*
1043 * Initialize an sk_lock.
1044 *
1045 * (We also register the sk_lock with the lock validator.)
1046 */
1047static inline void sock_lock_init(struct sock *sk)
1048{
1049	sock_lock_init_class_and_name(sk,
1050			af_family_slock_key_strings[sk->sk_family],
1051			af_family_slock_keys + sk->sk_family,
1052			af_family_key_strings[sk->sk_family],
1053			af_family_keys + sk->sk_family);
1054}
1055
1056/*
1057 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1058 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1059 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1060 */
1061static void sock_copy(struct sock *nsk, const struct sock *osk)
1062{
1063#ifdef CONFIG_SECURITY_NETWORK
1064	void *sptr = nsk->sk_security;
1065#endif
1066	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1067
1068	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1069	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1070
1071#ifdef CONFIG_SECURITY_NETWORK
1072	nsk->sk_security = sptr;
1073	security_sk_clone(osk, nsk);
1074#endif
1075}
1076
1077/*
1078 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1079 * un-modified. Special care is taken when initializing object to zero.
1080 */
1081static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1082{
1083	if (offsetof(struct sock, sk_node.next) != 0)
1084		memset(sk, 0, offsetof(struct sock, sk_node.next));
1085	memset(&sk->sk_node.pprev, 0,
1086	       size - offsetof(struct sock, sk_node.pprev));
1087}
1088
1089void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1090{
1091	unsigned long nulls1, nulls2;
1092
1093	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1094	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1095	if (nulls1 > nulls2)
1096		swap(nulls1, nulls2);
1097
1098	if (nulls1 != 0)
1099		memset((char *)sk, 0, nulls1);
1100	memset((char *)sk + nulls1 + sizeof(void *), 0,
1101	       nulls2 - nulls1 - sizeof(void *));
1102	memset((char *)sk + nulls2 + sizeof(void *), 0,
1103	       size - nulls2 - sizeof(void *));
1104}
1105EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1106
1107static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1108		int family)
1109{
1110	struct sock *sk;
1111	struct kmem_cache *slab;
1112
1113	slab = prot->slab;
1114	if (slab != NULL) {
1115		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1116		if (!sk)
1117			return sk;
1118		if (priority & __GFP_ZERO) {
1119			if (prot->clear_sk)
1120				prot->clear_sk(sk, prot->obj_size);
1121			else
1122				sk_prot_clear_nulls(sk, prot->obj_size);
1123		}
1124	} else
1125		sk = kmalloc(prot->obj_size, priority);
1126
1127	if (sk != NULL) {
1128		kmemcheck_annotate_bitfield(sk, flags);
1129
1130		if (security_sk_alloc(sk, family, priority))
1131			goto out_free;
1132
1133		if (!try_module_get(prot->owner))
1134			goto out_free_sec;
1135		sk_tx_queue_clear(sk);
1136	}
1137
1138	return sk;
1139
1140out_free_sec:
1141	security_sk_free(sk);
1142out_free:
1143	if (slab != NULL)
1144		kmem_cache_free(slab, sk);
1145	else
1146		kfree(sk);
1147	return NULL;
1148}
1149
1150static void sk_prot_free(struct proto *prot, struct sock *sk)
1151{
1152	struct kmem_cache *slab;
1153	struct module *owner;
1154
1155	owner = prot->owner;
1156	slab = prot->slab;
1157
1158	security_sk_free(sk);
1159	if (slab != NULL)
1160		kmem_cache_free(slab, sk);
1161	else
1162		kfree(sk);
1163	module_put(owner);
1164}
1165
1166#ifdef CONFIG_CGROUPS
1167void sock_update_classid(struct sock *sk)
1168{
1169	u32 classid;
1170
1171	rcu_read_lock();  /* doing current task, which cannot vanish. */
1172	classid = task_cls_classid(current);
1173	rcu_read_unlock();
1174	if (classid && classid != sk->sk_classid)
1175		sk->sk_classid = classid;
1176}
1177EXPORT_SYMBOL(sock_update_classid);
1178
1179void sock_update_netprioidx(struct sock *sk)
1180{
1181	if (in_interrupt())
1182		return;
1183
1184	sk->sk_cgrp_prioidx = task_netprioidx(current);
1185}
1186EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1187#endif
1188
1189/**
1190 *	sk_alloc - All socket objects are allocated here
1191 *	@net: the applicable net namespace
1192 *	@family: protocol family
1193 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1194 *	@prot: struct proto associated with this new sock instance
1195 */
1196struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1197		      struct proto *prot)
1198{
1199	struct sock *sk;
1200
1201	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1202	if (sk) {
1203		sk->sk_family = family;
1204		/*
1205		 * See comment in struct sock definition to understand
1206		 * why we need sk_prot_creator -acme
1207		 */
1208		sk->sk_prot = sk->sk_prot_creator = prot;
1209		sock_lock_init(sk);
1210		sock_net_set(sk, get_net(net));
1211		atomic_set(&sk->sk_wmem_alloc, 1);
1212
1213		sock_update_classid(sk);
1214		sock_update_netprioidx(sk);
1215	}
1216
1217	return sk;
1218}
1219EXPORT_SYMBOL(sk_alloc);
1220
1221static void __sk_free(struct sock *sk)
1222{
1223	struct sk_filter *filter;
1224
1225	if (sk->sk_destruct)
1226		sk->sk_destruct(sk);
1227
1228	filter = rcu_dereference_check(sk->sk_filter,
1229				       atomic_read(&sk->sk_wmem_alloc) == 0);
1230	if (filter) {
1231		sk_filter_uncharge(sk, filter);
1232		RCU_INIT_POINTER(sk->sk_filter, NULL);
1233	}
1234
1235	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1236
1237	if (atomic_read(&sk->sk_omem_alloc))
1238		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1239		       __func__, atomic_read(&sk->sk_omem_alloc));
1240
1241	if (sk->sk_peer_cred)
1242		put_cred(sk->sk_peer_cred);
1243	put_pid(sk->sk_peer_pid);
1244	put_net(sock_net(sk));
1245	sk_prot_free(sk->sk_prot_creator, sk);
1246}
1247
1248void sk_free(struct sock *sk)
1249{
1250	/*
1251	 * We subtract one from sk_wmem_alloc and can know if
1252	 * some packets are still in some tx queue.
1253	 * If not null, sock_wfree() will call __sk_free(sk) later
1254	 */
1255	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1256		__sk_free(sk);
1257}
1258EXPORT_SYMBOL(sk_free);
1259
1260/*
1261 * Last sock_put should drop reference to sk->sk_net. It has already
1262 * been dropped in sk_change_net. Taking reference to stopping namespace
1263 * is not an option.
1264 * Take reference to a socket to remove it from hash _alive_ and after that
1265 * destroy it in the context of init_net.
1266 */
1267void sk_release_kernel(struct sock *sk)
1268{
1269	if (sk == NULL || sk->sk_socket == NULL)
1270		return;
1271
1272	sock_hold(sk);
1273	sock_release(sk->sk_socket);
1274	release_net(sock_net(sk));
1275	sock_net_set(sk, get_net(&init_net));
1276	sock_put(sk);
1277}
1278EXPORT_SYMBOL(sk_release_kernel);
1279
1280static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1281{
1282	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1283		sock_update_memcg(newsk);
1284}
1285
1286/**
1287 *	sk_clone_lock - clone a socket, and lock its clone
1288 *	@sk: the socket to clone
1289 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1290 *
1291 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1292 */
1293struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1294{
1295	struct sock *newsk;
1296
1297	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1298	if (newsk != NULL) {
1299		struct sk_filter *filter;
1300
1301		sock_copy(newsk, sk);
1302
1303		/* SANITY */
1304		get_net(sock_net(newsk));
1305		sk_node_init(&newsk->sk_node);
1306		sock_lock_init(newsk);
1307		bh_lock_sock(newsk);
1308		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1309		newsk->sk_backlog.len = 0;
1310
1311		atomic_set(&newsk->sk_rmem_alloc, 0);
1312		/*
1313		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1314		 */
1315		atomic_set(&newsk->sk_wmem_alloc, 1);
1316		atomic_set(&newsk->sk_omem_alloc, 0);
1317		skb_queue_head_init(&newsk->sk_receive_queue);
1318		skb_queue_head_init(&newsk->sk_write_queue);
1319#ifdef CONFIG_NET_DMA
1320		skb_queue_head_init(&newsk->sk_async_wait_queue);
1321#endif
1322
1323		spin_lock_init(&newsk->sk_dst_lock);
1324		rwlock_init(&newsk->sk_callback_lock);
1325		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1326				af_callback_keys + newsk->sk_family,
1327				af_family_clock_key_strings[newsk->sk_family]);
1328
1329		newsk->sk_dst_cache	= NULL;
1330		newsk->sk_wmem_queued	= 0;
1331		newsk->sk_forward_alloc = 0;
1332		newsk->sk_send_head	= NULL;
1333		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1334
1335		sock_reset_flag(newsk, SOCK_DONE);
1336		skb_queue_head_init(&newsk->sk_error_queue);
1337
1338		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1339		if (filter != NULL)
1340			sk_filter_charge(newsk, filter);
1341
1342		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1343			/* It is still raw copy of parent, so invalidate
1344			 * destructor and make plain sk_free() */
1345			newsk->sk_destruct = NULL;
1346			bh_unlock_sock(newsk);
1347			sk_free(newsk);
1348			newsk = NULL;
1349			goto out;
1350		}
1351
1352		newsk->sk_err	   = 0;
1353		newsk->sk_priority = 0;
1354		/*
1355		 * Before updating sk_refcnt, we must commit prior changes to memory
1356		 * (Documentation/RCU/rculist_nulls.txt for details)
1357		 */
1358		smp_wmb();
1359		atomic_set(&newsk->sk_refcnt, 2);
1360
1361		/*
1362		 * Increment the counter in the same struct proto as the master
1363		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1364		 * is the same as sk->sk_prot->socks, as this field was copied
1365		 * with memcpy).
1366		 *
1367		 * This _changes_ the previous behaviour, where
1368		 * tcp_create_openreq_child always was incrementing the
1369		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1370		 * to be taken into account in all callers. -acme
1371		 */
1372		sk_refcnt_debug_inc(newsk);
1373		sk_set_socket(newsk, NULL);
1374		newsk->sk_wq = NULL;
1375
1376		sk_update_clone(sk, newsk);
1377
1378		if (newsk->sk_prot->sockets_allocated)
1379			sk_sockets_allocated_inc(newsk);
1380
1381		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1382			net_enable_timestamp();
1383	}
1384out:
1385	return newsk;
1386}
1387EXPORT_SYMBOL_GPL(sk_clone_lock);
1388
1389void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1390{
1391	__sk_dst_set(sk, dst);
1392	sk->sk_route_caps = dst->dev->features;
1393	if (sk->sk_route_caps & NETIF_F_GSO)
1394		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1395	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1396	if (sk_can_gso(sk)) {
1397		if (dst->header_len) {
1398			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1399		} else {
1400			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1401			sk->sk_gso_max_size = dst->dev->gso_max_size;
1402		}
1403	}
1404}
1405EXPORT_SYMBOL_GPL(sk_setup_caps);
1406
1407void __init sk_init(void)
1408{
1409	if (totalram_pages <= 4096) {
1410		sysctl_wmem_max = 32767;
1411		sysctl_rmem_max = 32767;
1412		sysctl_wmem_default = 32767;
1413		sysctl_rmem_default = 32767;
1414	} else if (totalram_pages >= 131072) {
1415		sysctl_wmem_max = 131071;
1416		sysctl_rmem_max = 131071;
1417	}
1418}
1419
1420/*
1421 *	Simple resource managers for sockets.
1422 */
1423
1424
1425/*
1426 * Write buffer destructor automatically called from kfree_skb.
1427 */
1428void sock_wfree(struct sk_buff *skb)
1429{
1430	struct sock *sk = skb->sk;
1431	unsigned int len = skb->truesize;
1432
1433	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1434		/*
1435		 * Keep a reference on sk_wmem_alloc, this will be released
1436		 * after sk_write_space() call
1437		 */
1438		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1439		sk->sk_write_space(sk);
1440		len = 1;
1441	}
1442	/*
1443	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1444	 * could not do because of in-flight packets
1445	 */
1446	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1447		__sk_free(sk);
1448}
1449EXPORT_SYMBOL(sock_wfree);
1450
1451/*
1452 * Read buffer destructor automatically called from kfree_skb.
1453 */
1454void sock_rfree(struct sk_buff *skb)
1455{
1456	struct sock *sk = skb->sk;
1457	unsigned int len = skb->truesize;
1458
1459	atomic_sub(len, &sk->sk_rmem_alloc);
1460	sk_mem_uncharge(sk, len);
1461}
1462EXPORT_SYMBOL(sock_rfree);
1463
1464
1465int sock_i_uid(struct sock *sk)
1466{
1467	int uid;
1468
1469	read_lock_bh(&sk->sk_callback_lock);
1470	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1471	read_unlock_bh(&sk->sk_callback_lock);
1472	return uid;
1473}
1474EXPORT_SYMBOL(sock_i_uid);
1475
1476unsigned long sock_i_ino(struct sock *sk)
1477{
1478	unsigned long ino;
1479
1480	read_lock_bh(&sk->sk_callback_lock);
1481	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1482	read_unlock_bh(&sk->sk_callback_lock);
1483	return ino;
1484}
1485EXPORT_SYMBOL(sock_i_ino);
1486
1487/*
1488 * Allocate a skb from the socket's send buffer.
1489 */
1490struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1491			     gfp_t priority)
1492{
1493	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1494		struct sk_buff *skb = alloc_skb(size, priority);
1495		if (skb) {
1496			skb_set_owner_w(skb, sk);
1497			return skb;
1498		}
1499	}
1500	return NULL;
1501}
1502EXPORT_SYMBOL(sock_wmalloc);
1503
1504/*
1505 * Allocate a skb from the socket's receive buffer.
1506 */
1507struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1508			     gfp_t priority)
1509{
1510	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1511		struct sk_buff *skb = alloc_skb(size, priority);
1512		if (skb) {
1513			skb_set_owner_r(skb, sk);
1514			return skb;
1515		}
1516	}
1517	return NULL;
1518}
1519
1520/*
1521 * Allocate a memory block from the socket's option memory buffer.
1522 */
1523void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1524{
1525	if ((unsigned int)size <= sysctl_optmem_max &&
1526	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1527		void *mem;
1528		/* First do the add, to avoid the race if kmalloc
1529		 * might sleep.
1530		 */
1531		atomic_add(size, &sk->sk_omem_alloc);
1532		mem = kmalloc(size, priority);
1533		if (mem)
1534			return mem;
1535		atomic_sub(size, &sk->sk_omem_alloc);
1536	}
1537	return NULL;
1538}
1539EXPORT_SYMBOL(sock_kmalloc);
1540
1541/*
1542 * Free an option memory block.
1543 */
1544void sock_kfree_s(struct sock *sk, void *mem, int size)
1545{
1546	kfree(mem);
1547	atomic_sub(size, &sk->sk_omem_alloc);
1548}
1549EXPORT_SYMBOL(sock_kfree_s);
1550
1551/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1552   I think, these locks should be removed for datagram sockets.
1553 */
1554static long sock_wait_for_wmem(struct sock *sk, long timeo)
1555{
1556	DEFINE_WAIT(wait);
1557
1558	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1559	for (;;) {
1560		if (!timeo)
1561			break;
1562		if (signal_pending(current))
1563			break;
1564		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1565		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1566		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1567			break;
1568		if (sk->sk_shutdown & SEND_SHUTDOWN)
1569			break;
1570		if (sk->sk_err)
1571			break;
1572		timeo = schedule_timeout(timeo);
1573	}
1574	finish_wait(sk_sleep(sk), &wait);
1575	return timeo;
1576}
1577
1578
1579/*
1580 *	Generic send/receive buffer handlers
1581 */
1582
1583struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1584				     unsigned long data_len, int noblock,
1585				     int *errcode)
1586{
1587	struct sk_buff *skb;
1588	gfp_t gfp_mask;
1589	long timeo;
1590	int err;
1591
1592	gfp_mask = sk->sk_allocation;
1593	if (gfp_mask & __GFP_WAIT)
1594		gfp_mask |= __GFP_REPEAT;
1595
1596	timeo = sock_sndtimeo(sk, noblock);
1597	while (1) {
1598		err = sock_error(sk);
1599		if (err != 0)
1600			goto failure;
1601
1602		err = -EPIPE;
1603		if (sk->sk_shutdown & SEND_SHUTDOWN)
1604			goto failure;
1605
1606		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1607			skb = alloc_skb(header_len, gfp_mask);
1608			if (skb) {
1609				int npages;
1610				int i;
1611
1612				/* No pages, we're done... */
1613				if (!data_len)
1614					break;
1615
1616				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1617				skb->truesize += data_len;
1618				skb_shinfo(skb)->nr_frags = npages;
1619				for (i = 0; i < npages; i++) {
1620					struct page *page;
1621
1622					page = alloc_pages(sk->sk_allocation, 0);
1623					if (!page) {
1624						err = -ENOBUFS;
1625						skb_shinfo(skb)->nr_frags = i;
1626						kfree_skb(skb);
1627						goto failure;
1628					}
1629
1630					__skb_fill_page_desc(skb, i,
1631							page, 0,
1632							(data_len >= PAGE_SIZE ?
1633							 PAGE_SIZE :
1634							 data_len));
1635					data_len -= PAGE_SIZE;
1636				}
1637
1638				/* Full success... */
1639				break;
1640			}
1641			err = -ENOBUFS;
1642			goto failure;
1643		}
1644		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1645		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1646		err = -EAGAIN;
1647		if (!timeo)
1648			goto failure;
1649		if (signal_pending(current))
1650			goto interrupted;
1651		timeo = sock_wait_for_wmem(sk, timeo);
1652	}
1653
1654	skb_set_owner_w(skb, sk);
1655	return skb;
1656
1657interrupted:
1658	err = sock_intr_errno(timeo);
1659failure:
1660	*errcode = err;
1661	return NULL;
1662}
1663EXPORT_SYMBOL(sock_alloc_send_pskb);
1664
1665struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1666				    int noblock, int *errcode)
1667{
1668	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1669}
1670EXPORT_SYMBOL(sock_alloc_send_skb);
1671
1672static void __lock_sock(struct sock *sk)
1673	__releases(&sk->sk_lock.slock)
1674	__acquires(&sk->sk_lock.slock)
1675{
1676	DEFINE_WAIT(wait);
1677
1678	for (;;) {
1679		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1680					TASK_UNINTERRUPTIBLE);
1681		spin_unlock_bh(&sk->sk_lock.slock);
1682		schedule();
1683		spin_lock_bh(&sk->sk_lock.slock);
1684		if (!sock_owned_by_user(sk))
1685			break;
1686	}
1687	finish_wait(&sk->sk_lock.wq, &wait);
1688}
1689
1690static void __release_sock(struct sock *sk)
1691	__releases(&sk->sk_lock.slock)
1692	__acquires(&sk->sk_lock.slock)
1693{
1694	struct sk_buff *skb = sk->sk_backlog.head;
1695
1696	do {
1697		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1698		bh_unlock_sock(sk);
1699
1700		do {
1701			struct sk_buff *next = skb->next;
1702
1703			WARN_ON_ONCE(skb_dst_is_noref(skb));
1704			skb->next = NULL;
1705			sk_backlog_rcv(sk, skb);
1706
1707			/*
1708			 * We are in process context here with softirqs
1709			 * disabled, use cond_resched_softirq() to preempt.
1710			 * This is safe to do because we've taken the backlog
1711			 * queue private:
1712			 */
1713			cond_resched_softirq();
1714
1715			skb = next;
1716		} while (skb != NULL);
1717
1718		bh_lock_sock(sk);
1719	} while ((skb = sk->sk_backlog.head) != NULL);
1720
1721	/*
1722	 * Doing the zeroing here guarantee we can not loop forever
1723	 * while a wild producer attempts to flood us.
1724	 */
1725	sk->sk_backlog.len = 0;
1726}
1727
1728/**
1729 * sk_wait_data - wait for data to arrive at sk_receive_queue
1730 * @sk:    sock to wait on
1731 * @timeo: for how long
1732 *
1733 * Now socket state including sk->sk_err is changed only under lock,
1734 * hence we may omit checks after joining wait queue.
1735 * We check receive queue before schedule() only as optimization;
1736 * it is very likely that release_sock() added new data.
1737 */
1738int sk_wait_data(struct sock *sk, long *timeo)
1739{
1740	int rc;
1741	DEFINE_WAIT(wait);
1742
1743	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1744	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1745	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1746	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1747	finish_wait(sk_sleep(sk), &wait);
1748	return rc;
1749}
1750EXPORT_SYMBOL(sk_wait_data);
1751
1752/**
1753 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1754 *	@sk: socket
1755 *	@size: memory size to allocate
1756 *	@kind: allocation type
1757 *
1758 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1759 *	rmem allocation. This function assumes that protocols which have
1760 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1761 */
1762int __sk_mem_schedule(struct sock *sk, int size, int kind)
1763{
1764	struct proto *prot = sk->sk_prot;
1765	int amt = sk_mem_pages(size);
1766	long allocated;
1767	int parent_status = UNDER_LIMIT;
1768
1769	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1770
1771	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1772
1773	/* Under limit. */
1774	if (parent_status == UNDER_LIMIT &&
1775			allocated <= sk_prot_mem_limits(sk, 0)) {
1776		sk_leave_memory_pressure(sk);
1777		return 1;
1778	}
1779
1780	/* Under pressure. (we or our parents) */
1781	if ((parent_status > SOFT_LIMIT) ||
1782			allocated > sk_prot_mem_limits(sk, 1))
1783		sk_enter_memory_pressure(sk);
1784
1785	/* Over hard limit (we or our parents) */
1786	if ((parent_status == OVER_LIMIT) ||
1787			(allocated > sk_prot_mem_limits(sk, 2)))
1788		goto suppress_allocation;
1789
1790	/* guarantee minimum buffer size under pressure */
1791	if (kind == SK_MEM_RECV) {
1792		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1793			return 1;
1794
1795	} else { /* SK_MEM_SEND */
1796		if (sk->sk_type == SOCK_STREAM) {
1797			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1798				return 1;
1799		} else if (atomic_read(&sk->sk_wmem_alloc) <
1800			   prot->sysctl_wmem[0])
1801				return 1;
1802	}
1803
1804	if (sk_has_memory_pressure(sk)) {
1805		int alloc;
1806
1807		if (!sk_under_memory_pressure(sk))
1808			return 1;
1809		alloc = sk_sockets_allocated_read_positive(sk);
1810		if (sk_prot_mem_limits(sk, 2) > alloc *
1811		    sk_mem_pages(sk->sk_wmem_queued +
1812				 atomic_read(&sk->sk_rmem_alloc) +
1813				 sk->sk_forward_alloc))
1814			return 1;
1815	}
1816
1817suppress_allocation:
1818
1819	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1820		sk_stream_moderate_sndbuf(sk);
1821
1822		/* Fail only if socket is _under_ its sndbuf.
1823		 * In this case we cannot block, so that we have to fail.
1824		 */
1825		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1826			return 1;
1827	}
1828
1829	trace_sock_exceed_buf_limit(sk, prot, allocated);
1830
1831	/* Alas. Undo changes. */
1832	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1833
1834	sk_memory_allocated_sub(sk, amt);
1835
1836	return 0;
1837}
1838EXPORT_SYMBOL(__sk_mem_schedule);
1839
1840/**
1841 *	__sk_reclaim - reclaim memory_allocated
1842 *	@sk: socket
1843 */
1844void __sk_mem_reclaim(struct sock *sk)
1845{
1846	sk_memory_allocated_sub(sk,
1847				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1848	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1849
1850	if (sk_under_memory_pressure(sk) &&
1851	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1852		sk_leave_memory_pressure(sk);
1853}
1854EXPORT_SYMBOL(__sk_mem_reclaim);
1855
1856
1857/*
1858 * Set of default routines for initialising struct proto_ops when
1859 * the protocol does not support a particular function. In certain
1860 * cases where it makes no sense for a protocol to have a "do nothing"
1861 * function, some default processing is provided.
1862 */
1863
1864int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1865{
1866	return -EOPNOTSUPP;
1867}
1868EXPORT_SYMBOL(sock_no_bind);
1869
1870int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1871		    int len, int flags)
1872{
1873	return -EOPNOTSUPP;
1874}
1875EXPORT_SYMBOL(sock_no_connect);
1876
1877int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1878{
1879	return -EOPNOTSUPP;
1880}
1881EXPORT_SYMBOL(sock_no_socketpair);
1882
1883int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1884{
1885	return -EOPNOTSUPP;
1886}
1887EXPORT_SYMBOL(sock_no_accept);
1888
1889int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1890		    int *len, int peer)
1891{
1892	return -EOPNOTSUPP;
1893}
1894EXPORT_SYMBOL(sock_no_getname);
1895
1896unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1897{
1898	return 0;
1899}
1900EXPORT_SYMBOL(sock_no_poll);
1901
1902int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1903{
1904	return -EOPNOTSUPP;
1905}
1906EXPORT_SYMBOL(sock_no_ioctl);
1907
1908int sock_no_listen(struct socket *sock, int backlog)
1909{
1910	return -EOPNOTSUPP;
1911}
1912EXPORT_SYMBOL(sock_no_listen);
1913
1914int sock_no_shutdown(struct socket *sock, int how)
1915{
1916	return -EOPNOTSUPP;
1917}
1918EXPORT_SYMBOL(sock_no_shutdown);
1919
1920int sock_no_setsockopt(struct socket *sock, int level, int optname,
1921		    char __user *optval, unsigned int optlen)
1922{
1923	return -EOPNOTSUPP;
1924}
1925EXPORT_SYMBOL(sock_no_setsockopt);
1926
1927int sock_no_getsockopt(struct socket *sock, int level, int optname,
1928		    char __user *optval, int __user *optlen)
1929{
1930	return -EOPNOTSUPP;
1931}
1932EXPORT_SYMBOL(sock_no_getsockopt);
1933
1934int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1935		    size_t len)
1936{
1937	return -EOPNOTSUPP;
1938}
1939EXPORT_SYMBOL(sock_no_sendmsg);
1940
1941int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1942		    size_t len, int flags)
1943{
1944	return -EOPNOTSUPP;
1945}
1946EXPORT_SYMBOL(sock_no_recvmsg);
1947
1948int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1949{
1950	/* Mirror missing mmap method error code */
1951	return -ENODEV;
1952}
1953EXPORT_SYMBOL(sock_no_mmap);
1954
1955ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1956{
1957	ssize_t res;
1958	struct msghdr msg = {.msg_flags = flags};
1959	struct kvec iov;
1960	char *kaddr = kmap(page);
1961	iov.iov_base = kaddr + offset;
1962	iov.iov_len = size;
1963	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1964	kunmap(page);
1965	return res;
1966}
1967EXPORT_SYMBOL(sock_no_sendpage);
1968
1969/*
1970 *	Default Socket Callbacks
1971 */
1972
1973static void sock_def_wakeup(struct sock *sk)
1974{
1975	struct socket_wq *wq;
1976
1977	rcu_read_lock();
1978	wq = rcu_dereference(sk->sk_wq);
1979	if (wq_has_sleeper(wq))
1980		wake_up_interruptible_all(&wq->wait);
1981	rcu_read_unlock();
1982}
1983
1984static void sock_def_error_report(struct sock *sk)
1985{
1986	struct socket_wq *wq;
1987
1988	rcu_read_lock();
1989	wq = rcu_dereference(sk->sk_wq);
1990	if (wq_has_sleeper(wq))
1991		wake_up_interruptible_poll(&wq->wait, POLLERR);
1992	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1993	rcu_read_unlock();
1994}
1995
1996static void sock_def_readable(struct sock *sk, int len)
1997{
1998	struct socket_wq *wq;
1999
2000	rcu_read_lock();
2001	wq = rcu_dereference(sk->sk_wq);
2002	if (wq_has_sleeper(wq))
2003		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2004						POLLRDNORM | POLLRDBAND);
2005	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2006	rcu_read_unlock();
2007}
2008
2009static void sock_def_write_space(struct sock *sk)
2010{
2011	struct socket_wq *wq;
2012
2013	rcu_read_lock();
2014
2015	/* Do not wake up a writer until he can make "significant"
2016	 * progress.  --DaveM
2017	 */
2018	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2019		wq = rcu_dereference(sk->sk_wq);
2020		if (wq_has_sleeper(wq))
2021			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2022						POLLWRNORM | POLLWRBAND);
2023
2024		/* Should agree with poll, otherwise some programs break */
2025		if (sock_writeable(sk))
2026			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2027	}
2028
2029	rcu_read_unlock();
2030}
2031
2032static void sock_def_destruct(struct sock *sk)
2033{
2034	kfree(sk->sk_protinfo);
2035}
2036
2037void sk_send_sigurg(struct sock *sk)
2038{
2039	if (sk->sk_socket && sk->sk_socket->file)
2040		if (send_sigurg(&sk->sk_socket->file->f_owner))
2041			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2042}
2043EXPORT_SYMBOL(sk_send_sigurg);
2044
2045void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2046		    unsigned long expires)
2047{
2048	if (!mod_timer(timer, expires))
2049		sock_hold(sk);
2050}
2051EXPORT_SYMBOL(sk_reset_timer);
2052
2053void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2054{
2055	if (timer_pending(timer) && del_timer(timer))
2056		__sock_put(sk);
2057}
2058EXPORT_SYMBOL(sk_stop_timer);
2059
2060void sock_init_data(struct socket *sock, struct sock *sk)
2061{
2062	skb_queue_head_init(&sk->sk_receive_queue);
2063	skb_queue_head_init(&sk->sk_write_queue);
2064	skb_queue_head_init(&sk->sk_error_queue);
2065#ifdef CONFIG_NET_DMA
2066	skb_queue_head_init(&sk->sk_async_wait_queue);
2067#endif
2068
2069	sk->sk_send_head	=	NULL;
2070
2071	init_timer(&sk->sk_timer);
2072
2073	sk->sk_allocation	=	GFP_KERNEL;
2074	sk->sk_rcvbuf		=	sysctl_rmem_default;
2075	sk->sk_sndbuf		=	sysctl_wmem_default;
2076	sk->sk_state		=	TCP_CLOSE;
2077	sk_set_socket(sk, sock);
2078
2079	sock_set_flag(sk, SOCK_ZAPPED);
2080
2081	if (sock) {
2082		sk->sk_type	=	sock->type;
2083		sk->sk_wq	=	sock->wq;
2084		sock->sk	=	sk;
2085	} else
2086		sk->sk_wq	=	NULL;
2087
2088	spin_lock_init(&sk->sk_dst_lock);
2089	rwlock_init(&sk->sk_callback_lock);
2090	lockdep_set_class_and_name(&sk->sk_callback_lock,
2091			af_callback_keys + sk->sk_family,
2092			af_family_clock_key_strings[sk->sk_family]);
2093
2094	sk->sk_state_change	=	sock_def_wakeup;
2095	sk->sk_data_ready	=	sock_def_readable;
2096	sk->sk_write_space	=	sock_def_write_space;
2097	sk->sk_error_report	=	sock_def_error_report;
2098	sk->sk_destruct		=	sock_def_destruct;
2099
2100	sk->sk_sndmsg_page	=	NULL;
2101	sk->sk_sndmsg_off	=	0;
2102	sk->sk_peek_off		=	-1;
2103
2104	sk->sk_peer_pid 	=	NULL;
2105	sk->sk_peer_cred	=	NULL;
2106	sk->sk_write_pending	=	0;
2107	sk->sk_rcvlowat		=	1;
2108	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2109	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2110
2111	sk->sk_stamp = ktime_set(-1L, 0);
2112
2113	/*
2114	 * Before updating sk_refcnt, we must commit prior changes to memory
2115	 * (Documentation/RCU/rculist_nulls.txt for details)
2116	 */
2117	smp_wmb();
2118	atomic_set(&sk->sk_refcnt, 1);
2119	atomic_set(&sk->sk_drops, 0);
2120}
2121EXPORT_SYMBOL(sock_init_data);
2122
2123void lock_sock_nested(struct sock *sk, int subclass)
2124{
2125	might_sleep();
2126	spin_lock_bh(&sk->sk_lock.slock);
2127	if (sk->sk_lock.owned)
2128		__lock_sock(sk);
2129	sk->sk_lock.owned = 1;
2130	spin_unlock(&sk->sk_lock.slock);
2131	/*
2132	 * The sk_lock has mutex_lock() semantics here:
2133	 */
2134	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2135	local_bh_enable();
2136}
2137EXPORT_SYMBOL(lock_sock_nested);
2138
2139void release_sock(struct sock *sk)
2140{
2141	/*
2142	 * The sk_lock has mutex_unlock() semantics:
2143	 */
2144	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2145
2146	spin_lock_bh(&sk->sk_lock.slock);
2147	if (sk->sk_backlog.tail)
2148		__release_sock(sk);
2149	sk->sk_lock.owned = 0;
2150	if (waitqueue_active(&sk->sk_lock.wq))
2151		wake_up(&sk->sk_lock.wq);
2152	spin_unlock_bh(&sk->sk_lock.slock);
2153}
2154EXPORT_SYMBOL(release_sock);
2155
2156/**
2157 * lock_sock_fast - fast version of lock_sock
2158 * @sk: socket
2159 *
2160 * This version should be used for very small section, where process wont block
2161 * return false if fast path is taken
2162 *   sk_lock.slock locked, owned = 0, BH disabled
2163 * return true if slow path is taken
2164 *   sk_lock.slock unlocked, owned = 1, BH enabled
2165 */
2166bool lock_sock_fast(struct sock *sk)
2167{
2168	might_sleep();
2169	spin_lock_bh(&sk->sk_lock.slock);
2170
2171	if (!sk->sk_lock.owned)
2172		/*
2173		 * Note : We must disable BH
2174		 */
2175		return false;
2176
2177	__lock_sock(sk);
2178	sk->sk_lock.owned = 1;
2179	spin_unlock(&sk->sk_lock.slock);
2180	/*
2181	 * The sk_lock has mutex_lock() semantics here:
2182	 */
2183	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2184	local_bh_enable();
2185	return true;
2186}
2187EXPORT_SYMBOL(lock_sock_fast);
2188
2189int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2190{
2191	struct timeval tv;
2192	if (!sock_flag(sk, SOCK_TIMESTAMP))
2193		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2194	tv = ktime_to_timeval(sk->sk_stamp);
2195	if (tv.tv_sec == -1)
2196		return -ENOENT;
2197	if (tv.tv_sec == 0) {
2198		sk->sk_stamp = ktime_get_real();
2199		tv = ktime_to_timeval(sk->sk_stamp);
2200	}
2201	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2202}
2203EXPORT_SYMBOL(sock_get_timestamp);
2204
2205int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2206{
2207	struct timespec ts;
2208	if (!sock_flag(sk, SOCK_TIMESTAMP))
2209		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2210	ts = ktime_to_timespec(sk->sk_stamp);
2211	if (ts.tv_sec == -1)
2212		return -ENOENT;
2213	if (ts.tv_sec == 0) {
2214		sk->sk_stamp = ktime_get_real();
2215		ts = ktime_to_timespec(sk->sk_stamp);
2216	}
2217	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2218}
2219EXPORT_SYMBOL(sock_get_timestampns);
2220
2221void sock_enable_timestamp(struct sock *sk, int flag)
2222{
2223	if (!sock_flag(sk, flag)) {
2224		unsigned long previous_flags = sk->sk_flags;
2225
2226		sock_set_flag(sk, flag);
2227		/*
2228		 * we just set one of the two flags which require net
2229		 * time stamping, but time stamping might have been on
2230		 * already because of the other one
2231		 */
2232		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2233			net_enable_timestamp();
2234	}
2235}
2236
2237/*
2238 *	Get a socket option on an socket.
2239 *
2240 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2241 *	asynchronous errors should be reported by getsockopt. We assume
2242 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2243 */
2244int sock_common_getsockopt(struct socket *sock, int level, int optname,
2245			   char __user *optval, int __user *optlen)
2246{
2247	struct sock *sk = sock->sk;
2248
2249	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2250}
2251EXPORT_SYMBOL(sock_common_getsockopt);
2252
2253#ifdef CONFIG_COMPAT
2254int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2255				  char __user *optval, int __user *optlen)
2256{
2257	struct sock *sk = sock->sk;
2258
2259	if (sk->sk_prot->compat_getsockopt != NULL)
2260		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2261						      optval, optlen);
2262	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2263}
2264EXPORT_SYMBOL(compat_sock_common_getsockopt);
2265#endif
2266
2267int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2268			struct msghdr *msg, size_t size, int flags)
2269{
2270	struct sock *sk = sock->sk;
2271	int addr_len = 0;
2272	int err;
2273
2274	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2275				   flags & ~MSG_DONTWAIT, &addr_len);
2276	if (err >= 0)
2277		msg->msg_namelen = addr_len;
2278	return err;
2279}
2280EXPORT_SYMBOL(sock_common_recvmsg);
2281
2282/*
2283 *	Set socket options on an inet socket.
2284 */
2285int sock_common_setsockopt(struct socket *sock, int level, int optname,
2286			   char __user *optval, unsigned int optlen)
2287{
2288	struct sock *sk = sock->sk;
2289
2290	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2291}
2292EXPORT_SYMBOL(sock_common_setsockopt);
2293
2294#ifdef CONFIG_COMPAT
2295int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2296				  char __user *optval, unsigned int optlen)
2297{
2298	struct sock *sk = sock->sk;
2299
2300	if (sk->sk_prot->compat_setsockopt != NULL)
2301		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2302						      optval, optlen);
2303	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2304}
2305EXPORT_SYMBOL(compat_sock_common_setsockopt);
2306#endif
2307
2308void sk_common_release(struct sock *sk)
2309{
2310	if (sk->sk_prot->destroy)
2311		sk->sk_prot->destroy(sk);
2312
2313	/*
2314	 * Observation: when sock_common_release is called, processes have
2315	 * no access to socket. But net still has.
2316	 * Step one, detach it from networking:
2317	 *
2318	 * A. Remove from hash tables.
2319	 */
2320
2321	sk->sk_prot->unhash(sk);
2322
2323	/*
2324	 * In this point socket cannot receive new packets, but it is possible
2325	 * that some packets are in flight because some CPU runs receiver and
2326	 * did hash table lookup before we unhashed socket. They will achieve
2327	 * receive queue and will be purged by socket destructor.
2328	 *
2329	 * Also we still have packets pending on receive queue and probably,
2330	 * our own packets waiting in device queues. sock_destroy will drain
2331	 * receive queue, but transmitted packets will delay socket destruction
2332	 * until the last reference will be released.
2333	 */
2334
2335	sock_orphan(sk);
2336
2337	xfrm_sk_free_policy(sk);
2338
2339	sk_refcnt_debug_release(sk);
2340	sock_put(sk);
2341}
2342EXPORT_SYMBOL(sk_common_release);
2343
2344#ifdef CONFIG_PROC_FS
2345#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2346struct prot_inuse {
2347	int val[PROTO_INUSE_NR];
2348};
2349
2350static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2351
2352#ifdef CONFIG_NET_NS
2353void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2354{
2355	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2356}
2357EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2358
2359int sock_prot_inuse_get(struct net *net, struct proto *prot)
2360{
2361	int cpu, idx = prot->inuse_idx;
2362	int res = 0;
2363
2364	for_each_possible_cpu(cpu)
2365		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2366
2367	return res >= 0 ? res : 0;
2368}
2369EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2370
2371static int __net_init sock_inuse_init_net(struct net *net)
2372{
2373	net->core.inuse = alloc_percpu(struct prot_inuse);
2374	return net->core.inuse ? 0 : -ENOMEM;
2375}
2376
2377static void __net_exit sock_inuse_exit_net(struct net *net)
2378{
2379	free_percpu(net->core.inuse);
2380}
2381
2382static struct pernet_operations net_inuse_ops = {
2383	.init = sock_inuse_init_net,
2384	.exit = sock_inuse_exit_net,
2385};
2386
2387static __init int net_inuse_init(void)
2388{
2389	if (register_pernet_subsys(&net_inuse_ops))
2390		panic("Cannot initialize net inuse counters");
2391
2392	return 0;
2393}
2394
2395core_initcall(net_inuse_init);
2396#else
2397static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2398
2399void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2400{
2401	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2402}
2403EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2404
2405int sock_prot_inuse_get(struct net *net, struct proto *prot)
2406{
2407	int cpu, idx = prot->inuse_idx;
2408	int res = 0;
2409
2410	for_each_possible_cpu(cpu)
2411		res += per_cpu(prot_inuse, cpu).val[idx];
2412
2413	return res >= 0 ? res : 0;
2414}
2415EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2416#endif
2417
2418static void assign_proto_idx(struct proto *prot)
2419{
2420	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2421
2422	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2423		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2424		return;
2425	}
2426
2427	set_bit(prot->inuse_idx, proto_inuse_idx);
2428}
2429
2430static void release_proto_idx(struct proto *prot)
2431{
2432	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2433		clear_bit(prot->inuse_idx, proto_inuse_idx);
2434}
2435#else
2436static inline void assign_proto_idx(struct proto *prot)
2437{
2438}
2439
2440static inline void release_proto_idx(struct proto *prot)
2441{
2442}
2443#endif
2444
2445int proto_register(struct proto *prot, int alloc_slab)
2446{
2447	if (alloc_slab) {
2448		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2449					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2450					NULL);
2451
2452		if (prot->slab == NULL) {
2453			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2454			       prot->name);
2455			goto out;
2456		}
2457
2458		if (prot->rsk_prot != NULL) {
2459			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2460			if (prot->rsk_prot->slab_name == NULL)
2461				goto out_free_sock_slab;
2462
2463			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2464								 prot->rsk_prot->obj_size, 0,
2465								 SLAB_HWCACHE_ALIGN, NULL);
2466
2467			if (prot->rsk_prot->slab == NULL) {
2468				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2469				       prot->name);
2470				goto out_free_request_sock_slab_name;
2471			}
2472		}
2473
2474		if (prot->twsk_prot != NULL) {
2475			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2476
2477			if (prot->twsk_prot->twsk_slab_name == NULL)
2478				goto out_free_request_sock_slab;
2479
2480			prot->twsk_prot->twsk_slab =
2481				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2482						  prot->twsk_prot->twsk_obj_size,
2483						  0,
2484						  SLAB_HWCACHE_ALIGN |
2485							prot->slab_flags,
2486						  NULL);
2487			if (prot->twsk_prot->twsk_slab == NULL)
2488				goto out_free_timewait_sock_slab_name;
2489		}
2490	}
2491
2492	mutex_lock(&proto_list_mutex);
2493	list_add(&prot->node, &proto_list);
2494	assign_proto_idx(prot);
2495	mutex_unlock(&proto_list_mutex);
2496	return 0;
2497
2498out_free_timewait_sock_slab_name:
2499	kfree(prot->twsk_prot->twsk_slab_name);
2500out_free_request_sock_slab:
2501	if (prot->rsk_prot && prot->rsk_prot->slab) {
2502		kmem_cache_destroy(prot->rsk_prot->slab);
2503		prot->rsk_prot->slab = NULL;
2504	}
2505out_free_request_sock_slab_name:
2506	if (prot->rsk_prot)
2507		kfree(prot->rsk_prot->slab_name);
2508out_free_sock_slab:
2509	kmem_cache_destroy(prot->slab);
2510	prot->slab = NULL;
2511out:
2512	return -ENOBUFS;
2513}
2514EXPORT_SYMBOL(proto_register);
2515
2516void proto_unregister(struct proto *prot)
2517{
2518	mutex_lock(&proto_list_mutex);
2519	release_proto_idx(prot);
2520	list_del(&prot->node);
2521	mutex_unlock(&proto_list_mutex);
2522
2523	if (prot->slab != NULL) {
2524		kmem_cache_destroy(prot->slab);
2525		prot->slab = NULL;
2526	}
2527
2528	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2529		kmem_cache_destroy(prot->rsk_prot->slab);
2530		kfree(prot->rsk_prot->slab_name);
2531		prot->rsk_prot->slab = NULL;
2532	}
2533
2534	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2535		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2536		kfree(prot->twsk_prot->twsk_slab_name);
2537		prot->twsk_prot->twsk_slab = NULL;
2538	}
2539}
2540EXPORT_SYMBOL(proto_unregister);
2541
2542#ifdef CONFIG_PROC_FS
2543static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2544	__acquires(proto_list_mutex)
2545{
2546	mutex_lock(&proto_list_mutex);
2547	return seq_list_start_head(&proto_list, *pos);
2548}
2549
2550static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2551{
2552	return seq_list_next(v, &proto_list, pos);
2553}
2554
2555static void proto_seq_stop(struct seq_file *seq, void *v)
2556	__releases(proto_list_mutex)
2557{
2558	mutex_unlock(&proto_list_mutex);
2559}
2560
2561static char proto_method_implemented(const void *method)
2562{
2563	return method == NULL ? 'n' : 'y';
2564}
2565static long sock_prot_memory_allocated(struct proto *proto)
2566{
2567	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2568}
2569
2570static char *sock_prot_memory_pressure(struct proto *proto)
2571{
2572	return proto->memory_pressure != NULL ?
2573	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2574}
2575
2576static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2577{
2578
2579	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2580			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2581		   proto->name,
2582		   proto->obj_size,
2583		   sock_prot_inuse_get(seq_file_net(seq), proto),
2584		   sock_prot_memory_allocated(proto),
2585		   sock_prot_memory_pressure(proto),
2586		   proto->max_header,
2587		   proto->slab == NULL ? "no" : "yes",
2588		   module_name(proto->owner),
2589		   proto_method_implemented(proto->close),
2590		   proto_method_implemented(proto->connect),
2591		   proto_method_implemented(proto->disconnect),
2592		   proto_method_implemented(proto->accept),
2593		   proto_method_implemented(proto->ioctl),
2594		   proto_method_implemented(proto->init),
2595		   proto_method_implemented(proto->destroy),
2596		   proto_method_implemented(proto->shutdown),
2597		   proto_method_implemented(proto->setsockopt),
2598		   proto_method_implemented(proto->getsockopt),
2599		   proto_method_implemented(proto->sendmsg),
2600		   proto_method_implemented(proto->recvmsg),
2601		   proto_method_implemented(proto->sendpage),
2602		   proto_method_implemented(proto->bind),
2603		   proto_method_implemented(proto->backlog_rcv),
2604		   proto_method_implemented(proto->hash),
2605		   proto_method_implemented(proto->unhash),
2606		   proto_method_implemented(proto->get_port),
2607		   proto_method_implemented(proto->enter_memory_pressure));
2608}
2609
2610static int proto_seq_show(struct seq_file *seq, void *v)
2611{
2612	if (v == &proto_list)
2613		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2614			   "protocol",
2615			   "size",
2616			   "sockets",
2617			   "memory",
2618			   "press",
2619			   "maxhdr",
2620			   "slab",
2621			   "module",
2622			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2623	else
2624		proto_seq_printf(seq, list_entry(v, struct proto, node));
2625	return 0;
2626}
2627
2628static const struct seq_operations proto_seq_ops = {
2629	.start  = proto_seq_start,
2630	.next   = proto_seq_next,
2631	.stop   = proto_seq_stop,
2632	.show   = proto_seq_show,
2633};
2634
2635static int proto_seq_open(struct inode *inode, struct file *file)
2636{
2637	return seq_open_net(inode, file, &proto_seq_ops,
2638			    sizeof(struct seq_net_private));
2639}
2640
2641static const struct file_operations proto_seq_fops = {
2642	.owner		= THIS_MODULE,
2643	.open		= proto_seq_open,
2644	.read		= seq_read,
2645	.llseek		= seq_lseek,
2646	.release	= seq_release_net,
2647};
2648
2649static __net_init int proto_init_net(struct net *net)
2650{
2651	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2652		return -ENOMEM;
2653
2654	return 0;
2655}
2656
2657static __net_exit void proto_exit_net(struct net *net)
2658{
2659	proc_net_remove(net, "protocols");
2660}
2661
2662
2663static __net_initdata struct pernet_operations proto_net_ops = {
2664	.init = proto_init_net,
2665	.exit = proto_exit_net,
2666};
2667
2668static int __init proto_init(void)
2669{
2670	return register_pernet_subsys(&proto_net_ops);
2671}
2672
2673subsys_initcall(proto_init);
2674
2675#endif /* PROC_FS */
2676