sock.c revision 7cb0240492caea2f6467f827313478f41877e6ef
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115#include <linux/user_namespace.h>
116#include <linux/static_key.h>
117#include <linux/memcontrol.h>
118#include <linux/prefetch.h>
119
120#include <asm/uaccess.h>
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
125#include <net/net_namespace.h>
126#include <net/request_sock.h>
127#include <net/sock.h>
128#include <linux/net_tstamp.h>
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
131#include <net/cls_cgroup.h>
132#include <net/netprio_cgroup.h>
133
134#include <linux/filter.h>
135
136#include <trace/events/sock.h>
137
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list);
144
145#ifdef CONFIG_MEMCG_KMEM
146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147{
148	struct proto *proto;
149	int ret = 0;
150
151	mutex_lock(&proto_list_mutex);
152	list_for_each_entry(proto, &proto_list, node) {
153		if (proto->init_cgroup) {
154			ret = proto->init_cgroup(memcg, ss);
155			if (ret)
156				goto out;
157		}
158	}
159
160	mutex_unlock(&proto_list_mutex);
161	return ret;
162out:
163	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164		if (proto->destroy_cgroup)
165			proto->destroy_cgroup(memcg);
166	mutex_unlock(&proto_list_mutex);
167	return ret;
168}
169
170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171{
172	struct proto *proto;
173
174	mutex_lock(&proto_list_mutex);
175	list_for_each_entry_reverse(proto, &proto_list, node)
176		if (proto->destroy_cgroup)
177			proto->destroy_cgroup(memcg);
178	mutex_unlock(&proto_list_mutex);
179}
180#endif
181
182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
189struct static_key memcg_socket_limit_enabled;
190EXPORT_SYMBOL(memcg_socket_limit_enabled);
191
192/*
193 * Make lock validator output more readable. (we pre-construct these
194 * strings build-time, so that runtime initialization of socket
195 * locks is fast):
196 */
197static const char *const af_family_key_strings[AF_MAX+1] = {
198  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
199  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
200  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
201  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
202  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
203  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
204  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
205  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212};
213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
215  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
216  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
217  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
218  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
219  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
220  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
221  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227  "slock-AF_NFC"   , "slock-AF_MAX"
228};
229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
231  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
232  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
233  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
234  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
235  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
236  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
237  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243  "clock-AF_NFC"   , "clock-AF_MAX"
244};
245
246/*
247 * sk_callback_lock locking rules are per-address-family,
248 * so split the lock classes by using a per-AF key:
249 */
250static struct lock_class_key af_callback_keys[AF_MAX];
251
252/* Take into consideration the size of the struct sk_buff overhead in the
253 * determination of these values, since that is non-constant across
254 * platforms.  This makes socket queueing behavior and performance
255 * not depend upon such differences.
256 */
257#define _SK_MEM_PACKETS		256
258#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
259#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261
262/* Run time adjustable parameters. */
263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264EXPORT_SYMBOL(sysctl_wmem_max);
265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266EXPORT_SYMBOL(sysctl_rmem_max);
267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269
270/* Maximal space eaten by iovec or ancillary data plus some space */
271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272EXPORT_SYMBOL(sysctl_optmem_max);
273
274/**
275 * sk_set_memalloc - sets %SOCK_MEMALLOC
276 * @sk: socket to set it on
277 *
278 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
279 * It's the responsibility of the admin to adjust min_free_kbytes
280 * to meet the requirements
281 */
282void sk_set_memalloc(struct sock *sk)
283{
284	sock_set_flag(sk, SOCK_MEMALLOC);
285	sk->sk_allocation |= __GFP_MEMALLOC;
286}
287EXPORT_SYMBOL_GPL(sk_set_memalloc);
288
289void sk_clear_memalloc(struct sock *sk)
290{
291	sock_reset_flag(sk, SOCK_MEMALLOC);
292	sk->sk_allocation &= ~__GFP_MEMALLOC;
293}
294EXPORT_SYMBOL_GPL(sk_clear_memalloc);
295
296#if defined(CONFIG_CGROUPS)
297#if !defined(CONFIG_NET_CLS_CGROUP)
298int net_cls_subsys_id = -1;
299EXPORT_SYMBOL_GPL(net_cls_subsys_id);
300#endif
301#if !defined(CONFIG_NETPRIO_CGROUP)
302int net_prio_subsys_id = -1;
303EXPORT_SYMBOL_GPL(net_prio_subsys_id);
304#endif
305#endif
306
307static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
308{
309	struct timeval tv;
310
311	if (optlen < sizeof(tv))
312		return -EINVAL;
313	if (copy_from_user(&tv, optval, sizeof(tv)))
314		return -EFAULT;
315	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
316		return -EDOM;
317
318	if (tv.tv_sec < 0) {
319		static int warned __read_mostly;
320
321		*timeo_p = 0;
322		if (warned < 10 && net_ratelimit()) {
323			warned++;
324			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
325				__func__, current->comm, task_pid_nr(current));
326		}
327		return 0;
328	}
329	*timeo_p = MAX_SCHEDULE_TIMEOUT;
330	if (tv.tv_sec == 0 && tv.tv_usec == 0)
331		return 0;
332	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
333		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
334	return 0;
335}
336
337static void sock_warn_obsolete_bsdism(const char *name)
338{
339	static int warned;
340	static char warncomm[TASK_COMM_LEN];
341	if (strcmp(warncomm, current->comm) && warned < 5) {
342		strcpy(warncomm,  current->comm);
343		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
344			warncomm, name);
345		warned++;
346	}
347}
348
349#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
350
351static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
352{
353	if (sk->sk_flags & flags) {
354		sk->sk_flags &= ~flags;
355		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
356			net_disable_timestamp();
357	}
358}
359
360
361int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
362{
363	int err;
364	int skb_len;
365	unsigned long flags;
366	struct sk_buff_head *list = &sk->sk_receive_queue;
367
368	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
369		atomic_inc(&sk->sk_drops);
370		trace_sock_rcvqueue_full(sk, skb);
371		return -ENOMEM;
372	}
373
374	err = sk_filter(sk, skb);
375	if (err)
376		return err;
377
378	if (!sk_rmem_schedule(sk, skb->truesize)) {
379		atomic_inc(&sk->sk_drops);
380		return -ENOBUFS;
381	}
382
383	skb->dev = NULL;
384	skb_set_owner_r(skb, sk);
385
386	/* Cache the SKB length before we tack it onto the receive
387	 * queue.  Once it is added it no longer belongs to us and
388	 * may be freed by other threads of control pulling packets
389	 * from the queue.
390	 */
391	skb_len = skb->len;
392
393	/* we escape from rcu protected region, make sure we dont leak
394	 * a norefcounted dst
395	 */
396	skb_dst_force(skb);
397
398	spin_lock_irqsave(&list->lock, flags);
399	skb->dropcount = atomic_read(&sk->sk_drops);
400	__skb_queue_tail(list, skb);
401	spin_unlock_irqrestore(&list->lock, flags);
402
403	if (!sock_flag(sk, SOCK_DEAD))
404		sk->sk_data_ready(sk, skb_len);
405	return 0;
406}
407EXPORT_SYMBOL(sock_queue_rcv_skb);
408
409int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
410{
411	int rc = NET_RX_SUCCESS;
412
413	if (sk_filter(sk, skb))
414		goto discard_and_relse;
415
416	skb->dev = NULL;
417
418	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
419		atomic_inc(&sk->sk_drops);
420		goto discard_and_relse;
421	}
422	if (nested)
423		bh_lock_sock_nested(sk);
424	else
425		bh_lock_sock(sk);
426	if (!sock_owned_by_user(sk)) {
427		/*
428		 * trylock + unlock semantics:
429		 */
430		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
431
432		rc = sk_backlog_rcv(sk, skb);
433
434		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
435	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
436		bh_unlock_sock(sk);
437		atomic_inc(&sk->sk_drops);
438		goto discard_and_relse;
439	}
440
441	bh_unlock_sock(sk);
442out:
443	sock_put(sk);
444	return rc;
445discard_and_relse:
446	kfree_skb(skb);
447	goto out;
448}
449EXPORT_SYMBOL(sk_receive_skb);
450
451void sk_reset_txq(struct sock *sk)
452{
453	sk_tx_queue_clear(sk);
454}
455EXPORT_SYMBOL(sk_reset_txq);
456
457struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
458{
459	struct dst_entry *dst = __sk_dst_get(sk);
460
461	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
462		sk_tx_queue_clear(sk);
463		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
464		dst_release(dst);
465		return NULL;
466	}
467
468	return dst;
469}
470EXPORT_SYMBOL(__sk_dst_check);
471
472struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
473{
474	struct dst_entry *dst = sk_dst_get(sk);
475
476	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
477		sk_dst_reset(sk);
478		dst_release(dst);
479		return NULL;
480	}
481
482	return dst;
483}
484EXPORT_SYMBOL(sk_dst_check);
485
486static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
487{
488	int ret = -ENOPROTOOPT;
489#ifdef CONFIG_NETDEVICES
490	struct net *net = sock_net(sk);
491	char devname[IFNAMSIZ];
492	int index;
493
494	/* Sorry... */
495	ret = -EPERM;
496	if (!capable(CAP_NET_RAW))
497		goto out;
498
499	ret = -EINVAL;
500	if (optlen < 0)
501		goto out;
502
503	/* Bind this socket to a particular device like "eth0",
504	 * as specified in the passed interface name. If the
505	 * name is "" or the option length is zero the socket
506	 * is not bound.
507	 */
508	if (optlen > IFNAMSIZ - 1)
509		optlen = IFNAMSIZ - 1;
510	memset(devname, 0, sizeof(devname));
511
512	ret = -EFAULT;
513	if (copy_from_user(devname, optval, optlen))
514		goto out;
515
516	index = 0;
517	if (devname[0] != '\0') {
518		struct net_device *dev;
519
520		rcu_read_lock();
521		dev = dev_get_by_name_rcu(net, devname);
522		if (dev)
523			index = dev->ifindex;
524		rcu_read_unlock();
525		ret = -ENODEV;
526		if (!dev)
527			goto out;
528	}
529
530	lock_sock(sk);
531	sk->sk_bound_dev_if = index;
532	sk_dst_reset(sk);
533	release_sock(sk);
534
535	ret = 0;
536
537out:
538#endif
539
540	return ret;
541}
542
543static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
544{
545	if (valbool)
546		sock_set_flag(sk, bit);
547	else
548		sock_reset_flag(sk, bit);
549}
550
551/*
552 *	This is meant for all protocols to use and covers goings on
553 *	at the socket level. Everything here is generic.
554 */
555
556int sock_setsockopt(struct socket *sock, int level, int optname,
557		    char __user *optval, unsigned int optlen)
558{
559	struct sock *sk = sock->sk;
560	int val;
561	int valbool;
562	struct linger ling;
563	int ret = 0;
564
565	/*
566	 *	Options without arguments
567	 */
568
569	if (optname == SO_BINDTODEVICE)
570		return sock_bindtodevice(sk, optval, optlen);
571
572	if (optlen < sizeof(int))
573		return -EINVAL;
574
575	if (get_user(val, (int __user *)optval))
576		return -EFAULT;
577
578	valbool = val ? 1 : 0;
579
580	lock_sock(sk);
581
582	switch (optname) {
583	case SO_DEBUG:
584		if (val && !capable(CAP_NET_ADMIN))
585			ret = -EACCES;
586		else
587			sock_valbool_flag(sk, SOCK_DBG, valbool);
588		break;
589	case SO_REUSEADDR:
590		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
591		break;
592	case SO_TYPE:
593	case SO_PROTOCOL:
594	case SO_DOMAIN:
595	case SO_ERROR:
596		ret = -ENOPROTOOPT;
597		break;
598	case SO_DONTROUTE:
599		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
600		break;
601	case SO_BROADCAST:
602		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
603		break;
604	case SO_SNDBUF:
605		/* Don't error on this BSD doesn't and if you think
606		 * about it this is right. Otherwise apps have to
607		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
608		 * are treated in BSD as hints
609		 */
610		val = min_t(u32, val, sysctl_wmem_max);
611set_sndbuf:
612		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
613		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
614		/* Wake up sending tasks if we upped the value. */
615		sk->sk_write_space(sk);
616		break;
617
618	case SO_SNDBUFFORCE:
619		if (!capable(CAP_NET_ADMIN)) {
620			ret = -EPERM;
621			break;
622		}
623		goto set_sndbuf;
624
625	case SO_RCVBUF:
626		/* Don't error on this BSD doesn't and if you think
627		 * about it this is right. Otherwise apps have to
628		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
629		 * are treated in BSD as hints
630		 */
631		val = min_t(u32, val, sysctl_rmem_max);
632set_rcvbuf:
633		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
634		/*
635		 * We double it on the way in to account for
636		 * "struct sk_buff" etc. overhead.   Applications
637		 * assume that the SO_RCVBUF setting they make will
638		 * allow that much actual data to be received on that
639		 * socket.
640		 *
641		 * Applications are unaware that "struct sk_buff" and
642		 * other overheads allocate from the receive buffer
643		 * during socket buffer allocation.
644		 *
645		 * And after considering the possible alternatives,
646		 * returning the value we actually used in getsockopt
647		 * is the most desirable behavior.
648		 */
649		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
650		break;
651
652	case SO_RCVBUFFORCE:
653		if (!capable(CAP_NET_ADMIN)) {
654			ret = -EPERM;
655			break;
656		}
657		goto set_rcvbuf;
658
659	case SO_KEEPALIVE:
660#ifdef CONFIG_INET
661		if (sk->sk_protocol == IPPROTO_TCP)
662			tcp_set_keepalive(sk, valbool);
663#endif
664		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
665		break;
666
667	case SO_OOBINLINE:
668		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
669		break;
670
671	case SO_NO_CHECK:
672		sk->sk_no_check = valbool;
673		break;
674
675	case SO_PRIORITY:
676		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
677			sk->sk_priority = val;
678		else
679			ret = -EPERM;
680		break;
681
682	case SO_LINGER:
683		if (optlen < sizeof(ling)) {
684			ret = -EINVAL;	/* 1003.1g */
685			break;
686		}
687		if (copy_from_user(&ling, optval, sizeof(ling))) {
688			ret = -EFAULT;
689			break;
690		}
691		if (!ling.l_onoff)
692			sock_reset_flag(sk, SOCK_LINGER);
693		else {
694#if (BITS_PER_LONG == 32)
695			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
696				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
697			else
698#endif
699				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
700			sock_set_flag(sk, SOCK_LINGER);
701		}
702		break;
703
704	case SO_BSDCOMPAT:
705		sock_warn_obsolete_bsdism("setsockopt");
706		break;
707
708	case SO_PASSCRED:
709		if (valbool)
710			set_bit(SOCK_PASSCRED, &sock->flags);
711		else
712			clear_bit(SOCK_PASSCRED, &sock->flags);
713		break;
714
715	case SO_TIMESTAMP:
716	case SO_TIMESTAMPNS:
717		if (valbool)  {
718			if (optname == SO_TIMESTAMP)
719				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
720			else
721				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
722			sock_set_flag(sk, SOCK_RCVTSTAMP);
723			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
724		} else {
725			sock_reset_flag(sk, SOCK_RCVTSTAMP);
726			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
727		}
728		break;
729
730	case SO_TIMESTAMPING:
731		if (val & ~SOF_TIMESTAMPING_MASK) {
732			ret = -EINVAL;
733			break;
734		}
735		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
736				  val & SOF_TIMESTAMPING_TX_HARDWARE);
737		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
738				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
739		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
740				  val & SOF_TIMESTAMPING_RX_HARDWARE);
741		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
742			sock_enable_timestamp(sk,
743					      SOCK_TIMESTAMPING_RX_SOFTWARE);
744		else
745			sock_disable_timestamp(sk,
746					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
747		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
748				  val & SOF_TIMESTAMPING_SOFTWARE);
749		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
750				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
751		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
752				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
753		break;
754
755	case SO_RCVLOWAT:
756		if (val < 0)
757			val = INT_MAX;
758		sk->sk_rcvlowat = val ? : 1;
759		break;
760
761	case SO_RCVTIMEO:
762		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
763		break;
764
765	case SO_SNDTIMEO:
766		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
767		break;
768
769	case SO_ATTACH_FILTER:
770		ret = -EINVAL;
771		if (optlen == sizeof(struct sock_fprog)) {
772			struct sock_fprog fprog;
773
774			ret = -EFAULT;
775			if (copy_from_user(&fprog, optval, sizeof(fprog)))
776				break;
777
778			ret = sk_attach_filter(&fprog, sk);
779		}
780		break;
781
782	case SO_DETACH_FILTER:
783		ret = sk_detach_filter(sk);
784		break;
785
786	case SO_PASSSEC:
787		if (valbool)
788			set_bit(SOCK_PASSSEC, &sock->flags);
789		else
790			clear_bit(SOCK_PASSSEC, &sock->flags);
791		break;
792	case SO_MARK:
793		if (!capable(CAP_NET_ADMIN))
794			ret = -EPERM;
795		else
796			sk->sk_mark = val;
797		break;
798
799		/* We implement the SO_SNDLOWAT etc to
800		   not be settable (1003.1g 5.3) */
801	case SO_RXQ_OVFL:
802		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
803		break;
804
805	case SO_WIFI_STATUS:
806		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
807		break;
808
809	case SO_PEEK_OFF:
810		if (sock->ops->set_peek_off)
811			sock->ops->set_peek_off(sk, val);
812		else
813			ret = -EOPNOTSUPP;
814		break;
815
816	case SO_NOFCS:
817		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
818		break;
819
820	default:
821		ret = -ENOPROTOOPT;
822		break;
823	}
824	release_sock(sk);
825	return ret;
826}
827EXPORT_SYMBOL(sock_setsockopt);
828
829
830void cred_to_ucred(struct pid *pid, const struct cred *cred,
831		   struct ucred *ucred)
832{
833	ucred->pid = pid_vnr(pid);
834	ucred->uid = ucred->gid = -1;
835	if (cred) {
836		struct user_namespace *current_ns = current_user_ns();
837
838		ucred->uid = from_kuid(current_ns, cred->euid);
839		ucred->gid = from_kgid(current_ns, cred->egid);
840	}
841}
842EXPORT_SYMBOL_GPL(cred_to_ucred);
843
844int sock_getsockopt(struct socket *sock, int level, int optname,
845		    char __user *optval, int __user *optlen)
846{
847	struct sock *sk = sock->sk;
848
849	union {
850		int val;
851		struct linger ling;
852		struct timeval tm;
853	} v;
854
855	int lv = sizeof(int);
856	int len;
857
858	if (get_user(len, optlen))
859		return -EFAULT;
860	if (len < 0)
861		return -EINVAL;
862
863	memset(&v, 0, sizeof(v));
864
865	switch (optname) {
866	case SO_DEBUG:
867		v.val = sock_flag(sk, SOCK_DBG);
868		break;
869
870	case SO_DONTROUTE:
871		v.val = sock_flag(sk, SOCK_LOCALROUTE);
872		break;
873
874	case SO_BROADCAST:
875		v.val = sock_flag(sk, SOCK_BROADCAST);
876		break;
877
878	case SO_SNDBUF:
879		v.val = sk->sk_sndbuf;
880		break;
881
882	case SO_RCVBUF:
883		v.val = sk->sk_rcvbuf;
884		break;
885
886	case SO_REUSEADDR:
887		v.val = sk->sk_reuse;
888		break;
889
890	case SO_KEEPALIVE:
891		v.val = sock_flag(sk, SOCK_KEEPOPEN);
892		break;
893
894	case SO_TYPE:
895		v.val = sk->sk_type;
896		break;
897
898	case SO_PROTOCOL:
899		v.val = sk->sk_protocol;
900		break;
901
902	case SO_DOMAIN:
903		v.val = sk->sk_family;
904		break;
905
906	case SO_ERROR:
907		v.val = -sock_error(sk);
908		if (v.val == 0)
909			v.val = xchg(&sk->sk_err_soft, 0);
910		break;
911
912	case SO_OOBINLINE:
913		v.val = sock_flag(sk, SOCK_URGINLINE);
914		break;
915
916	case SO_NO_CHECK:
917		v.val = sk->sk_no_check;
918		break;
919
920	case SO_PRIORITY:
921		v.val = sk->sk_priority;
922		break;
923
924	case SO_LINGER:
925		lv		= sizeof(v.ling);
926		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
927		v.ling.l_linger	= sk->sk_lingertime / HZ;
928		break;
929
930	case SO_BSDCOMPAT:
931		sock_warn_obsolete_bsdism("getsockopt");
932		break;
933
934	case SO_TIMESTAMP:
935		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
936				!sock_flag(sk, SOCK_RCVTSTAMPNS);
937		break;
938
939	case SO_TIMESTAMPNS:
940		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
941		break;
942
943	case SO_TIMESTAMPING:
944		v.val = 0;
945		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
946			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
947		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
948			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
949		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
950			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
951		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
952			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
953		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
954			v.val |= SOF_TIMESTAMPING_SOFTWARE;
955		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
956			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
957		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
958			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
959		break;
960
961	case SO_RCVTIMEO:
962		lv = sizeof(struct timeval);
963		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
964			v.tm.tv_sec = 0;
965			v.tm.tv_usec = 0;
966		} else {
967			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
968			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
969		}
970		break;
971
972	case SO_SNDTIMEO:
973		lv = sizeof(struct timeval);
974		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
975			v.tm.tv_sec = 0;
976			v.tm.tv_usec = 0;
977		} else {
978			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
979			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
980		}
981		break;
982
983	case SO_RCVLOWAT:
984		v.val = sk->sk_rcvlowat;
985		break;
986
987	case SO_SNDLOWAT:
988		v.val = 1;
989		break;
990
991	case SO_PASSCRED:
992		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
993		break;
994
995	case SO_PEERCRED:
996	{
997		struct ucred peercred;
998		if (len > sizeof(peercred))
999			len = sizeof(peercred);
1000		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1001		if (copy_to_user(optval, &peercred, len))
1002			return -EFAULT;
1003		goto lenout;
1004	}
1005
1006	case SO_PEERNAME:
1007	{
1008		char address[128];
1009
1010		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1011			return -ENOTCONN;
1012		if (lv < len)
1013			return -EINVAL;
1014		if (copy_to_user(optval, address, len))
1015			return -EFAULT;
1016		goto lenout;
1017	}
1018
1019	/* Dubious BSD thing... Probably nobody even uses it, but
1020	 * the UNIX standard wants it for whatever reason... -DaveM
1021	 */
1022	case SO_ACCEPTCONN:
1023		v.val = sk->sk_state == TCP_LISTEN;
1024		break;
1025
1026	case SO_PASSSEC:
1027		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1028		break;
1029
1030	case SO_PEERSEC:
1031		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1032
1033	case SO_MARK:
1034		v.val = sk->sk_mark;
1035		break;
1036
1037	case SO_RXQ_OVFL:
1038		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1039		break;
1040
1041	case SO_WIFI_STATUS:
1042		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1043		break;
1044
1045	case SO_PEEK_OFF:
1046		if (!sock->ops->set_peek_off)
1047			return -EOPNOTSUPP;
1048
1049		v.val = sk->sk_peek_off;
1050		break;
1051	case SO_NOFCS:
1052		v.val = sock_flag(sk, SOCK_NOFCS);
1053		break;
1054	default:
1055		return -ENOPROTOOPT;
1056	}
1057
1058	if (len > lv)
1059		len = lv;
1060	if (copy_to_user(optval, &v, len))
1061		return -EFAULT;
1062lenout:
1063	if (put_user(len, optlen))
1064		return -EFAULT;
1065	return 0;
1066}
1067
1068/*
1069 * Initialize an sk_lock.
1070 *
1071 * (We also register the sk_lock with the lock validator.)
1072 */
1073static inline void sock_lock_init(struct sock *sk)
1074{
1075	sock_lock_init_class_and_name(sk,
1076			af_family_slock_key_strings[sk->sk_family],
1077			af_family_slock_keys + sk->sk_family,
1078			af_family_key_strings[sk->sk_family],
1079			af_family_keys + sk->sk_family);
1080}
1081
1082/*
1083 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1084 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1085 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1086 */
1087static void sock_copy(struct sock *nsk, const struct sock *osk)
1088{
1089#ifdef CONFIG_SECURITY_NETWORK
1090	void *sptr = nsk->sk_security;
1091#endif
1092	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1093
1094	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1095	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1096
1097#ifdef CONFIG_SECURITY_NETWORK
1098	nsk->sk_security = sptr;
1099	security_sk_clone(osk, nsk);
1100#endif
1101}
1102
1103/*
1104 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1105 * un-modified. Special care is taken when initializing object to zero.
1106 */
1107static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1108{
1109	if (offsetof(struct sock, sk_node.next) != 0)
1110		memset(sk, 0, offsetof(struct sock, sk_node.next));
1111	memset(&sk->sk_node.pprev, 0,
1112	       size - offsetof(struct sock, sk_node.pprev));
1113}
1114
1115void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1116{
1117	unsigned long nulls1, nulls2;
1118
1119	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1120	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1121	if (nulls1 > nulls2)
1122		swap(nulls1, nulls2);
1123
1124	if (nulls1 != 0)
1125		memset((char *)sk, 0, nulls1);
1126	memset((char *)sk + nulls1 + sizeof(void *), 0,
1127	       nulls2 - nulls1 - sizeof(void *));
1128	memset((char *)sk + nulls2 + sizeof(void *), 0,
1129	       size - nulls2 - sizeof(void *));
1130}
1131EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1132
1133static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1134		int family)
1135{
1136	struct sock *sk;
1137	struct kmem_cache *slab;
1138
1139	slab = prot->slab;
1140	if (slab != NULL) {
1141		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1142		if (!sk)
1143			return sk;
1144		if (priority & __GFP_ZERO) {
1145			if (prot->clear_sk)
1146				prot->clear_sk(sk, prot->obj_size);
1147			else
1148				sk_prot_clear_nulls(sk, prot->obj_size);
1149		}
1150	} else
1151		sk = kmalloc(prot->obj_size, priority);
1152
1153	if (sk != NULL) {
1154		kmemcheck_annotate_bitfield(sk, flags);
1155
1156		if (security_sk_alloc(sk, family, priority))
1157			goto out_free;
1158
1159		if (!try_module_get(prot->owner))
1160			goto out_free_sec;
1161		sk_tx_queue_clear(sk);
1162	}
1163
1164	return sk;
1165
1166out_free_sec:
1167	security_sk_free(sk);
1168out_free:
1169	if (slab != NULL)
1170		kmem_cache_free(slab, sk);
1171	else
1172		kfree(sk);
1173	return NULL;
1174}
1175
1176static void sk_prot_free(struct proto *prot, struct sock *sk)
1177{
1178	struct kmem_cache *slab;
1179	struct module *owner;
1180
1181	owner = prot->owner;
1182	slab = prot->slab;
1183
1184	security_sk_free(sk);
1185	if (slab != NULL)
1186		kmem_cache_free(slab, sk);
1187	else
1188		kfree(sk);
1189	module_put(owner);
1190}
1191
1192#ifdef CONFIG_CGROUPS
1193void sock_update_classid(struct sock *sk)
1194{
1195	u32 classid;
1196
1197	rcu_read_lock();  /* doing current task, which cannot vanish. */
1198	classid = task_cls_classid(current);
1199	rcu_read_unlock();
1200	if (classid && classid != sk->sk_classid)
1201		sk->sk_classid = classid;
1202}
1203EXPORT_SYMBOL(sock_update_classid);
1204
1205void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1206{
1207	if (in_interrupt())
1208		return;
1209
1210	sk->sk_cgrp_prioidx = task_netprioidx(task);
1211}
1212EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1213#endif
1214
1215/**
1216 *	sk_alloc - All socket objects are allocated here
1217 *	@net: the applicable net namespace
1218 *	@family: protocol family
1219 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1220 *	@prot: struct proto associated with this new sock instance
1221 */
1222struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1223		      struct proto *prot)
1224{
1225	struct sock *sk;
1226
1227	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1228	if (sk) {
1229		sk->sk_family = family;
1230		/*
1231		 * See comment in struct sock definition to understand
1232		 * why we need sk_prot_creator -acme
1233		 */
1234		sk->sk_prot = sk->sk_prot_creator = prot;
1235		sock_lock_init(sk);
1236		sock_net_set(sk, get_net(net));
1237		atomic_set(&sk->sk_wmem_alloc, 1);
1238
1239		sock_update_classid(sk);
1240		sock_update_netprioidx(sk, current);
1241	}
1242
1243	return sk;
1244}
1245EXPORT_SYMBOL(sk_alloc);
1246
1247static void __sk_free(struct sock *sk)
1248{
1249	struct sk_filter *filter;
1250
1251	if (sk->sk_destruct)
1252		sk->sk_destruct(sk);
1253
1254	filter = rcu_dereference_check(sk->sk_filter,
1255				       atomic_read(&sk->sk_wmem_alloc) == 0);
1256	if (filter) {
1257		sk_filter_uncharge(sk, filter);
1258		RCU_INIT_POINTER(sk->sk_filter, NULL);
1259	}
1260
1261	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1262
1263	if (atomic_read(&sk->sk_omem_alloc))
1264		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1265			 __func__, atomic_read(&sk->sk_omem_alloc));
1266
1267	if (sk->sk_peer_cred)
1268		put_cred(sk->sk_peer_cred);
1269	put_pid(sk->sk_peer_pid);
1270	put_net(sock_net(sk));
1271	sk_prot_free(sk->sk_prot_creator, sk);
1272}
1273
1274void sk_free(struct sock *sk)
1275{
1276	/*
1277	 * We subtract one from sk_wmem_alloc and can know if
1278	 * some packets are still in some tx queue.
1279	 * If not null, sock_wfree() will call __sk_free(sk) later
1280	 */
1281	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1282		__sk_free(sk);
1283}
1284EXPORT_SYMBOL(sk_free);
1285
1286/*
1287 * Last sock_put should drop reference to sk->sk_net. It has already
1288 * been dropped in sk_change_net. Taking reference to stopping namespace
1289 * is not an option.
1290 * Take reference to a socket to remove it from hash _alive_ and after that
1291 * destroy it in the context of init_net.
1292 */
1293void sk_release_kernel(struct sock *sk)
1294{
1295	if (sk == NULL || sk->sk_socket == NULL)
1296		return;
1297
1298	sock_hold(sk);
1299	sock_release(sk->sk_socket);
1300	release_net(sock_net(sk));
1301	sock_net_set(sk, get_net(&init_net));
1302	sock_put(sk);
1303}
1304EXPORT_SYMBOL(sk_release_kernel);
1305
1306static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1307{
1308	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1309		sock_update_memcg(newsk);
1310}
1311
1312/**
1313 *	sk_clone_lock - clone a socket, and lock its clone
1314 *	@sk: the socket to clone
1315 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1316 *
1317 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1318 */
1319struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1320{
1321	struct sock *newsk;
1322
1323	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1324	if (newsk != NULL) {
1325		struct sk_filter *filter;
1326
1327		sock_copy(newsk, sk);
1328
1329		/* SANITY */
1330		get_net(sock_net(newsk));
1331		sk_node_init(&newsk->sk_node);
1332		sock_lock_init(newsk);
1333		bh_lock_sock(newsk);
1334		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1335		newsk->sk_backlog.len = 0;
1336
1337		atomic_set(&newsk->sk_rmem_alloc, 0);
1338		/*
1339		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1340		 */
1341		atomic_set(&newsk->sk_wmem_alloc, 1);
1342		atomic_set(&newsk->sk_omem_alloc, 0);
1343		skb_queue_head_init(&newsk->sk_receive_queue);
1344		skb_queue_head_init(&newsk->sk_write_queue);
1345#ifdef CONFIG_NET_DMA
1346		skb_queue_head_init(&newsk->sk_async_wait_queue);
1347#endif
1348
1349		spin_lock_init(&newsk->sk_dst_lock);
1350		rwlock_init(&newsk->sk_callback_lock);
1351		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1352				af_callback_keys + newsk->sk_family,
1353				af_family_clock_key_strings[newsk->sk_family]);
1354
1355		newsk->sk_dst_cache	= NULL;
1356		newsk->sk_wmem_queued	= 0;
1357		newsk->sk_forward_alloc = 0;
1358		newsk->sk_send_head	= NULL;
1359		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1360
1361		sock_reset_flag(newsk, SOCK_DONE);
1362		skb_queue_head_init(&newsk->sk_error_queue);
1363
1364		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1365		if (filter != NULL)
1366			sk_filter_charge(newsk, filter);
1367
1368		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1369			/* It is still raw copy of parent, so invalidate
1370			 * destructor and make plain sk_free() */
1371			newsk->sk_destruct = NULL;
1372			bh_unlock_sock(newsk);
1373			sk_free(newsk);
1374			newsk = NULL;
1375			goto out;
1376		}
1377
1378		newsk->sk_err	   = 0;
1379		newsk->sk_priority = 0;
1380		/*
1381		 * Before updating sk_refcnt, we must commit prior changes to memory
1382		 * (Documentation/RCU/rculist_nulls.txt for details)
1383		 */
1384		smp_wmb();
1385		atomic_set(&newsk->sk_refcnt, 2);
1386
1387		/*
1388		 * Increment the counter in the same struct proto as the master
1389		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1390		 * is the same as sk->sk_prot->socks, as this field was copied
1391		 * with memcpy).
1392		 *
1393		 * This _changes_ the previous behaviour, where
1394		 * tcp_create_openreq_child always was incrementing the
1395		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1396		 * to be taken into account in all callers. -acme
1397		 */
1398		sk_refcnt_debug_inc(newsk);
1399		sk_set_socket(newsk, NULL);
1400		newsk->sk_wq = NULL;
1401
1402		sk_update_clone(sk, newsk);
1403
1404		if (newsk->sk_prot->sockets_allocated)
1405			sk_sockets_allocated_inc(newsk);
1406
1407		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1408			net_enable_timestamp();
1409	}
1410out:
1411	return newsk;
1412}
1413EXPORT_SYMBOL_GPL(sk_clone_lock);
1414
1415void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1416{
1417	__sk_dst_set(sk, dst);
1418	sk->sk_route_caps = dst->dev->features;
1419	if (sk->sk_route_caps & NETIF_F_GSO)
1420		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1421	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1422	if (sk_can_gso(sk)) {
1423		if (dst->header_len) {
1424			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1425		} else {
1426			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1427			sk->sk_gso_max_size = dst->dev->gso_max_size;
1428		}
1429	}
1430}
1431EXPORT_SYMBOL_GPL(sk_setup_caps);
1432
1433void __init sk_init(void)
1434{
1435	if (totalram_pages <= 4096) {
1436		sysctl_wmem_max = 32767;
1437		sysctl_rmem_max = 32767;
1438		sysctl_wmem_default = 32767;
1439		sysctl_rmem_default = 32767;
1440	} else if (totalram_pages >= 131072) {
1441		sysctl_wmem_max = 131071;
1442		sysctl_rmem_max = 131071;
1443	}
1444}
1445
1446/*
1447 *	Simple resource managers for sockets.
1448 */
1449
1450
1451/*
1452 * Write buffer destructor automatically called from kfree_skb.
1453 */
1454void sock_wfree(struct sk_buff *skb)
1455{
1456	struct sock *sk = skb->sk;
1457	unsigned int len = skb->truesize;
1458
1459	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1460		/*
1461		 * Keep a reference on sk_wmem_alloc, this will be released
1462		 * after sk_write_space() call
1463		 */
1464		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1465		sk->sk_write_space(sk);
1466		len = 1;
1467	}
1468	/*
1469	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1470	 * could not do because of in-flight packets
1471	 */
1472	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1473		__sk_free(sk);
1474}
1475EXPORT_SYMBOL(sock_wfree);
1476
1477/*
1478 * Read buffer destructor automatically called from kfree_skb.
1479 */
1480void sock_rfree(struct sk_buff *skb)
1481{
1482	struct sock *sk = skb->sk;
1483	unsigned int len = skb->truesize;
1484
1485	atomic_sub(len, &sk->sk_rmem_alloc);
1486	sk_mem_uncharge(sk, len);
1487}
1488EXPORT_SYMBOL(sock_rfree);
1489
1490void sock_edemux(struct sk_buff *skb)
1491{
1492	sock_put(skb->sk);
1493}
1494EXPORT_SYMBOL(sock_edemux);
1495
1496int sock_i_uid(struct sock *sk)
1497{
1498	int uid;
1499
1500	read_lock_bh(&sk->sk_callback_lock);
1501	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1502	read_unlock_bh(&sk->sk_callback_lock);
1503	return uid;
1504}
1505EXPORT_SYMBOL(sock_i_uid);
1506
1507unsigned long sock_i_ino(struct sock *sk)
1508{
1509	unsigned long ino;
1510
1511	read_lock_bh(&sk->sk_callback_lock);
1512	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1513	read_unlock_bh(&sk->sk_callback_lock);
1514	return ino;
1515}
1516EXPORT_SYMBOL(sock_i_ino);
1517
1518/*
1519 * Allocate a skb from the socket's send buffer.
1520 */
1521struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1522			     gfp_t priority)
1523{
1524	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1525		struct sk_buff *skb = alloc_skb(size, priority);
1526		if (skb) {
1527			skb_set_owner_w(skb, sk);
1528			return skb;
1529		}
1530	}
1531	return NULL;
1532}
1533EXPORT_SYMBOL(sock_wmalloc);
1534
1535/*
1536 * Allocate a skb from the socket's receive buffer.
1537 */
1538struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1539			     gfp_t priority)
1540{
1541	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1542		struct sk_buff *skb = alloc_skb(size, priority);
1543		if (skb) {
1544			skb_set_owner_r(skb, sk);
1545			return skb;
1546		}
1547	}
1548	return NULL;
1549}
1550
1551/*
1552 * Allocate a memory block from the socket's option memory buffer.
1553 */
1554void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1555{
1556	if ((unsigned int)size <= sysctl_optmem_max &&
1557	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1558		void *mem;
1559		/* First do the add, to avoid the race if kmalloc
1560		 * might sleep.
1561		 */
1562		atomic_add(size, &sk->sk_omem_alloc);
1563		mem = kmalloc(size, priority);
1564		if (mem)
1565			return mem;
1566		atomic_sub(size, &sk->sk_omem_alloc);
1567	}
1568	return NULL;
1569}
1570EXPORT_SYMBOL(sock_kmalloc);
1571
1572/*
1573 * Free an option memory block.
1574 */
1575void sock_kfree_s(struct sock *sk, void *mem, int size)
1576{
1577	kfree(mem);
1578	atomic_sub(size, &sk->sk_omem_alloc);
1579}
1580EXPORT_SYMBOL(sock_kfree_s);
1581
1582/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1583   I think, these locks should be removed for datagram sockets.
1584 */
1585static long sock_wait_for_wmem(struct sock *sk, long timeo)
1586{
1587	DEFINE_WAIT(wait);
1588
1589	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1590	for (;;) {
1591		if (!timeo)
1592			break;
1593		if (signal_pending(current))
1594			break;
1595		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1596		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1597		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1598			break;
1599		if (sk->sk_shutdown & SEND_SHUTDOWN)
1600			break;
1601		if (sk->sk_err)
1602			break;
1603		timeo = schedule_timeout(timeo);
1604	}
1605	finish_wait(sk_sleep(sk), &wait);
1606	return timeo;
1607}
1608
1609
1610/*
1611 *	Generic send/receive buffer handlers
1612 */
1613
1614struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1615				     unsigned long data_len, int noblock,
1616				     int *errcode)
1617{
1618	struct sk_buff *skb;
1619	gfp_t gfp_mask;
1620	long timeo;
1621	int err;
1622	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1623
1624	err = -EMSGSIZE;
1625	if (npages > MAX_SKB_FRAGS)
1626		goto failure;
1627
1628	gfp_mask = sk->sk_allocation;
1629	if (gfp_mask & __GFP_WAIT)
1630		gfp_mask |= __GFP_REPEAT;
1631
1632	timeo = sock_sndtimeo(sk, noblock);
1633	while (1) {
1634		err = sock_error(sk);
1635		if (err != 0)
1636			goto failure;
1637
1638		err = -EPIPE;
1639		if (sk->sk_shutdown & SEND_SHUTDOWN)
1640			goto failure;
1641
1642		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1643			skb = alloc_skb(header_len, gfp_mask);
1644			if (skb) {
1645				int i;
1646
1647				/* No pages, we're done... */
1648				if (!data_len)
1649					break;
1650
1651				skb->truesize += data_len;
1652				skb_shinfo(skb)->nr_frags = npages;
1653				for (i = 0; i < npages; i++) {
1654					struct page *page;
1655
1656					page = alloc_pages(sk->sk_allocation, 0);
1657					if (!page) {
1658						err = -ENOBUFS;
1659						skb_shinfo(skb)->nr_frags = i;
1660						kfree_skb(skb);
1661						goto failure;
1662					}
1663
1664					__skb_fill_page_desc(skb, i,
1665							page, 0,
1666							(data_len >= PAGE_SIZE ?
1667							 PAGE_SIZE :
1668							 data_len));
1669					data_len -= PAGE_SIZE;
1670				}
1671
1672				/* Full success... */
1673				break;
1674			}
1675			err = -ENOBUFS;
1676			goto failure;
1677		}
1678		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1679		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1680		err = -EAGAIN;
1681		if (!timeo)
1682			goto failure;
1683		if (signal_pending(current))
1684			goto interrupted;
1685		timeo = sock_wait_for_wmem(sk, timeo);
1686	}
1687
1688	skb_set_owner_w(skb, sk);
1689	return skb;
1690
1691interrupted:
1692	err = sock_intr_errno(timeo);
1693failure:
1694	*errcode = err;
1695	return NULL;
1696}
1697EXPORT_SYMBOL(sock_alloc_send_pskb);
1698
1699struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1700				    int noblock, int *errcode)
1701{
1702	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1703}
1704EXPORT_SYMBOL(sock_alloc_send_skb);
1705
1706static void __lock_sock(struct sock *sk)
1707	__releases(&sk->sk_lock.slock)
1708	__acquires(&sk->sk_lock.slock)
1709{
1710	DEFINE_WAIT(wait);
1711
1712	for (;;) {
1713		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1714					TASK_UNINTERRUPTIBLE);
1715		spin_unlock_bh(&sk->sk_lock.slock);
1716		schedule();
1717		spin_lock_bh(&sk->sk_lock.slock);
1718		if (!sock_owned_by_user(sk))
1719			break;
1720	}
1721	finish_wait(&sk->sk_lock.wq, &wait);
1722}
1723
1724static void __release_sock(struct sock *sk)
1725	__releases(&sk->sk_lock.slock)
1726	__acquires(&sk->sk_lock.slock)
1727{
1728	struct sk_buff *skb = sk->sk_backlog.head;
1729
1730	do {
1731		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1732		bh_unlock_sock(sk);
1733
1734		do {
1735			struct sk_buff *next = skb->next;
1736
1737			prefetch(next);
1738			WARN_ON_ONCE(skb_dst_is_noref(skb));
1739			skb->next = NULL;
1740			sk_backlog_rcv(sk, skb);
1741
1742			/*
1743			 * We are in process context here with softirqs
1744			 * disabled, use cond_resched_softirq() to preempt.
1745			 * This is safe to do because we've taken the backlog
1746			 * queue private:
1747			 */
1748			cond_resched_softirq();
1749
1750			skb = next;
1751		} while (skb != NULL);
1752
1753		bh_lock_sock(sk);
1754	} while ((skb = sk->sk_backlog.head) != NULL);
1755
1756	/*
1757	 * Doing the zeroing here guarantee we can not loop forever
1758	 * while a wild producer attempts to flood us.
1759	 */
1760	sk->sk_backlog.len = 0;
1761}
1762
1763/**
1764 * sk_wait_data - wait for data to arrive at sk_receive_queue
1765 * @sk:    sock to wait on
1766 * @timeo: for how long
1767 *
1768 * Now socket state including sk->sk_err is changed only under lock,
1769 * hence we may omit checks after joining wait queue.
1770 * We check receive queue before schedule() only as optimization;
1771 * it is very likely that release_sock() added new data.
1772 */
1773int sk_wait_data(struct sock *sk, long *timeo)
1774{
1775	int rc;
1776	DEFINE_WAIT(wait);
1777
1778	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1779	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1780	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1781	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1782	finish_wait(sk_sleep(sk), &wait);
1783	return rc;
1784}
1785EXPORT_SYMBOL(sk_wait_data);
1786
1787/**
1788 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1789 *	@sk: socket
1790 *	@size: memory size to allocate
1791 *	@kind: allocation type
1792 *
1793 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1794 *	rmem allocation. This function assumes that protocols which have
1795 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1796 */
1797int __sk_mem_schedule(struct sock *sk, int size, int kind)
1798{
1799	struct proto *prot = sk->sk_prot;
1800	int amt = sk_mem_pages(size);
1801	long allocated;
1802	int parent_status = UNDER_LIMIT;
1803
1804	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1805
1806	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1807
1808	/* Under limit. */
1809	if (parent_status == UNDER_LIMIT &&
1810			allocated <= sk_prot_mem_limits(sk, 0)) {
1811		sk_leave_memory_pressure(sk);
1812		return 1;
1813	}
1814
1815	/* Under pressure. (we or our parents) */
1816	if ((parent_status > SOFT_LIMIT) ||
1817			allocated > sk_prot_mem_limits(sk, 1))
1818		sk_enter_memory_pressure(sk);
1819
1820	/* Over hard limit (we or our parents) */
1821	if ((parent_status == OVER_LIMIT) ||
1822			(allocated > sk_prot_mem_limits(sk, 2)))
1823		goto suppress_allocation;
1824
1825	/* guarantee minimum buffer size under pressure */
1826	if (kind == SK_MEM_RECV) {
1827		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1828			return 1;
1829
1830	} else { /* SK_MEM_SEND */
1831		if (sk->sk_type == SOCK_STREAM) {
1832			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1833				return 1;
1834		} else if (atomic_read(&sk->sk_wmem_alloc) <
1835			   prot->sysctl_wmem[0])
1836				return 1;
1837	}
1838
1839	if (sk_has_memory_pressure(sk)) {
1840		int alloc;
1841
1842		if (!sk_under_memory_pressure(sk))
1843			return 1;
1844		alloc = sk_sockets_allocated_read_positive(sk);
1845		if (sk_prot_mem_limits(sk, 2) > alloc *
1846		    sk_mem_pages(sk->sk_wmem_queued +
1847				 atomic_read(&sk->sk_rmem_alloc) +
1848				 sk->sk_forward_alloc))
1849			return 1;
1850	}
1851
1852suppress_allocation:
1853
1854	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1855		sk_stream_moderate_sndbuf(sk);
1856
1857		/* Fail only if socket is _under_ its sndbuf.
1858		 * In this case we cannot block, so that we have to fail.
1859		 */
1860		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1861			return 1;
1862	}
1863
1864	trace_sock_exceed_buf_limit(sk, prot, allocated);
1865
1866	/* Alas. Undo changes. */
1867	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1868
1869	sk_memory_allocated_sub(sk, amt);
1870
1871	return 0;
1872}
1873EXPORT_SYMBOL(__sk_mem_schedule);
1874
1875/**
1876 *	__sk_reclaim - reclaim memory_allocated
1877 *	@sk: socket
1878 */
1879void __sk_mem_reclaim(struct sock *sk)
1880{
1881	sk_memory_allocated_sub(sk,
1882				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1883	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1884
1885	if (sk_under_memory_pressure(sk) &&
1886	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1887		sk_leave_memory_pressure(sk);
1888}
1889EXPORT_SYMBOL(__sk_mem_reclaim);
1890
1891
1892/*
1893 * Set of default routines for initialising struct proto_ops when
1894 * the protocol does not support a particular function. In certain
1895 * cases where it makes no sense for a protocol to have a "do nothing"
1896 * function, some default processing is provided.
1897 */
1898
1899int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1900{
1901	return -EOPNOTSUPP;
1902}
1903EXPORT_SYMBOL(sock_no_bind);
1904
1905int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1906		    int len, int flags)
1907{
1908	return -EOPNOTSUPP;
1909}
1910EXPORT_SYMBOL(sock_no_connect);
1911
1912int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1913{
1914	return -EOPNOTSUPP;
1915}
1916EXPORT_SYMBOL(sock_no_socketpair);
1917
1918int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1919{
1920	return -EOPNOTSUPP;
1921}
1922EXPORT_SYMBOL(sock_no_accept);
1923
1924int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1925		    int *len, int peer)
1926{
1927	return -EOPNOTSUPP;
1928}
1929EXPORT_SYMBOL(sock_no_getname);
1930
1931unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1932{
1933	return 0;
1934}
1935EXPORT_SYMBOL(sock_no_poll);
1936
1937int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1938{
1939	return -EOPNOTSUPP;
1940}
1941EXPORT_SYMBOL(sock_no_ioctl);
1942
1943int sock_no_listen(struct socket *sock, int backlog)
1944{
1945	return -EOPNOTSUPP;
1946}
1947EXPORT_SYMBOL(sock_no_listen);
1948
1949int sock_no_shutdown(struct socket *sock, int how)
1950{
1951	return -EOPNOTSUPP;
1952}
1953EXPORT_SYMBOL(sock_no_shutdown);
1954
1955int sock_no_setsockopt(struct socket *sock, int level, int optname,
1956		    char __user *optval, unsigned int optlen)
1957{
1958	return -EOPNOTSUPP;
1959}
1960EXPORT_SYMBOL(sock_no_setsockopt);
1961
1962int sock_no_getsockopt(struct socket *sock, int level, int optname,
1963		    char __user *optval, int __user *optlen)
1964{
1965	return -EOPNOTSUPP;
1966}
1967EXPORT_SYMBOL(sock_no_getsockopt);
1968
1969int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1970		    size_t len)
1971{
1972	return -EOPNOTSUPP;
1973}
1974EXPORT_SYMBOL(sock_no_sendmsg);
1975
1976int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1977		    size_t len, int flags)
1978{
1979	return -EOPNOTSUPP;
1980}
1981EXPORT_SYMBOL(sock_no_recvmsg);
1982
1983int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1984{
1985	/* Mirror missing mmap method error code */
1986	return -ENODEV;
1987}
1988EXPORT_SYMBOL(sock_no_mmap);
1989
1990ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1991{
1992	ssize_t res;
1993	struct msghdr msg = {.msg_flags = flags};
1994	struct kvec iov;
1995	char *kaddr = kmap(page);
1996	iov.iov_base = kaddr + offset;
1997	iov.iov_len = size;
1998	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1999	kunmap(page);
2000	return res;
2001}
2002EXPORT_SYMBOL(sock_no_sendpage);
2003
2004/*
2005 *	Default Socket Callbacks
2006 */
2007
2008static void sock_def_wakeup(struct sock *sk)
2009{
2010	struct socket_wq *wq;
2011
2012	rcu_read_lock();
2013	wq = rcu_dereference(sk->sk_wq);
2014	if (wq_has_sleeper(wq))
2015		wake_up_interruptible_all(&wq->wait);
2016	rcu_read_unlock();
2017}
2018
2019static void sock_def_error_report(struct sock *sk)
2020{
2021	struct socket_wq *wq;
2022
2023	rcu_read_lock();
2024	wq = rcu_dereference(sk->sk_wq);
2025	if (wq_has_sleeper(wq))
2026		wake_up_interruptible_poll(&wq->wait, POLLERR);
2027	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2028	rcu_read_unlock();
2029}
2030
2031static void sock_def_readable(struct sock *sk, int len)
2032{
2033	struct socket_wq *wq;
2034
2035	rcu_read_lock();
2036	wq = rcu_dereference(sk->sk_wq);
2037	if (wq_has_sleeper(wq))
2038		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2039						POLLRDNORM | POLLRDBAND);
2040	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2041	rcu_read_unlock();
2042}
2043
2044static void sock_def_write_space(struct sock *sk)
2045{
2046	struct socket_wq *wq;
2047
2048	rcu_read_lock();
2049
2050	/* Do not wake up a writer until he can make "significant"
2051	 * progress.  --DaveM
2052	 */
2053	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2054		wq = rcu_dereference(sk->sk_wq);
2055		if (wq_has_sleeper(wq))
2056			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2057						POLLWRNORM | POLLWRBAND);
2058
2059		/* Should agree with poll, otherwise some programs break */
2060		if (sock_writeable(sk))
2061			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2062	}
2063
2064	rcu_read_unlock();
2065}
2066
2067static void sock_def_destruct(struct sock *sk)
2068{
2069	kfree(sk->sk_protinfo);
2070}
2071
2072void sk_send_sigurg(struct sock *sk)
2073{
2074	if (sk->sk_socket && sk->sk_socket->file)
2075		if (send_sigurg(&sk->sk_socket->file->f_owner))
2076			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2077}
2078EXPORT_SYMBOL(sk_send_sigurg);
2079
2080void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2081		    unsigned long expires)
2082{
2083	if (!mod_timer(timer, expires))
2084		sock_hold(sk);
2085}
2086EXPORT_SYMBOL(sk_reset_timer);
2087
2088void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2089{
2090	if (timer_pending(timer) && del_timer(timer))
2091		__sock_put(sk);
2092}
2093EXPORT_SYMBOL(sk_stop_timer);
2094
2095void sock_init_data(struct socket *sock, struct sock *sk)
2096{
2097	skb_queue_head_init(&sk->sk_receive_queue);
2098	skb_queue_head_init(&sk->sk_write_queue);
2099	skb_queue_head_init(&sk->sk_error_queue);
2100#ifdef CONFIG_NET_DMA
2101	skb_queue_head_init(&sk->sk_async_wait_queue);
2102#endif
2103
2104	sk->sk_send_head	=	NULL;
2105
2106	init_timer(&sk->sk_timer);
2107
2108	sk->sk_allocation	=	GFP_KERNEL;
2109	sk->sk_rcvbuf		=	sysctl_rmem_default;
2110	sk->sk_sndbuf		=	sysctl_wmem_default;
2111	sk->sk_state		=	TCP_CLOSE;
2112	sk_set_socket(sk, sock);
2113
2114	sock_set_flag(sk, SOCK_ZAPPED);
2115
2116	if (sock) {
2117		sk->sk_type	=	sock->type;
2118		sk->sk_wq	=	sock->wq;
2119		sock->sk	=	sk;
2120	} else
2121		sk->sk_wq	=	NULL;
2122
2123	spin_lock_init(&sk->sk_dst_lock);
2124	rwlock_init(&sk->sk_callback_lock);
2125	lockdep_set_class_and_name(&sk->sk_callback_lock,
2126			af_callback_keys + sk->sk_family,
2127			af_family_clock_key_strings[sk->sk_family]);
2128
2129	sk->sk_state_change	=	sock_def_wakeup;
2130	sk->sk_data_ready	=	sock_def_readable;
2131	sk->sk_write_space	=	sock_def_write_space;
2132	sk->sk_error_report	=	sock_def_error_report;
2133	sk->sk_destruct		=	sock_def_destruct;
2134
2135	sk->sk_sndmsg_page	=	NULL;
2136	sk->sk_sndmsg_off	=	0;
2137	sk->sk_peek_off		=	-1;
2138
2139	sk->sk_peer_pid 	=	NULL;
2140	sk->sk_peer_cred	=	NULL;
2141	sk->sk_write_pending	=	0;
2142	sk->sk_rcvlowat		=	1;
2143	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2144	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2145
2146	sk->sk_stamp = ktime_set(-1L, 0);
2147
2148	/*
2149	 * Before updating sk_refcnt, we must commit prior changes to memory
2150	 * (Documentation/RCU/rculist_nulls.txt for details)
2151	 */
2152	smp_wmb();
2153	atomic_set(&sk->sk_refcnt, 1);
2154	atomic_set(&sk->sk_drops, 0);
2155}
2156EXPORT_SYMBOL(sock_init_data);
2157
2158void lock_sock_nested(struct sock *sk, int subclass)
2159{
2160	might_sleep();
2161	spin_lock_bh(&sk->sk_lock.slock);
2162	if (sk->sk_lock.owned)
2163		__lock_sock(sk);
2164	sk->sk_lock.owned = 1;
2165	spin_unlock(&sk->sk_lock.slock);
2166	/*
2167	 * The sk_lock has mutex_lock() semantics here:
2168	 */
2169	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2170	local_bh_enable();
2171}
2172EXPORT_SYMBOL(lock_sock_nested);
2173
2174void release_sock(struct sock *sk)
2175{
2176	/*
2177	 * The sk_lock has mutex_unlock() semantics:
2178	 */
2179	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2180
2181	spin_lock_bh(&sk->sk_lock.slock);
2182	if (sk->sk_backlog.tail)
2183		__release_sock(sk);
2184
2185	if (sk->sk_prot->release_cb)
2186		sk->sk_prot->release_cb(sk);
2187
2188	sk->sk_lock.owned = 0;
2189	if (waitqueue_active(&sk->sk_lock.wq))
2190		wake_up(&sk->sk_lock.wq);
2191	spin_unlock_bh(&sk->sk_lock.slock);
2192}
2193EXPORT_SYMBOL(release_sock);
2194
2195/**
2196 * lock_sock_fast - fast version of lock_sock
2197 * @sk: socket
2198 *
2199 * This version should be used for very small section, where process wont block
2200 * return false if fast path is taken
2201 *   sk_lock.slock locked, owned = 0, BH disabled
2202 * return true if slow path is taken
2203 *   sk_lock.slock unlocked, owned = 1, BH enabled
2204 */
2205bool lock_sock_fast(struct sock *sk)
2206{
2207	might_sleep();
2208	spin_lock_bh(&sk->sk_lock.slock);
2209
2210	if (!sk->sk_lock.owned)
2211		/*
2212		 * Note : We must disable BH
2213		 */
2214		return false;
2215
2216	__lock_sock(sk);
2217	sk->sk_lock.owned = 1;
2218	spin_unlock(&sk->sk_lock.slock);
2219	/*
2220	 * The sk_lock has mutex_lock() semantics here:
2221	 */
2222	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2223	local_bh_enable();
2224	return true;
2225}
2226EXPORT_SYMBOL(lock_sock_fast);
2227
2228int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2229{
2230	struct timeval tv;
2231	if (!sock_flag(sk, SOCK_TIMESTAMP))
2232		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2233	tv = ktime_to_timeval(sk->sk_stamp);
2234	if (tv.tv_sec == -1)
2235		return -ENOENT;
2236	if (tv.tv_sec == 0) {
2237		sk->sk_stamp = ktime_get_real();
2238		tv = ktime_to_timeval(sk->sk_stamp);
2239	}
2240	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2241}
2242EXPORT_SYMBOL(sock_get_timestamp);
2243
2244int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2245{
2246	struct timespec ts;
2247	if (!sock_flag(sk, SOCK_TIMESTAMP))
2248		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2249	ts = ktime_to_timespec(sk->sk_stamp);
2250	if (ts.tv_sec == -1)
2251		return -ENOENT;
2252	if (ts.tv_sec == 0) {
2253		sk->sk_stamp = ktime_get_real();
2254		ts = ktime_to_timespec(sk->sk_stamp);
2255	}
2256	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2257}
2258EXPORT_SYMBOL(sock_get_timestampns);
2259
2260void sock_enable_timestamp(struct sock *sk, int flag)
2261{
2262	if (!sock_flag(sk, flag)) {
2263		unsigned long previous_flags = sk->sk_flags;
2264
2265		sock_set_flag(sk, flag);
2266		/*
2267		 * we just set one of the two flags which require net
2268		 * time stamping, but time stamping might have been on
2269		 * already because of the other one
2270		 */
2271		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2272			net_enable_timestamp();
2273	}
2274}
2275
2276/*
2277 *	Get a socket option on an socket.
2278 *
2279 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2280 *	asynchronous errors should be reported by getsockopt. We assume
2281 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2282 */
2283int sock_common_getsockopt(struct socket *sock, int level, int optname,
2284			   char __user *optval, int __user *optlen)
2285{
2286	struct sock *sk = sock->sk;
2287
2288	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2289}
2290EXPORT_SYMBOL(sock_common_getsockopt);
2291
2292#ifdef CONFIG_COMPAT
2293int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2294				  char __user *optval, int __user *optlen)
2295{
2296	struct sock *sk = sock->sk;
2297
2298	if (sk->sk_prot->compat_getsockopt != NULL)
2299		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2300						      optval, optlen);
2301	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2302}
2303EXPORT_SYMBOL(compat_sock_common_getsockopt);
2304#endif
2305
2306int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2307			struct msghdr *msg, size_t size, int flags)
2308{
2309	struct sock *sk = sock->sk;
2310	int addr_len = 0;
2311	int err;
2312
2313	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2314				   flags & ~MSG_DONTWAIT, &addr_len);
2315	if (err >= 0)
2316		msg->msg_namelen = addr_len;
2317	return err;
2318}
2319EXPORT_SYMBOL(sock_common_recvmsg);
2320
2321/*
2322 *	Set socket options on an inet socket.
2323 */
2324int sock_common_setsockopt(struct socket *sock, int level, int optname,
2325			   char __user *optval, unsigned int optlen)
2326{
2327	struct sock *sk = sock->sk;
2328
2329	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2330}
2331EXPORT_SYMBOL(sock_common_setsockopt);
2332
2333#ifdef CONFIG_COMPAT
2334int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2335				  char __user *optval, unsigned int optlen)
2336{
2337	struct sock *sk = sock->sk;
2338
2339	if (sk->sk_prot->compat_setsockopt != NULL)
2340		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2341						      optval, optlen);
2342	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2343}
2344EXPORT_SYMBOL(compat_sock_common_setsockopt);
2345#endif
2346
2347void sk_common_release(struct sock *sk)
2348{
2349	if (sk->sk_prot->destroy)
2350		sk->sk_prot->destroy(sk);
2351
2352	/*
2353	 * Observation: when sock_common_release is called, processes have
2354	 * no access to socket. But net still has.
2355	 * Step one, detach it from networking:
2356	 *
2357	 * A. Remove from hash tables.
2358	 */
2359
2360	sk->sk_prot->unhash(sk);
2361
2362	/*
2363	 * In this point socket cannot receive new packets, but it is possible
2364	 * that some packets are in flight because some CPU runs receiver and
2365	 * did hash table lookup before we unhashed socket. They will achieve
2366	 * receive queue and will be purged by socket destructor.
2367	 *
2368	 * Also we still have packets pending on receive queue and probably,
2369	 * our own packets waiting in device queues. sock_destroy will drain
2370	 * receive queue, but transmitted packets will delay socket destruction
2371	 * until the last reference will be released.
2372	 */
2373
2374	sock_orphan(sk);
2375
2376	xfrm_sk_free_policy(sk);
2377
2378	sk_refcnt_debug_release(sk);
2379	sock_put(sk);
2380}
2381EXPORT_SYMBOL(sk_common_release);
2382
2383#ifdef CONFIG_PROC_FS
2384#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2385struct prot_inuse {
2386	int val[PROTO_INUSE_NR];
2387};
2388
2389static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2390
2391#ifdef CONFIG_NET_NS
2392void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2393{
2394	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2395}
2396EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2397
2398int sock_prot_inuse_get(struct net *net, struct proto *prot)
2399{
2400	int cpu, idx = prot->inuse_idx;
2401	int res = 0;
2402
2403	for_each_possible_cpu(cpu)
2404		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2405
2406	return res >= 0 ? res : 0;
2407}
2408EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2409
2410static int __net_init sock_inuse_init_net(struct net *net)
2411{
2412	net->core.inuse = alloc_percpu(struct prot_inuse);
2413	return net->core.inuse ? 0 : -ENOMEM;
2414}
2415
2416static void __net_exit sock_inuse_exit_net(struct net *net)
2417{
2418	free_percpu(net->core.inuse);
2419}
2420
2421static struct pernet_operations net_inuse_ops = {
2422	.init = sock_inuse_init_net,
2423	.exit = sock_inuse_exit_net,
2424};
2425
2426static __init int net_inuse_init(void)
2427{
2428	if (register_pernet_subsys(&net_inuse_ops))
2429		panic("Cannot initialize net inuse counters");
2430
2431	return 0;
2432}
2433
2434core_initcall(net_inuse_init);
2435#else
2436static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2437
2438void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2439{
2440	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2441}
2442EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2443
2444int sock_prot_inuse_get(struct net *net, struct proto *prot)
2445{
2446	int cpu, idx = prot->inuse_idx;
2447	int res = 0;
2448
2449	for_each_possible_cpu(cpu)
2450		res += per_cpu(prot_inuse, cpu).val[idx];
2451
2452	return res >= 0 ? res : 0;
2453}
2454EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2455#endif
2456
2457static void assign_proto_idx(struct proto *prot)
2458{
2459	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2460
2461	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2462		pr_err("PROTO_INUSE_NR exhausted\n");
2463		return;
2464	}
2465
2466	set_bit(prot->inuse_idx, proto_inuse_idx);
2467}
2468
2469static void release_proto_idx(struct proto *prot)
2470{
2471	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2472		clear_bit(prot->inuse_idx, proto_inuse_idx);
2473}
2474#else
2475static inline void assign_proto_idx(struct proto *prot)
2476{
2477}
2478
2479static inline void release_proto_idx(struct proto *prot)
2480{
2481}
2482#endif
2483
2484int proto_register(struct proto *prot, int alloc_slab)
2485{
2486	if (alloc_slab) {
2487		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2488					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2489					NULL);
2490
2491		if (prot->slab == NULL) {
2492			pr_crit("%s: Can't create sock SLAB cache!\n",
2493				prot->name);
2494			goto out;
2495		}
2496
2497		if (prot->rsk_prot != NULL) {
2498			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2499			if (prot->rsk_prot->slab_name == NULL)
2500				goto out_free_sock_slab;
2501
2502			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2503								 prot->rsk_prot->obj_size, 0,
2504								 SLAB_HWCACHE_ALIGN, NULL);
2505
2506			if (prot->rsk_prot->slab == NULL) {
2507				pr_crit("%s: Can't create request sock SLAB cache!\n",
2508					prot->name);
2509				goto out_free_request_sock_slab_name;
2510			}
2511		}
2512
2513		if (prot->twsk_prot != NULL) {
2514			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2515
2516			if (prot->twsk_prot->twsk_slab_name == NULL)
2517				goto out_free_request_sock_slab;
2518
2519			prot->twsk_prot->twsk_slab =
2520				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2521						  prot->twsk_prot->twsk_obj_size,
2522						  0,
2523						  SLAB_HWCACHE_ALIGN |
2524							prot->slab_flags,
2525						  NULL);
2526			if (prot->twsk_prot->twsk_slab == NULL)
2527				goto out_free_timewait_sock_slab_name;
2528		}
2529	}
2530
2531	mutex_lock(&proto_list_mutex);
2532	list_add(&prot->node, &proto_list);
2533	assign_proto_idx(prot);
2534	mutex_unlock(&proto_list_mutex);
2535	return 0;
2536
2537out_free_timewait_sock_slab_name:
2538	kfree(prot->twsk_prot->twsk_slab_name);
2539out_free_request_sock_slab:
2540	if (prot->rsk_prot && prot->rsk_prot->slab) {
2541		kmem_cache_destroy(prot->rsk_prot->slab);
2542		prot->rsk_prot->slab = NULL;
2543	}
2544out_free_request_sock_slab_name:
2545	if (prot->rsk_prot)
2546		kfree(prot->rsk_prot->slab_name);
2547out_free_sock_slab:
2548	kmem_cache_destroy(prot->slab);
2549	prot->slab = NULL;
2550out:
2551	return -ENOBUFS;
2552}
2553EXPORT_SYMBOL(proto_register);
2554
2555void proto_unregister(struct proto *prot)
2556{
2557	mutex_lock(&proto_list_mutex);
2558	release_proto_idx(prot);
2559	list_del(&prot->node);
2560	mutex_unlock(&proto_list_mutex);
2561
2562	if (prot->slab != NULL) {
2563		kmem_cache_destroy(prot->slab);
2564		prot->slab = NULL;
2565	}
2566
2567	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2568		kmem_cache_destroy(prot->rsk_prot->slab);
2569		kfree(prot->rsk_prot->slab_name);
2570		prot->rsk_prot->slab = NULL;
2571	}
2572
2573	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2574		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2575		kfree(prot->twsk_prot->twsk_slab_name);
2576		prot->twsk_prot->twsk_slab = NULL;
2577	}
2578}
2579EXPORT_SYMBOL(proto_unregister);
2580
2581#ifdef CONFIG_PROC_FS
2582static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2583	__acquires(proto_list_mutex)
2584{
2585	mutex_lock(&proto_list_mutex);
2586	return seq_list_start_head(&proto_list, *pos);
2587}
2588
2589static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2590{
2591	return seq_list_next(v, &proto_list, pos);
2592}
2593
2594static void proto_seq_stop(struct seq_file *seq, void *v)
2595	__releases(proto_list_mutex)
2596{
2597	mutex_unlock(&proto_list_mutex);
2598}
2599
2600static char proto_method_implemented(const void *method)
2601{
2602	return method == NULL ? 'n' : 'y';
2603}
2604static long sock_prot_memory_allocated(struct proto *proto)
2605{
2606	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2607}
2608
2609static char *sock_prot_memory_pressure(struct proto *proto)
2610{
2611	return proto->memory_pressure != NULL ?
2612	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2613}
2614
2615static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2616{
2617
2618	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2619			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2620		   proto->name,
2621		   proto->obj_size,
2622		   sock_prot_inuse_get(seq_file_net(seq), proto),
2623		   sock_prot_memory_allocated(proto),
2624		   sock_prot_memory_pressure(proto),
2625		   proto->max_header,
2626		   proto->slab == NULL ? "no" : "yes",
2627		   module_name(proto->owner),
2628		   proto_method_implemented(proto->close),
2629		   proto_method_implemented(proto->connect),
2630		   proto_method_implemented(proto->disconnect),
2631		   proto_method_implemented(proto->accept),
2632		   proto_method_implemented(proto->ioctl),
2633		   proto_method_implemented(proto->init),
2634		   proto_method_implemented(proto->destroy),
2635		   proto_method_implemented(proto->shutdown),
2636		   proto_method_implemented(proto->setsockopt),
2637		   proto_method_implemented(proto->getsockopt),
2638		   proto_method_implemented(proto->sendmsg),
2639		   proto_method_implemented(proto->recvmsg),
2640		   proto_method_implemented(proto->sendpage),
2641		   proto_method_implemented(proto->bind),
2642		   proto_method_implemented(proto->backlog_rcv),
2643		   proto_method_implemented(proto->hash),
2644		   proto_method_implemented(proto->unhash),
2645		   proto_method_implemented(proto->get_port),
2646		   proto_method_implemented(proto->enter_memory_pressure));
2647}
2648
2649static int proto_seq_show(struct seq_file *seq, void *v)
2650{
2651	if (v == &proto_list)
2652		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2653			   "protocol",
2654			   "size",
2655			   "sockets",
2656			   "memory",
2657			   "press",
2658			   "maxhdr",
2659			   "slab",
2660			   "module",
2661			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2662	else
2663		proto_seq_printf(seq, list_entry(v, struct proto, node));
2664	return 0;
2665}
2666
2667static const struct seq_operations proto_seq_ops = {
2668	.start  = proto_seq_start,
2669	.next   = proto_seq_next,
2670	.stop   = proto_seq_stop,
2671	.show   = proto_seq_show,
2672};
2673
2674static int proto_seq_open(struct inode *inode, struct file *file)
2675{
2676	return seq_open_net(inode, file, &proto_seq_ops,
2677			    sizeof(struct seq_net_private));
2678}
2679
2680static const struct file_operations proto_seq_fops = {
2681	.owner		= THIS_MODULE,
2682	.open		= proto_seq_open,
2683	.read		= seq_read,
2684	.llseek		= seq_lseek,
2685	.release	= seq_release_net,
2686};
2687
2688static __net_init int proto_init_net(struct net *net)
2689{
2690	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2691		return -ENOMEM;
2692
2693	return 0;
2694}
2695
2696static __net_exit void proto_exit_net(struct net *net)
2697{
2698	proc_net_remove(net, "protocols");
2699}
2700
2701
2702static __net_initdata struct pernet_operations proto_net_ops = {
2703	.init = proto_init_net,
2704	.exit = proto_exit_net,
2705};
2706
2707static int __init proto_init(void)
2708{
2709	return register_pernet_subsys(&proto_net_ops);
2710}
2711
2712subsys_initcall(proto_init);
2713
2714#endif /* PROC_FS */
2715